# Data Prep 

In [1]:
import numpy as np
import pandas as pd
import re
import time
import ipyparallel as ipp
import seaborn as sns
import matplotlib.pyplot as plt
import functools

In [2]:
from sklearn.preprocessing import LabelEncoder

### Helper Methods

In [3]:
# Time how long an import takes
def timed_df(path):
    start = time.time()
    dataframe = pd.read_csv(path)
    print(f'Finished reading dataframe in {time.time()-start:.2f} seconds')
    return dataframe

### Data Minimization
Remove redundant columns to make search more efficient

In [111]:
# Import our error-free but inefficient dataframe
df = timed_df('data/fixed_data.csv')

Finished reading dataframe in 60.31 seconds


In [122]:
# Purge redundant columns
df = df[['product', 'basket_id']]

In [123]:
# Replace basket identifier with a numeric value
le = LabelEncoder()
df['basket_id'] = le.fit_transform(df['basket_id'])

In [170]:
df.head()

Unnamed: 0,product,basket_id
0,Special Gasoline,749985
1,Special Diesel,2752347
2,Special Diesel,3482050
3,Lubricant,3508856
4,Diesel Auto Clean,3508856


In [125]:
# Write this data out
df.to_csv('data/products.csv', index=False)

### Data Import and Organization

#### Notice how the dataframe reads ~12 times faster

In [4]:
# Read in minimized data
df = timed_df('data/products.csv')

Finished reading dataframe in 4.66 seconds


##### ~27 million entries 

In [5]:
df.shape

(26951165, 2)

In [53]:
df = df.iloc[:1000000] # limit to 10m

In [54]:
# Get all unique products and baskets
products = df['product'].unique()
baskets = df.basket_id.unique()

In [55]:
print(f'There are {len(baskets):,} unique transactions.')

There are 473,641 unique transactions.


# Parallel Vector Extraction 

#### Run the `cluster.sh` script with an integer argument to init that many engines 

In [33]:
# Get engines
rc = ipp.Client()
dv = rc[:]
print(f'Running {len(dv):,} nodes')

Running 10 nodes


In [56]:
# Push global variables to all engines
dv.push(dict(df=df, products=products))

<AsyncResult: _push>

In [57]:
# Parallel approach that generates a binary purchase vector for a given basket id
@dv.parallel(block=True)
def gen_vector(basket_id):
    # Get all products in transaction
    transaction = df.loc[df.basket_id == basket_id]['product']
    # Cast to array
    transaction = transaction.values
    # Generate binary vector
    return [1 if p in transaction else 0 for p in products]

In [86]:
def estimate(num_tests):
    """
    Estimates how long it will take (hours) to generate
    a binary purchase matrix for a given dataset
    by extrapolating the results from a timed test
    on a subsection of the total transactions.
    """
    start = time.time()
    # Compiles the results from all engines
    results = np.array(gen_vector.map(baskets[:num_tests]))
    
    return (((time.time() - start)/num_tests)*len(baskets))/3600

In [87]:
# dv.push(dict(data))
e = estimate(1000)

In [88]:
print(f'It will take {e:,.3f} hour(s) ({e*60:,.2f} minutes) to compute this with {len(dv)} engines for a dataset with {df.shape[0]:,} entries.')

It will take 0.093 hour(s) (5.58 minutes) to compute this with 10 engines for a dataset with 1,000,000 entries.


In [None]:
start = time.time()

output = np.array(gen_vector.map(baskets))

print(f'Finished in {time.time()-start/60:.2f} minutes')