In [136]:
import ipyparallel as ipp
import pandas as pd
import numpy as np
import math
import time

# Start Clusters

Run this command: 
`!ipcluster start --n=X` where `X` is the number of engines you want to run.

In [159]:
rc = ipp.Client()
dv = rc[:]
print(f'Running {len(dv)} nodes')

Running 50 nodes


In [6]:
df = pd.read_csv('data/fixed_data.csv')

In [160]:
data = df[:1000000]

In [161]:
data.shape

(1000000, 15)

In [162]:
baskets = data.basket_id.unique()
print(f'There are {len(baskets):,} unique baskets.')
products = data['product'].unique()
print(f'There are {len(products):,} unique products.')

There are 473,641 unique baskets.
There are 40 unique products.


In [163]:
def in_basket_single(b_id):
    global output_log
    basket_ls = data.loc[data.basket_id == b_id]['product']
    return [1 if p in basket_ls.values else 0 for p in products]

In [164]:
@dv.parallel(block=True)
def in_basket(b_id):
    global output_log
    basket_ls = data.loc[data.basket_id == b_id]['product']
    return [1 if p in basket_ls.values else 0 for p in products]

In [165]:
dv.push(dict(data=data, products=products, output_log=output_log))

<AsyncResult: _push>

In [172]:
def speed_test(num_tests):
    
    # Linear
    temp_set = []
    start = time.time()
    for i in range(num_tests):
        temp_set.append(in_basket_single(baskets[i]))
    timed_test = (time.time() - start)
    
    print(f'Finished linear in {round(timed_test, 2)}')
    
    # Parallel
    start = time.time()
    res_set = in_basket.map(baskets[:num_tests])
    end_parallel = time.time() - start
    
    print(f'Finished parallel in {round(end_parallel, 2)}')
    
    assert(res_set == temp_set)
    per_test = lambda x: ((x/num_tests)*len(baskets))/3600
    lin_est, par_est = per_test(timed_test), per_test(end_parallel)
    print(f'\nEstimated Completion Time (All {len(baskets)} Baskets):\nLinear: {round(lin_est, 1)} hours\nParallel ({len(dv)} nodes): {round(par_est, 1)} hours')
    

In [173]:
speed_test(1000) # 50 nodes

Finished linear in 48.93
Finished parallel in 13.62

Estimated Completion Time (All 473641 Baskets):
Linear: 6.4 hours
Parallel (50 nodes): 1.8 hours
