# Load Packages

In [1]:
%load_ext watermark

In [2]:
%watermark -m -v -n -p numpy,sklearn

Fri Apr 03 2020 

CPython 3.7.6
IPython 7.13.0

numpy 1.18.1
sklearn 0.22.2.post1

compiler   : Clang 9.0.1 
system     : Darwin
release    : 19.3.0
machine    : x86_64
processor  : i386
CPU cores  : 6
interpreter: 64bit


# Load Data

In [3]:
#Import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import MiniBatchKMeans

In [4]:
np.random.seed(2020)

In [5]:
X_1m = np.random.rand(1_000_000, 30)
X_100k = np.random.rand(100_000, 30)
X_10k = np.random.rand(10_000, 30)
X_1k = np.random.rand(1_000, 30)

## Export Data For Other Languages

In [6]:
np.savetxt('data_1m.csv', X_1m, delimiter=',')
np.savetxt('data_100k.csv', X_100k, delimiter=',')
np.savetxt('data_10k.csv', X_10k, delimiter=',')
np.savetxt('data_1k.csv', X_1k, delimiter=',')

# Benchmarks

## Sklearn Multi-Thread Benchmark

In [7]:
def test_multicore_speed(x):
    """
    Just a convenient function to select the number of cluster groups based 
    on the elbow method. Requries testing 2 to 10 k ranges using all available cores.
    """
    ss = []
    for i in range(2, 11):

        model = MiniBatchKMeans(n_clusters=i, init='k-means++', max_iter=1000).fit(x)
        ss.append(model.inertia_)
        
    return ss

In [8]:
%timeit test_multicore_speed(X_1m)

30.1 s ± 2.48 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%timeit test_multicore_speed(X_100k)

3.75 s ± 405 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit test_multicore_speed(X_10k)

613 ms ± 45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%timeit test_multicore_speed(X_1k)

201 ms ± 14.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Sklearn Multi-Thread Results

In [12]:
test_multicore_speed(X_1m)

[2447844.622163797,
 2410065.3115257025,
 2382870.705802254,
 2357328.667334357,
 2340856.721208137,
 2323608.3094369015,
 2308229.727174064,
 2290479.2838455164,
 2277841.943680881]

In [13]:
test_multicore_speed(X_100k)

[244844.82795721738,
 240826.34373045812,
 238007.0239377424,
 235703.69897529576,
 233968.6199870226,
 231772.27469308046,
 230237.945394652,
 228795.61081951743,
 228469.0079741194]

In [14]:
test_multicore_speed(X_10k)

[24348.763173523337,
 24001.890978658197,
 23684.223413130192,
 23454.91855690728,
 23269.213047246023,
 23120.88619672152,
 22868.42252728833,
 22738.258512791606,
 22573.987681140177]

In [15]:
test_multicore_speed(X_1k)

[2427.133269064922,
 2385.534105444017,
 2344.84192128042,
 2310.5402444073848,
 2285.2629922427886,
 2270.897261587013,
 2242.0384501089516,
 2226.0452902526335,
 2204.5812780067554]