# Load Packages

In [15]:
%load_ext watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [16]:
%watermark -m -v -n -p numpy,sklearn

Python implementation: CPython
Python version       : 3.9.2
IPython version      : 7.22.0

numpy  : 1.20.2
sklearn: 0.24.1

Compiler    : Clang 11.0.1 
OS          : Darwin
Release     : 20.4.0
Machine     : x86_64
Processor   : i386
CPU cores   : 6
Architecture: 64bit



# Load Data

In [17]:
#Import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import MiniBatchKMeans

In [18]:
np.random.seed(2020)

In [19]:
X_1m = np.random.rand(1_000_000, 30)
X_100k = np.random.rand(100_000, 30)
X_10k = np.random.rand(10_000, 30)
X_1k = np.random.rand(1_000, 30)

# Benchmarks

## Sklearn Multi-Thread Benchmark

In [20]:
def test_multicore_speed(x):
    """
    Just a convenient function to select the number of cluster groups based 
    on the elbow method. Requries testing 2 to 10 k ranges using all available cores.
    """
    ss = []
    for i in range(2, 11):

        model = MiniBatchKMeans(n_clusters=i,
                                init='k-means++',
                                tol=1e-6, 
                                max_no_improvement=10,
                                max_iter=100_000).fit(x)
        
        ss.append(model.inertia_)
        
    return ss

In [21]:
%timeit test_multicore_speed(X_1m)

11.2 s ± 1.42 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%timeit test_multicore_speed(X_100k)

1.41 s ± 168 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
%timeit test_multicore_speed(X_10k)

317 ms ± 32.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
%timeit test_multicore_speed(X_1k)

141 ms ± 1.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Sklearn Multi-Thread Results

In [25]:
test_multicore_speed(X_1m)

[2445867.479686552,
 2412165.7171342387,
 2385104.140258503,
 2357469.335790009,
 2340276.2532835295,
 2321510.4619055036,
 2311584.688176011,
 2290377.712416416,
 2277367.844858403]

In [26]:
test_multicore_speed(X_100k)

[244482.59885062292,
 240816.03776186175,
 237910.97134228173,
 236027.32053503324,
 234021.78786939755,
 231831.28918362648,
 230429.8308173524,
 229400.1630608017,
 227886.7518947741]

In [27]:
test_multicore_speed(X_10k)

[24392.745661181747,
 23987.646437792417,
 23697.726153370553,
 23426.952006343166,
 23276.90790158953,
 23064.17230258268,
 22954.647209537172,
 22722.318449295362,
 22708.263604056046]

In [28]:
test_multicore_speed(X_1k)

[2431.933005539147,
 2382.703412642692,
 2350.0513505820563,
 2317.982091752885,
 2294.633919025983,
 2274.200812255565,
 2242.548182527589,
 2221.944884143639,
 2205.5533905720613]