# Load Packages

In [1]:
%load_ext watermark

In [2]:
%watermark -m -v -n -p numpy,sklearn

Python implementation: CPython
Python version       : 3.9.2
IPython version      : 7.22.0

numpy  : 1.20.2
sklearn: 0.24.1

Compiler    : Clang 11.0.1 
OS          : Darwin
Release     : 20.4.0
Machine     : x86_64
Processor   : i386
CPU cores   : 6
Architecture: 64bit



# Load Data

In [3]:
#Import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

In [4]:
np.random.seed(2020)

In [5]:
X_1m = np.random.rand(1_000_000, 30)
X_100k = np.random.rand(100_000, 30)
X_10k = np.random.rand(10_000, 30)
X_1k = np.random.rand(1_000, 30)

## Export Data For Other Languages

In [6]:
np.savetxt('data_1m.csv', X_1m, delimiter=',')
np.savetxt('data_100k.csv', X_100k, delimiter=',')
np.savetxt('data_10k.csv', X_10k, delimiter=',')
np.savetxt('data_1k.csv', X_1k, delimiter=',')

In [7]:
%ls

ClusteringJL, Mlpack, & ParallelKMeans Benchmarks Final.ipynb
Sklearn Benchmark-Final-MiniBatch.ipynb
Sklearn Benchmark-Final.ipynb
data_100k.csv
data_10k.csv
data_1k.csv
data_1m.csv
knor_final.Rmd
knor_final.pdf


# Benchmarks

## Sklearn Multi-Thread Benchmark

In [8]:
def test_multicore_speed(x):
    """
    Just a convenient function to select the number of cluster groups based 
    on the elbow method. 
    Requries testing 2 to 10 k ranges using all available cores (default in sklearn > 0.24).
    """
    ss = []
    for i in range(2, 11):

        model = KMeans(n_clusters=i, init='k-means++',
                       max_iter=100_000, tol=1e-6).fit(x)
        
        ss.append(model.inertia_)
        
    return ss

In [9]:
# takes days with timeit
%time test_multicore_speed(X_1m)

CPU times: user 1h 18min 54s, sys: 58.2 s, total: 1h 19min 52s
Wall time: 14min 14s


[2437341.7270931103,
 2399351.127290113,
 2368831.2054143585,
 2346201.222251802,
 2326790.6041440214,
 2309585.672550795,
 2293413.1789655103,
 2280334.331886036,
 2267961.2758927774]

In [10]:
%timeit test_multicore_speed(X_100k)

1min 27s ± 2.37 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%timeit test_multicore_speed(X_10k)

6.11 s ± 407 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%timeit test_multicore_speed(X_1k)

719 ms ± 16.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Sklearn Multi-Thread Results

In [13]:
test_multicore_speed(X_1m)

[2437324.235389248,
 2399413.0830678497,
 2368818.8267027903,
 2346299.9410698186,
 2326735.440058622,
 2309516.949678505,
 2293478.379922331,
 2280295.348438886,
 2267882.6124851117]

In [14]:
test_multicore_speed(X_100k)

[243589.2873161867,
 239764.75364587895,
 236710.75522105594,
 234408.93214474779,
 232424.07131403798,
 230670.0302232603,
 229006.70085774802,
 227697.49490115896,
 226417.04919758317]

In [15]:
test_multicore_speed(X_10k)

[24258.313108868635,
 23866.423212883026,
 23549.056659001235,
 23281.743553533157,
 23075.514042452258,
 22868.84837785622,
 22703.89545928599,
 22543.180090006244,
 22388.36033844091]

In [16]:
test_multicore_speed(X_1k)

[2416.426793583685,
 2363.360485189129,
 2323.6350017094774,
 2289.0280235978544,
 2257.950426505983,
 2238.313140590653,
 2212.847741952131,
 2191.168916233091,
 2172.246861197593]