# Load Packages

In [1]:
%load_ext watermark

In [2]:
%watermark -m -v -n -p numpy,sklearn

Wed Apr 01 2020 

CPython 3.7.6
IPython 7.13.0

numpy 1.18.1
sklearn 0.22.2.post1

compiler   : Clang 9.0.1 
system     : Darwin
release    : 19.3.0
machine    : x86_64
processor  : i386
CPU cores  : 6
interpreter: 64bit


# Load Data

In [3]:
#Import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

In [4]:
np.random.seed(2020)

In [5]:
X_1m = np.random.rand(1_000_000, 30)
X_100k = np.random.rand(100_000, 30)
X_10k = np.random.rand(10_000, 30)
X_1k = np.random.rand(1_000, 30)

## Export Data For Other Languages

In [6]:
np.savetxt('data_1m.csv', X_1m, delimiter=',')
np.savetxt('data_100k.csv', X_100k, delimiter=',')
np.savetxt('data_10k.csv', X_10k, delimiter=',')
np.savetxt('data_1k.csv', X_1k, delimiter=',')

In [7]:
%ls

ClusteringJL & ParallelKMeans Benchmarks.ipynb
Sklearn Benchmark-Final.ipynb
Sklearn Benchmark.ipynb
data_100k.csv
data_10k.csv
data_1k.csv
data_1m.csv
knor_benchmark.ipynb


# Benchmarks

## Sklearn Multi-Thread Benchmark

In [8]:
def test_multicore_speed(x):
    """
    Just a convenient function to select the number of cluster groups based 
    on the elbow method. Requries testing 2 to 10 k ranges using all available cores.
    """
    ss = []
    for i in range(2, 11):

        model = KMeans(n_clusters=i, init='k-means++',
                       max_iter=1000, tol=1e-6, n_jobs=-1).fit(x)
        
        ss.append(model.inertia_)
        
    return ss

In [9]:
%timeit test_multicore_speed(X_1m)



23min 50s ± 37.3 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit test_multicore_speed(X_100k)

2min 26s ± 10.5 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%timeit test_multicore_speed(X_10k)

5.77 s ± 190 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%timeit test_multicore_speed(X_1k)

344 ms ± 26.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Sklearn Multi-Thread Results

In [13]:
test_multicore_speed(X_1m)

[2437387.415492945,
 2399339.992032868,
 2368871.465710383,
 2346462.0070933346,
 2326813.1069245334,
 2309414.654713539,
 2293462.201313399,
 2280445.3099815217,
 2267931.587844557]

In [14]:
test_multicore_speed(X_100k)

[243589.31245874695,
 239767.31925266725,
 236726.38550223835,
 234442.93793080037,
 232422.69835603604,
 230691.21321680304,
 229058.01186924183,
 227682.08176776656,
 226431.94988426092]

In [15]:
test_multicore_speed(X_10k)

[24258.330630805453,
 23867.89675877502,
 23551.140681476474,
 23293.288967365683,
 23061.19983962278,
 22875.664074825218,
 22696.46474595464,
 22537.26870504501,
 22386.635745851694]

In [16]:
test_multicore_speed(X_1k)

[2415.7676706737066,
 2363.5037202047565,
 2325.6642970049497,
 2290.4172039328205,
 2259.749823733678,
 2233.0460105250836,
 2209.553788815628,
 2183.6662205376624,
 2172.285836431226]