# Load Packages

In [1]:
%load_ext watermark

In [2]:
%watermark -m -v -n -p numpy,sklearn,seaborn

Thu Mar 12 2020 

CPython 3.7.6
IPython 7.13.0

numpy 1.18.1
sklearn 0.22.2.post1
seaborn 0.10.0

compiler   : Clang 9.0.1 
system     : Darwin
release    : 19.3.0
machine    : x86_64
processor  : i386
CPU cores  : 6
interpreter: 64bit


# Load Data

In [3]:
#Import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

In [4]:
np.random.seed(2020)

In [5]:
X = np.random.rand(1_000_000, 30)

In [6]:
X.shape

(1000000, 30)

In [7]:
X

array([[0.98627683, 0.87339195, 0.50974552, ..., 0.45611937, 0.15585136,
        0.47604897],
       [0.16970244, 0.89625834, 0.37339376, ..., 0.9360322 , 0.80302764,
        0.69730515],
       [0.46138428, 0.66243461, 0.74966564, ..., 0.90240675, 0.44920447,
        0.61818198],
       ...,
       [0.46241048, 0.02711373, 0.98297598, ..., 0.63149313, 0.68530809,
        0.93077828],
       [0.85256642, 0.68951042, 0.08835872, ..., 0.60046939, 0.3804147 ,
        0.17530941],
       [0.81275259, 0.80919492, 0.67010203, ..., 0.32749441, 0.94242564,
        0.81643605]])

## Export Data For Julia

In [8]:
np.savetxt('data.csv', X, delimiter=',')

In [9]:
%ls

Sklearn Benchmark.ipynb  data.csv


# Benchmarks

## Sklearn Single Thread Benchmark

In [9]:
def test_speed(x):
    """
    Just a convenient function to select the number of cluster groups based 
    on the elbow method. Requries testing 2 to 10 k ranges using 1 core.
    """
    ss = []
    for i in range(2, 11):

        model = KMeans(n_clusters=i, init='k-means++',
                       max_iter=300, tol=0.0001, n_jobs=1).fit(x)
        
        ss.append(model.inertia_)
    return ss

In [10]:
%timeit test_speed(X)

26min 1s ± 1min 7s per loop (mean ± std. dev. of 7 runs, 1 loop each)


###  Sklearn Single Thread Results

In [11]:
test_speed(X)

[2437344.710396418,
 2399284.054008447,
 2369173.5347967194,
 2346620.4948332976,
 2326796.9289331255,
 2309666.2464365195,
 2293318.060627062,
 2280686.245286427,
 2268077.5249787876]

## Sklearn Multi-Thread Benchmark

In [12]:
def test_multicore_speed(x):
    """
    Just a convenient function to select the number of cluster groups based 
    on the elbow method. Requries testing 2 to 10 k ranges using all available cores.
    """
    ss = []
    for i in range(2, 11):

        model = KMeans(n_clusters=i, init='k-means++',
                       max_iter=300, tol=0.0001, n_jobs=-1).fit(x)
        
        ss.append(model.inertia_)
        
    return ss

In [13]:
%timeit test_multicore_speed(X)



9min 23s ± 17.2 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Sklearn Multi-Thread Results

In [14]:
test_multicore_speed(X)

[2437462.6906016935,
 2399335.1366852117,
 2368934.8531181137,
 2346427.8617889704,
 2326975.168462808,
 2309499.4518277273,
 2293430.3159716832,
 2280704.7902194555,
 2267943.237770229]