In [None]:
# EXECUTION TIME: 1m12s

# Python 3 ImportError
import sys
sys.path.append('.')

import time

import numpy as np
import pandas as pd

import seaborn as sns

from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.datasets.samples_generator import make_blobs


# #############################################################################
# Comparison Matrix
n_samples = np.linspace(2000, 150000, 100)
df = pd.DataFrame(index=n_samples, columns=[
                  "KMeans Execution Time",
                  "MiniBatchKMeans Execution Time",
                  "Inertia Delta"])

centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)

for samples in n_samples:
    # Generate sample data
    np.random.seed(0)

    batch_size = int(samples // 100)
    X, labels_true = make_blobs(
        n_samples=50000, centers=centers, cluster_std=0.7)

    # Compute clustering with Means
    k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
    t0 = time.time()
    k_means.fit(X)
    t_batch = time.time() - t0

    # Compute clustering with MiniBatchKMeans
    mbk = MiniBatchKMeans(init='k-means++', n_clusters=3,
                          batch_size=batch_size, n_init=10,
                          max_no_improvement=10, verbose=0)
    t0 = time.time()
    mbk.fit(X)
    t_mini_batch = time.time() - t0

    df.loc[samples] = [t_batch, t_mini_batch,
                       np.abs(k_means.inertia_ - mbk.inertia_)]

print('\n', df.to_latex(), '\n')
df.to_csv('assets/3.1/kmeans/complexity.csv')

In [None]:
import plotly
import plotly.graph_objs as go

plotly.offline.init_notebook_mode(connected=True)

trace_kmeans = go.Scatter(
    x=n_samples,
    y=df['KMeans Execution Time'],
    name='KMeans'
)

trace_minibatchkmeans = go.Scatter(
    x=n_samples,
    y=df['MiniBatchKMeans Execution Time'],
    name='MiniBatchKMeans'
)

trace_inertiadiff = go.Scatter(
    x=n_samples,
    y=df['Inertia Delta'],
    yaxis='y2',
    name='Inertia Difference'
)

data = [trace_kmeans, trace_minibatchkmeans, trace_inertiadiff]

layout = go.Layout(
    title='KMeans vs MiniBatchKMeans Execution Times',
    xaxis=dict(
        title='Number of Training Samples'
    ),
    yaxis=dict(
        title='Time/s'
    ),
    yaxis2=dict(
        title='Difference in Inertia',
        overlaying='y',
        side='right'
    ),
    legend=dict(x=0.4, y=1)
)

figure=go.Figure(data=data,layout=layout)


plotly.offline.iplot(
    figure
)

plotly.io.write_image(figure, 'assets/3.1/kmeans/minibatchtime.png')
