### Generate a synthetic dataset

In [37]:
%matplotlib inline

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs

In [8]:
def GaussMix(R, k, n=10000, d=15, seed=None):
    """Generates GaussMixture synthetic dataset"""
    np.random.seed(seed)
    mu = np.zeros(d)
    sigma = np.diag(R*np.ones(d))
    centers = np.random.multivariate_normal(mean=mu, cov=sigma, size=k)
    
    X,y = make_blobs(n_samples=n, n_features=d, centers=centers, random_state=seed)
    return X,y

### k-means++

#### helper functions

In [18]:
def distance(xs, centroid):
    """Computes matrix of squared distance from each point to each centroid.
    
    Parameters:
    -----------
    xs : ndarray of n points in d dimensional Euclidean space (nxd)
    centroid: ndarray of k centroids in d dimensional Euclidean space (kxd)
    
    Returns:
    --------
    distance: matrix of squared distances (nxk)
    """

    distance = np.sum((xs[:,None,:] - centroid)**2, axis=-1)
    return distance

In [19]:
def cost(xs, centroid):
    """Computes the cost of a set of points with respect to a collection of centroids
    
    Parameters:
    -----------
    xs : ndarray of n points in d dimensional Euclidean space (nxd)
    centroid: ndarray of k centroids in d dimensional Euclidean space (kxd)
    
    Returns:
    --------
    cost: cost with respect to centroids
    """
    #calculate distance to the nearest centroid for each point
    min_dist = np.min(distance(xs, centroid), axis = 1)
    
    #compute cost
    cost = np.sum(min_dist)
    return cost

In [20]:
def centroid_weights(d, centroid):
    """Computes weights as defined in step 7 of the k-means|| algorithm
        
    Parameters:
    -----------
    d : matrix of squared distances (nxk); likely returned from distance() function
    centroid: ndarray of k centroids in d dimensional Euclidean space (kxd)
    
    Returns:
    --------
    w_x: ndarray of weights applied to centroids (kx1)
    """
    #identify closest centroid to each point
    c_close = np.zeros(d.shape)
    c_close[np.range(d.shape[0]), np.argmin(d, axis = 1)] = 1
    
    #compute the weights
    w_x = np.sum(c_close, axis = 0)
    return w_x

#### k-means++ initialization algorithm

In [21]:
def k_means_pp(xs, k, seed=None):
    """
    Implements the K_means++ Initialization algorithm
    
    Parameters:
    -----------
    xs: input dataset
    k: the number of output clusters
    seed: an optional random seed
    
    Returns:
    --------
    C: the reclustered k centroids used to initialize the k-means algorithm
    """
    #initialization
    np.random.seed(seed)
    C = xs[np.random.choice(xs.shape[0],1),:]
    loop = 0
    
    while len(C)<k:
        
        if (loop % 10 == 0):
            print("The current loop is:", loop)
        
        dist = distance(xs,C)
        cst = cost(xs,C)
        
        probs_x = np.min(dist, axis = 1)/cst
        C_new = xs[np.random.choice(xs.shape[0],1,p=probs_x),:]
        
        C = np.vstack((C,C_new))
        
        loop += 1
        
    return C

#### k-means algorithm

In [83]:
def k_means(X, k, centroids):
    """
        This function will separate X into k clusters using the classic k-means
        algorithm.
    """
    ## parameters
    max_iter = 10000
    step = 0
    #n, p = X.shape
    
    ## run the algorithm
    while step < max_iter:
        ### sort the data in terms of clusters
        dist = distance(X, centroids)
        cluster_indices = np.argmin(dist, axis=1)
        
        ### update centroids
        update_centroids = np.zeros(centroids.shape)
        for i in range(k):
            update_centroids[i,:] = np.mean(X[cluster_indices==i,:], axis=0)
        
        ### check conditions
        if np.array_equal(update_centroids, centroids):
            break
        else:
            centroids = update_centroids
            
            if (step % 5 == 0):
                print("We are currently at {} step".format(step))
            
            step += 1
    
    total_dist = distance(X, centroids)
    total_cost = cost(X, centroids)
            
    return {"Centroids": centroids,
            "Cluster Indices": cluster_indices,
            "Number of Iterations": step,
            "Total Cost": total_cost}

#### generate synthetic dataset

In [84]:
synthetic_data, synthetic_data_true_labels = GaussMix(R=10, k=50, n=10000, d=15, seed=2018)

In [76]:
synthetic_data

array([[-1.86138968, -0.23923958, -0.41357835, ..., -3.68704194,
        -2.97831231, -1.46541168],
       [ 5.05002218, -4.23808721,  0.70508808, ...,  2.41356994,
         5.28438716,  2.13176893],
       [-5.0757495 , -1.49734056, -4.83912418, ..., -1.91355102,
        -9.08379913, -0.66437582],
       ...,
       [-3.86208243, -4.1686748 , -2.33285155, ..., -2.09367202,
        -1.4248969 , -0.43013637],
       [ 4.00113826, -4.11981762, -1.8441766 , ..., -6.90744421,
        -6.21712508,  1.91209808],
       [ 0.0732189 , -2.13458364,  0.78276181, ..., -2.94964197,
         0.2176415 ,  4.74044444]])

In [77]:
synthetic_data_true_labels

array([33, 24, 11, ..., 18,  1, 32])

In [85]:
%%time

initial_centroids_pp = k_means_pp(synthetic_data, k=50, seed=2018)
k_means_output = k_means(synthetic_data, k=50, centroids=initial_centroids_pp)

The current loop is: 0
The current loop is: 10
The current loop is: 20
The current loop is: 30
The current loop is: 40
We are currently at 0 step
We are currently at 5 step
We are currently at 10 step
We are currently at 15 step
We are currently at 20 step
We are currently at 25 step
CPU times: user 3.5 s, sys: 1.1 s, total: 4.6 s
Wall time: 4.62 s


In [86]:
synthetic_data_predicted_labels = k_means_output["Cluster Indices"]

#### Evaluation of clustering performance using sklearn package

In [87]:
from sklearn import metrics
metrics.adjusted_rand_score(synthetic_data_true_labels, synthetic_data_predicted_labels)

0.904545447528751

#### Evaluation of clustering performance using cost function

In [88]:
k_means_output["Total Cost"]

193819.89598231384

### Reproduce the paper result using only k-means++

#### R=1, Seed=23 and Final=14 (scaled down by $10^4$)

In [89]:
synthetic_data, synthetic_data_true_labels = GaussMix(R=1, k=50, n=10000, d=15, seed=23)

In [90]:
%%time

initial_centroids_pp = k_means_pp(synthetic_data, k=50, seed=23)
k_means_output = k_means(synthetic_data, k=50, centroids=initial_centroids_pp)

The current loop is: 0
The current loop is: 10
The current loop is: 20
The current loop is: 30
The current loop is: 40
We are currently at 0 step
We are currently at 5 step
We are currently at 10 step
We are currently at 15 step
We are currently at 20 step
We are currently at 25 step
We are currently at 30 step
We are currently at 35 step
CPU times: user 4.18 s, sys: 1.05 s, total: 5.24 s
Wall time: 5.25 s


In [91]:
k_means_output["Total Cost"]

142718.2786790511

#### R=10, Seed=62 and Final=31 (scaled down by $10^4$)

In [92]:
synthetic_data, synthetic_data_true_labels = GaussMix(R=10, k=50, n=10000, d=15, seed=62)

In [95]:
%%time

initial_centroids_pp = k_means_pp(synthetic_data, k=50, seed=62)
k_means_output = k_means(synthetic_data, k=50, centroids=initial_centroids_pp)

The current loop is: 0
The current loop is: 10
The current loop is: 20
The current loop is: 30
The current loop is: 40
We are currently at 0 step
We are currently at 5 step
We are currently at 10 step
CPU times: user 2.82 s, sys: 784 ms, total: 3.6 s
Wall time: 3.6 s


In [96]:
k_means_output["Total Cost"]

185193.07153342268

#### R=100, Seed=30 and Final=15 (scaled down by $10^4$)

In [100]:
synthetic_data, synthetic_data_true_labels = GaussMix(R=100, k=50, n=10000, d=15, seed=30)

In [101]:
%%time

initial_centroids_pp = k_means_pp(synthetic_data, k=50, seed=30)
k_means_output = k_means(synthetic_data, k=50, centroids=initial_centroids_pp)

The current loop is: 0
The current loop is: 10
The current loop is: 20
The current loop is: 30
The current loop is: 40
We are currently at 0 step
We are currently at 5 step
CPU times: user 2.56 s, sys: 680 ms, total: 3.24 s
Wall time: 3.24 s


In [102]:
k_means_output["Total Cost"]

324350.2169674193