### K-means++ Algorithm Reproduction

#### k-means++ initialization

In [19]:
import timeit
import numpy as np
import scipy.linalg as la
from numba import jit

In [20]:
# helper functions
def min_distSQ(x, C):
    """
        x: vector input
        C: list input
    """
    distSQ = np.zeros(len(C))
    for c_index in range(len(C)):
        distSQ[c_index] = np.sum((x - C[c_index])**2)
    return min(distSQ)

In [21]:
def k_means_pp(X, k):
    """
    Implements the K_means++ Initialization algorithm
    
    Parameters:
    -----------
    X: input dataset
    k: the number of output clusters
    
    Returns:
    --------
    C: the reclustered k centroids used to initialize the k-means algorithm
    """
    init = X[np.random.randint(low=0, high=X.shape[0], size=1)]
    C = [init]
    loop = 0
    
    while len(C)<k:
        
        if (loop % 10 == 0):
            print("The current loop is:", loop)
        
        cost = 1e-16
        d2 = []
        for row_index in range(X.shape[0]):
            x = X[row_index,]
            dist = min_distSQ(x, C)
            d2.append(dist)
            cost += dist
        prob = d2/cost
        new_c_index = np.random.choice(np.arange(X.shape[0]), size=1, p=prob)
        new_c = X[new_c_index,]
        C.append(new_c)
        
        loop += 1
        
    return C

In [None]:
%%file k_means_PlusPlus.cpp


#### k-means Algorithm

In [None]:
# helper functions
def distance(X, C):
    """
        This function computes the distance between all the points in X and all the 
        centroids in C.
    """
    return np.sum((X[:, np.newaxis, :] - C)**2, axis=2)

In [None]:
# k-means algorithm
def k_means(X, k):
    """
        This function will separate X into k clusters using the classic k-means
        algorithm.
    """
    ## parameters
    max_iter = 1000
    step = 0
    n, p = X.shape
    
    ## initialize the centroids
    initial_centroid_indices = np.random.choice(n, k)
    centroids = X[initial_centroid_indices,:] # (k,p)-dimension
    
    ## run the algorithm
    while step < max_iter:
        ### sort the data in terms of clusters
        dist = distance(X, centroids)
        cluster_indices = np.argmin(dist, axis=1)
        
        ### update centroids
        update_centroids = np.zeros(centroids.shape)
        for i in range(k):
            update_centroids[i,:] = np.mean(X[cluster_indices==i,:], axis=0)
        
        ### check conditions
        if np.array_equal(update_centroids, centroids):
            break
        else:
            centroids = update_centroids
            
            if (step % 5 == 0):
                print("We are currently at {} step".format(step))
            
            step += 1
    
    total_dist = distance(X, centroids)
            
    return {"Centroids": centroids,
            "Cluster Indices": cluster_indices,
            "Number of Iterations": step}

In [22]:
# R = 1
np.random.seed(2018)
N = 10000
R = 1
cov = R * np.eye(15) 
GaussMixture1 = np.random.multivariate_normal(np.zeros(15), cov, N)

In [23]:
%%time

k = 50
C_R1=k_means_pp(GaussMixture1, k)
C_R1

The current loop is: 0
The current loop is: 10
The current loop is: 20
The current loop is: 30
The current loop is: 40
CPU times: user 1min 33s, sys: 646 ms, total: 1min 34s
Wall time: 1min 36s


In [24]:
%prun -q -D work.prof k_means_pp(GaussMixture1, k)

The current loop is: 0
The current loop is: 10
The current loop is: 20
The current loop is: 30
The current loop is: 40
 
*** Profile stats marshalled to file 'work.prof'. 


In [25]:
import pstats
p = pstats.Stats('work.prof')
p.print_stats()
pass

Sat Apr  7 18:25:34 2018    work.prof

         51941119 function calls in 115.793 seconds

   Random listing order was used

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   490049    0.081    0.000    0.081    0.000 {method 'append' of 'list' objects}
       98    0.000    0.000    0.000    0.000 {method 'get' of 'dict' objects}
        1    0.000    0.000  115.793  115.793 {built-in method builtins.exec}
 12250020    2.994    0.000    2.994    0.000 {built-in method builtins.isinstance}
      147    0.000    0.000    0.000    0.000 {built-in method builtins.issubclass}
   980050    0.139    0.000    0.139    0.000 {built-in method builtins.len}
   490000    2.599    0.000    2.599    0.000 {built-in method builtins.min}
        5    0.000    0.000    0.001    0.000 {built-in method builtins.print}
       25    0.000    0.000    0.000    0.000 {method 'acquire' of '_thread.lock' objects}
       20    0.000    0.000    0.000    0.000 {built-in method posix.ge

In [29]:
p.sort_stats('ncalls').print_stats(5)
pass

Sat Apr  7 18:25:34 2018    work.prof

         51941119 function calls in 115.793 seconds

   Ordered by: call count
   List reduced from 35 to 5 due to restriction <5>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 12250049   31.058    0.000   31.058    0.000 {method 'reduce' of 'numpy.ufunc' objects}
 12250020    2.994    0.000    2.994    0.000 {built-in method builtins.isinstance}
 12250000   19.231    0.000   56.739    0.000 /anaconda3/lib/python3.6/site-packages/numpy/core/fromnumeric.py:1778(sum)
 12250000    3.456    0.000   34.513    0.000 /anaconda3/lib/python3.6/site-packages/numpy/core/_methods.py:31(_sum)
   980050    0.139    0.000    0.139    0.000 {built-in method builtins.len}




In [41]:
%%cython -a 

import cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt, pow


UsageError: Cell magic `%%cython` not found.


In [39]:
%%cython -a 

# helper functions
def min_distSQ(x, C):
    """
        x: vector input
        C: list input
    """
    distSQ = np.zeros(len(C))
    for c_index in range(len(C)):
        distSQ[c_index] = np.sum((x - C[c_index])**2)
    return min(distSQ)

UsageError: Cell magic `%%cython` not found.


In [38]:
def k_means_pp(X, k):
    init = X[np.random.randint(low=0, high=X.shape[0], size=1)]
    C = [init]
    loop = 0
    
    while len(C)<k:
        
        if (loop % 10 == 0):
            print("The current loop is:", loop)
        
        cost = 1e-16
        d2 = []
        for row_index in range(X.shape[0]):
            x = X[row_index,]
            dist = min_distSQ(x, C)
            d2.append(dist)
            cost += dist
        prob = d2/cost
        new_c_index = np.random.choice(np.arange(X.shape[0]), size=1, p=prob)
        new_c = X[new_c_index,]
        C.append(new_c)
        
        loop += 1
        
    return C

In [36]:
# R = 1
np.random.seed(2018)
N = 10000
R = 1
cov = R * np.eye(15) 
GaussMixture1 = np.random.multivariate_normal(np.zeros(15), cov, N)

In [37]:
%%time

k = 50
C_R1=k_means_pp(GaussMixture1, k)
C_R1

The current loop is: 0
The current loop is: 10
The current loop is: 20
The current loop is: 30
The current loop is: 40
CPU times: user 1min 42s, sys: 1.04 s, total: 1min 44s
Wall time: 1min 47s


1. write the algorithm
2. optimize (C++/cython)
3. Sparks (parallel)
4. Show all the intermediate steps when optimizing our functions
5. reproduce the results using datasets (synthetic/realistc)