## K-Means Algorithm and Initializations


### Helper Functions

In [1]:
%%file helper_func.py
import numpy as np

def distance(xs, centroid, weights = np.array([1])):
    """Computes matrix of squared distance from each point to each centroid.
    
    Parameters:
    -----------
    xs : ndarray of n points in d dimensional Euclidean space (nxd)
    centroid: ndarray of k centroids in d dimensional Euclidean space (kxd)
    weights: an optional input of weights applied to each data point
    
    Returns:
    --------
    distance: matrix of squared distances (nxk)
    """
    if weights.all() == 1:
        weights = np.ones(xs.shape[0])

    distance = weights[:,None]*np.sum((xs[:,None,:] - centroid)**2, axis=-1)
    return distance

def cost(d):
    """Computes the cost of a set of points with respect to a collection of centroids
    
    Parameters:
    -----------
    d : matrix of squared distances (nxk) returned from distance() function
    
    Returns:
    --------
    cost: cost with respect to centroids
    """
    #calculate distance to the nearest centroid for each point
    min_dist = np.min(d, axis = 1)
    
    #compute cost
    cost = np.sum(min_dist)
    return cost

def centroid_weights(d):
    """Computes weights as defined in step 7 of the k-means|| algorithm
        
    Parameters:
    -----------
    d : matrix of squared distances (nxk) returned from distance() function
    
    Returns:
    --------
    w_x: ndarray of weights applied to centroids (kx1)
    """
    #identify closest centroid to each point
    c_close = np.zeros(d.shape)
    c_close[np.arange(d.shape[0]), np.argmin(d, axis = 1)] = 1
    
    #compute the weights
    w_x = np.sum(c_close, axis = 0)
    return w_x

Overwriting helper_func.py


In [2]:
%%file kmeans_pp_init.py
import numpy as np
from helper_func import distance, cost

def k_means_pp(xs, k, seed=None, verbose=False, weights = np.array([1])):
    """
    Implements the K_means++ Initialization algorithm
    
    Parameters:
    -----------
    xs: input dataset
    k: the number of output clusters
    seed: an optional random seed
    verbose: an optional argument to show progress of algorithm
    weights: an optional input of weights applied to each data point 
    
    Returns:
    --------
    C: the reclustered k centroids used to initialize the k-means algorithm
    """
    #initialization
    np.random.seed(seed)
    C = xs[np.random.choice(xs.shape[0],1),:]
    loop = 0
    
    while len(C)<k:
        
        if ((loop % 10 == 0)&(verbose == True)):
            print("The current loop is:", loop)
        
        dist = distance(xs,C, weights = weights)
        cst = cost(dist)
        
        probs_x = np.min(dist, axis = 1)/cst
        C_new = xs[np.random.choice(xs.shape[0],1,p=probs_x),:]
        
        C = np.vstack((C,C_new))
        
        loop += 1
        
    return C

Overwriting kmeans_pp_init.py


In [3]:
%%file kmeans_ll_init.py
import numpy as np
from helper_func import distance, cost, centroid_weights
from kmeans_pp_init import k_means_pp


def K_Means_ll(xs, k, l, seed=None, max_iter=None):
    """Implements the K_means || algorithm
    
    Parameters:
    -----------
    xs : ndarray of n points in d dimensional Euclidean space (nxd)
    k: the number of output clusters
    l: the oversampling factor; the number of centroids to sample at each iteration
    seed: an optional random seed
    max_iter: an optional argument to seet the number of iterations
    
    Returns:
    --------
    C: the reclustered k centroids used to initialize the k-means algorithm
    """
    
    #initialization
    np.random.seed(seed)
    centroid = xs[np.random.choice(xs.shape[0],1),:]
    cost_int = cost(distance(xs,centroid))
    
    order = np.log10(cost_int)
    
    if max_iter is not None:
        n_iter = max_iter
    else:
        n_iter = np.round(order)
    
    for i in np.arange(n_iter):
        dist = distance(xs,centroid)
        cst = cost(dist)
        
        probs_x = l*np.min(dist, axis = 1)/cst
        
        if any(probs_x > 1):
            probs_x[np.where(probs_x > 1)] = 1
        
        centroid_new = xs[np.random.binomial(1, p = probs_x) == 1,:]
        
        centroid = np.vstack((centroid,centroid_new))
    
    if centroid.shape[0] < k:
        return print('Error: Number of centers selected before reclustering is less than k;',
                    'adjust hyperparameters (k,l) or run for more iterations')
    
    dist = distance(xs,centroid)
    w_x = centroid_weights(dist)
    
    #Implement k-means++ to recluster weighted points in C
    C = k_means_pp(centroid,k,seed=seed,weights=w_x)

    return C

Overwriting kmeans_ll_init.py


In [4]:
%%file kmeans_func.py
import numpy as np
from helper_func import distance, cost

def k_means(X, k, centroids, verbose=False):
    """This function will separate X into k clusters
    using the classic k-means algorithm.
    
    Parameters:
    -----------
    X : ndarray of n points in d dimensional Euclidean space (nxd)
    k: the number of output clusters
    centroids: ndarray of initial centroids (kxd)
    verbose: an optional argument to show progress of algorithm

    Returns:
    --------
    Centroids: k centers after clustering
    Cluster_Indices: cluster classification for each data point
    Number of Iterations: number of iterations until convergence
    Total Cost: cost after convergence
    """
    ## parameters
    max_iter = 10000
    step = 0
    
    ## run the algorithm
    while step < max_iter:
        ### sort the data in terms of clusters
        dist = distance(X, centroids)
        cluster_indices = np.argmin(dist, axis=1)
        
        ### update centroids
        update_centroids = np.zeros(centroids.shape)
        for i in range(k):
            if np.sum(cluster_indices == i) == 0:
                update_centroids[i,:] = centroids[i,:]
            else:
                update_centroids[i,:] = np.mean(X[cluster_indices==i,:], axis=0)
        
        ### check conditions
        if np.array_equal(update_centroids, centroids):
            break
        else:
            centroids = update_centroids
            
            if ((step % 5 == 0)&(verbose == True)):
                print("We are currently at {} step".format(step))
            
            step += 1
    
    total_dist = distance(X, centroids)
    total_cost = cost(total_dist)
            
    return {"Centroids": centroids,
            "Cluster Indices": cluster_indices,
            "Number of Iterations": step,
            "Total Cost": total_cost}

Overwriting kmeans_func.py
