In [1]:
import numpy as np
import pandas as pd

### Reading Spam dataset

In [10]:
spam = pd.read_table("./spam_data/spambase.data", 
                          sep=",", header=None)

In [15]:
spam_data = spam.iloc[:,0:57]
spam_true_labels = spam.iloc[:,57]

In [21]:
spam_numpy_data = spam_data.values
spam_numpy_true_labels = spam_true_labels.values

In [22]:
type(spam_numpy_data), type(spam_numpy_true_labels)

(numpy.ndarray, numpy.ndarray)

In [24]:
spam_numpy_data.shape, spam_numpy_true_labels.shape

((4601, 57), (4601,))

### K-means++ Algorithm (adapted from previous work)

In [25]:
def distance(xs, centroid):
    """Computes matrix of squared distance from each point to each centroid.
    
    Parameters:
    -----------
    xs : ndarray of n points in d dimensional Euclidean space (nxd)
    centroid: ndarray of k centroids in d dimensional Euclidean space (kxd)
    
    Returns:
    --------
    distance: matrix of squared distances (nxk)
    """

    distance = np.sum((xs[:,None,:] - centroid)**2, axis=-1)
    return distance

In [26]:
def cost(d):
    """Computes the cost of a set of points with respect to a collection of centroids
    
    Parameters:
    -----------
    d : matrix of squared distances (nxk); likely returned from distance() function
    
    Returns:
    --------
    cost: cost with respect to centroids
    """
    #calculate distance to the nearest centroid for each point
    min_dist = np.min(d, axis = 1)
    
    #compute cost
    cost = np.sum(min_dist)
    return cost

In [27]:
def centroid_weights(d):
    """Computes weights as defined in step 7 of the k-means|| algorithm
        
    Parameters:
    -----------
    d : matrix of squared distances (nxk); likely returned from distance() function
    
    Returns:
    --------
    w_x: ndarray of weights applied to centroids (kx1)
    """
    #identify closest centroid to each point
    c_close = np.zeros(d.shape)
    c_close[np.range(d.shape[0]), np.argmin(d, axis = 1)] = 1
    
    #compute the weights
    w_x = np.sum(c_close, axis = 0)
    return w_x

In [28]:
def k_means_pp(xs, k, seed=None):
    """
    Implements the K_means++ Initialization algorithm
    
    Parameters:
    -----------
    xs: input dataset
    k: the number of output clusters
    seed: an optional random seed
    
    Returns:
    --------
    C: the reclustered k centroids used to initialize the k-means algorithm
    """
    #initialization
    np.random.seed(seed)
    C = xs[np.random.choice(xs.shape[0],1),:]
    loop = 0
    
    while len(C)<k:
        
        if (loop % 10 == 0):
            print("The current loop is:", loop)
        
        dist = distance(xs,C)
        cst = cost(dist)
        
        probs_x = np.min(dist, axis = 1)/cst
        C_new = xs[np.random.choice(xs.shape[0],1,p=probs_x),:]
        
        C = np.vstack((C,C_new))
        
        loop += 1
        
    return C

In [29]:
def k_means(X, k, centroids):
    """
        This function will separate X into k clusters using the classic k-means
        algorithm.
    """
    ## parameters
    max_iter = 10000
    step = 0
    #n, p = X.shape
    
    ## run the algorithm
    while step < max_iter:
        ### sort the data in terms of clusters
        dist = distance(X, centroids)
        cluster_indices = np.argmin(dist, axis=1)
        
        ### update centroids
        update_centroids = np.zeros(centroids.shape)
        for i in range(k):
            update_centroids[i,:] = np.mean(X[cluster_indices==i,:], axis=0)
        
        ### check conditions
        if np.array_equal(update_centroids, centroids):
            break
        else:
            centroids = update_centroids
            
            if (step % 5 == 0):
                print("We are currently at {} step".format(step))
            
            step += 1
    
    total_dist = distance(X, centroids)
    total_cost = cost(total_dist)
            
    return {"Centroids": centroids,
            "Cluster Indices": cluster_indices,
            "Number of Iterations": step,
            "Total Cost": total_cost}

In [30]:
%%time

initial_centroids_pp = k_means_pp(spam_numpy_data, k=20, seed=2018)
k_means_output = k_means(spam_numpy_data, k=20, centroids=initial_centroids_pp)

The current loop is: 0
The current loop is: 10
We are currently at 0 step
We are currently at 5 step
We are currently at 10 step
We are currently at 15 step
We are currently at 20 step
CPU times: user 808 ms, sys: 200 ms, total: 1.01 s
Wall time: 1.01 s


#### Ignore for now this performance measure (because true labels have only 0's and 1's, whereas predicted labels are related to the chosen k value)

In [35]:
from sklearn import metrics
spam_numpy_predicted_labels = k_means_output["Cluster Indices"]
metrics.adjusted_rand_score(spam_numpy_true_labels, spam_numpy_predicted_labels)

0.13542936092919955

In [34]:
spam_numpy_predicted_labels

array([15, 12, 16, ..., 19, 19,  0])

In [33]:
k_means_output["Total Cost"]

25315637.30823114