# K-means and K-medoids

## Problem


Assume we have a 2D dataset consisting of (0,−6),(4,4),(0,0),(−5,2). We wish to do k-means and k-medoids clustering with k=2. We initialize the cluster centers with (−5,2),(0,−6).

For this small dataset, in choosing between two equally valid exemplars for a cluster in k-medoids, choose them with priority in the order given above (i.e. all other things being equal, you would choose (0,−6) as a center over (−5,2)).

For the following scenarios, give the clusters and cluster centers after the algorithm converges.

## Aims for solution

1. define functions for standard/ simple operations
2. define initialization
3. define steps of the k-means/ k-medoids algorithm



In [1]:
import numpy as np

In [21]:
X = np.array(([0,-6], [4,4], [0,0], [-5,2]))
Z = np.array(([-5,2], [0,-6]))
X.shape

(4, 2)

## Define helper functions

In [3]:
def dist(x_1, x_2, norm="l_1"):
    """returns distance between two points
    l_1: l_1 norm,
    l_2: l_2 norm"""
    if norm == "l_1":
        return np.linalg.norm(x_1 - x_2, ord = 1)
    elif norm == "l_2":
        return np.linalg.norm(x_1 - x_2)

In [4]:
def abs(x):
    """returns the alsolute value of scalar x"""
    if x < 0:
        return -x
    else:
        return x

In [5]:
def list_to_array(x_list):
    """return numpy array (m x n), given list of m numpy arrays (1 x n)"""
    result_arr = np.empty([len(x_list), len(x_list[0])])
    for i in range(len(x_list)):
        result_arr[i,:] = x_list[i]
    return result_arr

In [6]:
def labelled_X(X_l):
    """return labelled points (usage of temporary dummy labels)"""
    return np.append(X_l, np.zeros([len(X_l), 1]), axis = 1)

In [72]:
def medoid(points, norm="l_1"):
    """
    calulates medoid of labelled/ clustered points
    input parameters:
    points - numpy.array (nxm), n: no. of points, m: dim of points
    norm - "l_1" or "l_2"
    output:
    medoid of points 
    (element of points with least distance to the rest of points) 
    """
    
    # append one coloumn for the distances to each point
    center_dists = np.append(points, np.zeros([len(points), 1]), axis = 1)
    
    # calculate cumulative distance for each point in cluster
    for center in center_dists:
        for point in points:
            #fill the last col of center_dists with the cumulative distances
            center[-1] += dist(point, center[0:points.shape[1]], norm)
    
    # return point with minimum cumulative distance
    return center_dists[np.argmin(center_dists[:,-1])][0:points.shape[1]]

# quick check
test = np.ones(([1,2]))
medoid(test)

array([1., 1.])

In [59]:
def mean(points, norm="l_1"):
    """
    returns mean point from np.array
    input:
    points - numpy.array (n x m), n: no. of points, m: no. of dim.
    norm - "l_1" or "l_2"
    output:
    centerpoint - np.array (1 x m)
    """
    
    # calculate the mean value of clustered points
    # return mean or median center (depending on norm)
    if norm == "l_1":
        return np.median(points, axis = 0)
    elif norm == "l_2":
        return np.mean(points, axis = 0)

# quick check
test = np.arange(0,8).reshape([2,4])
mean(test, "l_1") 

array([2., 3., 4., 5.])

## Define functions of algorithm

In [53]:
def assign_points_to_clusters(points, centers, norm="l_1"):
    """
    returns clusters (points assigned to clusters regarding to the distance to their centers)
    input parameters: 
    points - numpy array (n x m); n: no. of points, m: dim of points
    centers - numpy array (k x m); k: no. of clusters, m: dim of points
    norm - "l_1" or "l_2" for type of cost function
    output:
    list of numpy arrays (individual components of list represent clusters)
    """
   
    # fill clusters with first (empty) elements
    # TODO: change inner list to numpy array
    clusters = []
    for center in centers:
        #clusters.append(np.array((1, np.shape(points)[1])))
        clusters.append([])
        
    # calculate distances from points to individual centers
    # TODO: substitute the for-loops with clever numpy operations if possible
    distances = np.empty((len(points), len(clusters)))
    for i_point in range(len(points)):
        for i_center in range(len(centers)):
            distances[i_point, i_center] = dist(points[i_point], centers[i_center], norm)

    # loop through the distances to assign points to the clusters
    # where the point has minimal distance to the corresponding center
    for i in range(len(distances)):
        # col index of distances corresponds to cluster-index, therfore to index in cluster list
        #np.append(clusters[np.argmin(distances[i])], points[i], axis=0)
        clusters[np.argmin(distances[i])].append(points[i])
    
    # before returning the output convert in right format list(np.arrays)
    
    result_clusters = []
    for cluster in clusters:
        # initialize clusters with array according to the no. of points in it
        result_clusters.append(np.empty([len(cluster), np.shape(cluster)[1]]))
    
    for i in range(len(clusters)):
        for j in range(len(clusters[i])):
            result_clusters[i][j] = clusters[i][j]
    
    
    return result_clusters

#assign_points_to_clusters(X, Z, "l_1")

In [75]:
def assign_centers_to_clusters(clusters, c_type="kmeans", norm="l_1"):
    """
    calculates centers to clusters
    input values:
    clusters - list of numpy arrays (arrays with points corresponding to each cluster)
    c_type - paramter for algorithm "kmeans" - K-Means, "kmedoids" - K-Medoids
    norm - parameter for calculating the distance between points (loss function)
    output values:
    centers - numpy array (k x m); k: no. of clusters, m: dim of points
    """
    
    # initialize empty result array for centers (to be calculated)
    centers = np.empty([len(clusters), np.shape(clusters[0])[1]])
    
    # K-Means algorithm
    if c_type == "kmeans":
        for i in range(len(centers)):
            centers[i] = mean(clusters[i], norm)
    # K-Medoids algorithm
    elif c_type == "kmedoids":
        for i in range(len(centers)):
            centers[i] = medoid(clusters[i], norm)
    
    
    return centers

#test = assign_points_to_clusters(X, Z, "l_1")
#assign_centers_to_clusters(test, "kmeans", "l_1")

In [80]:
# test the k-means algorithm:
max_iter = 10
points = X
centers = Z

# TODO: use convergence criteria
for i in range(max_iter):
    clusters = assign_points_to_clusters(points, centers, "l_1")
    #centers = assign_centers_to_clusters(clusters, "kmedoids", "l_1")
    centers = assign_centers_to_clusters(clusters, "kmeans", "l_1")

print(clusters)
print(centers)

[array([[ 4.,  4.],
       [-5.,  2.]]), array([[ 0., -6.],
       [ 0.,  0.]])]
[[-0.5  3. ]
 [ 0.  -3. ]]
