## K-means MapReduce(random initialization)

In [150]:
import numpy as np
import math

def KmeansMapReduce(data, K, Maxiter=50, ObjV =math.inf, threshold = 0.01, centroids = False):
    centroids = initializeCentroid(data, K)
    for i in range(Maxiter):
        data_plus_clusterid, New_ObjV = mapper(data, centroids)
        centroids, labels = reducer(data, K, centroids)
        if ObjV-New_ObjV > threshold:
            ObjV = New_ObjV
    return labels, centroids, ObjV
    
## initialize k centroids randomly
def initializeCentroid(data, K):
    return data[np.random.choice(len(data), K, replace=False)]

## the map process
def mapper(data,centroids):
    nearest_centroid_matrix = np.array([centroids[np.argmin(np.sqrt(np.sum((data[i]-centroids)**2, axis=1)))] for i in range(len(data))])
    cluster_id = np.array([np.argmin(np.sqrt(np.sum((data[i]-centroids)**2, axis=1))) for i in range(len(data))])
    objV = np.sum([np.sum((data[i]-centroids)**2) for i in range(len(data))])
    data_plus_clusterid = np.hstack([cluster_id.reshape(len(cluster_id),1), data])
    return data_plus_clusterid, objV

## the reduce process
def reducer(data, K, centroids):
    data_plus_clusterid, objV = mapper(data, centroids)
    labels = data_plus_clusterid[:,0].reshape(len(data),1)
    labels = np.array(labels).reshape(-1)
    updated_centroid = np.array([np.mean(data_plus_clusterid[:,1:][labels==k], axis=0) for k in range(K)])
    return updated_centroid, labels

In [153]:
from tensorflow.keras.datasets import mnist
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

In [160]:
labels, centroids, ObjV = KmeansMapReduce(X_test.reshape(10000,784), 10, Maxiter=50)

In [161]:
from sklearn.metrics import accuracy_score
accuracy_score(labels, Y_test)

0.2336

In [141]:
Y_test[0:100]

array([7, 2, 1, 0, 4, 1, 4, 9, 5, 9, 0, 6, 9, 0, 1, 5, 9, 7, 3, 4, 9, 6,
       6, 5, 4, 0, 7, 4, 0, 1, 3, 1, 3, 4, 7, 2, 7, 1, 2, 1, 1, 7, 4, 2,
       3, 5, 1, 2, 4, 4, 6, 3, 5, 5, 6, 0, 4, 1, 9, 5, 7, 8, 9, 3, 7, 4,
       6, 4, 3, 0, 7, 0, 2, 9, 1, 7, 3, 2, 9, 7, 7, 6, 2, 7, 8, 4, 7, 3,
       6, 1, 3, 6, 9, 3, 1, 4, 1, 7, 6, 9], dtype=uint8)

In [135]:
X_test.shape

(10000, 28, 28)