In [None]:
# Loading all the important datasets used in q1
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats as st
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, rand_score

x_raw_data = np.load('./x_raw_data.npy')
y_raw_data = np.load('./y_raw_data.npy')
x_pca_data = np.load('./x_pca_data.npy')
y_pca_data = np.load('./y_pca_data.npy')

x_ten_data = np.load('./x_ten_data.npy')
y_ten_data = np.load('./y_ten_data.npy')
x_ten_pca_data = np.load('./x_ten_pca_data.npy')
y_ten_pca_data = np.load('./y_ten_pca_data.npy')

In [None]:
def closest_images(kmeans, x_data):
    closest_images = np.empty((0,x_data.shape[1]))
    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_

    for i in range(10):         # 10 clusters
        cluster_data_points = x_data[labels == i]
        cluster_centroid = centroids[i]
        distances = np.linalg.norm(cluster_data_points - cluster_centroid,axis=1)
        closest_images = np.append(closest_images, [x_data[np.argmin(distances)]], axis=0)

    print(closest_images)
    return closest_images

In [None]:
# x_raw_data
def cluster(x_data):
    kmeans = KMeans(n_clusters=10)
    kmeans.fit(x_data)

    # print(kmeans.labels_)
    # print(kmeans.cluster_centers_)
    return kmeans

In [None]:
kmeans_raw = cluster(x_raw_data)
# labels = kmeans_raw.labels_
# centroids = kmeans_raw.cluster_centers_
closest_images_raw = closest_images(kmeans_raw, x_raw_data)

In [None]:
kmeans_ten = cluster(x_pca_data[:,:10])
closest_images_ten = closest_images(kmeans_ten, x_pca_data[:,:10])

In [None]:
kmeans_pov_90 = cluster(x_pca_data[:,:26])
closest_images_pov_90 = closest_images(kmeans_pov_90, x_pca_data[:,:26])

2b.

In [None]:
def calculate_sse(kmeans,x_data):
    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_
    sse = 0

    for i in range(10):     # 10 clusters
        cluster_data_points = x_data[labels == i]
        sse += np.sum((cluster_data_points - centroids[i])**2)

    return sse

In [None]:
# Purity measures the extent to which all the data points in a cluster belong to the same class
def calculate_purity(kmean,x_data,y_data):
    y_pred = kmean.predict(x_data)
    # TODO: map from cluster_num to label

    conf_mat = confusion_matrix(y_data,y_pred)
    purity = np.sum(np.amax(conf_mat,axis=0)) / np.sum(conf_mat)

    return purity

In [None]:
# Rand-index is a measure of similarity between true and predicted clusterings
def calculate_rand_index(kmean,x_data,y_data):
    y_pred = kmean.predict(x_data)
    rand_index = rand_score(y_data,y_pred)

    return rand_index

In [None]:
purity = calculate_purity(kmeans_pov_90,x_pca_data[:,:26],y_pca_data)
print(purity)

rand_index = calculate_rand_index(kmeans_pov_90,x_pca_data[:,:26],y_pca_data)
print(rand_index)

In [None]:
def label_clusters(kmean, y_data):
    cluster_to_label = np.empty((0,))
    labels = kmean.labels_
    for i in range(0,10):           # cluster numbers 0 to 9
        temp = y_raw_data[labels == i]
        temp = list(temp)
        counts = {j:temp.count(j) for j in temp}
        print('cluster', i, '-->', counts)

        mode = st.mode(temp).mode
        cluster_to_label = np.append(cluster_to_label,mode)

    print(cluster_to_label)

label_clusters(kmeans_raw, y_raw_data)
label_clusters(kmeans_ten, y_raw_data)