In [None]:
import numpy as np

from starter import read_data, inplace_min_max_scaling, kmeans_helper, calculate_downsample

from sklearn.metrics import silhouette_score, adjusted_mutual_info_score, adjusted_rand_score


KMEANS WITH RAW DATA

In [3]:
train_data = read_data('mnist_train.csv')
valid_data = read_data('mnist_valid.csv')
test_data = read_data('mnist_test.csv')

inplace_min_max_scaling(train_data)
inplace_min_max_scaling(test_data)
inplace_min_max_scaling(valid_data)

cluster_data = [l[1] for l in train_data]
true_labels = [l[0] for l in train_data]

metrics = ['euclidean', 'cosim']

In [5]:
for metric in metrics:
    print()
    k = 10

    cluster_memberships = kmeans_helper(cluster_data, metric, k=k)

    clusters = [[] for _ in range(k)]

    for data_point_idx, cluster in enumerate(cluster_memberships):
        clusters[cluster].append(cluster_data[data_point_idx])

    data = np.array(cluster_data)

    # Silhouette score for custom KMeans
    custom_score = silhouette_score(data, cluster_memberships)
    cami = adjusted_mutual_info_score(true_labels, cluster_memberships)
    cari = adjusted_rand_score(true_labels, cluster_memberships)
    print(f"Silhouette Score with k={k} (custom KMeans):", custom_score)
    print("Adjusted Mutual Information (AMI):", cami)
    print("Adjusted Rand Index (ARI):", cari)
    print()


Silhouette Score with k=10 (custom KMeans): 0.07038219828897542
Adjusted Mutual Information (AMI): 0.4947745456009607
Adjusted Rand Index (ARI): 0.38945316287005266


Silhouette Score with k=10 (custom KMeans): 0.06038206107697131
Adjusted Mutual Information (AMI): 0.5109744105664488
Adjusted Rand Index (ARI): 0.393551748373689



KMEANS WITH DOWNSAMPLED DATA

In [6]:
train_data = read_data('mnist_train.csv')
valid_data = read_data('mnist_valid.csv')
test_data = read_data('mnist_test.csv')

inplace_min_max_scaling(train_data)
inplace_min_max_scaling(test_data)
inplace_min_max_scaling(valid_data)

calculate_downsample(train_data) # inplace calculation of downsample -- reduces the dataset by half exactly
calculate_downsample(valid_data)
calculate_downsample(test_data)

cluster_data = [l[1] for l in train_data]
true_labels = [l[0] for l in train_data]

metrics = ['euclidean', 'cosim']

In [7]:
for metric in metrics:
    print()
    k = 10

    cluster_memberships = kmeans_helper(cluster_data, metric, k=k)

    clusters = [[] for _ in range(k)]

    for data_point_idx, cluster in enumerate(cluster_memberships):
        clusters[cluster].append(cluster_data[data_point_idx])

    data = np.array(cluster_data)

    # Silhouette score for custom KMeans
    custom_score = silhouette_score(data, cluster_memberships)
    cami = adjusted_mutual_info_score(true_labels, cluster_memberships)
    cari = adjusted_rand_score(true_labels, cluster_memberships)
    print(f"Silhouette Score with k={k} (custom KMeans):", custom_score)
    print("Adjusted Mutual Information (AMI):", cami)
    print("Adjusted Rand Index (ARI):", cari)
    print()


Silhouette Score with k=10 (custom KMeans): 0.07681420743363736
Adjusted Mutual Information (AMI): 0.5045506029552029
Adjusted Rand Index (ARI): 0.40018290495180836


Silhouette Score with k=10 (custom KMeans): 0.0634570424654387
Adjusted Mutual Information (AMI): 0.5065747278342634
Adjusted Rand Index (ARI): 0.39660189647244537

