K-Means performance evaluation

CIFAR-10

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import torch
import warnings
warnings.filterwarnings('ignore')
from time import time
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from scipy.spatial.distance import cdist
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
import seaborn as sns

np.random.seed(0)

def flatten_data(x):
    samples = x.shape[0]
    flattened_shape = 1
    for dim in x.shape[1:]:
        flattened_shape *= dim
    return x.reshape(samples, flattened_shape)

def pca_encode(pca_model, X):
    return pca_model.transform(X)

def pca_decode(pca_model, X):
    return pca_model.inverse_transform(X)

def clusters_to_index(cluster_labels):
    dict = {}
    for i in range(len(cluster_labels)):
        if cluster_labels[i] in dict:
            dict[cluster_labels[i]].append(i)
        else:
            dict[cluster_labels[i]] = [i]
    return dict

# reconstruct data from N*3072 to N*32*32*3
def construct_image_from_flattern(x, colored = True):
    samples = x.shape[0]
    if colored:
      return x.reshape(samples, 32, 32, 3).astype("uint8")
    else:
      return x.reshape(samples, 32, 32).astype("uint8")

#visualize image data, displayed on row*col grid, x's 1st-dim >= (row*col)
def visualize_data(x, n, ncol, plt_size):
    nrow = n // ncol + 1
    #fig, axes1 = plt.subplots(nrows=nrow, ncols=ncol)
    plt.figure(figsize=(plt_size, plt_size))
    for i in range(n):
        idx = np.random.randint(0, len(x))
        plt.subplot(nrow, ncol, i+1)
        plt.axis("off")
        plt.imshow(x[idx])
        #axes1[k].set_axis_off()
        #axes1[k].imshow(x[k])
    plt.tight_layout()
    plt.show()
    return
    
# visualizing kmeans cluster centers
def visualize_kmeans_centre(kmeans_model, pca_model, n, ncol, colored = True):
    cluster_centers = kmeans_model.cluster_centers_
    cluster_centers_decoded = pca_decode(pca_model, cluster_centers)
    cluster_centers_decoded_image = construct_image_from_flattern(cluster_centers_decoded, colored)
    visualize_data(cluster_centers_decoded_image, n, ncol, 15)



def bench_k_means(kmeans, name, data, labels):
    t0 = time()
    #estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
    kmeans_model = kmeans.fit(data)
    fit_time = time() - t0
    #results = [name, fit_time, kmeans_model.inertia_]
    results = [name, fit_time]
    # Define the metrics which require only the true labels and estimator
    # labels
    clustering_metrics_labels = [
        metrics.normalized_mutual_info_score,
        metrics.adjusted_rand_score,
        #metrics.homogeneity_score,
        #metrics.completeness_score,
        metrics.v_measure_score
    ]
    results += [m(labels, kmeans_model.labels_) for m in clustering_metrics_labels]

    # The silhouette score requires the full dataset
    results += [
        metrics.silhouette_score(data, kmeans_model.labels_, metric="euclidean"),
        metrics.calinski_harabasz_score(data, kmeans_model.labels_)
    ]

    # Show the results
    formatter_result = (
        #"{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t\t{:.3f}\t{:.3f}"
        "{:9s}\t{:.3f}s\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t\t{:.3f}"
    )
    print(formatter_result.format(*results))

    return results


def load_data(path):
    CIFARValData = np.load(path)['arr_0']
    data = []
    labels = []
    for classno, classimg in enumerate(CIFARValData):
        for i in range(len(classimg)):
            data.append((classimg[i]/255).astype(np.float))
            labels.append(classno)

    data = np.array(data)
    labels = np.array(labels)
    return data, labels


def make_elbow_plot(train_data, val_data, vec_sizes, min_k=1, max_k=15, metric=False):
    #PCA cifar4
    scores = []
    elbow_values = []
    for vec_size in vec_sizes:
        pca_model = PCA(n_components=vec_size).fit(train_data)
        val_data_pca = pca_model.transform(val_data)
        print("Feature vector size: ", val_data_pca.shape)
        model = KMeans(random_state=0)
        if not metric:
            visualizer = KElbowVisualizer(model, k=(min_k, max_k))
        else:
            visualizer = KElbowVisualizer(model, k=(min_k, max_k), metric='calinski_harabasz')
            
        visualizer.fit(val_data_pca)       # Fit the data to the visualizer
        #visualizer.show()                  # Finalize and render the figure
        scores.append(visualizer.k_scores_)
        elbow_values.append(visualizer.elbow_value_)

    return scores, elbow_values


def plot_all(model_name, vec_sizes, scores, elbow, min_k=1, max_k=15):

    #colors = ['red', 'blue', 'green', 'orange']
    sns.set_theme(style="darkgrid")
    # change the edge color (bluish and transparentish) and thickness
    plt.figure(figsize=(6,5))
    ax = plt.axes()
    ax.axvline(x = elbow, ls='--', lw=2, c='black', label="Elbow K=" + str(elbow))
    #plt.ylim([0, np.amax(np.array(scores))*1.1])

    ks = np.arange(min_k, max_k)
    for i in range(len(scores)):
        plt.plot(ks, scores[i], label="Feature Vector Size = " + str(vec_sizes[i]), marker='o', linewidth=1.5, markersize=4)
    
    plt.legend(fontsize=8)
    plt.title(model_name + '+KMeans Elbow Plot', fontsize=12)
    plt.xlabel('Number of Clusters', fontsize=10)
    plt.xticks(ks)
    plt.ylabel('Distortion Score', fontsize=10)
    plt.savefig(f'{model_name}_elbow_plot.png')
    plt.show()

In [None]:
cifar4_train_data, cifar4_train_labels = load_data('CIFARTrainData.npz')
cifar10_train_data, cifar10_train_labels = load_data('CIFARTrainData10Class.npz')
cifar4_val_data, cifar4_val_labels = load_data('CIFARValData.npz')
cifar10_val_data, cifar10_val_labels = load_data('CIFARValData10Class.npz')
cifar4_test_data, cifar4_test_labels = load_data('CIFARTestData.npz')
cifar10_test_data, cifar10_test_labels = load_data('CIFARTestData10Class.npz')

print(cifar4_train_data.shape)
print(cifar10_train_data.shape)
print(cifar4_val_data.shape)
print(cifar10_val_data.shape)
print(cifar4_test_data.shape)
print(cifar10_test_data.shape)

vec_sizes = [10, 50, 200, 500, 1000]

In [None]:
print(set(cifar4_train_labels))
print(set(cifar10_train_labels))
print(set(cifar4_val_labels))
print(set(cifar10_val_labels))
print(set(cifar4_test_labels))
print(set(cifar10_test_labels))

In [None]:
#cifar4 plots
train_data = flatten_data(cifar4_train_data)
val_data = flatten_data(cifar4_val_data)
pca_cifar4_scores, pca_cifar4_elbows = make_elbow_plot(train_data, val_data, vec_sizes, min_k=1, max_k=15)

In [None]:
elbow = np.argmax(np.bincount(pca_cifar4_elbows))
plot_all('CIFAR4_PCA', vec_sizes, pca_cifar4_scores, elbow, min_k=1, max_k=15)

In [None]:
#cifar10 plots
train_data = flatten_data(cifar10_train_data)
val_data = flatten_data(cifar10_val_data)
pca_cifar10_scores, pca_cifar10_elbows = make_elbow_plot(train_data, val_data, vec_sizes, min_k=5, max_k=20)

In [None]:
elbow = np.argmax(np.bincount(pca_cifar10_elbows))
plot_all('CIFAR10_PCA', vec_sizes, pca_cifar10_scores, elbow, min_k=5, max_k=20)

In [None]:
cifar4_simclr_train = np.load('CIFARDataFeature.npz')['arr_0']
cifar4_simclr_val = np.load('CIFARDataValFeature.npz')['arr_0']
cifar4_simclr_test = np.load('CIFARDataTestFeature.npz')['arr_0']

cifar10_simclr_train = np.load('CIFARDataFeature10Class.npz')['arr_0']
cifar10_simclr_val = np.load('CIFARDataValFeature10Class.npz')['arr_0']
cifar10_simclr_test = np.load('CIFARDataTestFeature10Class.npz')['arr_0']

print(cifar4_simclr_train.shape)
print(cifar4_simclr_val.shape)
print(cifar4_simclr_test.shape)
print(cifar10_simclr_train.shape)
print(cifar10_simclr_val.shape)
print(cifar10_simclr_test.shape)

In [None]:
cifar4_simclr_scores, cifar4_simclr_elbows = make_elbow_plot(cifar4_simclr_train, cifar4_simclr_val, vec_sizes, min_k=1, max_k=15)

In [None]:
elbow = np.argmax(np.bincount(cifar4_simclr_elbows))
plot_all('CIFAR4_SimCLR_PCA', vec_sizes, cifar4_simclr_scores, elbow, min_k=1, max_k=15)

In [None]:
cifar10_simclr_scores, cifar10_simclr_elbows = make_elbow_plot(cifar10_simclr_val, cifar10_simclr_val, vec_sizes, min_k=5, max_k=20)

In [None]:
elbow = np.argmax(np.bincount(cifar10_simclr_elbows))
plot_all('CIFAR10_SimCLR_PCA', vec_sizes, cifar10_simclr_scores, elbow, min_k=5, max_k=20)

In [None]:
print("Evaluating PCA on cifar4...")
train_vec = flatten_data(cifar4_train_data)
val_vec = flatten_data(cifar4_val_data)
print(val_vec.shape)

print(100 * "_")
#print("init\t\ttime\tinertia\t\thomo\tcompl\tv-meas\tARI\tNMI\tsilhouette\tcalinski\tdavies")
print("Model\t\tTime\tNMI\tARI\tV-meas\tSilhouette\tCalinski")

for vec_size in vec_sizes:
    kmeans = KMeans(n_clusters=4, random_state=0)
    pca_model = PCA(n_components=vec_size).fit(train_vec)
    encoded = pca_model.transform(val_vec)
    #print(encoded.shape)
    bench_k_means(kmeans, name="PCA-" + "n4" + "v" + str(vec_size), data=encoded, labels=cifar4_val_labels)

print(100 * "_")


print("\nEvaluating PCA on cifar10...")
train_vec = flatten_data(cifar10_train_data)
val_vec = flatten_data(cifar10_val_data)
print(val_vec.shape)

print(100 * "_")
#print("init\t\ttime\tinertia\t\thomo\tcompl\tv-meas\tARI\tNMI\tsilhouette\tcalinski\tdavies")
print("Model\t\tTime\tNMI\tARI\tV-meas\tSilhouette\tCalinski")

for vec_size in vec_sizes:
    kmeans = KMeans(n_clusters=10, random_state=0)
    pca_model = PCA(n_components=vec_size).fit(train_vec)
    encoded = pca_model.transform(val_vec)
    #print(encoded.shape)
    bench_k_means(kmeans, name="PCA-" + "n10" + "v" + str(vec_size), data=encoded, labels=cifar10_val_labels)

print(100 * "_")

In [None]:
print("\nEvaluating SimCLR on CIFAR4...")
train_vec = cifar4_simclr_train
val_vec = cifar4_simclr_val
print(val_vec.shape)

print(100 * "_")
print("Model\t\tTime\tNMI\tARI\tV-meas\tSilhouette\tCalinski")
# cifar4
for vec_size in vec_sizes:
    kmeans = KMeans(n_clusters=4, random_state=0)
    pca_model = PCA(n_components=vec_size).fit(val_vec)
    encoded = pca_model.transform(val_vec)
    bench_k_means(kmeans, name="SimCLR-" + "n4" + "v" + str(vec_size), data=encoded, labels=cifar4_val_labels)


print("\nEvaluating SimCLR on CIFAR10...")
train_vec = cifar10_simclr_train
val_vec = cifar10_simclr_val
print(val_vec.shape)

print(100 * "_")
print("Model\t\tTime\tNMI\tARI\tV-meas\tSilhouette\tCalinski")
# cifar10
for vec_size in vec_sizes:
    kmeans = KMeans(n_clusters=10, random_state=0)
    pca_model = PCA(n_components=vec_size).fit(val_vec)
    encoded = pca_model.transform(val_vec)
    bench_k_means(kmeans, name="SimCLR-" + "n10" + "v" + str(vec_size), data=encoded, labels=cifar10_val_labels)

print(100 * "_")

In [None]:
data_vec1 = flatten_data(cifar4_test_data)
data_vec2 = cifar4_simclr_test
data_vec3 = flatten_data(cifar10_test_data)
data_vec4 = cifar10_simclr_test

print("Model\t\tTime\tNMI\tARI\tV-meas\tSilhouette\tCalinski")

kmeans = KMeans(n_clusters=4, random_state=0)
encoded = PCA(n_components=50).fit_transform(data_vec1)
# pca_model = PCA(n_components=50).fit(flatten_data(cifar4_test_data))
# encoded = pca_model.transform(data_vec1)
bench_k_means(kmeans, name="PCA-" + "n4" + "v" + str(50), data=encoded, labels=cifar4_test_labels)

kmeans = KMeans(n_clusters=4, random_state=0)
encoded = PCA(n_components=50).fit_transform(data_vec2)
# pca_model = PCA(n_components=50).fit(cifar4_simclr_train)
# encoded = pca_model.transform(data_vec2)
bench_k_means(kmeans, name="SimCLR-" + "n4" + "v" + str(50), data=encoded, labels=cifar4_test_labels)


kmeans = KMeans(n_clusters=10, random_state=0)
encoded = PCA(n_components=50).fit_transform(data_vec3)
# pca_model = PCA(n_components=50).fit(flatten_data(cifar10_test_data))
# encoded = pca_model.transform(data_vec3)
bench_k_means(kmeans, name="PCA-" + "n10" + "v" + str(50), data=encoded, labels=cifar10_test_labels)

kmeans = KMeans(n_clusters=10, random_state=0)
encoded = PCA(n_components=50).fit_transform(data_vec4)
# pca_model = PCA(n_components=50).fit(cifar10_simclr_train)
# encoded = pca_model.transform(data_vec4)
bench_k_means(kmeans, name="SimCLR-" + "n10" + "v" + str(50), data=encoded, labels=cifar10_test_labels)

print(100 * "_")

In [None]:
kmeans = KMeans(n_clusters=10, random_state=0)
bench_k_means(kmeans, name="SimCLR-" + "n10" + "v" + str(cifar10_simclr_val.shape[1]), data=cifar10_simclr_val, labels=cifar10_val_labels)

In [None]:
# SimCLR
kmeans = KMeans(n_clusters=4, random_state=0)
pca_model = PCA(n_components=10).fit(cifar4_simclr_train)
encoded = pca_model.transform(cifar4_simclr_test)
kmeans_simclr = kmeans.fit(encoded)

# visualize
cluster_index_dict = clusters_to_index(kmeans_simclr.labels_)
for cluster in sorted(list(cluster_index_dict.keys())):
    print(cluster)
    image_index = cluster_index_dict[cluster]
    images = cifar4_test_data[image_index]
    visualize_data(images, n=9, ncol=3, plt_size=6)