In [2]:
!pip install sklearn matplotlib pandas numpy autograd

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import os

def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='latin1')
    return dict

def load_cifar_data(file_dir):
    train_data_dict = unpickle(os.path.join(file_dir, "train"))
    test_data_dict = unpickle(os.path.join(file_dir, "test"))
    x_train = train_data_dict["data"].reshape(50000, 3, 32, 32).transpose(0,2,3,1).astype("uint8")
    x_test = test_data_dict["data"].reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("uint8")
    return x_train, x_test

# flattern cifar data from N*32*32*3 to N*3072
def flattern_data(x):
    samples = x.shape[0]
    flattern_shape = 1
    for dim in x.shape[1:]:
        flattern_shape *= dim
    return x.reshape(samples, flattern_shape)

# reconstruct data from N*3072 to N*32*32*3
def construct_image_from_flattern(x, colored = True):
    samples = x.shape[0]
    if colored:
      return x.reshape(samples, 32, 32, 3).astype("uint8")
    else:
      return x.reshape(samples, 32, 32).astype("uint8")

#visualize image data, displayed on row*col grid, x's 1st-dim >= (row*col)
def visualize_data(row, col, plt_size, x):
    fig, axes1 = plt.subplots(row, col, figsize=(plt_size, plt_size))
    i = 0
    for j in range(row):
        for k in range(col):
            if i >= len(x):
              break
            axes1[j][k].set_axis_off()
            axes1[j][k].imshow(x[i])
            i += 1
    plt.show()
    return

In [None]:
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])

In [4]:
from sklearn.decomposition import PCA

# construct a pca object from given data
def get_pca(component, X):
    pca = PCA(n_components=component)
    pca.fit(X)
    return pca

def pca_encode(pca_model, X):
  return pca_model.transform(X)

def pca_decode(pca_model, X):
  return pca_model.inverse_transform(X)

In [5]:
from sklearn.cluster import KMeans

# generate a kmeans model and label of input data
def generate_kmeans_model(X, k):
    kmeans = KMeans(n_clusters=k, random_state=0)
    transformed = kmeans.fit_predict(X)
    return kmeans, transformed

# predict cluster labels of given data
def kmeans_clustering(model, X):
    return model.predict(X)

# transform a list of labels respect to index into a dictionary of {cluster)num:[index]}
def clusters_to_index(cluster_labels):
    dict = {}
    for i in range(len(cluster_labels)):
        if cluster_labels[i] in dict:
            dict[cluster_labels[i]].append(i)
        else:
            dict[cluster_labels[i]] = [i]
    return dict

In [6]:
def pca_kmeans_pipeline(X, feature_vector_size, cluster_num):
  pca_model = get_pca(feature_vector_size, X)
  encoded_image = pca_encode(pca_model, X)
  kmeans_model, data_index_cluster_labels = generate_kmeans_model(encoded_image, cluster_num)
  cluster_index_dict = clusters_to_index(data_index_cluster_labels)
  return pca_model, kmeans_model, cluster_index_dict

In [7]:
# visualizing kmeans cluster centers
def visualize_kmeans_centre(kmeans_model, pca_model, row, col, colored = True):
  cluster_centers = kmeans_model.cluster_centers_
  cluster_centers_decoded = pca_decode(pca_model, cluster_centers)
  cluster_centers_decoded_image = construct_image_from_flattern(cluster_centers_decoded, colored)
  visualize_data(row, col, 15, cluster_centers_decoded_image)


In [8]:
from sklearn.mixture import GaussianMixture

def generate_GMM_model(X, components):
  GMM_model = GaussianMixture(n_components = components)
  transformed = GMM_model.fit_predict(X)
  return GMM_model, transformed

def GMM_clustering(model, X):
  return model.predict(X)

In [9]:
def pca_GMM_pipeline(X, feature_vector_size, cluster_num):
  pca_model = get_pca(feature_vector_size, X)
  encoded_image = pca_encode(pca_model, X)
  GMM_model, data_index_cluster_labels = generate_GMM_model(encoded_image, cluster_num)
  cluster_index_dict = clusters_to_index(data_index_cluster_labels)
  return pca_model, GMM_model, cluster_index_dict

In [10]:
def visualize_GMM_means(GMM_model, pca_model, row, col, colored = True):
  cluster_centers = GMM_model.means_
  cluster_centers_decoded = pca_decode(pca_model, cluster_centers)
  cluster_centers_decoded_image = construct_image_from_flattern(cluster_centers_decoded, colored)
  visualize_data(row, col, 15, cluster_centers_decoded_image)