In [1]:
!pip install sklearn matplotlib pandas numpy autograd

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0.post1-py3-none-any.whl size=2344 sha256=c787706acf4dd157da1882e723d8624514bbff0fc6007339424f562ac867367f
  Stored in directory: /root/.cache/pip/wheels/14/25/f7/1cc0956978ae479e75140219088deb7a36f60459df242b1a72
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post1


In [21]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import os

def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='latin1')
    return dict

def load_cifar_100_data(file_dir):
    train_data_dict = unpickle(os.path.join(file_dir, "train"))
    test_data_dict = unpickle(os.path.join(file_dir, "test"))
    x_train = train_data_dict["data"].reshape(50000, 3, 32, 32).transpose(0,2,3,1).astype("uint8")
    y_train = train_data_dict["coarse_labels"]
    x_test = test_data_dict["data"].reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("uint8")
    y_test = test_data_dict["coarse_labels"]
    return x_train, x_test, y_train, y_test

def load_cifar_10_data(file_dir):
    x_train = None
    y_train = None
    for i in range(1, 6):
      train_data_batch_dict = unpickle(os.path.join(file_dir, "data_batch_"+str(i)))
      x_train_batch = train_data_batch_dict["data"].reshape(10000,3072)
      y_train_batch = train_data_batch_dict["labels"]
      x_train = x_train_batch if x_train is None else np.vstack((x_train,x_train_batch))
      y_train = y_train_batch if y_train is None else y_train + y_train_batch
    x_train = x_train.reshape(50000, 3, 32, 32).transpose(0,2,3,1).astype("uint8")
    test_data_dict = unpickle(os.path.join(file_dir, "test_batch"))
    x_test = test_data_dict["data"].reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("uint8")
    y_test = np.array(unpickle(os.path.join(".", "test_batch"))["labels"])
    return x_train, x_test, y_train, y_test

# flattern cifar data from N*32*32*3 to N*3072
def flattern_data(x):
    samples = x.shape[0]
    flattern_shape = 1
    for dim in x.shape[1:]:
        flattern_shape *= dim
    return x.reshape(samples, flattern_shape)

# reconstruct data from N*3072 to N*32*32*3
def construct_image_from_flattern(x, colored = True):
    samples = x.shape[0]
    if colored:
      return x.reshape(samples, 32, 32, 3).astype("uint8")
    else:
      return x.reshape(samples, 32, 32).astype("uint8")

#visualize image data, displayed on row*col grid, x's 1st-dim >= (row*col)
def visualize_data(row, col, plt_size, x):
    fig, axes1 = plt.subplots(row, col, figsize=(plt_size, plt_size))
    i = 0
    for j in range(row):
        for k in range(col):
            if i >= len(x):
              break
            axes1[j][k].set_axis_off()
            axes1[j][k].imshow(x[i])
            i += 1
    plt.show()
    return

In [None]:
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])

In [None]:
from sklearn.decomposition import PCA

# construct a pca object from given data
def get_pca(component, X):
    pca = PCA(n_components=component)
    pca.fit(X)
    return pca

def pca_encode(pca_model, X):
  return pca_model.transform(X)

def pca_decode(pca_model, X):
  return pca_model.inverse_transform(X)

In [None]:
from sklearn.cluster import KMeans

# generate a kmeans model and label of input data
def generate_kmeans_model(X, k):
    kmeans = KMeans(n_clusters=k, random_state=0)
    transformed = kmeans.fit_predict(X)
    return kmeans, transformed

# predict cluster labels of given data
def kmeans_clustering(model, X):
    return model.predict(X)

# transform a list of labels respect to index into a dictionary of {cluster)num:[index]}
def clusters_to_index(cluster_labels):
    dict = {}
    for i in range(len(cluster_labels)):
        if cluster_labels[i] in dict:
            dict[cluster_labels[i]].append(i)
        else:
            dict[cluster_labels[i]] = [i]
    return dict

In [None]:
def pca_kmeans_pipeline(X, feature_vector_size, cluster_num):
  pca_model = get_pca(feature_vector_size, X)
  encoded_image = pca_encode(pca_model, X)
  kmeans_model, data_index_cluster_labels = generate_kmeans_model(encoded_image, cluster_num)
  cluster_index_dict = clusters_to_index(data_index_cluster_labels)
  return pca_model, kmeans_model, cluster_index_dict

In [None]:
# visualizing kmeans cluster centers
def visualize_kmeans_centre(kmeans_model, pca_model, row, col, colored = True):
  cluster_centers = kmeans_model.cluster_centers_
  cluster_centers_decoded = pca_decode(pca_model, cluster_centers)
  cluster_centers_decoded_image = construct_image_from_flattern(cluster_centers_decoded, colored)
  visualize_data(row, col, 15, cluster_centers_decoded_image)


In [None]:
from sklearn.mixture import GaussianMixture

def generate_GMM_model(X, components):
  GMM_model = GaussianMixture(n_components = components)
  transformed = GMM_model.fit_predict(X)
  return GMM_model, transformed

def GMM_clustering(model, X):
  return model.predict(X)

In [None]:
def pca_GMM_pipeline(X, feature_vector_size, cluster_num):
  pca_model = get_pca(feature_vector_size, X)
  encoded_image = pca_encode(pca_model, X)
  GMM_model, data_index_cluster_labels = generate_GMM_model(encoded_image, cluster_num)
  cluster_index_dict = clusters_to_index(data_index_cluster_labels)
  return pca_model, GMM_model, cluster_index_dict

In [None]:
def visualize_GMM_means(GMM_model, pca_model, row, col, colored = True):
  cluster_centers = GMM_model.means_
  cluster_centers_decoded = pca_decode(pca_model, cluster_centers)
  cluster_centers_decoded_image = construct_image_from_flattern(cluster_centers_decoded, colored)
  visualize_data(row, col, 15, cluster_centers_decoded_image)

In [None]:
from time import time
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist

def bench_k_means(kmeans, name, data, labels):
    """Benchmark to evaluate the KMeans initialization methods.

    Parameters
    ----------
    kmeans : KMeans instance
        A :class:`~sklearn.cluster.KMeans` instance with the initialization
        already set.
    name : str
        Name given to the strategy. It will be used to show the results in a
        table.
    data : ndarray of shape (n_samples, n_features)
        The data to cluster.
    labels : ndarray of shape (n_samples,)
        The labels used to compute the clustering metrics which requires some
        supervision.
    """
    t0 = time()
    #estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
    kmeans_model = kmeans.fit(data)
    fit_time = time() - t0
    results = [name, fit_time, kmeans_model.inertia_]

    # Define the metrics which require only the true labels and estimator
    # labels
    clustering_metrics = [
        metrics.homogeneity_score,
        metrics.completeness_score,
        metrics.v_measure_score,
        metrics.adjusted_rand_score,
        metrics.normalized_mutual_info_score,
    ]
    results += [m(labels, kmeans_model.labels_) for m in clustering_metrics]

    # The silhouette score requires the full dataset
    results += [
        metrics.silhouette_score(
            data,
            kmeans_model.labels_,
            metric="euclidean",
            #sample_size=300,
        ),
        metrics.calinski_harabasz_score(data, kmeans_model.labels_),
        metrics.davies_bouldin_score(data, kmeans_model.labels_)
    ]

    # Show the results
    formatter_result = (
        "{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t\t{:.3f}\t{:.3f}"
    )
    print(formatter_result.format(*results))

    return kmeans_model