# 10.1 Unsupervised Learning with k-Means Clustering

This example illustrates clusting on the MNIST and DIGITS image data using the k-Means clustering algorithm. This is done with Unsupervised Learning, without using the label that is provided for each sample. We will use the labels only for visualization.

In [None]:
# Mount GDrive, change directory and check contents of folder.

import os
from google.colab import drive
from google.colab import files

PROJECT_FOLDER = "/content/gdrive/My Drive/Colab Notebooks/CS345_SP22/10. Clustering"

drive.mount('/content/gdrive/')
os.chdir(PROJECT_FOLDER)
print("Current dir: ", os.getcwd())

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Dataset loading
Load either MNIST or DIGITS.

In [None]:
from data.digits import CDIGITSDataSet
from data.mnist import CMNISTDataSet

# _____// Data Hyperparameters \\_____
IS_MNIST = True
SAMPLE_COUNT = 1500  # How many samples to use from the available population in the dataset

if IS_MNIST:
  oMNIST  = CMNISTDataSet()
  sDataName  = "MNIST"
  nSamples   = oMNIST.TSSamples[:SAMPLE_COUNT,:]
  nLabels    = oMNIST.TSLabels[:SAMPLE_COUNT]
else:
  sDataName = "DIGITS"
  oDIGITS = CDIGITSDataSet() 
  nSamples   = oDIGITS.TSSamples[:SAMPLE_COUNT,:]
  nLabels    = oDIGITS.TSLabels[:SAMPLE_COUNT]

print("Loaded %s dataset" % sDataName)
print("Training set shape:", nSamples.shape)
print("Class count:", len(np.unique(nLabels)))

# Data Preprocessing.
We transform the values of our features with **standardization**, that "centers" the mean value for each to 0, lesser values will be negative and greater will be positive. A new value of 1 will mean that its distance from mean is exactly σ the standard deviation.

In [None]:
from sklearn.preprocessing import StandardScaler

print("First sample in dataset, features 40-100, before standardization")

# Standardization
nSampleIndex = 0
nFromFeature = 140
nToFeature = 160

oScaler = StandardScaler()
nStandardizedSamples = oScaler.fit_transform(nSamples)

print("Sample#1 original features %d-%d" % (nFromFeature, nToFeature))
print(nSamples[nSampleIndex,nFromFeature:nToFeature])
print("Mean μ of features %d-%d over %d samples" % (nFromFeature, nToFeature, SAMPLE_COUNT))
print(oScaler.mean_[nFromFeature:nToFeature])
print("Std σ of features %d-%d over %d samples" % (nFromFeature, nToFeature, SAMPLE_COUNT))
print(oScaler.scale_[nFromFeature:nToFeature])
print("Sample#1 standardized features %d-%d" % (nFromFeature, nToFeature))
print(nStandardizedSamples[nSampleIndex,nFromFeature:nToFeature])

# Dimensionality Reduction.
In order to understanding clustering, we would like to visualize the clusters in a 2D space. For this reason we will decrease the initial dimensionality of the image that is 28x28 = 784 to just 2 features, using one of the PCA and t-SNE algorithms.

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# _____// Dimensionality Reduction Hyperparameters \\_____
COMPONENTS = 2
IS_LINEAR_DIM_REDUCTION = False

# ... Dimensionality Reduction Hyperparameters ...
PERPLEXITY      = 100.0
LEARNING_RATE   = 1000.0
EPOCHS          = 1000
GRADIENT_CALCULATION_ALGORITHM = "barnes_hut" #Fast

if IS_LINEAR_DIM_REDUCTION:
  sDimReductionMethod = "PCA"
  nReducedSamples  = PCA(n_components=COMPONENTS).fit_transform(nStandardizedSamples)
else:
  sDimReductionMethod = "t-SNE"
  oTSNE = TSNE( n_components=COMPONENTS
                ,perplexity=PERPLEXITY, n_iter=EPOCHS
                ,method=GRADIENT_CALCULATION_ALGORITHM
                ,verbose=2
                )
  nReducedSamples  = oTSNE.fit_transform(nStandardizedSamples)

# Visualization of Known Classes
We visualize the samples as points in the 2D space using a different color for each class.

In [None]:
from mllib.visualization import CMultiScatterPlot

oPlot = CMultiScatterPlot("Visualization of %s image samples after reducing dimensionality with %s" % (sDataName, sDimReductionMethod))
oPlot.AddData("",nReducedSamples , nLabels)
oPlot.Show(0, "Component1", "Component2")

# Training
Learn a k-Means clustering model.



In [None]:
from sklearn.cluster import KMeans

# _____// Clustering Hyperparameters \\_____
NUM_RUNS_RANDOM_CENTROIDS = 4
RANDOM_SEED               = 2021
CLUSTER_COUNT_K           = 10

nReducedSamples = nReducedSamples.astype(np.float64)
oClusteringModel = KMeans(init="k-means++", n_clusters=CLUSTER_COUNT_K, n_init=NUM_RUNS_RANDOM_CENTROIDS, verbose=2)
oClusteringModel.fit(nReducedSamples)

# Visualization of Clusters
Visualize the cluster centroids and partitioning of the 2D representation space of an image (after dimensionality reduction) using a Voronoi diagram.

In [None]:
from mllib.visualization  import CVoronoi2DPlot

oVoronoi = CVoronoi2DPlot("K-means clustering on %s dataset (%s reduced data)\n" 
                          "Centroids are marked with white cross" % (sDataName, sDimReductionMethod)
                          ,nReducedSamples, nLabels, p_nGroundTruthClusterCount=10)
oVoronoi.ShowForKMeans(oClusteringModel)