# 11.2 - t-SNE on DIGITS and MNIST data
We run the t-SNE algorithm on both the MNIST and DIGITS datasets for **non-linear dimensionality reduction**. We are creating many models with this **Manifold Learning** algorithm, each one with a different number of components and compare the **K-L divergence error** after the stop of the algorithm.

In [None]:
# Mount GDrive, change directory and check contents of folder.

import os
from google.colab import drive
from google.colab import files

PROJECT_FOLDER = "/content/gdrive/My Drive/Colab Notebooks/CS345_SP22/11. Dimensionality Reduction"

drive.mount('/content/gdrive/')
os.chdir(PROJECT_FOLDER)
print("Current dir: ", os.getcwd())

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from data.digits import CDIGITSDataSet
from data.mnist import CMNISTDataSet

Helper class

In [None]:
# =========================================================================================================================
class CUnsupervisedDimReduction(object):
  # --------------------------------------------------------------------------------------
  def __init__(self, p_sName, p_nSamples):
    self.Name       = p_sName
    self.Samples    = p_nSamples
    self.PCAModels  = []
    self.TSNEModels = []
  # --------------------------------------------------------------------------------------
  def AddPCAModel(self, p_oPCAModel):
    self.PCAModels.append([p_oPCAModel, p_oPCAModel.n_components_, np.sum(p_oPCAModel.explained_variance_ratio_)])
  # --------------------------------------------------------------------------------------
  def ClearModels(self, p_bIsClearingTSNEOnly=False):
    if not p_bIsClearingTSNEOnly:
      self.PCAModels  = []
    self.TNSEModels = []
  # --------------------------------------------------------------------------------------
  def AddTSNEModel(self, p_oTSNEModel, p_nComponents):
    self.TSNEModels.append([p_oTSNEModel, p_nComponents, p_oTSNEModel.kl_divergence_])
  # --------------------------------------------------------------------------------------
  def GetPCAPlotSerie(self):
    nX = [oRec[1] for oRec in self.PCAModels]
    nY = [oRec[2] for oRec in self.PCAModels] 
    return nX, nY
  # --------------------------------------------------------------------------------------
  def GetTSNEPlotSerie(self):
    nX = [oRec[1] for oRec in self.TSNEModels]
    nY = [oRec[2] for oRec in self.TSNEModels] 
    return nX, nY
  # --------------------------------------------------------------------------------------
# =========================================================================================================================


# Data and Hyperparameters
Create to different sets of paired models (PCA, t-SNE) for the two datasets, DIGITS (8x8 grayscale) and MNIST (28x28 grayscale), that will use a fraction of the available samples.

In [None]:
oDIGITS = CDIGITSDataSet() 
oMNIST  = CMNISTDataSet()

print(oDIGITS.TSSamples.shape)
print(oMNIST.TSSamples.shape)

# _____// Hyperparameters \\_____
# ... Data Hyperparameters ...
SAMPLE_COUNT = 100 # How many samples to use from the available population in the dataset

oDRModels = []
oDRModels.append(CUnsupervisedDimReduction("DIGITS", oDIGITS.TSSamples[:SAMPLE_COUNT,:]))
oDRModels.append(CUnsupervisedDimReduction("MNIST" , oMNIST.TSSamples[:SAMPLE_COUNT,:]))

# Training all PCA models
For each dataset we train different PCA models for range of components that is our hyperparameter.

In [None]:
# _____// Hyperparameters \\_____
# ... Dimensionality Reduction Hyperparameters ...
COMPONENTS_FROM = 2
COMPONENTS_TO   = 40

# Creating PCA Models
for oDRM in oDRModels:
  oDRM.ClearModels()
  for nComponents in range(COMPONENTS_FROM, COMPONENTS_TO + 1):
    print("-"*25, "PCA on %s with %d Components" % (oDRM.Name, nComponents), "-"*25)
    oPCA = PCA(n_components=nComponents)
    oPCA.fit(oDRM.Samples)
    oDRM.AddPCAModel(oPCA)

# Evaluation
Evaluate the performance of different count of components, using the percentage of explained variance for each different components hyperparameter setting.

In [None]:
# Comparison of different PCA models
plt.title("Principle Component Analysics (PCA)")
plt.xlabel('Components')
plt.ylabel('Ratio of Explained Variance')

nX1, nY1 = oDRModels[0].GetPCAPlotSerie()
plt.plot(nX1, nY1, "-b", label=oDRModels[0].Name)

nX2, nY2 = oDRModels[1].GetPCAPlotSerie()
plt.plot(nX2, nY2, "-r", label=oDRModels[1].Name)

plt.legend(loc="upper left")
plt.xlim(0.0, COMPONENTS_TO + 5)
plt.show()

# Training all t-SNE models
For each dataset we train different t-SNE models for a range of components that is our hyperparameter. There are several other  hyperparameters that are common for all models.


In [None]:
# _____// Hyperparameters \\_____
# ... Dimensionality Reduction Hyperparameters ...
COMPONENTS_FROM = 2
COMPONENTS_TO   = 40
PERPLEXITY      = 100.0
LEARNING_RATE   = 200.0
INITIALIZATION  = 'random'
EPOCHS          = 1000
GRADIENT_CALCULATION_ALGORITHM = "exact" #Slow but for higher number of components
#GRADIENT_CALCULATION_ALGORITHM = "barnes_hut" #Fast

# Learning t-SNE Embeddings
for oDRM in oDRModels:
  oDRM.ClearModels(True)
  for nComponents in range(COMPONENTS_FROM, COMPONENTS_TO + 1):
    print("-"*25, "t-SNE on %s with %d Components" % (oDRM.Name, nComponents), "-"*25)
    oTSNE = TSNE( n_components=nComponents
                 ,perplexity=PERPLEXITY, n_iter=EPOCHS
                 ,method=GRADIENT_CALCULATION_ALGORITHM
                 ,learning_rate=LEARNING_RATE
                 ,init=INITIALIZATION
                 )
    oTSNE.fit(oDRM.Samples)
    oDRM.AddTSNEModel(oTSNE, nComponents)

# Comparizon of Training
Visualize the K-L divergence cost for each different component hyperparameter setting.

In [None]:
# Comparison of different t-SNE models
plt.title("t-Distributed Stochastic Neighbor Embedding (t-SNE)")
plt.xlabel('Components')
plt.ylabel('Kullback–Leibler Divergence (Cost)')
nX1, nY1 = oDRModels[0].GetTSNEPlotSerie()
plt.plot(nX1, nY1, "-b", label=oDRModels[0].Name)

nX2, nY2 = oDRModels[1].GetTSNEPlotSerie()
plt.plot(nX2, nY2, "-r", label=oDRModels[1].Name)

plt.legend(loc="upper left")
plt.xlim(0.0, COMPONENTS_TO + 5)
plt.show()