# 11.1 - PCA on DIGITS and MNIST data
We run PCA algorithm on both the MNIST and DIGITS datasets to perform **linear dimensionality reduction**. We are creating many models, each one with a different number of components in order to choose an optimal number of components.


In [None]:
# Mount GDrive, change directory and check contents of folder.

import os
from google.colab import drive
from google.colab import files

PROJECT_FOLDER = "/content/gdrive/My Drive/Colab Notebooks/CS345_SP22/11. Dimensionality Reduction"

drive.mount('/content/gdrive/')
os.chdir(PROJECT_FOLDER)
print("Current dir: ", os.getcwd())

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from data.digits import CDIGITSDataSet
from data.mnist import CMNISTDataSet

Helper class

In [None]:
# =========================================================================================================================
class CUnsupervisedDimReduction(object):
  # --------------------------------------------------------------------------------------
  def __init__(self, p_sName, p_nSamples):
    self.Name       = p_sName
    self.Samples    = p_nSamples
    self.PCAModels  = []
  # --------------------------------------------------------------------------------------
  def ClearModels(self)
    self.PCAModels  = []
  # --------------------------------------------------------------------------------------
  def AddPCAModel(self, p_oPCAModel):
    self.PCAModels.append([p_oPCAModel, p_oPCAModel.n_components_, np.sum(p_oPCAModel.explained_variance_ratio_)])
  # --------------------------------------------------------------------------------------
  def GetPlotSerie(self):
    nX = [oRec[1] for oRec in self.PCAModels]
    nY = [oRec[2] for oRec in self.PCAModels] 
    return nX, nY
  # --------------------------------------------------------------------------------------
# =========================================================================================================================

# Data and Hyperparameters
We load the DIGITS (8x8 grayscale) and MNIST (28x28 grayscale) datasets . We create to different sets of PCA models for the two datasets that will use a fraction of the available samples.

In [None]:
oDIGITS = CDIGITSDataSet() 
oMNIST  = CMNISTDataSet()

print("Training samples shape for DIGITS:", oDIGITS.TSSamples.shape)
print("Training samples shape for MNIST:", oMNIST.TSSamples.shape)

# _____// Hyperparameters \\_____
# ... Data Hyperparameters ...
SAMPLE_COUNT = 1000 # How many samples to use from the available population in the dataset
# ... Dimensionality Reduction Hyperparameters ...
COMPONENTS_FROM = 2
COMPONENTS_TO   = 50

oDRModels = []
oDRModels.append(CUnsupervisedDimReduction("DIGITS", oDIGITS.TSSamples[:SAMPLE_COUNT,:]))
oDRModels.append(CUnsupervisedDimReduction("MNIST" , oMNIST.TSSamples[:SAMPLE_COUNT,:]))

# Training of all models
For each dataset we train different PCA models for range of components that is our hyperparameter.

In [None]:
for oDRM in oDRModels:
  oDRM.ClearModels()
  for nComponents in range(COMPONENTS_FROM, COMPONENTS_TO + 1):
    print("-"*25, "PCA on %s with %d Components" % (oDRM.Name, nComponents), "-"*25)
    oPCA = PCA(n_components=nComponents)
    oPCA.fit(oDRM.Samples)
    oDRM.AddPCAModel(oPCA)

# Evaluation
Evaluate the performance of different count of components, using the percentage of explained variance for each different components hyperparameter setting.

In [None]:
nX1, nY1 = oDRModels[0].GetPlotSerie()
plt.plot(nX1, nY1, "-b", label=oDRModels[0].Name)

nX2, nY2 = oDRModels[1].GetPlotSerie()
plt.plot(nX2, nY2, "-r", label=oDRModels[1].Name)

plt.legend(loc="upper left")
plt.xlim(0.0, COMPONENTS_TO + 5)
plt.show()