# problem 7

In [1]:
import numpy as np
from sklearn.decomposition import PCA
import cv2
import glob
import matplotlib.pyplot as plt
import copy
import mnist_reader as mnist
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.metrics import accuracy_score as AC
from tqdm.notebook import tqdm

# combine train and test together

In [2]:
DATA_SET_PATH = "mnist/"
train_imgs , train_labels = mnist.load_mnist(DATA_SET_PATH , kind = 'train')
test_imgs , test_labels = mnist.load_mnist(DATA_SET_PATH , kind = 't10k')

image path mnist/train-images-idx3-ubyte.gz
image path mnist/t10k-images-idx3-ubyte.gz


In [3]:
data = np.concatenate((train_imgs , test_imgs) , axis = 0)

# standard form
data = data.T
train_imgs = train_imgs.T
test_imgs = test_imgs.T

# pca

In [4]:
class PCA():
    def __init__(self , th = 10**1 , whiten = False):
        self.th = th
        self.whiten = whiten
        
    def fit(self , X):
        #mean of data
        self.m = np.atleast_2d(np.mean(X , axis = 1)).T
        
        #scatter matrix
        self.S = (X.shape[1]-1) * np.cov(X)
        
        #eigen values and eigen vectors
        self.w , self.V = np.linalg.eig(self.S)
        self.V = np.real(self.V)
        self.w = np.real(self.w)
        
        #condition number
        self.cn = np.max(np.abs(self.w)) / np.abs(self.w)
        
        #mask
        self.mask = self.cn < self.th
        
        
        # D matrix for whitening
        if self.whiten is False :
            self.D = np.eye(X.shape[0])
        else :
            self.D = np.diag(np.square(1/self.w[self.mask]))
            
        # transform matrix
        self.T = np.matmul(self.D , self.V[: , self.mask].T)
        
        #final_features
        self.n_features_ = sum(self.mask)
        self.features_ = X.shape[0]
            
    def transform(self , X):
        return np.matmul(self.T , (X - self.m))

In [5]:
pca1 = PCA(whiten=True)
pca1.fit(data)
np.max(pca1.cn) , np.min(pca1.w) , np.max(pca1.w) , pca1.features_ , pca1.n_features_

(154173308.2466142, 109.37509654080506, 16862720473.488724, 784, 6)

#  A) eigen values sorted

In [6]:
pca1.w.sort()
pca1.w[::-1]

array([1.68627205e+10, 1.03389054e+10, 3.41728510e+09, 2.84574939e+09,
       2.20645070e+09, 1.95402598e+09, 1.34403594e+09, 1.09858228e+09,
       7.72924827e+08, 7.65054468e+08, 5.76883668e+08, 5.21031643e+08,
       4.39539101e+08, 3.74489082e+08, 3.52063107e+08, 3.45748094e+08,
       3.15468934e+08, 3.02127179e+08, 2.65836726e+08, 2.61155813e+08,
       2.53795557e+08, 2.39136799e+08, 2.24184286e+08, 2.15286753e+08,
       2.10220229e+08, 2.01229220e+08, 1.92036243e+08, 1.84303808e+08,
       1.78812383e+08, 1.69562229e+08, 1.59828021e+08, 1.52481606e+08,
       1.50366588e+08, 1.46746991e+08, 1.42248654e+08, 1.35885075e+08,
       1.31650831e+08, 1.29870582e+08, 1.25389274e+08, 1.21080125e+08,
       1.14110546e+08, 1.13057009e+08, 1.12106199e+08, 1.04777348e+08,
       1.02478378e+08, 9.76751090e+07, 9.51675727e+07, 9.40827965e+07,
       9.36602710e+07, 8.92548985e+07, 8.84752699e+07, 8.66382499e+07,
       8.50592999e+07, 8.35912797e+07, 8.12282233e+07, 7.81825758e+07,
      

In [7]:
pca1.cn.sort()
pca1.cn[::-1]

array([1.54173308e+08, 3.01065648e+07, 1.85622288e+07, 2.94473930e+06,
       2.66069622e+06, 1.94027268e+06, 1.54900580e+06, 1.05435987e+06,
       8.51060440e+05, 6.65175172e+05, 5.87081390e+05, 4.46664915e+05,
       3.70592246e+05, 3.50271513e+05, 2.96641632e+05, 2.59516270e+05,
       2.36375667e+05, 2.12137214e+05, 1.92964519e+05, 1.78595467e+05,
       1.69763127e+05, 1.58576973e+05, 1.45738864e+05, 1.38654073e+05,
       1.33521338e+05, 1.26602218e+05, 1.22334292e+05, 1.14811557e+05,
       1.05328371e+05, 9.65968969e+04, 9.56119120e+04, 8.85130959e+04,
       8.16193489e+04, 7.53182955e+04, 7.28622686e+04, 6.99523856e+04,
       6.85318425e+04, 6.52277015e+04, 6.48841011e+04, 6.26715904e+04,
       5.57797290e+04, 5.43298971e+04, 5.14766014e+04, 4.96673996e+04,
       4.87024987e+04, 4.79877244e+04, 4.50583587e+04, 4.46972513e+04,
       4.28137432e+04, 4.20991762e+04, 4.12840774e+04, 3.97550709e+04,
       3.84511465e+04, 3.75682981e+04, 3.71289572e+04, 3.62921043e+04,
      

In [11]:
print(pca1.S)

[[3.06951923e+02 3.05815385e+02 3.03950000e+02 ... 1.28709615e+02
  1.74484615e+02 2.09153846e+02]
 [3.05815385e+02 1.05929108e+03 3.53296800e+03 ... 5.77476492e+03
  1.75338092e+03 2.82910769e+02]
 [3.03950000e+02 3.53296800e+03 2.90090680e+04 ... 3.08465380e+04
  9.79710400e+03 2.14768000e+03]
 ...
 [1.28709615e+02 5.77476492e+03 3.08465380e+04 ... 3.90839061e+06
  1.35845269e+06 1.06585649e+05]
 [1.74484615e+02 1.75338092e+03 9.79710400e+03 ... 1.35845269e+06
  1.15610744e+06 1.23886809e+05]
 [2.09153846e+02 2.82910769e+02 2.14768000e+03 ... 1.06585649e+05
  1.23886809e+05 6.48331077e+04]]
