In [1]:
# PCA: Principal Componant Analysis
# It is a dimension reduction technique

In [2]:
#we need some data first
from sklearn.datasets import load_iris
import numpy as np
import pandas as df
iris  = load_iris()
x = iris.data
y = iris.target

x.shape

(150, 4)

- **Step 1: Standardization of data**

Before proceeding with PCA, we need to perform the standardization of the data.

Performing standardization is a crucial step because the original variables may have different scales. We need to bring them to a similar range to get reasonable covariance analysis.

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
std = StandardScaler()
x_new = std.fit_transform(x)
#x_new

- **Step 2: Computing covariance matrix with standardized data**

The covariance matrix represents the correlation between two variables. This helps us to understand which two variables are highly correlated on each other

In [5]:
cov_mat=np.cov(x_new.T)
cov_mat

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

- **Step 3: Calculating Eigenvectors and Eigenvalues on the covariance matrix**

The eigenvectors (principal components) represents the directions of the new feature space and the eigenvalues explain the variance of the data along the direction.

Each Eigenvector will have a corresponding Eigenvalue and the sum of all the Eigenvalues represents the overall variance in the entire dataset. It is very important to compute the Eigenvalues because it explains where the maximum variance lies in the dataset.

In [6]:
eig_vals, eig_vecs = np.linalg.eig(cov_mat)

print('Eigenvectors:\n',eig_vecs)
print('\nEigenvalues:\n',eig_vals)

Eigenvectors:
 [[ 0.52106591 -0.37741762 -0.71956635  0.26128628]
 [-0.26934744 -0.92329566  0.24438178 -0.12350962]
 [ 0.5804131  -0.02449161  0.14212637 -0.80144925]
 [ 0.56485654 -0.06694199  0.63427274  0.52359713]]

Eigenvalues:
 [2.93808505 0.9201649  0.14774182 0.02085386]


In [7]:

eig_pairs = [(np.abs(eig_vals[i]),eig_vecs[:,i])for i in range(len(eig_vals))]
for i in eig_pairs:
    print (i[1],':',i[0])

[ 0.52106591 -0.26934744  0.5804131   0.56485654] : 2.9380850501999936
[-0.37741762 -0.92329566 -0.02449161 -0.06694199] : 0.920164904162487
[-0.71956635  0.24438178  0.14212637  0.63427274] : 0.14774182104494776
[ 0.26128628 -0.12350962 -0.80144925  0.52359713] : 0.02085386217646217


- **Step 4: Selecting The Principal Components**

The eigenvectors with the lowest eigenvalues bear the least information about the distribution of the data; those are the ones can be dropped.

Before doing anything more first we want to know,how much information (variance) is holding by each of the principal components.that we can get by **explained variance** which can be calculated from the eigenvalues.

In [8]:
tot = sum(eig_vals)

var_exp = [(i/tot)*100 for i in sorted(eig_vals,reverse=True)]
print (var_exp)

[72.96244541329987, 22.850761786701774, 3.6689218892828714, 0.5178709107154775]


In [9]:
72.96+22.85

95.81

**Note**

- Now we can select top k eigen vectors

- But the variance which corresponds to those k eigen vectors should be  enough to describe the data set.

- if we take first two eigen vectors we will get around 95% information

### Now PCA using sklearn

In [10]:
from sklearn.decomposition import PCA

In [11]:
pca = PCA(n_components=.95)

## n_components:int or float
#int:no of components 
#float:0 to 1
#that represents how much information we want to preserve from original data

In [12]:
pca_data = pca.fit_transform(x_new)

In [13]:
#pca.n_components_

In [14]:
pca_data.shape

(150, 2)

In [15]:
pca.explained_variance_ratio_ ## percentage of info held by 2 Eigen vectors that were selected

array([0.72962445, 0.22850762])

In [16]:
pca.explained_variance_ ## Eigen Values of 2 Eigen vectors selected 

array([2.93808505, 0.9201649 ])

In [17]:
pca.components_

array([[ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654],
       [ 0.37741762,  0.92329566,  0.02449161,  0.06694199]])

### PCA on MNIST dataset

In [18]:
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np

In [19]:
mnist = fetch_openml('mnist_784')

In [20]:
X = mnist.data.values
y = mnist.target.values

In [21]:
X.shape

(70000, 784)

In [22]:
y.shape

(70000,)

In [23]:
print('no. of classes/labels:',np.unique(y))

no. of classes/labels: ['0' '1' '2' '3' '4' '5' '6' '7' '8' '9']


In [24]:
### display no. of samples for each class/label

In [25]:
np.bincount(y.astype(int))

array([6903, 7877, 6990, 7141, 6824, 6313, 6876, 7293, 6825, 6958],
      dtype=int64)

#### Let's take two classes only

In [26]:
x_new = X[(y=='0')|(y=='1')]
y_new = y[(y=='0')|(y=='1')]

x_new.shape

(14780, 784)

In [27]:
y_new.shape

(14780,)

In [28]:
np.unique(y_new)

array(['0', '1'], dtype=object)

In [29]:
xtrain,xtest,ytrain,ytest = train_test_split(x_new,y_new,test_size=.2,random_state=10)

In [30]:
xtrain.shape

(11824, 784)

In [31]:
xtest.shape

(2956, 784)

In [32]:
xtrain.max()

255.0

In [33]:
xtrain.min()

0.0

In [34]:
std_train = xtrain/255.0
std_test = xtest/255.0

In [35]:
### PCA

In [36]:
pca = PCA(n_components=.95)

In [37]:
pca_train = pca.fit_transform(std_train)

pca_test = pca.transform(std_test)

In [38]:
pca_train.shape

(11824, 97)

In [39]:
pca_test.shape

(2956, 97)

In [40]:
clf = SVC(kernel='linear')

In [41]:
clf.fit(pca_train,ytrain)

test_score = clf.score(pca_test,ytest)

train_score = clf.score(pca_train,ytrain)

print(train_score,',',test_score)

1.0 , 0.9989851150202977
