# PCA

Principal Component Analysis

In [1]:
import numpy as np 
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [6]:
iris = datasets.load_iris()

In [7]:
list(iris.keys())

['data',
 'target',
 'frame',
 'target_names',
 'DESCR',
 'feature_names',
 'filename']

In [16]:
iris_df = pd.DataFrame(data=iris.data, columns=["sepal_length", "sepal_width", "petal_length", "petal_width"])

In [17]:
iris_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


Standardize features z = (x - u) / s

In [18]:
iris_scaled = StandardScaler().fit_transform(iris_df.values)

In [19]:
iris_scaled

array([[-9.00681170e-01,  1.01900435e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.38535265e+00,  3.28414053e-01, -1.39706395e+00,
        -1.31544430e+00],
       [-1.50652052e+00,  9.82172869e-02, -1.28338910e+00,
        -1.31544430e+00],
       [-1.02184904e+00,  1.24920112e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-5.37177559e-01,  1.93979142e+00, -1.16971425e+00,
        -1.05217993e+00],
       [-1.50652052e+00,  7.88807586e-01, -1.34022653e+00,
        -1.18381211e+00],
       [-1.02184904e+00,  7.88807586e-01, -1.28338910e+00,
        -1.31544430e+00],
       [-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00,  9.82172869e-02, -1.28338910e+00,
        -1.44707648e+00],
       [-5.37177559e-01,  1.47939788e+00, -1.28338910e+00,
        -1.31544430e+00],
       [-1.26418478e+00,  7.88807586e-01, -1.22655167e+00,
      

PCA using sklearn.

In [22]:
pca = PCA(n_components=4)

In [35]:
iris_pca = pca.fit_transform(iris_scaled)

The eigenvalues.

In [44]:
pca.explained_variance_

array([2.93808505, 0.9201649 , 0.14774182, 0.02085386])

Percentage of variance explained by each of the selected components.

In [36]:
pca.explained_variance_ratio_

array([0.72962445, 0.22850762, 0.03668922, 0.00517871])

Two first features contain majority of the information provided in the iris_scaled.

In [37]:
sum(pca.explained_variance_ratio_), sum(pca.explained_variance_ratio_[:2])

(0.9999999999999999, 0.9581320720000164)

In [41]:
iris_pca_df = pd.DataFrame(data=iris_pca[:, :2], columns=['PC 1', 'PC 2'])

In [42]:
iris_pca_df

Unnamed: 0,PC 1,PC 2
0,-2.264703,0.480027
1,-2.080961,-0.674134
2,-2.364229,-0.341908
3,-2.299384,-0.597395
4,-2.389842,0.646835
...,...,...
145,1.870503,0.386966
146,1.564580,-0.896687
147,1.521170,0.269069
148,1.372788,1.011254


PCA using numpy.

Covariance matrix.

In [46]:
cov_matrix = np.cov(iris_scaled.T)

In [47]:
cov_matrix

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

Compute the eigenvalues and right eigenvectors of a square array.

In [49]:
eigen_values, eigen_vectors = np.linalg.eig(cov_matrix)

The eigenvalues.

In [51]:
eigen_values

array([2.93808505, 0.9201649 , 0.14774182, 0.02085386])

The eigenvectors.

In [53]:
eigen_vectors

array([[ 0.52106591, -0.37741762, -0.71956635,  0.26128628],
       [-0.26934744, -0.92329566,  0.24438178, -0.12350962],
       [ 0.5804131 , -0.02449161,  0.14212637, -0.80144925],
       [ 0.56485654, -0.06694199,  0.63427274,  0.52359713]])

Percentage of the explained variance.

In [57]:
var_explained = [(i/sum(eigen_values))*100 for i in eigen_values]

In [58]:
var_explained

[72.96244541329983, 22.850761786701817, 3.668921889282866, 0.5178709107154746]

Create a DataFrame with the principal components.

In [98]:
pc1 = iris_scaled.dot(eigen_vectors[:, 0])
pc2 = iris_scaled.dot(eigen_vectors[:, 1])

In [102]:
iris_pca_df = pd.DataFrame(data=np.array(([pc1], [pc2])).reshape(2, 150).T, columns=['PC 1', 'PC 2'])

In [103]:
iris_pca_df

Unnamed: 0,PC 1,PC 2
0,-2.264703,-0.480027
1,-2.080961,0.674134
2,-2.364229,0.341908
3,-2.299384,0.597395
4,-2.389842,-0.646835
...,...,...
145,1.870503,-0.386966
146,1.564580,0.896687
147,1.521170,-0.269069
148,1.372788,-1.011254


A little simpler.

In [104]:
pc = iris_scaled.dot(eigen_vectors[:, :2])

In [106]:
iris_pca_df2 = pd.DataFrame(data=pc, columns=['PC 1', 'PC 2'])

In [107]:
iris_pca_df2

Unnamed: 0,PC 1,PC 2
0,-2.264703,-0.480027
1,-2.080961,0.674134
2,-2.364229,0.341908
3,-2.299384,0.597395
4,-2.389842,-0.646835
...,...,...
145,1.870503,-0.386966
146,1.564580,0.896687
147,1.521170,-0.269069
148,1.372788,-1.011254
