[jupyter notebook](PCA.ipynb)

# Principal Component Analysis

In [1]:
import pandas as pd

df = pd.DataFrame( { 'X1': [  7 ,  9 ,  6 , -8 ,  8 ,  7 ,  7 , -8,  8, -7,  9 ],
                     'X2': [  1 ,  2 ,  2 ,  3 ,  2 ,  2 ,  1 ,  1,  3,  1,  1 ],
                     'X3': [  9 , -7 ,  8 ,  9 , -9 ,  8 ,  7 , -8,  7, -9,  6 ],
                     'X4': [  2 ,  2 ,  2 ,  1 ,  2 ,  2 ,  3 ,  2,  1,  2,  1 ] } )

display( df )

Unnamed: 0,X1,X2,X3,X4
0,7,1,9,2
1,9,2,-7,2
2,6,2,8,2
3,-8,3,9,1
4,8,2,-9,2
5,7,2,8,2
6,7,1,7,3
7,-8,1,-8,2
8,8,3,7,1
9,-7,1,-9,2


### Standardize data by mean and standard deviation

In [2]:
from sklearn.preprocessing import StandardScaler

x = df.values.tolist()

scaler = StandardScaler( with_mean = True, with_std = True )
xS = scaler.fit_transform( x )
dfS = pd.DataFrame( xS, columns = ['Xs1', 'Xs2', 'Xs3', 'Xs4' ] )
display( dfS )

Unnamed: 0,Xs1,Xs2,Xs3,Xs4
0,0.516296,-0.970143,0.91619,0.316228
1,0.80754,0.363803,-1.15111,0.316228
2,0.370674,0.363803,0.786984,0.316228
3,-1.668033,1.697749,0.91619,-1.423025
4,0.661918,0.363803,-1.409523,0.316228
5,0.516296,0.363803,0.786984,0.316228
6,0.516296,-0.970143,0.657777,2.05548
7,-1.668033,-0.970143,-1.280317,0.316228
8,0.661918,1.697749,0.657777,-1.423025
9,-1.522411,-0.970143,-1.409523,0.316228


### Apply PCA

In [3]:
from sklearn.decomposition import PCA

pca = PCA( n_components = 4 ) # Number of components is set to 4 to look at the complete transformation matrix
pca.fit( xS )

print( "\n********Principal axes (Transformation matrix)********\n" )
print( pca.components_ )

print( "\n********Explained variance by each Component********\n" )
print( pca.explained_variance_ )

print( "\n********Explained variance ratio by each Component********\n" )
print( pca.explained_variance_ratio_ )


********Principal axes (Transformation matrix)********

[[ 0.13895701  0.62860682  0.47653033 -0.59871801]
 [-0.78531844  0.18173158 -0.46718382 -0.36330133]
 [ 0.54556352  0.40341546 -0.73380448 -0.03387322]
 [ 0.257539   -0.63959602 -0.12723642 -0.71302279]]

********Explained variance by each Component********

[1.89118651 1.34104433 0.68726507 0.4805041 ]

********Explained variance ratio by each Component********

[0.42981511 0.3047828  0.15619661 0.10920548]


### Transform values along the components

In [4]:
xC = pca.transform( xS )

dfC = pd.DataFrame( xC, columns = ['Xc1', 'Xc2', 'Xc3', 'Xc4' ] )
display( dfC )

Unnamed: 0,Xc1,Xc2,Xc3,Xc4
0,-0.290834,-1.124677,-0.792714,0.411415
1,-0.396968,-0.145167,1.421307,-0.103729
2,0.465887,-0.707535,-0.239214,-0.462835
3,2.124015,1.70743,-0.849221,-0.617381
4,-0.540344,0.089919,1.531485,-0.108353
5,0.486123,-0.821894,-0.159768,-0.425331
6,-1.455298,-1.635824,-0.662004,-0.795832
7,-1.641064,1.616889,-0.372598,0.128341
8,2.324637,-0.001598,0.611539,0.015552
9,-1.6824,1.562892,-0.19834,0.182284


### Calculate $m = x_{T}x$ for singular value decomposition and eigenvectors

In [5]:
import numpy as np

m = np.dot( np.transpose( xS ), xS )
print( m )

[[11.          0.4591406   3.26359365  1.24334189]
 [ 0.4591406  11.          2.88301538 -5.90563041]
 [ 3.26359365  2.88301538 11.         -2.51280454]
 [ 1.24334189 -5.90563041 -2.51280454 11.        ]]


### Eigenvectors - Compare the transpose of the matrix of eigenvectors to pca components matrix

In [6]:
w, v = np.linalg.eig( m )
print( "\n********Transpose of eigenvector matrix********\n" )
print( np.transpose( v ) )

print( "\n********Principal axes (Transformation matrix)********\n" )
print( pca.components_ )

# Sign difference in some rows means that the component is in the opposite direction


********Transpose of eigenvector matrix********

[[ 0.13895701  0.62860682  0.47653033 -0.59871801]
 [ 0.78531844 -0.18173158  0.46718382  0.36330133]
 [ 0.54556352  0.40341546 -0.73380448 -0.03387322]
 [ 0.257539   -0.63959602 -0.12723642 -0.71302279]]

********Principal axes (Transformation matrix)********

[[ 0.13895701  0.62860682  0.47653033 -0.59871801]
 [-0.78531844  0.18173158 -0.46718382 -0.36330133]
 [ 0.54556352  0.40341546 -0.73380448 -0.03387322]
 [ 0.257539   -0.63959602 -0.12723642 -0.71302279]]


### Singular value decomposition - Compare the matrix of right-singular vectors to pca components matrix

In [7]:
u, s, vh = np.linalg.svd( m, full_matrices = True )
print( "\n********Matrix of right-singular vectors********\n" )
print( vh )

print( "\n********Principal axes (Transformation matrix)********\n" )
print( pca.components_ )

# Sign difference in some rows means that the component is in the opposite direction


********Matrix of right-singular vectors********

[[-0.13895701 -0.62860682 -0.47653033  0.59871801]
 [ 0.78531844 -0.18173158  0.46718382  0.36330133]
 [ 0.54556352  0.40341546 -0.73380448 -0.03387322]
 [-0.257539    0.63959602  0.12723642  0.71302279]]

********Principal axes (Transformation matrix)********

[[ 0.13895701  0.62860682  0.47653033 -0.59871801]
 [-0.78531844  0.18173158 -0.46718382 -0.36330133]
 [ 0.54556352  0.40341546 -0.73380448 -0.03387322]
 [ 0.257539   -0.63959602 -0.12723642 -0.71302279]]
