In [1]:
import numpy as np
import pandas as pd

Dataset ( 10 datasamples, 2 features)

In [2]:
A = np.matrix([[1,2],
               [5,5],
               [1,4],
               [5,3],
               [8,1],
               [4,7],
               [8,9],
               [3,7],
               [1,7],
               [3,7]]);

In [3]:
df = pd.DataFrame(A,columns  = ['f1','f2'])
df

Unnamed: 0,f1,f2
0,1,2
1,5,5
2,1,4
3,5,3
4,8,1
5,4,7
6,8,9
7,3,7
8,1,7
9,3,7


Standardize Dataset

In [4]:
df_std  = (df - df.mean()) / (df.std())
df_std

Unnamed: 0,f1,f2
0,-1.096968,-1.223153
1,0.416091,-0.076447
2,-1.096968,-0.458682
3,0.416091,-0.840918
4,1.550886,-1.605389
5,0.037826,0.688024
6,1.550886,1.452494
7,-0.340438,0.688024
8,-1.096968,0.688024
9,-0.340438,0.688024


Covariance population formula (divide by N)

In [5]:
df_cov = np.cov(df_std.T, bias = 1)
df_cov

array([[0.9       , 0.00289172],
       [0.00289172, 0.9       ]])

Covariance sample formula (divide by N-1)

In [6]:
cov_mat = np.cov(df_std.T, bias = 0)
cov_mat

array([[1.        , 0.00321303],
       [0.00321303, 1.        ]])

In [7]:
## verify varinace(f1) is as expected
print('var(f1) (population formula): ',((df_std.f1)**2).sum()/5)
print('var(f1) (sample formula): ',((df_std.f1)**2).sum()/4)

var(f1) (population formula):  1.8000000000000003
var(f1) (sample formula):  2.2500000000000004


In [8]:
## verify varinace(f1) is as expected
print('var(f1) (population formula): ',((df_std.f1)**2).sum()/5)
print('var(f1) (sample formula): ',((df_std.f1)**2).sum()/4)

var(f1) (population formula):  1.8000000000000003
var(f1) (sample formula):  2.2500000000000004


Calculate Eigenvalue and eigen vector

In [9]:
eigen_val, eigen_vectors = np.linalg.eig(cov_mat)

In [10]:
print(eigen_val)

[1.00321303 0.99678697]


In [11]:
print(eigen_vectors)

[[ 0.70710678 -0.70710678]
 [ 0.70710678  0.70710678]]


Sort the eigen values and their correspoding eigen vectors

In [12]:
n_components=1

Pick top k (n_components) eigen values and their corresponding eigen vectors

In [13]:
top_eigen_vectors = eigen_vectors[:,:n_components]

In [14]:
top_eigen_vectors

array([[0.70710678],
       [0.70710678]])

In [15]:
top_eigen_vectors.shape

(2, 1)

In [16]:
np.array(df_std).shape

(10, 2)

Transform the original matrix.

In [17]:
transformed_data = np.matmul(np.array(df_std),top_eigen_vectors)

In [18]:
pd.DataFrame(data = transformed_data
             , columns = ['principal component '+ str(i+1) for i in range(n_components)])

Unnamed: 0,principal component 1
0,-1.640573
1,0.240165
2,-1.100011
3,-0.300398
4,-0.038539
5,0.513254
6,2.12371
7,0.24578
8,-0.289167
9,0.24578


In [19]:
transformed_data.shape

(10, 1)

Result using the Sklearn library

In [20]:
from sklearn.decomposition import PCA
pca = PCA(n_components=n_components)
principalComponents = pca.fit_transform(df_std)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component '+ str(i+1) for i in range(n_components)])

In [21]:
principalDf

Unnamed: 0,principal component 1
0,-1.640573
1,0.240165
2,-1.100011
3,-0.300398
4,-0.038539
5,0.513254
6,2.12371
7,0.24578
8,-0.289167
9,0.24578
