# Principal Component Analysis (PCA)

# Generate dataframe

In [250]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import math

In [251]:
np.random.seed(42)

fratures = [np.random.random(30) for _ in range(10)]
columns = [ "feature_" + str(i) for i in range(10)]

In [252]:
df = pd.DataFrame(fratures,columns).T

In [253]:
df

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9
0,0.37454,0.607545,0.388677,0.119594,0.80744,0.908266,0.341066,0.548734,0.940459,0.809361
1,0.950714,0.170524,0.271349,0.713245,0.896091,0.239562,0.113474,0.691895,0.953929,0.810113
2,0.731994,0.065052,0.828738,0.760785,0.318003,0.144895,0.924694,0.651961,0.914864,0.867072
3,0.598658,0.948886,0.356753,0.561277,0.110052,0.489453,0.877339,0.224269,0.370159,0.913241
4,0.156019,0.965632,0.280935,0.770967,0.227935,0.98565,0.257942,0.712179,0.015457,0.511342
5,0.155995,0.808397,0.542696,0.493796,0.427108,0.242055,0.659984,0.237249,0.928319,0.501516
6,0.058084,0.304614,0.140924,0.522733,0.818015,0.672136,0.817222,0.3254,0.428184,0.798295
7,0.866176,0.097672,0.802197,0.427541,0.860731,0.76162,0.555201,0.746491,0.966655,0.649964
8,0.601115,0.684233,0.074551,0.025419,0.006952,0.237638,0.529651,0.649633,0.96362,0.701967
9,0.708073,0.440152,0.986887,0.107891,0.510747,0.728216,0.241852,0.849223,0.853009,0.795793


In [254]:
df.shape

(30, 10)

In [255]:
# Standardization
def stand(x):
    return (x - x.mean()) / x.std()

In [256]:
newdf = stand(df)

In [257]:
newdf

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9
0,-0.226825,0.337706,-0.315043,-1.269522,1.188577,1.537721,-0.509765,0.174595,1.102914,1.055381
1,1.813397,-0.990203,-0.710451,0.758141,1.488393,-0.799221,-1.217,0.739698,1.147166,1.057989
2,1.038912,-1.310687,1.168001,0.920519,-0.466685,-1.130057,1.303834,0.582066,1.018831,1.255497
3,0.566774,1.374887,-0.42263,0.239083,-1.169972,0.074081,1.156683,-1.106167,-0.77065,1.415588
4,-1.000606,1.425772,-0.678147,0.955297,-0.771294,1.80816,-0.768072,0.819765,-1.935926,0.021985
5,-1.000691,0.948007,0.204015,0.008594,-0.097697,-0.790507,0.481259,-1.054931,1.063031,-0.012088
6,-1.347392,-0.582765,-1.149994,0.107432,1.22434,0.712509,0.969871,-0.706974,-0.580023,1.017009
7,1.514049,-1.211568,1.078557,-0.217704,1.368804,1.025232,0.15565,0.955206,1.188974,0.502662
8,0.575472,0.570727,-1.37368,-1.591185,-1.518653,-0.805946,0.076253,0.572875,1.179004,0.682985
9,0.954207,-0.170924,1.70098,-1.309494,0.18517,0.908496,-0.818069,1.360721,0.815624,1.008331


In [258]:
# Covariance Matrix
def coMat(x):
    return np.cov(x.T)

In [259]:
cov = coMat(df)

In [260]:
cov

array([[ 0.079754  , -0.01492926, -0.00970407,  0.00803297, -0.00795922,
         0.00517455,  0.01976306,  0.01890092,  0.03630665,  0.01301234],
       [-0.01492926,  0.10830968, -0.01846831,  0.00793838, -0.0256429 ,
        -0.00347226,  0.00211297,  0.00507083, -0.01441531, -0.02802807],
       [-0.00970407, -0.01846831,  0.08804728, -0.00793968,  0.01117639,
         0.01172478, -0.03474914,  0.01505522,  0.00902347, -0.00529625],
       [ 0.00803297,  0.00793838, -0.00793968,  0.08571759, -0.00772733,
        -0.00469914, -0.01106403, -0.00830523, -0.02448112,  0.01265203],
       [-0.00795922, -0.0256429 ,  0.01117639, -0.00772733,  0.08742977,
         0.00735174,  0.00536358,  0.00906316, -0.00158379,  0.01478564],
       [ 0.00517455, -0.00347226,  0.01172478, -0.00469914,  0.00735174,
         0.08187887, -0.01059775,  0.00065504, -0.0167141 ,  0.00833892],
       [ 0.01976306,  0.00211297, -0.03474914, -0.01106403,  0.00536358,
        -0.01059775,  0.10355921, -0.01081917

In [261]:
# Evalue,Evector
def eigenV(x):
    Evalue,Evector = np.linalg.eig(x)
    #Evector = Evector.T
    return Evalue,Evector

In [262]:
Evalue,Evector=eigenV(cov)

In [263]:
Evalue

array([0.16307655, 0.15036615, 0.02016979, 0.11704867, 0.10723031,
       0.03911653, 0.08626907, 0.07720219, 0.06382984, 0.0503897 ])

In [264]:
def getmatrix(Evalue,Evector,n_com):
    sortV = np.argsort(Evalue)[::-1]
    Evalue = Evalue[sortV]
    Evector = Evector[sortV]
    
    return Evector[:n_com]
    

In [265]:
n_com = 3
mat = getmatrix(Evalue,Evector,n_com)

In [266]:
mat.shape

(3, 10)

In [267]:
mat

array([[-0.22533125,  0.43181644, -0.54841942,  0.09212014,  0.44510545,
         0.32699946,  0.20489747,  0.19962319,  0.25782155, -0.01259238],
       [ 0.6153576 , -0.06730151, -0.22944779, -0.23108688,  0.24017839,
        -0.26865981,  0.44385825, -0.34382586,  0.09620329,  0.237514  ],
       [ 0.1532434 , -0.17003208,  0.32350646,  0.48792316,  0.41842818,
        -0.10523407, -0.28621328,  0.01380182,  0.5665014 ,  0.12526528]])

In [268]:
def transfrom(df,mat):
    return np.dot(df,mat.T)

In [269]:
transfrom(newdf,mat)

array([[ 1.48611046,  0.14617379,  0.4273014 ],
       [ 0.20515299,  1.30968872,  2.43415902],
       [-1.30287053,  1.21311316,  1.50200851],
       [ 0.0229225 ,  1.15286944, -1.26990992],
       [ 1.05587863, -2.25166441, -1.52485167],
       [ 0.38402164,  0.13625075,  0.24635321],
       [ 1.36573526,  0.41108303, -0.47821942],
       [-0.00859449,  0.84381119,  1.85073407],
       [ 0.20940083,  0.96289712, -1.04071511],
       [-0.66120423, -0.20159864,  0.90979276],
       [-1.43271648, -1.1896958 , -0.53662116],
       [ 0.10057529,  1.16583946, -0.99609777],
       [-0.26998278,  2.10675271, -0.87014764],
       [-0.24142946, -1.07567961, -0.97173315],
       [-0.75634175, -0.24435623,  1.10668839],
       [-0.21589872, -1.30532159, -0.84985439],
       [ 0.02328384, -0.48418128,  0.77478333],
       [ 1.08711969,  1.50678551,  0.3147189 ],
       [ 0.11853366,  0.41436632, -1.05343287],
       [ 1.01308768,  0.77404125, -1.46881416],
       [ 1.06206271, -0.23259477,  0.275

# In class format

In [270]:
class PCA():
    
    def __init__(self,n_com):
        self.n_com = n_com
        
    def fit(self,df):
        stan = self._stand(df)
        comM = np.cov(stan.T)
        Evalue,Evector = self._eigenV(comM)
        mat = self._getmatrix(Evalue,Evector,self.n_com)
        
        return np.dot(df,mat.T)

    def _stand(self,df):
        return (df - df.mean()) / df.std()
    
    def _eigenV(self,x):
        Evalue,Evector = np.linalg.eig(x)
        return Evalue,Evector
    
    def _getmatrix(self,Evalue,Evector,n_com):
        sortV = np.argsort(Evalue)[::-1]
        Evalue = Evalue[sortV]
        Evector = Evector[sortV]
    
        return Evector[:n_com] 

In [271]:
model = PCA(6)

In [273]:
pd.DataFrame(model.fit(df))

Unnamed: 0,0,1,2,3,4,5
0,-0.229464,0.10488,1.188121,-0.186995,-1.275237,-0.407506
1,-0.970421,-0.479913,0.81837,0.246399,-1.05206,-0.324495
2,-0.604791,-0.525547,0.746077,0.069711,-1.375029,0.422821
3,0.330329,-0.421329,0.550322,0.060137,-1.604306,-0.035763
4,0.061414,-0.489373,0.664458,-0.194021,-1.455147,-0.601759
5,-0.075843,-0.26836,0.529365,-0.419109,-1.318439,-0.144223
6,-0.188601,0.253131,0.634582,0.222939,-1.332287,-0.487638
7,-0.854072,-0.135775,1.280871,0.100345,-1.39449,0.145664
8,-0.05245,-0.188311,0.627485,-0.514736,-0.968977,0.054701
9,-0.378291,-0.413575,1.507623,-0.201556,-1.129656,0.127651
