<a href="https://colab.research.google.com/github/V-Rang/Machine-Learning-Practice/blob/main/PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

In [None]:
A = np.array([
     [1,2,3,4],
     [5,5,6,7],
     [1,4,2,3],
     [5,3,2,1],
     [8,1,2,2]],dtype=float)

#procedure:
# A : n X m matrix, n observations, m features
# Steps:
# 1. Standardize A: A[i][j] = (A[i][j]-mu[i])/s[i] where mu[i] and s[i] are mean and standard deviation of ith feature
# 2. Form Covariance matrix (m X m) for the feautures: cov[i][j] = sum( (xi-xbar)(yi-ybar)   )/n
# 3. Eigendecomposition of covariance matrix
# 4. Rearrange eigvec columns in descending order of eigenvalue, resultant matrix: (m X m) 
# 5. Extract x dominant columns of eigvec matrix
# 6. Standardized A (n X m) @ eigvec matrix (m X x) => (n X x) matrix

# Compare to pca of StandardScaler scaled A.

# **1 StandardScalar calculates std dev as division by N, not N-1. You can use N-1 in manual calculation by setting ddof = 1.
# **2 For why we standardize: 
# https://stats.stackexchange.com/questions/69157/why-do-we-need-to-normalize-data-before-principal-component-analysis-pca

# ** 3 When initializing A: do dtype = float, else numpy assumes integers and any manipulation will give integer value, instead of desired floating point values.

mu,s = [],[]
for i in range(A.shape[1]):
  mu.append(np.mean(A[:,i]))
  # s.append(np.std(A[:,i],ddof=1))
  s.append(np.std(A[:,i]))


for i in range(A.shape[1]): #col traversal
  for j in range(A.shape[0]): #row traversal
    A[j][i] = (A[j][i]-mu[i])/s[i]


# print((A[0][1] - mu[0])/s[0])
# A[0][1] = (A[0][1] - mu[0])/s[0]

# print(A,'\n')
covmat = np.zeros((A.shape[1],A.shape[1]),dtype=float)

def covcalc(A,ind1,ind2):
  mu1 = np.mean(A[:,ind1])
  mu2 = np.mean(A[:,ind2])
  val = 0
  for i in range(A.shape[0]):
    val += (A[i][ind1] - mu1)*(A[i][ind2] - mu2)
  val = val/A.shape[0]

  return val


for i in range(covmat.shape[0]):
  for j in range(covmat.shape[1]):
    covmat[i][j] = covcalc(A,i,j)

# print(covmat)

w,v = np.linalg.eig(covmat)

#sort in descending order

idx = np.argsort(w)
# print(idx)
idx_rev = np.flip(idx)
w_new = w[idx_rev]
v_new = v[:,idx_rev]

# print(w,'\n')
# print(w_new,'\n')
# print(v,'\n')
# print(v_new,'\n')

pca_self = A@v_new[:,:2]
# print(A@v_new[:,:2],'\n')
# print(A)
print(pca_self,'\n')

######################################################
#Using sklearn.decomposition.PCA

A = np.array([
     [1,2,3,4],
     [5,5,6,7],
     [1,4,2,3],
     [5,3,2,1],
     [8,1,2,2]],dtype=float)
import sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
pca2 = PCA(n_components=2)
scaling = StandardScaler()
scaled_A = scaling.fit_transform(A)
# print(scaled_A,'\n')
pca2.fit(scaled_A)
pca_A_2 = pca2.transform(scaled_A)
print(pca_A_2)

[[ 1.56561741e-02  8.45205482e-01]
 [-2.85829190e+00 -8.72549250e-01]
 [-5.75566043e-02  1.40104719e+00]
 [ 1.13385419e+00  2.66995807e-04]
 [ 1.76633814e+00 -1.37397042e+00]] 

[[-1.56561741e-02  8.45205482e-01]
 [ 2.85829190e+00 -8.72549250e-01]
 [ 5.75566043e-02  1.40104719e+00]
 [-1.13385419e+00  2.66995807e-04]
 [-1.76633814e+00 -1.37397042e+00]]


In [None]:
import sklearn
from sklearn.decomposition import PCA
#pca on unscaled data
pca = PCA(n_components=2)
pca.fit(A)
pca_A = pca.transform(A)
print(pca_A)

[[-1.40033078e-02  7.55974765e-01]
 [ 2.55653399e+00 -7.80431775e-01]
 [ 5.14801919e-02  1.25313470e+00]
 [-1.01415002e+00  2.38808310e-04]
 [-1.57986086e+00 -1.22891650e+00]]


In [None]:
A = np.array([
     [1,2,3,4],
     [5,5,6,7],
     [1,4,2,3],
     [5,3,2,1],
     [8,1,2,2]],dtype=float)

from sklearn.preprocessing import StandardScaler
pca2 = PCA(n_components=2)
scaling = StandardScaler()
scaled_A = scaling.fit_transform(A)
# print(scaled_A,'\n')
pca2.fit(scaled_A)
pca_A_2 = pca2.transform(scaled_A)
print(pca_A_2)

[[-1.56561741e-02  8.45205482e-01]
 [ 2.85829190e+00 -8.72549250e-01]
 [ 5.75566043e-02  1.40104719e+00]
 [-1.13385419e+00  2.66995807e-04]
 [-1.76633814e+00 -1.37397042e+00]]


In [None]:
A = np.array([
     [1,2,3,4],
     [5,5,6,7],
     [1,4,2,3],
     [5,3,2,1],
     [8,1,2,2]],dtype=float)

from sklearn.preprocessing import MinMaxScaler
pca3 = PCA(n_components=2)
scaling_2 = MinMaxScaler()
scaled_A_2 = scaling_2.fit_transform(A)
# print(scaled_A)
pca3.fit(scaled_A)
pca_A_3 = pca3.transform(scaled_A_2)
print(pca_A_3)

[[ 0.57575947 -0.08637201]
 [ 1.61394275 -0.75358655]
 [ 0.59188498  0.11654617]
 [ 0.16947558 -0.42057284]
 [-0.06253542 -0.9363814 ]]
