In [1]:
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.model_selection import  train_test_split
import numpy as np

In [2]:
bc_dataset = datasets.load_breast_cancer()
x=bc_dataset.data
y=bc_dataset.target
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=123)

In [3]:
class Self_PCA:
    
    def __init__(self,n_comp = None,mean_centre = False):
        self.n_comp = n_comp
        self.components = None
        self._mean_centre = False
        self._mean = None
        
    def fit(self,x_train):
        
        self._mean = np.mean(x_train,axis=0)
        if self._mean_centre == True:
            x_train = x_train - self._mean
        
        
        cov_matrix = np.cov(x_train.T)
        
        eigen_vals , eigen_vector = np.linalg.eig(cov_matrix)
        eigen_vector = eigen_vector.T
        
        nSamples , nFeatures = x_train.shape
        if self.n_comp == None:
            self.n_comp = min(nSamples,nFeatures)
        
        sort_idxs = np.argsort(eigen_vals)[::-1]
        
        eigen_vals = eigen_vals[sort_idxs]
        eigen_vector = eigen_vector[sort_idxs]
        
        self.components = eigen_vector[:self.n_comp]
        self.explanined_var = eigen_vals[:self.n_comp]
    
    def transform(self,x_train):
        x_train = x_train - self._mean
        projection = np.dot(x_train,self.components.T)
        
        return np.array(projection)
    
    def fit_n_transform(self,x_train):
        self.fit(x_train)
        return self.transform(x_train)

In [29]:
# finding optimal value of n_components

n=0
curr_var =0

pca = Self_PCA()
pca.fit(x_train)
total_var = np.sum(pca.explanined_var)

# var_to_restore = float(input())
var_to_restore = 0.95
while curr_var / total_var < var_to_restore:
    curr_var += pca.explanined_var[n]
    n+=1

print(f"Optimal value of k with restorin g {var_to_restore*100}% of variance is ",n)

Optimal value of k with restoring 95.0% of variance is  1


In [22]:
# Self implementation of PCA
PCA_obj = Self_PCA(n_comp=n,mean_centre=True)
x_train_pca=PCA_obj.fit_n_transform(x_train)

print("Self Implemented PCA \n")
print(PCA_obj.components)

Self Implemented PCA 

[[ 5.10120380e-03  2.26911347e-03  3.51226828e-02  5.13955528e-01
   3.55976675e-06  3.94653551e-05  8.03726490e-05  4.74061830e-05
   7.56800010e-06 -2.94958268e-06  3.01243192e-04 -5.18780326e-05
   2.14460525e-03  5.36865589e-02 -9.38812218e-07  5.47687188e-06
   8.08106248e-06  3.13979786e-06 -1.05474179e-06 -1.43235281e-07
   7.19117697e-03  3.22096124e-03  4.97210549e-02  8.53910813e-01
   5.43695454e-06  1.00543242e-04  1.65335175e-04  7.29148775e-05
   2.02960997e-05  1.09658820e-06]]


In [24]:
# Inbuilt PCA 
PCA_obj_ = PCA(n_components=n,whiten=True)
x_train_pca_=PCA_obj_.fit_transform(x_train)

print("InBuilt PCA \n")
print(PCA_obj_.components_)

InBuilt PCA 

[[ 5.10120380e-03  2.26911347e-03  3.51226828e-02  5.13955528e-01
   3.55976675e-06  3.94653551e-05  8.03726490e-05  4.74061830e-05
   7.56800010e-06 -2.94958268e-06  3.01243192e-04 -5.18780326e-05
   2.14460525e-03  5.36865589e-02 -9.38812218e-07  5.47687188e-06
   8.08106248e-06  3.13979786e-06 -1.05474179e-06 -1.43235281e-07
   7.19117697e-03  3.22096124e-03  4.97210549e-02  8.53910813e-01
   5.43695454e-06  1.00543242e-04  1.65335175e-04  7.29148775e-05
   2.02960997e-05  1.09658820e-06]]
