In [77]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
from seedtools import load_seed,mapper_auto,dropper
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
import plotly.express as px

#verbose = 0
import warnings
warnings.filterwarnings("ignore")

## CONTENTS 
- PCA Implementation from scratch 
- PCA code

## PCA (from scratch)

### <center> 1. Data Loading <center />

In [78]:
# Implementation on 3,3 for better visualization
raw  =  load_seed("heart.csv",quiet=True)
data =  raw.data[["age","thalach","chol","target"]]
data.columns = ["f1","f2","f3","target"]

ss =  StandardScaler()

for col in ["f1","f2","f3"]:
    data[col] =  ss.fit_transform(data[col].values.reshape(-1,1))

data.head(3)

Unnamed: 0,f1,f2,f3,target
0,-0.268437,0.821321,-0.659332,0
1,-0.158157,0.255968,-0.833861,0
2,1.716595,-1.048692,-1.396233,0


In [79]:
# original 3d data 
fig = px.scatter_3d(data,x="f1",y="f2",z="f3",color="target")
fig.update_traces(marker=dict(size=12,line=dict(width=2, color='DarkSlateGrey')),selector=dict(mode='markers'))
fig.show()

### <center> 2. Covariance Matrix <center />

In [80]:
## Covariance matrix
cov_matrix =  np.cov([data.iloc[:,0],data.iloc[:,1],data.iloc[:,2]])
cov_matrix

array([[ 1.00097656, -0.39060816,  0.22003721],
       [-0.39060816,  1.00097656, -0.02179335],
       [ 0.22003721, -0.02179335,  1.00097656]])

### <center> 3. Eigen Decomposition <center />

In [81]:
# eigen decomposition
eigen_values,eigen_vectors =  np.linalg.eig(cov_matrix)
print("Eigen Values :",eigen_values)
print("Eigen Vectors :",eigen_vectors)

Eigen Values : [1.4588497  0.56173023 0.98234977]
Eigen Vectors : [[ 0.6995135  -0.71417243 -0.02527065]
 [-0.61414129 -0.61886451  0.48973176]
 [ 0.36539203  0.32705423  0.87150685]]


#### 🔷 1. Eigenvalues:

Eigenvalues = $\begin{bmatrix} 1.4588 & 0.9823 & 0.5617 \end{bmatrix}$


*These tell us how much variance is captured along each principal component.*

| PC (Principal Component) | Eigenvalue | % Variance Explained |
|--------------------------|------------|------------------------|
| PC1                      | 1.4588     | ~47.7%                 |
| PC2                      | 0.9823     | ~32.1%                 |
| PC3                      | 0.5617     | ~20.2%                 |


📌 Total variance = 1.4588 + 0.9823 + 0.5617 ≈ 3.003

#### 🔷 2. Eigenvectors:
*Each eigenvector is a direction (axis) in the original 3D space that defines a principal component.*

Eigen vectors =
$\begin{bmatrix}
  0.6995 & -0.7142 & -0.0253 \\
 -0.6141 & -0.6189 &  0.4897 \\
  0.3654 &  0.3271 &  0.8715
\end{bmatrix}$

​
  
​
 
**Each column is a principal component:**

=> First column = PC1 direction <br />
=> Second column = PC2 direction <br />
=> Third column = PC3 direction



### <center> 4. New Projected Data  <center />

In [82]:
## final touch our projected data 
pc =  eigen_vectors[0:2]
new_data =  np.dot(data.iloc[:,:3],pc.T)  # (N,3) ● (3,2) =>  (N,2)
print("New data shape :",new_data.shape)
new_data[:3]

New data shape : (1025, 2)


array([[-0.75767773, -0.66632399],
       [-0.27236598, -0.46964699],
       [ 1.98501231, -1.0890134 ]])

In [83]:

new_df = pd.DataFrame(new_data,columns=['PC1','PC2'])
new_df["target"] =  data.target.values
new_df.head(2)

Unnamed: 0,PC1,PC2,target
0,-0.757678,-0.666324,0
1,-0.272366,-0.469647,0


In [84]:
## OUR NEW DATA 
fig =  px.scatter(new_df,x="PC1",y="PC2",color="target")
fig.show()

## PCA code 
(For explained_variance_ratio)

In [86]:
## using same data 
x =  raw.data.drop(columns=["target"]).values
y = raw.data["target"].values 

x_train,x_test,y_train,y_test =  train_test_split(x,y,test_size=0.2)
x_train

array([[62.,  0.,  0., ...,  2.,  0.,  2.],
       [40.,  1.,  0., ...,  1.,  0.,  3.],
       [55.,  0.,  1., ...,  2.,  0.,  2.],
       ...,
       [58.,  1.,  1., ...,  1.,  0.,  2.],
       [50.,  1.,  0., ...,  1.,  0.,  3.],
       [64.,  0.,  2., ...,  2.,  0.,  3.]])

In [92]:
# NO OF MAX COMPONENTS = n(PC) <= n
max_components = min(x_train.shape[0], x_train.shape[1])
max_components

13

In [93]:

for c in range(1,14):
    pca =  PCA(n_components=c,svd_solver='auto')
    x_train_tf =  pca.fit_transform(x_train)
    x_test_tf =  pca.transform(x_test)
    
    model =  RandomForestClassifier()
    model.fit(x_train_tf,y_train)
    
    score = model.score(x_test_tf,y_test)
    
    print(f"C : {c} || SCORE : {score} ")

C : 1 || SCORE : 1.0 
C : 2 || SCORE : 1.0 
C : 3 || SCORE : 1.0 
C : 4 || SCORE : 1.0 
C : 5 || SCORE : 1.0 
C : 6 || SCORE : 0.9853658536585366 
C : 7 || SCORE : 1.0 
C : 8 || SCORE : 1.0 
C : 9 || SCORE : 1.0 
C : 10 || SCORE : 1.0 
C : 11 || SCORE : 1.0 
C : 12 || SCORE : 1.0 
C : 13 || SCORE : 1.0 


In [94]:
fig =  px.scatter_3d(data,x=x_train_tf[:,0],y=x_train_tf[:,1],z=x_train_tf[:,2],color=y_train)
fig.show()

In [102]:
pca =  PCA(n_components=3,svd_solver='auto') # lets take 3 , u can take any 
x_train_tf =  pca.fit_transform(x_train)
x_test_tf =  pca.transform(x_test)

In [108]:
eigen_values_ = pca.explained_variance_
eigen_vectors_ = pca.components_
print("Eigne values :\n",eigen_values_)
print("Eigen Vectors shape :\n",eigen_vectors_.shape)

Eigne values :
 [2799.02026136  541.80956593  314.35445453]
Eigen Vectors shape :
 (3, 13)


In [112]:
print("These are percentage of lambda each eigen vector explain the variance ")
print("\nfirst one explains approx 75%")
pca.explained_variance_ratio_

These are percentage of lambda each eigen vector explain the variance 

first one explains approx 75%


array([0.75250339, 0.14566294, 0.08451271])