<a href="https://colab.research.google.com/github/Susovan88/Machine_Learning/blob/main/PCA/PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import numpy as np
import pandas as pd

# for reproducibility
np.random.seed(23)

# class 1 data
mu_vec1 = np.array([0, 0, 0])
cov_mat1 = np.array([[1, 0, 0],
                     [0, 1, 0],
                     [0, 0, 1]])
class1_sample = np.random.multivariate_normal(mu_vec1, cov_mat1, 40)

df1 = pd.DataFrame(class1_sample, columns=['feature1', 'feature2', 'feature3'])
df1['target'] = 1


mu_vec2 = np.array([1, 1, 1])
cov_mat2 = np.array([[1, 0, 0],
                     [0, 1, 0],
                     [0, 0, 1]])
class2_sample = np.random.multivariate_normal(mu_vec2, cov_mat2, 40)

df2 = pd.DataFrame(class2_sample, columns=['feature1', 'feature2', 'feature3'])
df2['target'] = 0

df = pd.concat([df1, df2], ignore_index=True)

df = df.sample(frac=1, random_state=23).reset_index(drop=True)

print(df.head(),df.shape)


   feature1  feature2  feature3  target
0  0.190141  0.512137  0.131538       1
1  0.010229  0.437830  1.327788       1
2  0.767934  0.809642  1.525384       0
3  0.973444  1.623219  0.996259       0
4  1.267248  0.173634 -1.223255       1 (80, 4)


In [31]:
import plotly.express as px
#y_train_trf = y_train.astype(str)
fig = px.scatter_3d(df, x=df['feature1'], y=df['feature2'], z=df['feature3'],
              color=df['target'].astype('str'))
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.show()

In [32]:
from sklearn.preprocessing import StandardScaler  ## mean centering
std_scaler=StandardScaler()
df.iloc[:,:3]=std_scaler.fit_transform(df.iloc[:,:3])

In [33]:
covariance_matrix=np.cov([df.iloc[:,0],df.iloc[:,1],df.iloc[:,2]])

covariance_matrix

array([[1.01265823, 0.25665924, 0.03098   ],
       [0.25665924, 1.01265823, 0.04568181],
       [0.03098   , 0.04568181, 1.01265823]])

In [34]:
eigen_values,eigen_vectors=np.linalg.eig(covariance_matrix)  ## get all eigan vector after linear transformation
print(eigen_values)
print(eigen_vectors)

[1.28030501 0.75556906 1.00210061]
[[-0.69022364 -0.70341319 -0.16970918]
 [-0.69579391  0.70957909 -0.11121306]
 [-0.19865082 -0.04132073  0.97919888]]


In [40]:
eigen_values = np.array(eigen_values)
eigen_vectors = np.array(eigen_vectors)

# Combine into DataFrame
eigens = pd.DataFrame({
    'Eigenvalue': eigen_values,
    'Eigenvector_1': eigen_vectors[:, 0],
    'Eigenvector_2': eigen_vectors[:, 1],
    'Eigenvector_3': eigen_vectors[:, 2]
})
eigens

Unnamed: 0,Eigenvalue,Eigenvector_1,Eigenvector_2,Eigenvector_3
0,1.280305,-0.690224,-0.703413,-0.169709
1,0.755569,-0.695794,0.709579,-0.111213
2,1.002101,-0.198651,-0.041321,0.979199


In [42]:
## select k large eigan values to get most variance data
k=2
eigens_sorted = eigens.sort_values(by='Eigenvalue', ascending=False).reset_index(drop=True)
top2_vectors = eigens_sorted.iloc[:k, 1:]
print(top2_vectors)

   Eigenvector_1  Eigenvector_2  Eigenvector_3
0      -0.690224      -0.703413      -0.169709
1      -0.198651      -0.041321       0.979199


In [46]:
top2_vectors=np.array([top2_vectors.iloc[0,:],top2_vectors.iloc[1,:]])
top2_vectors

array([[-0.69022364, -0.70341319, -0.16970918],
       [-0.19865082, -0.04132073,  0.97919888]])

In [47]:
## transformation of df
transform_df=np.dot(df.iloc[:,:3],top2_vectors.T)
new_df=pd.DataFrame(transform_df,columns=['PC1','PC2'])
new_df['target']=df['target']
new_df

Unnamed: 0,PC1,PC2,target
0,0.227102,-0.448270,1
1,0.187825,0.731725,1
2,-0.551379,0.770033,0
3,-1.094929,0.196985,0
4,-0.017532,-1.927592,1
...,...,...,...
75,1.055229,-1.477042,1
76,0.356593,-0.531267,1
77,-0.239779,-0.062621,0
78,-0.629556,-0.134837,0


In [49]:
## show in 2D
new_df['target'] = new_df['target'].astype('str')
fig = px.scatter(x=new_df['PC1'],
                 y=new_df['PC2'],
                 color=new_df['target'],
                 color_discrete_sequence=px.colors.qualitative.G10
                )

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()