<a href="https://www.kaggle.com/code/piyushjain572/principal-component-analysis-pca?scriptVersionId=200772807" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [6]:
import numpy as np
import pandas as pd

np.random.seed(2) 

mu_vec1 = np.array([0,0,0])
cov_mat1 = np.array([[1,0,0],[0,1,0],[0,0,1]])
class1_sample = np.random.multivariate_normal(mu_vec1, cov_mat1, 20)

df = pd.DataFrame(class1_sample,columns=['feature1','feature2','feature3'])
df['target'] = 1

mu_vec2 = np.array([1,1,1])
cov_mat2 = np.array([[1,0,0],[0,1,0],[0,0,1]])
class2_sample = np.random.multivariate_normal(mu_vec2, cov_mat2, 20)

df1 = pd.DataFrame(class2_sample,columns=['feature1','feature2','feature3'])

df1['target'] = 0

df = pd.concat([df,df1],ignore_index=True)

df = df.sample(40)
df.head()

Unnamed: 0,feature1,feature2,feature3,target
17,-0.381093,-0.375669,-0.074471,1
23,1.73528,0.34675,1.842456,0
11,0.112727,0.370445,1.359634,1
32,1.380472,0.782865,2.173531,0
3,-0.909008,0.551454,2.292208,1


In [7]:
import plotly.express as px
fig = px.scatter_3d(df, x=df['feature1'], y=df['feature2'], z=df['feature3'],
              color=df['target'].astype('str'))
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.show()

Now, we can apply **PCA** on this dataset.

# Step1: Standardize the data.
### **Note:** *You have to split the data first before applying PCA, since data is only 40 rows, i am not doing here.*

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df.iloc[:,0:3] = scaler.fit_transform(df.iloc[:,0:3])

# Step2: Find the covariance matrix.

In [9]:
covariance_matrix = np.cov([df.iloc[:,0],df.iloc[:,1],df.iloc[:,2]])
print('Covariance Matrix:\n', covariance_matrix)

Covariance Matrix:
 [[ 1.02564103 -0.10657747  0.26910515]
 [-0.10657747  1.02564103  0.0686515 ]
 [ 0.26910515  0.0686515   1.02564103]]


# Step3: Find the Eigenvalues and Eigenvectors

In [10]:
eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)
print('eigen values\n',eigen_values)
print('eigen vectors\n',eigen_vectors)

eigen values
 [0.70801342 1.29769746 1.0712122 ]
eigen vectors
 [[ 0.67011227  0.72034775 -0.17902142]
 [ 0.36459414 -0.10935523  0.92472296]
 [-0.64654517  0.68493836  0.33591485]]


In [11]:
# Note: np.linalg.eig() does not guarantee the order of eigenvalues, so you will need to sort them if you require a specific order. 
sorted_indices = np.argsort(eigen_values)[::-1]  # Indices for sorting in descending order
eigen_values = eigen_values[sorted_indices]
eigen_vectors = eigen_vectors[:, sorted_indices]

print('eigen values\n',eigen_values)
print('eigen vectors\n',eigen_vectors)

eigen values
 [1.29769746 1.0712122  0.70801342]
eigen vectors
 [[ 0.72034775 -0.17902142  0.67011227]
 [-0.10935523  0.92472296  0.36459414]
 [ 0.68493836  0.33591485 -0.64654517]]


In [12]:
import plotly.graph_objects as go

# Calculate the mean of the dataset
mean_x = df['feature1'].mean()
mean_y = df['feature2'].mean()
mean_z = df['feature3'].mean()

# Create 3D scatter plot using Plotly
fig = go.Figure()

# Add scatter points for the dataset
fig.add_trace(go.Scatter3d(
    x=df['feature1'],
    y=df['feature2'],
    z=df['feature3'],
    mode='markers',
    marker=dict(size=5, color=df['target'], colorscale='Viridis', opacity=0.8),
    name='Data Points'
))

# Add mean point
fig.add_trace(go.Scatter3d(
    x=[mean_x],
    y=[mean_y],
    z=[mean_z],
    mode='markers',
    marker=dict(size=10, color='red', symbol='cross'),
    name='Mean Point'
))
eigenvector_colors = ['blue', 'green', 'orange', 'purple', 'cyan', 'magenta']
# Assuming you have your eigenvectors in a variable called `eigen_vectors`
# Add eigenvectors
for i in range(len(eigen_vectors)):
    v = eigen_vectors[:, i]
    fig.add_trace(go.Scatter3d(
        x=[mean_x, mean_x + v[0]],
        y=[mean_y, mean_y + v[1]],
        z=[mean_z, mean_z + v[2]],
        mode='lines+text',
        line=dict(color=eigenvector_colors[i % len(eigenvector_colors)], width=5),
        name=f'Eigenvector {i + 1}'
    ))

# Update layout
fig.update_layout(
    scene=dict(
        xaxis_title='Feature 1',
        yaxis_title='Feature 2',
        zaxis_title='Feature 3',
        camera=dict(eye=dict(x=1.5, y=1.5, z=1.5))
    ),
    title='3D Scatter Plot with Eigenvectors',
)

# Show the plot
fig.show()


# Step4: Choose the Top Principal Components

In [13]:
pc = eigen_vectors[0:2]
pc

array([[ 0.72034775, -0.17902142,  0.67011227],
       [-0.10935523,  0.92472296,  0.36459414]])

# Step5: Transform the Data

In [14]:
transformed_df = np.dot(df.iloc[:,0:3],pc.T)

new_df = pd.DataFrame(transformed_df,columns=['PC1','PC2'])
new_df['target'] = df['target'].values
new_df.head()

Unnamed: 0,PC1,PC2,target
0,-0.739196,-0.622107,1
1,1.571545,0.304681,0
2,0.26046,0.335348,1
3,1.462509,0.774155,0
4,0.09488,0.854866,1


In [15]:
new_df['target'] = new_df['target'].astype('str')
fig = px.scatter(x=new_df['PC1'],
                 y=new_df['PC2'],
                 color=new_df['target'],
                 color_discrete_sequence=px.colors.qualitative.G10
                )

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
     

## Sklearn Implementation

In [16]:
# Synthetic Data Generation
import numpy as np
import pandas as pd

np.random.seed(2) 

mu_vec1 = np.array([0,0,0])
cov_mat1 = np.array([[1,0,0],[0,1,0],[0,0,1]])
class1_sample = np.random.multivariate_normal(mu_vec1, cov_mat1, 20)

df = pd.DataFrame(class1_sample,columns=['feature1','feature2','feature3'])
df['target'] = 1

mu_vec2 = np.array([1,1,1])
cov_mat2 = np.array([[1,0,0],[0,1,0],[0,0,1]])
class2_sample = np.random.multivariate_normal(mu_vec2, cov_mat2, 20)

df1 = pd.DataFrame(class2_sample,columns=['feature1','feature2','feature3'])

df1['target'] = 0

df = pd.concat([df,df1],ignore_index=True)

df = df.sample(40)
df.head()

Unnamed: 0,feature1,feature2,feature3,target
17,-0.381093,-0.375669,-0.074471,1
23,1.73528,0.34675,1.842456,0
11,0.112727,0.370445,1.359634,1
32,1.380472,0.782865,2.173531,0
3,-0.909008,0.551454,2.292208,1


In [22]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df.iloc[:,0:3] = scaler.fit_transform(df.iloc[:,0:3])

pca = PCA(n_components=2)
X_pca = pca.fit_transform(df.iloc[:,0:3])

new_df = pd.DataFrame(X_pca,columns=['PC1','PC2'])
new_df['target'] = df['target'].values
new_df['target'] = new_df['target'].astype('str')
print(new_df.head())

import plotly.express as px
fig = px.scatter(x=new_df['PC1'],
                 y=new_df['PC2'],
                 color=new_df['target'],
                 color_discrete_sequence=px.colors.qualitative.G10
                )

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()

        PC1       PC2 target
0 -0.786151 -0.557659      1
1  1.589906  0.192484      0
2  0.274277  0.335194      1
3  1.510178  0.676111      0
4  0.130604  0.895961      1
