<a href="https://colab.research.google.com/github/Pushkar1-GitHub/AI-in-Biology/blob/main/Copy_of_4_PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# 1. Create Mock Biological Data
# Rows = Samples (Cells), Cols = Genes
# Let's say Genes 1-2 are correlated, and Genes 3-5 are noise.
X = np.array([
    [10, 20, 5, 2, 1],   # Cell A (Healthy)
    [11, 22, 6, 1, 2],   # Cell B (Healthy)
    [12, 24, 5, 2, 1],   # Cell C (Healthy)
    [50, 100, 1, 9, 8],  # Cell D (Diseased)
    [52, 104, 2, 8, 9],  # Cell E (Diseased)
])

gene_names = ['Gene1', 'Gene2', 'Gene3', 'Gene4', 'Gene5']

# 2. Standardize! (Crucial for PCA)
# If we don't scale, genes with high raw counts dominate the variance.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Apply PCA
# We want to reduce 5 dimensions -> 2 dimensions
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)

# 4. View Results
df_pca = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
print("--- PCA Result (5 Genes -> 2 PCs) ---")
print(df_pca.round(2))

# 5. Explained Variance
# How much information did we keep?
print("\n--- Explained Variance Ratio ---")
print(pca.explained_variance_ratio_)
print(f"Total Info Retained: {sum(pca.explained_variance_ratio_)*100:.2f}%")

--- PCA Result (5 Genes -> 2 PCs) ---
    PC1   PC2
0 -1.77 -0.24
1 -1.96  0.40
2 -1.68 -0.19
3  2.78 -0.31
4  2.64  0.34

--- Explained Variance Ratio ---
[0.98098487 0.01868491]
Total Info Retained: 99.97%
