# 03 — PCA Analysis

In [None]:

# Update this if your data isn't under ./data
base_path = r"D:\IITB\STData\1"
  # change to r"D:\IITB\STData" on Windows if needed
save_models_to = r"./models"
save_fig_to = r"./notebooks/figures"

import os, pandas as pd, numpy as np, matplotlib.pyplot as plt
os.makedirs(save_models_to, exist_ok=True)
os.makedirs(save_fig_to, exist_ok=True)

def read_csv(name):
    p = os.path.join(base_path, name)
    return pd.read_csv(p)

print("Using base_path:", base_path)


In [None]:

from sklearn.decomposition import PCA
import numpy as np, os, pickle, matplotlib.pyplot as plt, pandas as pd

X = pd.read_csv(os.path.join(base_path,"processed_clean.csv")).values

pca = PCA().fit(X)
cumvar = pca.explained_variance_ratio_.cumsum()
k90 = int(np.argmax(cumvar >= 0.90) + 1)
print("Components for 90% variance:", k90)

plt.figure(figsize=(8,4)); plt.plot(cumvar); plt.title("Cumulative Variance (PCA)")
plt.xlabel("Components"); plt.ylabel("Cumulative variance"); plt.tight_layout()
plt.savefig(os.path.join(save_fig_to,"03_pca_cumvar.png")); plt.show()

with open(os.path.join(save_models_to,"pca_components.pkl"),"wb") as f:
    pickle.dump({"components":pca.components_, "explained":pca.explained_variance_ratio_}, f)
print("Saved PCA components to models/")


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Path to your merged feature CSV
base_path = r"D:\IITB\STData"
data_path = os.path.join(base_path, "eye_features_all_students.csv")

df = pd.read_csv(data_path)
print("Shape:", df.shape)
df.head()


X = df.drop(columns=["student_id"])
features = X.columns

# Standardize (important for PCA)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Keep enough components to explain ~95% variance
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

print("Original shape:", X_scaled.shape)
print("Reduced shape:", X_pca.shape)


plt.figure(figsize=(8,5))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("PCA - Explained Variance")
plt.grid(True)
plt.show()


# Take first 2 components
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c='blue', alpha=0.6, edgecolor='k')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA Scatter Plot (first 2 components)")
plt.show()

out_csv = os.path.join(base_path, "eye_features_pca.csv")
pca_df = pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(X_pca.shape[1])])
pca_df["student_id"] = df["student_id"]

pca_df.to_csv(out_csv, index=False)
print("Saved PCA-reduced data to:", out_csv)
