# 04 — LDA Analysis

In [None]:

# Update this if your data isn't under ./data
base_path = r"./data"  # change to r"D:\IITB\STData" on Windows if needed
save_models_to = r"./models"
save_fig_to = r"./notebooks/figures"

import os, pandas as pd, numpy as np, matplotlib.pyplot as plt
os.makedirs(save_models_to, exist_ok=True)
os.makedirs(save_fig_to, exist_ok=True)

def read_csv(name):
    p = os.path.join(base_path, name)
    return pd.read_csv(p)

print("Using base_path:", base_path)


In [None]:

import os, pandas as pd, numpy as np, pickle
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score

X = pd.read_csv(os.path.join(base_path,"processed_clean.csv"))
# Build a Y label if available; fallback to median split on Engagement
df_raw = pd.read_csv(os.path.join(base_path,"processed_merged.csv"))
if 'Correct' in df_raw.columns:
    y = (df_raw['Correct']>0).astype(int).values
else:
    y = (df_raw['Engagement'] > df_raw['Engagement'].median()).astype(int).values

lda = LinearDiscriminantAnalysis()
scores = cross_val_score(lda, X.values, y, cv=5)
print("LDA CV accuracy:", scores.mean().round(3))

lda.fit(X.values, y)
coef = pd.Series(lda.coef_[0], index=X.columns).sort_values(key=lambda s: s.abs(), ascending=False)
coef.head(20).to_csv(os.path.join(base_path,"lda_top_features.csv"), index=True)

with open(os.path.join(save_models_to,"lda_model.pkl"),"wb") as f:
    pickle.dump(lda, f)
print("Saved LDA model")
