# 06 — Model-driven Feature Importance (XGBoost + SHAP)

In [None]:

# Update this if your data isn't under ./data
base_path = r"./data"  # change to r"D:\IITB\STData" on Windows if needed
save_models_to = r"./models"
save_fig_to = r"./notebooks/figures"

import os, pandas as pd, numpy as np, matplotlib.pyplot as plt
os.makedirs(save_models_to, exist_ok=True)
os.makedirs(save_fig_to, exist_ok=True)

def read_csv(name):
    p = os.path.join(base_path, name)
    return pd.read_csv(p)

print("Using base_path:", base_path)


In [None]:

import os, pandas as pd, numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import shap

X = pd.read_csv(os.path.join(base_path,"processed_clean.csv"))
df_raw = pd.read_csv(os.path.join(base_path,"processed_merged.csv"))
if 'Correct' in df_raw.columns:
    y = (df_raw['Correct']>0).astype(int).values
else:
    y = (df_raw['Engagement'] > df_raw['Engagement'].median()).astype(int).values

clf = XGBClassifier(n_estimators=400, max_depth=4, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9, n_jobs=-1, eval_metric='logloss')
scores = cross_val_score(clf, X.values, y, cv=5)
print("XGB CV accuracy:", scores.mean().round(3))

clf.fit(X.values, y)
imp = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
imp.to_csv(os.path.join(base_path,"xgb_feature_importance.csv"))

# SHAP (summary)
expl = shap.TreeExplainer(clf)
shap_vals = expl.shap_values(X)
shap.summary_plot(shap_vals, X, show=False)  # This will display in notebook; save manually if needed
