In [1]:
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from scipy.spatial.distance import cosine
from scipy.stats import entropy
import shap
import torch
import warnings
warnings.filterwarnings("ignore")

In [2]:
DATA_PATH = r"C:/Users/LENOVO/Desktop/ByteBuzz/Data/final_dataset.csv"
MODEL_DIR = r"C:/Users/LENOVO/Desktop/ByteBuzz/Models"
RESULTS_DIR = r"C:/Users/LENOVO/Desktop/ByteBuzz/results"
os.makedirs(RESULTS_DIR, exist_ok=True)

BASELINE_CSV = os.path.join(RESULTS_DIR, "baseline_results.csv")
KD_CSV = os.path.join(RESULTS_DIR, "kd_results.csv")
CONTRAST_CSV = os.path.join(RESULTS_DIR, "contrastive_results.csv")

In [3]:
def safe_read_csv(p):
    return pd.read_csv(p) if os.path.exists(p) else None

def print_row(r):
    print(f"{r['Modality']:<6} | Acc: {r['Acc']:.3f} | F1: {r['F1']:.3f} | ROC-AUC: {r.get('ROC_AUC', np.nan):.3f}")

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}. Please regenerate step2.3_final_dataset.csv")

df = pd.read_csv(DATA_PATH)
print("Loaded dataset:", df.shape)

Loaded dataset: (1448, 28)


In [4]:
# modality columns detection (per-modality PCA columns)
EEG_cols  = [c for c in df.columns if c.startswith("EEG_PC")]
EYE_cols  = [c for c in df.columns if c.startswith("EYE_PC")]
GSR_cols  = [c for c in df.columns if c.startswith("GSR_PC")]
IVT_cols  = [c for c in df.columns if c.startswith("IVT_PC")]
TIVA_cols = [c for c in df.columns if c.startswith("TIVA_PC")]

modalities = {
    "EEG": EEG_cols,
    "EYE": EYE_cols,
    "GSR": GSR_cols,
    "TIVA": TIVA_cols,
    "IVT": IVT_cols
}

print("Feature counts:", {k: len(v) for k,v in modalities.items()})

y = df["Label"].values

def eval_model_on_modality(model, X, y):
    preds = model.predict(X)
    acc = accuracy_score(y, preds)
    f1 = f1_score(y, preds, average="weighted")
    roc = None
    if hasattr(model, "predict_proba") and len(np.unique(y))==2:
        try:
            prob = model.predict_proba(X)[:,1]
            roc = roc_auc_score(y, prob)
        except Exception:
            roc = None
    return acc, f1, roc


Feature counts: {'EEG': 7, 'EYE': 3, 'GSR': 4, 'TIVA': 5, 'IVT': 6}


#Baseline results

In [5]:
baseline_df = safe_read_csv(BASELINE_CSV)
if baseline_df is None:
    rows = []
    # EEG teacher
    teacher_paths = [
        os.path.join(MODEL_DIR, "teacher_eeg_xgb.pkl"),
        os.path.join(MODEL_DIR, "teacher_eeg.pkl"),
        os.path.join(MODEL_DIR, "eeg_teacher.pkl"),
    ]
    teacher_path = next((p for p in teacher_paths if os.path.exists(p)), None)
    if teacher_path and len(EEG_cols) > 0:
        teacher = joblib.load(teacher_path)
        X = df[EEG_cols].values
        acc, f1, roc = eval_model_on_modality(teacher, X, y)
        rows.append({"Modality":"EEG","Acc":acc,"F1":f1,"ROC_AUC": roc})
    else:
        print("Teacher model not found, skipping teacher baseline.")

    # Student models
    student_map = {
        "EYE": ["student_eye_rf.pkl","student_eye.pkl","eye_student.pkl"],
        "GSR": ["student_gsr_rf.pkl","student_gsr.pkl","gsr_student.pkl"],
        "TIVA": ["student_tiva_rf.pkl","student_tiva.pkl","tiva_student.pkl"]
    }
    for mod, patlist in student_map.items():
        path = next((os.path.join(MODEL_DIR, p) for p in patlist if os.path.exists(os.path.join(MODEL_DIR, p))), None)
        if path and len(modalities[mod]) > 0:
            model = joblib.load(path)
            X = df[modalities[mod]].values
            acc,f1,roc = eval_model_on_modality(model, X, y)
            rows.append({"Modality":mod,"Acc":acc,"F1":f1,"ROC_AUC":roc})
        else:
            print(f"No baseline model found for {mod} or missing features.")

    baseline_df = pd.DataFrame(rows)
    baseline_df.to_csv(BASELINE_CSV, index=False)
    print("Saved baseline_results.csv")
else:
    print("Loaded baseline CSV")

print("\nBaseline Results:")
if not baseline_df.empty:
    for _,r in baseline_df.iterrows(): print_row(r)


Loaded baseline CSV

Baseline Results:
EEG    | Acc: 0.859 | F1: 0.837 | ROC-AUC: 0.896
EYE    | Acc: 0.945 | F1: 0.943 | ROC-AUC: 0.958
GSR    | Acc: 0.950 | F1: 0.948 | ROC-AUC: 0.956
TIVA   | Acc: 0.947 | F1: 0.946 | ROC-AUC: 0.948


In [6]:
# 2) KD results + SHAP interpretability
# -------------------------
kd_df = safe_read_csv(KD_CSV)
shap_summary = {}  # initialize outside
if kd_df is None:
    rows = []
    kd_names = {
        "EYE": ["student_eye_kd.pkl","student_eye_kd.pkl"],
        "GSR": ["student_gsr_kd.pkl","student_gsr_kd.pkl"],
        "TIVA": ["student_tiva_kd.pkl","student_tiva_kd.pkl"]
    }

    for mod, patlist in kd_names.items():
        path = next((os.path.join(MODEL_DIR, p) for p in patlist if os.path.exists(os.path.join(MODEL_DIR, p))), None)
        if path and len(modalities[mod]) > 0:
            model = joblib.load(path)
            X = df[modalities[mod]].values
            acc,f1,roc = eval_model_on_modality(model, X, y)
            rows.append({"Modality":mod,"Acc":acc,"F1":f1,"ROC_AUC":roc})

            # SHAP interpretability (robust)
            try:
                explainer = shap.Explainer(model, X)
                shap_values = explainer(X)
                shap_vals = np.mean(np.abs(shap_values.values), axis=(0, -1))  # 1D for any output
                shap_df = pd.DataFrame({
                    "Feature": modalities[mod],
                    "Mean_Abs_SHAP": shap_vals
                }).sort_values(by="Mean_Abs_SHAP", ascending=False)
                shap_summary[mod] = shap_df.head(10)
            except Exception as e:
                print(f"SHAP computation failed for {mod}: {e}")
        else:
            print(f"No KD model found for {mod} or missing features.")

    kd_df = pd.DataFrame(rows)
    kd_df.to_csv(KD_CSV, index=False)
    print("Saved kd_results.csv")
else:
    print("Loaded kd CSV")

print("\nKD Results:")
if not kd_df.empty:
    for _,r in kd_df.iterrows(): print_row(r)

# Print SHAP top features
print("\n=== SHAP Top Features per KD Model ===")
for mod, df_shap in shap_summary.items():
    print(f"\n{mod}:")
    print(df_shap)



Saved kd_results.csv

KD Results:
EYE    | Acc: 0.945 | F1: 0.943 | ROC-AUC: 0.958
GSR    | Acc: 0.950 | F1: 0.948 | ROC-AUC: 0.956
TIVA   | Acc: 0.947 | F1: 0.946 | ROC-AUC: 0.948

=== SHAP Top Features per KD Model ===

EYE:
   Feature  Mean_Abs_SHAP
2  EYE_PC3       0.081395
0  EYE_PC1       0.072252
1  EYE_PC2       0.068678

GSR:
   Feature  Mean_Abs_SHAP
1  GSR_PC2       0.079207
0  GSR_PC1       0.066688
2  GSR_PC3       0.050288
3  GSR_PC4       0.048604

TIVA:
    Feature  Mean_Abs_SHAP
0  TIVA_PC1       0.057369
3  TIVA_PC4       0.047802
1  TIVA_PC2       0.044521
2  TIVA_PC3       0.043569
4  TIVA_PC5       0.041678


#Contrastive results + domain adaptation

In [7]:
contrast_df = safe_read_csv(CONTRAST_CSV)
if contrast_df is None:
    rows = []
    pairs = [("EYE","eeg2eye"), ("GSR","eeg2gsr"), ("TIVA","eeg2tiva"), ("IVT","eeg2ivt")]
    for mod, pair in pairs:
        enc_t_path = os.path.join(MODEL_DIR, f"{pair}_enc_t.pth")
        proj_t_path = os.path.join(MODEL_DIR, f"{pair}_proj_t.pth")
        scaler_t_path = os.path.join(MODEL_DIR, f"{pair}_scaler_t.pkl")
        if os.path.exists(enc_t_path) and os.path.exists(proj_t_path) and os.path.exists(scaler_t_path) and len(modalities[mod])>0:
            # load scaler and transform
            scaler_t = joblib.load(scaler_t_path)
            Xt = df[modalities[mod]].fillna(0).values
            Xt_s = scaler_t.transform(Xt)

            # RandomForest evaluation
            clf = RandomForestClassifier(n_estimators=200, random_state=42)
            Xtr, Xte, ytr, yte = train_test_split(Xt_s, y, test_size=0.2, stratify=y, random_state=42)
            clf.fit(Xtr, ytr)
            preds = clf.predict(Xte)
            acc = accuracy_score(yte, preds)
            f1 = f1_score(yte, preds, average="weighted")
            rows.append({"Modality":mod,"Acc":acc,"F1":f1,"ROC_AUC":None})

            # Domain adaptation metrics
            try:
                enc_state = torch.load(enc_t_path, map_location="cpu")
                Xt_tensor = torch.tensor(Xt_s.astype(np.float32))
                emb_mean = Xt_tensor.mean(dim=0).numpy()
                eeg_mean = df[EEG_cols].fillna(0).values.mean(axis=0)
                cos_sim = 1 - cosine(emb_mean, eeg_mean)
                p = emb_mean / emb_mean.sum()
                q = eeg_mean / eeg_mean.sum()
                kl_div = entropy(p + 1e-10, q + 1e-10)
                print(f"{mod}: Cosine sim={cos_sim:.3f}, KL div={kl_div:.3f}")
            except Exception as e:
                print(f"Domain adaptation metric error for {mod}: {e}")
        else:
            print(f"Contrastive files missing/skipping for {mod}")

    contrast_df = pd.DataFrame(rows)
    contrast_df.to_csv(CONTRAST_CSV, index=False)
    print("Saved contrastive_results.csv")
else:
    print("Loaded contrastive CSV")

print("\nContrastive Results:")
if not contrast_df.empty:
    for _,r in contrast_df.iterrows(): print_row(r)


Contrastive files missing/skipping for EYE
Contrastive files missing/skipping for GSR
Contrastive files missing/skipping for TIVA
Contrastive files missing/skipping for IVT
Saved contrastive_results.csv

Contrastive Results:


#Combined comparison & interpretation

In [8]:
dfs = []
for df_, name in [(baseline_df, "Baseline"), (kd_df, "KD"), (contrast_df, "Contrastive")]:
    if df_ is not None and not df_.empty:
        dfs.append(df_.assign(Method=name))

if dfs:
    combined = pd.concat(dfs, ignore_index=True, sort=False)
    combined.to_csv(os.path.join(RESULTS_DIR,"combined_results_all.csv"), index=False)

    print("\n=== Combined Results ===")
    print(combined.pivot_table(index="Modality", columns="Method", values=["Acc","F1"], aggfunc='first'))

    print("\n=== Interpretation ===")
    for mod in combined["Modality"].unique():
        sub = combined[combined["Modality"]==mod]
        if sub.empty: continue
        best = sub.loc[sub["F1"].idxmax()]
        print(f"{mod}: Best method = {best['Method']} (Acc={best['Acc']:.3f}, F1={best['F1']:.3f})")
else:
    print("⚠️ No results DataFrames available to combine.")


=== Combined Results ===
               Acc                  F1          
Method    Baseline        KD  Baseline        KD
Modality                                        
EEG       0.859116       NaN  0.836767       NaN
EYE       0.944751  0.944751  0.943180  0.943180
GSR       0.949586  0.949586  0.948381  0.948381
TIVA      0.946823  0.946823  0.945621  0.945621

=== Interpretation ===
EEG: Best method = Baseline (Acc=0.859, F1=0.837)
EYE: Best method = Baseline (Acc=0.945, F1=0.943)
GSR: Best method = Baseline (Acc=0.950, F1=0.948)
TIVA: Best method = KD (Acc=0.947, F1=0.946)
