In [1]:
import os
import pandas as pd
import numpy as np
from scipy.signal import find_peaks
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [2]:
BASE_PATH = r"C:\Users\LENOVO\Desktop\ByteBuzz\Data\STData"
OUTPUT_FILE = "Processed_dataset.csv"

#Helpers

In [3]:
def safe_read(path):
    if not os.path.exists(path):
        return None
    try:
        return pd.read_csv(path)
    except Exception:
        return None

def to_numeric_cols(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

# Feature functions

In [4]:
def eeg_features(df):
    feats = {}
    if df is None or df.empty: return feats
    bands = ["Delta","Theta","Alpha","Beta","Gamma"]
    for band in bands:
        cols = [c for c in df.columns if band in c]
        if cols:
            to_numeric_cols(df, cols)
            feats[f"EEG_{band}_mean"] = df[cols].mean().mean(skipna=True)
            feats[f"EEG_{band}_var"]  = df[cols].var().mean(skipna=True)
    if "EEG_Theta_mean" in feats and "EEG_Alpha_mean" in feats:
        feats["EEG_Theta_Alpha"] = feats["EEG_Theta_mean"] / (feats["EEG_Alpha_mean"] + 1e-9)
    if "EEG_Beta_mean" in feats and "EEG_Alpha_mean" in feats:
        feats["EEG_Beta_Alpha"] = feats["EEG_Beta_mean"] / (feats["EEG_Alpha_mean"] + 1e-9)
    return feats

def gsr_features(df):
    feats = {}
    if df is None or df.empty: return feats
    if "GSR Conductance CAL" in df.columns:
        sig = pd.to_numeric(df["GSR Conductance CAL"], errors="coerce").dropna()
        if len(sig) > 0:
            feats["GSR_mean"] = sig.mean()
            feats["GSR_std"]  = sig.std()
            feats["GSR_slope"] = (sig.iloc[-1] - sig.iloc[0]) / max(len(sig),1)
            peaks, _ = find_peaks(sig.values, height=(sig.mean() + sig.std()))
            feats["GSR_peaks"] = len(peaks)
    return feats

def eye_features(df):
    feats = {}
    if df is None or df.empty: return feats
    if "ET_PupilLeft" in df.columns and "ET_PupilRight" in df.columns:
        pl = pd.to_numeric(df["ET_PupilLeft"], errors="coerce")
        pr = pd.to_numeric(df["ET_PupilRight"], errors="coerce")
        pup = (pl + pr) / 2.0
        feats["Pupil_mean"] = pup.mean(skipna=True)
        feats["Pupil_std"]  = pup.std(skipna=True)
    gx = [c for c in df.columns if "ET_Gaze" in c and "x" in c.lower()]
    gy = [c for c in df.columns if "ET_Gaze" in c and "y" in c.lower()]
    if gx:
        to_numeric_cols(df, gx)
        feats["GazeX_std"] = df[gx].std().mean(skipna=True)
    if gy:
        to_numeric_cols(df, gy)
        feats["GazeY_std"] = df[gy].std().mean(skipna=True)
    return feats

def ivt_features(df):
    feats = {}
    if df is None or df.empty: return feats
    if "Fixation Index" in df.columns:
        feats["fixation_count"] = df["Fixation Index"].nunique()
    if "Fixation Duration" in df.columns:
        vals = pd.to_numeric(df["Fixation Duration"], errors="coerce")
        feats["fixation_duration_mean"] = vals.mean()
        feats["fixation_duration_std"]  = vals.std()
    if "Fixation Dispersion" in df.columns:
        feats["fixation_dispersion_mean"] = pd.to_numeric(df["Fixation Dispersion"], errors="coerce").mean()
    if "Saccade Index" in df.columns:
        feats["saccade_count"] = df["Saccade Index"].nunique()
    if "Saccade Duration" in df.columns:
        vals = pd.to_numeric(df["Saccade Duration"], errors="coerce")
        feats["saccade_duration_mean"] = vals.mean()
        feats["saccade_duration_std"]  = vals.std()
    if "Saccade Amplitude" in df.columns:
        vals = pd.to_numeric(df["Saccade Amplitude"], errors="coerce")
        feats["saccade_amplitude_mean"] = vals.mean()
        feats["saccade_amplitude_std"]  = vals.std()
    if "Saccade Peak Velocity" in df.columns:
        vals = pd.to_numeric(df["Saccade Peak Velocity"], errors="coerce")
        feats["saccade_velocity_mean"] = vals.mean()
        feats["saccade_velocity_std"]  = vals.std()
    return feats

def tiva_features(df):
    feats = {}
    if df is None or df.empty: return feats
    emotions = ["Engagement","Valence","Attention","Joy","Sadness","Anger","Fear","Surprise","Neutral"]
    for c in emotions:
        if c in df.columns:
            vals = pd.to_numeric(df[c], errors="coerce").dropna()
            if len(vals) > 0:
                feats[f"TIVA_{c}_mean"] = vals.mean()
    return feats

def psy_features(df):
    feats = {}
    if df is None or df.empty: return feats
    if "Difficulty" in df.columns:
        feats["Difficulty_mean"] = pd.to_numeric(df["Difficulty"], errors="coerce").mean()
    if "ResponseTime" in df.columns:
        feats["ResponseTime_mean"] = pd.to_numeric(df["ResponseTime"], errors="coerce").mean()
    if "verdict" in df.columns:
        vals = df["verdict"].astype(str).str.lower().str.strip()
        correct = (vals == "correct").sum()
        incorrect = (vals == "incorrect").sum()
        feats["Label"] = 1 if correct >= incorrect else 0
    return feats


# Main loop

In [5]:
rows = []
students = sorted([d for d in os.listdir(BASE_PATH) if d.isdigit()], key=lambda x: int(x))

for student in students:
    folder = os.path.join(BASE_PATH, student)
    sid = int(student)

    eeg  = safe_read(os.path.join(folder, f"{student}_EEG.csv"))
    gsr  = safe_read(os.path.join(folder, f"{student}_GSR.csv"))
    eye  = safe_read(os.path.join(folder, f"{student}_EYE.csv"))
    ivt  = safe_read(os.path.join(folder, f"{student}_IVT.csv"))
    tiva = safe_read(os.path.join(folder, f"{student}_TIVA.csv"))
    psy  = safe_read(os.path.join(folder, f"{student}_PSY.csv"))

    if psy is None or "Key" not in psy.columns:
        continue

    for _, psy_row in psy.iterrows():
        trial_key = psy_row["Key"]

        def subset(df):
            if df is None: return None
            if "QuestionKey" in df.columns and trial_key in df["QuestionKey"].values:
                return df[df["QuestionKey"] == trial_key].copy()
            return None

        feats = {"StudentID": sid, "TrialID": trial_key}
        feats.update(eeg_features(subset(eeg)))
        feats.update(gsr_features(subset(gsr)))
        feats.update(eye_features(subset(eye)))
        feats.update(ivt_features(subset(ivt)))
        feats.update(tiva_features(subset(tiva)))
        feats.update(psy_features(pd.DataFrame([psy_row])))

        rows.append(feats)

# -----------------------------
# Build final DataFrame
# -----------------------------
df_all = pd.DataFrame(rows)

expected_cols = [
    'StudentID','TrialID',
    'EEG_Delta_mean','EEG_Delta_var',
    'EEG_Theta_mean','EEG_Theta_var',
    'EEG_Alpha_mean','EEG_Alpha_var',
    'EEG_Beta_mean','EEG_Beta_var',
    'EEG_Gamma_mean','EEG_Gamma_var',
    'EEG_Theta_Alpha','EEG_Beta_Alpha',
    'GSR_mean','GSR_std','GSR_slope','GSR_peaks',
    'Pupil_mean','Pupil_std',
    'GazeX_std','GazeY_std',
    'fixation_count','fixation_duration_mean','fixation_duration_std','fixation_dispersion_mean',
    'saccade_count','saccade_duration_mean','saccade_duration_std',
    'saccade_amplitude_mean','saccade_amplitude_std',
    'saccade_velocity_mean','saccade_velocity_std',
    'TIVA_Engagement_mean','TIVA_Valence_mean','TIVA_Attention_mean',
    'TIVA_Joy_mean','TIVA_Sadness_mean','TIVA_Anger_mean',
    'TIVA_Fear_mean','TIVA_Surprise_mean','TIVA_Neutral_mean',
    'Difficulty_mean','ResponseTime_mean','Label'
]

for c in expected_cols:
    if c not in df_all.columns:
        df_all[c] = 0

df_all = df_all[expected_cols]

# clean infinities / NaNs
df_all.replace([np.inf, -np.inf], np.nan, inplace=True)
df_all.fillna(0, inplace=True)

# scale numeric features
scaler = StandardScaler()
num_cols = [c for c in df_all.columns if c not in ["StudentID","TrialID","Label"]]
if len(df_all) > 0:
    df_all[num_cols] = scaler.fit_transform(df_all[num_cols])

# save
df_all.to_csv(OUTPUT_FILE, index=False)
print("✅ Saved", OUTPUT_FILE, "with shape", df_all.shape)


✅ Saved Processed_dataset.csv with shape (1448, 45)


In [6]:
column_name = df_all.columns
print(column_name)

Index(['StudentID', 'TrialID', 'EEG_Delta_mean', 'EEG_Delta_var',
       'EEG_Theta_mean', 'EEG_Theta_var', 'EEG_Alpha_mean', 'EEG_Alpha_var',
       'EEG_Beta_mean', 'EEG_Beta_var', 'EEG_Gamma_mean', 'EEG_Gamma_var',
       'EEG_Theta_Alpha', 'EEG_Beta_Alpha', 'GSR_mean', 'GSR_std', 'GSR_slope',
       'GSR_peaks', 'Pupil_mean', 'Pupil_std', 'GazeX_std', 'GazeY_std',
       'fixation_count', 'fixation_duration_mean', 'fixation_duration_std',
       'fixation_dispersion_mean', 'saccade_count', 'saccade_duration_mean',
       'saccade_duration_std', 'saccade_amplitude_mean',
       'saccade_amplitude_std', 'saccade_velocity_mean',
       'saccade_velocity_std', 'TIVA_Engagement_mean', 'TIVA_Valence_mean',
       'TIVA_Attention_mean', 'TIVA_Joy_mean', 'TIVA_Sadness_mean',
       'TIVA_Anger_mean', 'TIVA_Fear_mean', 'TIVA_Surprise_mean',
       'TIVA_Neutral_mean', 'Difficulty_mean', 'ResponseTime_mean', 'Label'],
      dtype='object')


In [7]:
from sklearn.decomposition import PCA

In [9]:
# Separate identifiers and labels
X = df_all.drop(columns=["StudentID", "TrialID", "Label"])
y = df_all["Label"]


# Z-score normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# PCA for Dimensionality Reduction
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# Create PCA DataFrame
df_pca = pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(X_pca.shape[1])])
df_pca["StudentID"] = df_all["StudentID"].values
df_pca["TrialID"] = df_all["TrialID"].values
df_pca["Label"] = y.values

df_pca.to_csv("pca_dataset.csv", index=False)

print("Original features:", X.shape[1])
print("PCA features:", X_pca.shape[1])
print("Variance explained:", round(sum(pca.explained_variance_ratio_)*100, 2), "%")

Original features: 42
PCA features: 21
Variance explained: 95.21 %


## Label Encoding + Per-Modality PCA

In [11]:
INPUT_FILE = r"C:\Users\LENOVO\Desktop\ByteBuzz\Data\Processed_dataset.csv"
OUTPUT_FILE = r"C:\Users\LENOVO\Desktop\ByteBuzz\Data\final_dataset.csv"

df = pd.read_csv(INPUT_FILE)


## Split features by modality

In [12]:
# Separate metadata + label
meta = df[["StudentID", "TrialID", "Label"]]
features = df.drop(columns=["StudentID", "TrialID", "Label"])

modalities = {
    "EEG": [c for c in features.columns if "EEG" in c],
    "GSR": [c for c in features.columns if "GSR" in c],
    "EYE": [c for c in features.columns if "Pupil" in c or "Gaze" in c],
    "IVT": [c for c in features.columns if "fixation" in c or "saccade" in c],
    "TIVA": [c for c in features.columns if "TIVA" in c]
}

print("Feature counts per modality:")
for m, cols in modalities.items():
    print(f"{m}: {len(cols)}")
    

# 2. PCA per modality

def pca_reduce(X, prefix, var_ratio=0.95):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    pca = PCA(n_components=var_ratio, random_state=42)
    X_pca = pca.fit_transform(X_scaled)
    cols = [f"{prefix}_PC{i+1}" for i in range(X_pca.shape[1])]
    return pd.DataFrame(X_pca, columns=cols)

df_pca_parts = []

for mod, cols in modalities.items():
    if len(cols) > 0:
        df_pca_parts.append(pca_reduce(features[cols], mod))

# Concatenate PCA outputs + meta
df_final = pd.concat([meta] + df_pca_parts, axis=1)


Feature counts per modality:
EEG: 12
GSR: 4
EYE: 4
IVT: 11
TIVA: 9


In [13]:
df_final.to_csv(OUTPUT_FILE, index=False)

print("Shape:", df_final.shape)
print("Columns:", df_final.columns[:15], "...")

Shape: (1448, 28)
Columns: Index(['StudentID', 'TrialID', 'Label', 'EEG_PC1', 'EEG_PC2', 'EEG_PC3',
       'EEG_PC4', 'EEG_PC5', 'EEG_PC6', 'EEG_PC7', 'GSR_PC1', 'GSR_PC2',
       'GSR_PC3', 'GSR_PC4', 'EYE_PC1'],
      dtype='object') ...
