# 01 — Preprocessing (merge, impute, scale)

In [None]:

# Update this if your data isn't under ./data
base_path = r"./data"  # change to r"D:\IITB\STData" on Windows if needed
save_models_to = r"./models"
save_fig_to = r"./notebooks/figures"

import os, pandas as pd, numpy as np, matplotlib.pyplot as plt
os.makedirs(save_models_to, exist_ok=True)
os.makedirs(save_fig_to, exist_ok=True)

def read_csv(name):
    p = os.path.join(base_path, name)
    return pd.read_csv(p)

print("Using base_path:", base_path)


In [None]:

# Load raw modalities
eye = read_csv("EYE.csv")
ivt = read_csv("IVT.csv") if os.path.exists(os.path.join(base_path,"IVT.csv")) else None
eeg = read_csv("EEG.csv")
gsr = read_csv("GSR.csv")
tiva = read_csv("TIVA.csv")
psy = read_csv("PSY.csv") if os.path.exists(os.path.join(base_path,"PSY.csv")) else read_csv("ENG.csv")

# Build key time column
def tcol(df):
    for c in ["UnixTime","TimeStamp","Timestamp","routineStamp","time","Time"]:
        if c in df.columns: return c
    raise KeyError("No time column found")

# Engineer condensed features
eye_t = tcol(eye)
eye['PupilDiameter'] = eye[['ET_PupilLeft','ET_PupilRight']].mean(axis=1, skipna=True)
eye_small = eye[[eye_t,'PupilDiameter']].rename(columns={eye_t:'Time'}).sort_values('Time')

beta_cols = [c for c in eeg.columns if c.startswith("Beta_")]
if not beta_cols:
    beta_cols = [c for p in ["Delta_","Theta_","Alpha_","Gamma_"] for c in eeg.columns if c.startswith(p)]
eeg_small = eeg[[tcol(eeg)] + beta_cols].copy()
eeg_small['BetaPower'] = eeg_small[beta_cols].mean(axis=1, skipna=True)
eeg_small = eeg_small.rename(columns={tcol(eeg):'Time'})[['Time','BetaPower']].sort_values('Time')

gsr_cols = [c for c in gsr.columns if "GSR" in c and "Conductance" in c]
use_gsr = [c for c in gsr_cols if "CAL" in c]
use_gsr = use_gsr[0] if use_gsr else (gsr_cols[0] if gsr_cols else None)
gsr_small = gsr[[tcol(gsr),use_gsr]].rename(columns={tcol(gsr):'Time', use_gsr:'GSR'}).sort_values('Time') if use_gsr else None

cols = tiva.columns
emos = [c for c in cols if any(w in c.lower() for w in ["joy","anger","sad","fear","disgust","surprise","neutral"])]
val = next((c for c in cols if "valence" in c.lower()), None)
aro = next((c for c in cols if "arousal" in c.lower()), None)
blink = next((c for c in cols if "blinkrate" in c.lower() or c.lower()=="blink"), None)
keep = [tcol(tiva)] + ([val] if val else []) + ([aro] if aro else []) + ([blink] if blink else []) + emos[:8]
tiva_small = tiva[keep].rename(columns={tcol(tiva):'Time'}).sort_values('Time')
if emos:
    tiva_small['EmotionAvg'] = tiva_small[emos].mean(axis=1, skipna=True)
elif val and aro:
    v = (tiva_small[val]-tiva_small[val].min())/(tiva_small[val].ptp()+1e-9)
    a = (tiva_small[aro]-tiva_small[aro].min())/(tiva_small[aro].ptp()+1e-9)
    tiva_small['EmotionAvg'] = (v+a)/2
if val:   tiva_small = tiva_small.rename(columns={val:'Valence'})
if aro:   tiva_small = tiva_small.rename(columns={aro:'Arousal'})
if blink: tiva_small = tiva_small.rename(columns={blink:'BlinkRate'})

# Merge (nearest on Time)
from pandas import merge_asof
def m_asof(a,b):
    if a is None: return b
    if b is None: return a
    return merge_asof(a.sort_values('Time'), b.sort_values('Time'), on='Time', tolerance=0.1, direction='nearest')

data = None
for df in [eye_small, eeg_small, gsr_small, tiva_small]:
    data = m_asof(data, df)

# Engagement proxy if missing
if 'Engagement' not in data.columns:
    if 'BlinkRate' in data.columns:
        br = data['BlinkRate']
        data['Engagement'] = -(br - br.mean())/(br.std()+1e-6)
    elif 'GSR' in data.columns:
        g = data['GSR']
        data['Engagement'] = 1 - (g - g.min())/(g.max()-g.min()+1e-9)

# Save processed dataset
proc_path = os.path.join(base_path, "processed_merged.csv")
data.to_csv(proc_path, index=False)
print("Saved:", proc_path)

# Quick sanity plots
plt.figure(figsize=(10,4)); plt.plot(data['Time'], data['PupilDiameter']); plt.title("Pupil"); plt.savefig(os.path.join(save_fig_to,"01_pupil.png")); plt.show()
plt.figure(figsize=(10,4)); plt.plot(data['Time'], data['BetaPower']); plt.title("BetaPower"); plt.savefig(os.path.join(save_fig_to,"01_beta.png")); plt.show()
