# 01 â€” Preprocessing (merge, impute, scale)

In [None]:
# Update this if your data isn't under ./data
base_path = r"D:\IITB\STData\1"
  # or keep "./data" if you copy CSVs there

# Because this notebook runs from repo/notebooks/, save relative to repo root:
save_models_to = r"../models"        # goes to repo/models/
save_fig_to    = r"./figures"        # goes to repo/notebooks/figures/

import os, pandas as pd, numpy as np, matplotlib.pyplot as plt
os.makedirs(save_models_to, exist_ok=True)
os.makedirs(save_fig_to,    exist_ok=True)

def read_csv(name):
    p = os.path.join(base_path, name)
    return pd.read_csv(p)

print("Using base_path:", base_path)


In [None]:
# =========================
# 01 â€” Preprocessing (FULL)
# =========================
# Paths (adjust ONLY if your files are elsewhere)
base_path    = r"D:\IITB\STData\1"   # <- your subject-1 folder with 1_EYE.csv etc.
save_models  = r"../models"          # notebook is in repo/notebooks, so ../models is repo/models
save_figures = r"./figures"          # figures go to repo/notebooks/figures

import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pandas import merge_asof
from scipy.signal import spectrogram

os.makedirs(save_models,  exist_ok=True)
os.makedirs(save_figures, exist_ok=True)
print("Using base_path:", base_path)

# -------------------------
# 1) Load raw modalities
# -------------------------
def must_read(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing file: {path}")
    return pd.read_csv(path)

eye  = must_read(os.path.join(base_path, "1_EYE.csv"))
eeg  = must_read(os.path.join(base_path, "1_EEG.csv"))
gsr  = must_read(os.path.join(base_path, "1_GSR.csv"))
tiva = must_read(os.path.join(base_path, "1_TIVA.csv"))
psy  = pd.read_csv(os.path.join(base_path, "1_PSY.csv"))  if os.path.exists(os.path.join(base_path,"1_PSY.csv"))  else None
ivt  = pd.read_csv(os.path.join(base_path, "1_IVT.csv"))  if os.path.exists(os.path.join(base_path,"1_IVT.csv"))  else None

print("Loaded shapes:", {k:v.shape for k,v in {
    "EYE":eye, "EEG":eeg, "GSR":gsr, "TIVA":tiva, "PSY":psy, "IVT":ivt}.items() if v is not None})

# -------------------------
# 2) Helpers
# -------------------------
def tcol(df):
    """Pick a time column that exists in the given dataframe."""
    for c in ["UnixTime","TimeStamp","Timestamp","routineStamp","time","Time","Unix Time","Unix_Timestamp"]:
        if c in df.columns: return c
    # fallback: first numeric col
    for c in df.columns:
        if pd.api.types.is_numeric_dtype(df[c]): return c
    raise KeyError("No time-like column found")

def merge_nearest(a, b, tol=0.5):
    """Nearest-time merge with tolerance in seconds."""
    if a is None: return b
    if b is None: return a
    return merge_asof(a.sort_values("Time"), b.sort_values("Time"),
                      on="Time", direction="nearest", tolerance=tol)

# -------------------------
# 3) Engineer condensed features
# -------------------------
# EYE -> PupilDiameter
eye_t = tcol(eye)
pupil_cols = [c for c in ["ET_PupilLeft","ET_PupilRight","PupilLeft","PupilRight","PupilDiameter"] if c in eye.columns]
if not pupil_cols:
    raise KeyError("No pupil columns found in 1_EYE.csv; update 'pupil_cols' list above.")
eye_small = eye[[eye_t] + pupil_cols].copy()
eye_small["PupilDiameter"] = eye_small[pupil_cols].mean(axis=1, skipna=True)
eye_small = eye_small.rename(columns={eye_t:"Time"})[["Time","PupilDiameter"]]

# EEG -> BetaPower (fallback to other band powers if needed)
eeg_t = tcol(eeg)
beta_cols = [c for c in eeg.columns if c.lower().startswith("beta_") or c.lower()=="beta"]
if not beta_cols:
    alt_prefixes = ["delta_","theta_","alpha_","gamma_"]
    beta_cols = [c for c in eeg.columns if any(c.lower().startswith(p) for p in alt_prefixes)]
eeg_small = eeg[[eeg_t] + beta_cols].copy() if beta_cols else eeg[[eeg_t]].copy()
eeg_small["BetaPower"] = eeg_small[beta_cols].mean(axis=1, skipna=True) if beta_cols else np.nan
eeg_small = eeg_small.rename(columns={eeg_t:"Time"})[["Time","BetaPower"]]

# GSR -> GSR conductance
gsr_t = tcol(gsr)
gsr_candidates = [c for c in gsr.columns
                  if (("gsr" in c.lower()) or ("eda" in c.lower()))
                  and any(k in c.lower() for k in ["conduct", "micro", "skin"])]
if not gsr_candidates:
    # fallback: first numeric signal column
    gsr_candidates = [c for c in gsr.columns if pd.api.types.is_numeric_dtype(gsr[c]) and c != gsr_t]
use_gsr = gsr_candidates[0]
gsr_small = gsr[[gsr_t, use_gsr]].rename(columns={gsr_t:"Time", use_gsr:"GSR"})

# TIVA -> EmotionAvg (+Valence/Arousal/Blink if present)
tiva_t = tcol(tiva)
cols = tiva.columns
val   = next((c for c in cols if "valence" in c.lower() or c.lower()=="val"), None)
aro   = next((c for c in cols if "arousal" in c.lower() or c.lower()=="aro"), None)
blink = next((c for c in cols if "blink"   in c.lower()), None)
emo_words = ["joy","anger","sad","fear","disgust","surprise","neutral","happy","contempt"]
emo_cols  = [c for c in cols if any(w in c.lower() for w in emo_words)]

keep = [tiva_t] + ([val] if val else []) + ([aro] if aro else []) + ([blink] if blink else []) + emo_cols[:10]
tiva_small = tiva[keep].rename(columns={tiva_t:"Time"})
if emo_cols:
    tiva_small["EmotionAvg"] = tiva_small[emo_cols].mean(axis=1, skipna=True)
elif val and aro:
    v = tiva_small[val]; a = tiva_small[aro]
    v01 = (v - v.min())/(v.max()-v.min()+1e-9)
    a01 = (a - a.min())/(a.max()-a.min()+1e-9)
    tiva_small["EmotionAvg"] = (v01 + a01)/2
if val:   tiva_small = tiva_small.rename(columns={val:"Valence"})
if aro:   tiva_small = tiva_small.rename(columns={aro:"Arousal"})
if blink: tiva_small = tiva_small.rename(columns={blink:"BlinkRate"})

# -------------------------
# 4) Merge all on Time
# -------------------------
data = None
for df in [eye_small, eeg_small, gsr_small, tiva_small]:
    data = merge_nearest(data, df, tol=0.5)  # increase tol if needed (e.g., 1.0)

if data is None or data.empty:
    raise RuntimeError("Merged data is empty â€” check time columns or increase tolerance.")

# Engagement proxy if missing
if "Engagement" not in data.columns:
    if "BlinkRate" in data.columns:
        br = data["BlinkRate"].astype(float)
        data["Engagement"] = -(br - np.nanmean(br)) / (np.nanstd(br) + 1e-6)
    elif "GSR" in data.columns:
        g = data["GSR"].astype(float)
        data["Engagement"] = 1 - (g - np.nanmin(g)) / (np.nanmax(g) - np.nanmin(g) + 1e-9)
    else:
        data["Engagement"] = np.nan

# -------------------------
# 5) Save processed dataset
# -------------------------
out_csv = os.path.join(base_path, "processed_merged.csv")
data.to_csv(out_csv, index=False)
print("Saved:", out_csv)
display(data.head())

# -------------------------
# 6) Quick plots
# -------------------------
def plot_line(x, y, title, fname, ylabel=None):
    plt.figure(figsize=(10,4))
    plt.plot(x, y)
    plt.title(title); plt.xlabel("Time"); plt.ylabel(ylabel or title)
    plt.tight_layout()
    path = os.path.join(save_figures, fname)
    plt.savefig(path); plt.show()
    print("Figure saved:", path)

t = data["Time"].astype(float).values
if "PupilDiameter" in data.columns:
    plot_line(t, data["PupilDiameter"].astype(float).values, "Pupil Diameter", "01_pupil.png", "Pupil")
if "BetaPower" in data.columns:
    plot_line(t, data["BetaPower"].astype(float).values, "Beta Power", "01_beta.png", "Beta Power")
    # Spectrogram (optional)
    valid = np.isfinite(t)
    dt = np.median(np.diff(t[valid])) if valid.sum()>1 else None
    fs = 1.0/dt if (dt and dt>0) else 128.0
    f, ts, Sxx = spectrogram(np.nan_to_num(data["BetaPower"].astype(float).values), fs)
    plt.figure(figsize=(10,4))
    plt.pcolormesh(ts, f, 10*np.log10(Sxx + 1e-12), shading="gouraud")
    plt.ylabel("Frequency [Hz]"); plt.xlabel("Time [s]"); plt.title("EEG Beta Power Spectrogram")
    plt.colorbar(label="Power (dB)"); plt.tight_layout()
    sp = os.path.join(save_figures, "01_beta_spectrogram.png")
    plt.savefig(sp); plt.show(); print("Figure saved:", sp)
if "EmotionAvg" in data.columns and data["EmotionAvg"].notna().sum() >= 400:
    seq = data["EmotionAvg"].dropna().values[:400]
    mat = seq.reshape(20,20)
    plt.figure(figsize=(5,4)); plt.imshow(mat, aspect="auto")
    plt.title("Emotion Intensity Heatmap (Time Slices)")
    plt.colorbar(); plt.tight_layout()
    eh = os.path.join(save_figures, "01_emotion_heatmap.png")
    plt.savefig(eh); plt.show(); print("Figure saved:", eh)
# Correlation
cols = [c for c in ["PupilDiameter","EmotionAvg","Engagement","Valence","BetaPower","GSR"] if c in data.columns]
if len(cols) >= 2:
    C = data[cols].astype(float).corr()
    plt.figure(figsize=(5,4)); plt.imshow(C, cmap="viridis")
    plt.xticks(range(len(cols)), cols, rotation=45, ha='right'); plt.yticks(range(len(cols)), cols)
    for i in range(len(cols)):
        for j in range(len(cols)):
            plt.text(j, i, f"{C.iloc[i,j]:.2f}", ha='center', va='center', color='w')
    plt.title("Multimodal Signal Correlation"); plt.tight_layout()
    ch = os.path.join(save_figures, "01_correlation.png")
    plt.savefig(ch); plt.show(); print("Figure saved:", ch)

print("âœ… Preprocessing completed.")
