# 02 — Feature Cleaning (low variance, correlation pruning)

In [None]:

# Update this if your data isn't under ./data
base_path = r"D:\IITB\STData\1"
  # change to r"D:\IITB\STData" on Windows if needed
save_models_to = r"./models"
save_fig_to = r"./notebooks/figures"

import os, pandas as pd, numpy as np, matplotlib.pyplot as plt
os.makedirs(save_models_to, exist_ok=True)
os.makedirs(save_fig_to, exist_ok=True)

def read_csv(name):
    p = os.path.join(base_path, name)
    return pd.read_csv(p)

print("Using base_path:", base_path)


In [None]:

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
import pandas as pd, numpy as np, os

df = pd.read_csv(os.path.join(base_path,"processed_merged.csv"))
features = [c for c in df.columns if c not in ['Time']]
X = df[features].values

imp = SimpleImputer(strategy="median")
scaler = StandardScaler()
X_imp = imp.fit_transform(X)
X_std = scaler.fit_transform(X_imp)

vt = VarianceThreshold(threshold=1e-5)
X_lv = vt.fit_transform(X_std)
kept = np.array(features)[vt.get_support()]

# Correlation pruning
Xd = pd.DataFrame(X_lv, columns=kept)
corr = Xd.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [c for c in upper.columns if any(upper[c] > 0.95)]
X_clean = Xd.drop(columns=to_drop)

clean_path = os.path.join(base_path, "processed_clean.csv")
X_clean.to_csv(clean_path, index=False)
print("Saved:", clean_path)


In [None]:
import os, re
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

BASE_DIR = r"D:\IITB\STData"   # <-- folder that contains 1,2,3,...38

# ---------- helpers ----------
def read_csv_safe(path):
    """Read CSV if it exists, else return None."""
    return pd.read_csv(path, low_memory=False) if os.path.exists(path) else None

def find_time_col(df):
    for c in ["Time","Timestamp","TimeStamp","UnixTime","routineStamp","time"]:
        if c in df.columns: return c
    # last resort: try case-insensitive contains
    for c in df.columns:
        if re.search("time", c, re.I): return c
    return None

def first_col(df, candidates):
    """Return the first column that exists (case-insensitive contains)."""
    cols = list(df.columns)
    # exact first
    for c in candidates:
        if c in cols: return c
    # contains match
    for c in candidates:
        for col in cols:
            if re.search(c, col, re.I):  # regex/substring
                return col
    return None

def pupil_series(EYE):
    """Return a clean pupil diameter series from many possible schemas."""
    # direct pupil column?
    direct = first_col(EYE, ["PupilDiameter","Pupil","PupilSize"])
    if direct:
        s = pd.to_numeric(EYE[direct], errors="coerce")
    else:
        # try left/right pairs
        left  = first_col(EYE, ["ET_PupilLeft","LeftPupil","LeftPupilDiameter"])
        right = first_col(EYE, ["ET_PupilRight","RightPupil","RightPupilDiameter"])
        if left is None and right is None:
            return None
        s = pd.concat([
            pd.to_numeric(EYE[left], errors="coerce") if left else pd.Series(np.nan, index=EYE.index),
            pd.to_numeric(EYE[right], errors="coerce") if right else pd.Series(np.nan, index=EYE.index),
        ], axis=1).mean(axis=1, skipna=True)

    # basic cleaning: zeros/negatives are invalid for pupil
    s = s.mask(s <= 0)
    s = s.replace([np.inf, -np.inf], np.nan)
    return s

def per_student(sp):
    sid = os.path.basename(sp)

    eye = read_csv_safe(os.path.join(sp, f"{sid}_EYE.csv"))
    ivt = read_csv_safe(os.path.join(sp, f"{sid}_IVT.csv"))
    if eye is None or ivt is None:
        return None  # skip students without both files

    # --- pupil ---
    p = pupil_series(eye)

    # --- IVT columns (robust find) ---
    fix_col = first_col(ivt, ["FixationDuration","Fix_Dur","FixDuration","Duration"])
    sac_col = first_col(ivt, ["SaccadeAmplitude","Sacc_Amp","SaccadeAmp","Amplitude"])

    if p is None or fix_col is None or sac_col is None:
        return None

    fix = pd.to_numeric(ivt[fix_col], errors="coerce").clip(lower=50, upper=1500)   # ms
    sac = pd.to_numeric(ivt[sac_col], errors="coerce").clip(lower=0, upper=30)      # deg

    # aggregate a few stable stats
    row = {
        "student_id": sid,
        "pupil_mean": float(np.nanmean(p)),
        "pupil_std":  float(np.nanstd(p)),
        "fix_mean":   float(np.nanmean(fix)),
        "fix_std":    float(np.nanstd(fix)),
        "fix_count":  int(np.isfinite(fix).sum()),
        "sac_mean":   float(np.nanmean(sac)),
        "sac_std":    float(np.nanstd(sac)),
        "sac_count":  int(np.isfinite(sac).sum()),
    }
    return row

def collect_all_students_features(base_dir):
    rows = []
    for d in sorted(os.listdir(base_dir), key=lambda x: (len(x), x)):
        sp = os.path.join(base_dir, d)
        if not os.path.isdir(sp): 
            continue
        r = per_student(sp)
        if r is not None:
            rows.append(r)
    return pd.DataFrame(rows)

# ---------- 1) build feature table across all students ----------
all_features = collect_all_students_features(BASE_DIR)
print("Raw table shape:", all_features.shape)
display(all_features.head())

if all_features.empty:
    raise RuntimeError("No valid students found (missing columns). Check a couple of folders to confirm column names.")

# ---------- 2) clean (impute/scale/variance/correlation) ----------
id_col = "student_id"
X = all_features.drop(columns=[id_col])

imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(X)

scaler = StandardScaler()
X_std = scaler.fit_transform(X_imp)

vt = VarianceThreshold(threshold=1e-5)
X_lv = vt.fit_transform(X_std)
kept = np.array(X.columns)[vt.get_support()]

Xd = pd.DataFrame(X_lv, columns=kept)
corr = Xd.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [c for c in upper.columns if any(upper[c] > 0.95)]
X_clean = Xd.drop(columns=to_drop)

df_clean = pd.concat([all_features[[id_col]].reset_index(drop=True),
                      X_clean.reset_index(drop=True)], axis=1)

# ---------- 3) save ----------
out_csv = os.path.join(BASE_DIR, "eye_features_all_students.csv")
df_clean.to_csv(out_csv, index=False)
print("✅ Saved:", out_csv, "| rows:", len(df_clean), "| cols:", df_clean.shape[1])


In [None]:
import os, time, pandas as pd

# where your student folders live
base_path = r"D:\IITB\victus-edtech-analysis\STData"
out_main  = os.path.join(base_path, "eye_features_all_students.csv")

# <-- your code that builds `all_features` above this line -->
# all_features = collect_all_students_features(base_path)

# try saving; if locked, save with a timestamped name instead
try:
    all_features.to_csv(out_main, index=False)
    print("✅ Saved:", out_main)
except PermissionError:
    ts = time.strftime("%Y%m%d-%H%M%S")
    alt = os.path.join(base_path, f"eye_features_all_students_{ts}.csv")
    all_features.to_csv(alt, index=False)
    print("⚠️ File was locked (likely open in Excel). Saved to:", alt)


# =============================
# 02 — Feature Cleaning (low variance, correlation pruning)
# =============================

import os
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

# --- Paths ---
base_path = r"D:\IITB\STData\1"   # change if needed
save_models_to = r"../models"
save_fig_to = r"./figures"

os.makedirs(save_models_to, exist_ok=True)
os.makedirs(save_fig_to, exist_ok=True)

print("Using base_path:", base_path)

# --- Utility to read CSV safely ---
def read_csv_safe(path):
    try:
        return pd.read_csv(path)
    except Exception as e:
        print("Missing:", path, e)
        return None

# --- Collect features for all students ---
def collect_all_students_features(base_dir):
    all_features = []
    student_dirs = [os.path.join(base_dir, d) for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]

    for sp in student_dirs:
        sid = os.path.basename(sp)
        print(f"Processing student {sid} ...")

        # Example: only Eye + IVT (as in your project) 
        eye = read_csv_safe(os.path.join(sp, f"{sid}_EYE.csv"))
        ivt = read_csv_safe(os.path.join(sp, f"{sid}_IVT.csv"))

        if eye is None or ivt is None:
            continue

        # Simple aggregation: mean values
        row = {
            "student_id": sid,
            "pupil_mean": eye["PupilDiameter"].mean(skipna=True),
            "fixation_mean": ivt["FixationDuration"].mean(skipna=True),
            "saccade_mean": ivt["SaccadeAmplitude"].mean(skipna=True)
        }
        all_features.append(row)

    df = pd.DataFrame(all_features)
    return df

# --- Step 1: Build dataset across all students ---
all_features = collect_all_students_features(r"D:\IITB\STData")   # folder with 1,2,3,... students
print("Shape before cleaning:", all_features.shape)

# --- Step 2: Impute missing ---
imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(all_features.drop(columns=["student_id"]))

# --- Step 3: Scale ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imp)

# --- Step 4: Low variance filter ---
vt = VarianceThreshold(threshold=1e-5)
X_lv = vt.fit_transform(X_scaled)

# --- Step 5: Correlation pruning ---
df_lv = pd.DataFrame(X_lv, columns=np.array(all_features.drop(columns=["student_id"]).columns)[vt.get_support()])
corr = df_lv.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [c for c in upper.columns if any(upper[c] > 0.95)]
df_clean = df_lv.drop(columns=to_drop)

# --- Step 6: Add back student_id ---
df_clean.insert(0, "student_id", all_features["student_id"].values)

# --- Step 7: Save cleaned dataset ---
clean_path = os.path.join(base_path, "eye_features_all_students.csv")
df_clean.to_csv(clean_path, index=False)
print("✅ Saved:", clean_path)
