## 1 - Data Loading

In [None]:
# define root path
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))


In [None]:
# load metadata from excel sheet
from src.load_data import load_metadata

DATA_DIR = Path("../data/MODMA")

meta = load_metadata(DATA_DIR / "subjects_information_EEG_128channels_resting_lanzhou_2015.xlsx")

print(meta.columns)
meta.head()


In [None]:
# load a sample subject's data
from src.load_data import load_subject

sample_mat = DATA_DIR / "02030020_rest 20151230 1416.mat" 
raw, label, info = load_subject(sample_mat, meta)

print(info)
print("label:", label)
print("shape:", raw.get_data().shape)
print("sfreq:", raw.info["sfreq"])


## 2 - Preprocessing, Epoching, and Artifact Rejection

In [None]:
# preprocess the raw data
from src.preprocess import preprocess_raw, epoch_raw

raw_clean = preprocess_raw(raw)

raw_clean.plot(n_channels=20, duration=10, scalings="auto")
raw_clean.compute_psd(fmax = 50).plot()


In [None]:
# epoching
epochs = epoch_raw(raw_clean, epoch_len=2.0)

print(epochs)
print("Epochs shape:", epochs.get_data().shape)


In [None]:
# artifact rejection
epochs_clean = epochs.copy().drop_bad(
    reject=dict(eeg=150e-6)  # conservative threshold
)

print("Before:", len(epochs))
print("After:", len(epochs_clean))


In [None]:
# sanity checks
import numpy as np

data = epochs_clean.get_data()  # (n_epochs, n_ch, n_times)
ptp = np.ptp(data, axis = 2)         # peak-to-peak per epoch/channel

print("epochs_clean:", data.shape)
print("Median peak-to-peak (uV):", np.median(ptp) * 1e6)
print("95th percentile peak-to-peak (uV):", np.percentile(ptp, 95) * 1e6)


## 3 - Feature Extraction

In [None]:
# featurize a single subject
from src.features import featurize_subject, dict_to_row
import pandas as pd

feats = featurize_subject(epochs_clean)

print("num features:", len(feats))
list(feats.items())[:10]

In [None]:
# construct a feature vector dataframe
row = dict_to_row(feats, subject_id=info["subject_id"], label=label)
df_one = pd.DataFrame([row])
df_one.head()

In [None]:
# Ensure no NaNs
print("NaNs:", df_one.isna().sum().sum())

# Ensure features are finite
import numpy as np
feat_only = df_one.drop(columns=["subject_id","label"])
print("Finite:", np.isfinite(feat_only.values).all())


## 4 - Dataset Construction

In [None]:
from src.dataset import build_dataset

DATA_DIR = Path("../data/MODMA")
META_PATH = DATA_DIR / "subjects_information_EEG_128channels_resting_lanzhou_2015.xlsx"

df = build_dataset(DATA_DIR, META_PATH)

print(df.shape)
df.head()


In [None]:
# check class balance and missing values
print(df["label"].value_counts())
print(df.isna().sum().sum())


## 5 - Baseline Training Model

In [None]:
# prepare X/y
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

X = df.drop(columns=["subject_id", "label"])
y = df["label"]


In [None]:
# Use logistic regression with balanced class weights as a baseline model
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        penalty="l2",
        solver="liblinear",
        class_weight="balanced",
        max_iter=1000,
    )),
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    pipe,
    X,
    y,
    cv=cv,
    scoring="roc_auc"
)

print("ROC AUC per fold:", scores)
print("Mean ROC AUC:", scores.mean(), "+/-", scores.std())


## 6 - Feature Importance 

In [None]:
pipe.fit(X, y)
coefs = pipe.named_steps["clf"].coef_[0]

importance = (
    pd.Series(coefs, index=X.columns)
    .abs()
    .sort_values(ascending=False)
)

importance.head(15)


In [None]:
importance.head(50).to_frame("abs_coef")


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif

rf_pipe = Pipeline([
    ("select", SelectKBest(f_classif, k=50)),
    ("rf", RandomForestClassifier(
        n_estimators=300,
        max_depth=10,
        class_weight="balanced",
        random_state=42,
    )),
])

scores_rf_fs = cross_val_score(
    rf_pipe, X, y, cv=cv, scoring="roc_auc"
)

print(scores_rf_fs.mean())

