### Mount Drive, pull data from csv processed in stressid_split.ipynb and train baseline models


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


### Binary Stress

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_selection import VarianceThreshold
from google.colab import drive

# 2. Mount Google Drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# 3. Define paths
TRAIN_PATH = "/content/drive/MyDrive/WuHaoAllenCentad/stressidtrainbalanced2.csv"
TEST_PATH = "/content/drive/MyDrive/WuHaoAllenCentad/stressidtest2.csv"
EXTRACTED_PATH = "/content/drive/MyDrive/WuHaoAllenCentad/extracted_opensmile_features.csv"

# 4. Load CSVs
print("Loading datasets...")
traindf = pd.read_csv(TRAIN_PATH)
testdf = pd.read_csv(TEST_PATH)
extracteddf = pd.read_csv(EXTRACTED_PATH)

print(f"Train split rows: {len(traindf)}")
print(f"Test split rows: {len(testdf)}")
print(f"Features rows: {len(extracteddf)}")

# 5. Merge Features with Splits
# Train/Test have 'subject/task', Features have 'basename'. We merge on these.
print("Merging datasets...")
train_merged = pd.merge(traindf, extracteddf, left_on='subject/task', right_on='basename', how='inner')
test_merged = pd.merge(testdf, extracteddf, left_on='subject/task', right_on='basename', how='inner')

print(f"Merged Train Shape: {train_merged.shape}")
print(f"Merged Test Shape: {test_merged.shape}")

# 6. Prepare X and y
# Identify target column (handle potential naming variations)
TARGET_COL = "binary-stress"
if TARGET_COL not in train_merged.columns:
    if "binary_stress" in train_merged.columns:
        TARGET_COL = "binary_stress"
    else:
        raise ValueError(f"Target column not found. Available columns: {train_merged.columns}")

y_train = train_merged[TARGET_COL].astype(int)
y_test = test_merged[TARGET_COL].astype(int)

# Identify feature columns: all numeric columns in merged df excluding metadata
metadata_cols = [
    'subject/task', 'binary-stress', 'binary_stress', 'affect3-class', 'subject', 'isuseful', 'path', 'continuouslabel',
    'file', 'basename', 'filename', 'audio_path', 'label', 'class', 'id', 'Unnamed: 0'
]

feature_cols = [c for c in train_merged.columns if c not in metadata_cols and pd.api.types.is_numeric_dtype(train_merged[c])]

X_train_raw = train_merged[feature_cols]
X_test_raw = test_merged[feature_cols]

# 7. Clean and Preprocess
# Handle missing values (impute with median from train)
imputer = X_train_raw.median(numeric_only=True)
X_train = X_train_raw.fillna(imputer).fillna(0) # fill remaining NaNs with 0
X_test = X_test_raw.fillna(imputer).fillna(0)

# Variance Threshold (remove constant features)
vt = VarianceThreshold(threshold=0.0)
X_train_v = vt.fit_transform(X_train)
X_test_v = vt.transform(X_test)

# Convert back to DataFrame
X_train = pd.DataFrame(X_train_v, columns=X_train.columns[vt.get_support()], index=y_train.index)
X_test = pd.DataFrame(X_test_v, columns=X_train.columns, index=y_test.index)

# 8. Setup variables for downstream compatibility (CV groups, etc.)
df = train_merged.copy()
train_idx = df.index # Since df is exactly the training set now

print(f"Final X_train shape: {X_train.shape}")
print(f"Final X_test shape: {X_test.shape}")
print(f"Class distribution in Train:\n{y_train.value_counts()}")
print(f"Class distribution in Test:\n{y_test.value_counts()}")

Loading datasets...
Train split rows: 440
Test split rows: 70
Features rows: 371
Merging datasets...
Merged Train Shape: (440, 46)
Merged Test Shape: (70, 46)
Final X_train shape: (440, 37)
Final X_test shape: (70, 37)
Class distribution in Train:
binary-stress
0    220
1    220
Name: count, dtype: int64
Class distribution in Test:
binary-stress
1    41
0    29
Name: count, dtype: int64


In [None]:
# === StressID (binary-stress) model training using pre-extracted openSMILE features ===
# Models: SVM (RBF, tuned), Gradient Boosting (tuned), SVC (baseline), Logistic Regression, Random Forest
# Assumes Google Drive is already mounted.

import os
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV, StratifiedGroupKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


# -----------------------------
# 1) Paths (uses your Drive paths; includes fallback if you uploaded to runtime)
# -----------------------------
def pick_path(*candidates):
    for p in candidates:
        if p and os.path.exists(p):
            return p
    raise FileNotFoundError("None of the provided paths exist:\n" + "\n".join(candidates))

OPENSMILE_PATH = pick_path(
    "/content/drive/MyDrive/WuHaoAllenCentad/extracted_opensmile_features.csv",
    "/mnt/data/extracted_opensmile_features.csv",
)

TRAIN_SPLIT_PATH = pick_path(
    "/content/drive/MyDrive/WuHaoAllenCentad/stressidtrainbalanced2.csv",
    "/mnt/data/stressidtrainbalanced2.csv",
)

TEST_SPLIT_PATH = pick_path(
    "/content/drive/MyDrive/WuHaoAllenCentad/stressidtest2.csv",
    "/mnt/data/stressidtest2.csv",
)

print("Using:")
print(" openSMILE:", OPENSMILE_PATH)
print(" train:", TRAIN_SPLIT_PATH)
print(" test :", TEST_SPLIT_PATH)


# -----------------------------
# 2) Load CSVs + join on path
# -----------------------------
opensmile = pd.read_csv(OPENSMILE_PATH)
train_df = pd.read_csv(TRAIN_SPLIT_PATH)
test_df  = pd.read_csv(TEST_SPLIT_PATH)

def norm_path(series: pd.Series) -> pd.Series:
    return (series.astype(str)
            .str.replace("\\", "/", regex=False)
            .str.strip())

# Keys to merge on (paths should match between split files and openSMILE file)
opensmile["path_key"] = norm_path(opensmile["file"])
train_df["path_key"]  = norm_path(train_df["path"])
test_df["path_key"]   = norm_path(test_df["path"])

# Merge: keep split rows (train/test), attach features
opensmile_feats = opensmile.drop(columns=["file"])  # keep basename + feature columns + path_key
train_merged = train_df.merge(opensmile_feats, on="path_key", how="left")
test_merged  = test_df.merge(opensmile_feats, on="path_key", how="left")

# Sanity checks
missing_train = train_merged.isna().any(axis=1).sum()
missing_test  = test_merged.isna().any(axis=1).sum()
print(f"Rows with any missing values after merge -> train: {missing_train}, test: {missing_test}")

if train_merged.filter(like="F0").shape[1] == 0:
    print("WARNING: It looks like no openSMILE feature columns were merged. Check that 'path' matches openSMILE 'file'.")

# Features = numeric columns from openSMILE (exclude identifiers/labels)
exclude_cols = {
    "subject/task", "binary-stress", "affect3-class", "subject", "isuseful", "path", "continuouslabel",
    "basename", "path_key"
}
feature_cols = [c for c in train_merged.columns if c not in exclude_cols]
feature_cols = train_merged[feature_cols].select_dtypes(include=[np.number]).columns.tolist()

LABEL_COL = "binary-stress"

X_train = train_merged[feature_cols]
y_train = train_merged[LABEL_COL].astype(int)

X_test = test_merged[feature_cols]
y_test = test_merged[LABEL_COL].astype(int)

# Group key to avoid leakage from duplicated/oversampled rows:
# (your balanced train file likely repeats the same path multiple times)
groups_train = train_merged["path_key"]

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)
print("Train label distribution:\n", y_train.value_counts().to_string())
print("Test  label distribution:\n", y_test.value_counts().to_string())


# -----------------------------
# 3) Define models
# -----------------------------
# StratifiedGroupKFold prevents duplicates (same path_key) from landing in both train/val during tuning
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

models = {}

# SVC baseline (RBF defaults)
models["SVC (Baseline)"] = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", SVC(kernel="rbf", random_state=42)),  # probability=False (faster); use decision_function for AUC
])

# Logistic Regression
models["Logistic Regression"] = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=3000, solver="liblinear", random_state=42)),
])

# Random Forest
models["Random Forest"] = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("clf", RandomForestClassifier(
        n_estimators=400,
        random_state=42,
        n_jobs=-1
    )),
])

# Tuned SVM (RBF)
svm_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", SVC(kernel="rbf", random_state=42)),
])
svm_grid = {
    "clf__C": [0.1, 1, 10],
    "clf__gamma": ["scale", 0.01, 0.1],
}
models["SVM (RBF, Tuned)"] = GridSearchCV(
    estimator=svm_pipe,
    param_grid=svm_grid,
    scoring="f1",
    cv=cv,
    n_jobs=-1,
    verbose=0,
)

# Tuned Gradient Boosting
gb_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("clf", GradientBoostingClassifier(random_state=42)),
])
gb_grid = {
    "clf__n_estimators": [100, 200],
    "clf__learning_rate": [0.05, 0.1],
    "clf__max_depth": [2, 3],
    "clf__subsample": [1.0, 0.8],
}
models["Gradient Boosting (Tuned)"] = GridSearchCV(
    estimator=gb_pipe,
    param_grid=gb_grid,
    scoring="f1",
    cv=cv,
    n_jobs=-1,
    verbose=0,
)


# -----------------------------
# 4) Train + evaluate
# -----------------------------
def get_auc_scores(model, X):
    # Prefer proba if available, else decision_function (works for SVC)
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        return model.decision_function(X)
    return None

rows = []

for name, model in models.items():
    # Fit (pass groups ONLY for GridSearchCV using StratifiedGroupKFold)
    if isinstance(model, GridSearchCV):
        model.fit(X_train, y_train, groups=groups_train)
        best_params = model.best_params_
        fitted = model.best_estimator_
    else:
        model.fit(X_train, y_train)
        best_params = {}
        fitted = model

    y_pred = fitted.predict(X_test)
    y_score = get_auc_scores(fitted, X_test)

    row = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1": f1_score(y_test, y_pred, zero_division=0),
        "ROC_AUC": roc_auc_score(y_test, y_score) if y_score is not None else np.nan,
        "Best Params (if tuned)": best_params,
    }
    rows.append(row)

results = pd.DataFrame(rows).sort_values("F1", ascending=False)

# Pretty display
display(results.style.format({
    "Accuracy": "{:.4f}",
    "Precision": "{:.4f}",
    "Recall": "{:.4f}",
    "F1": "{:.4f}",
    "ROC_AUC": "{:.4f}",
}))


Using:
 openSMILE: /content/drive/MyDrive/WuHaoAllenCentad/extracted_opensmile_features.csv
 train: /content/drive/MyDrive/WuHaoAllenCentad/stressidtrainbalanced2.csv
 test : /content/drive/MyDrive/WuHaoAllenCentad/stressidtest2.csv
Rows with any missing values after merge -> train: 0, test: 0
Train shape: (440, 37)  Test shape: (70, 37)
Train label distribution:
 binary-stress
0    220
1    220
Test  label distribution:
 binary-stress
1    41
0    29


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC_AUC,Best Params (if tuned)
4,Gradient Boosting (Tuned),0.6571,0.6441,0.9268,0.76,0.6081,"{'clf__learning_rate': 0.1, 'clf__max_depth': 3, 'clf__n_estimators': 100, 'clf__subsample': 0.8}"
2,Random Forest,0.6429,0.629,0.9512,0.7573,0.5673,{}
1,Logistic Regression,0.7,0.75,0.7317,0.7407,0.7586,{}
3,"SVM (RBF, Tuned)",0.5714,0.6038,0.7805,0.6809,0.6417,"{'clf__C': 10, 'clf__gamma': 0.01}"
0,SVC (Baseline),0.5571,0.5962,0.7561,0.6667,0.6165,{}


### Affect-3 Class


In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, StratifiedGroupKFold

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score
)

# -----------------------------
# Paths (Drive + runtime fallback)
# -----------------------------
def pick_path(*candidates):
    for p in candidates:
        if p and os.path.exists(p):
            return p
    raise FileNotFoundError("None of these paths exist:\n" + "\n".join(candidates))

OPENSMILE_PATH = pick_path(
    "/content/drive/MyDrive/WuHaoAllenCentad/extracted_opensmile_features.csv",
    "/mnt/data/extracted_opensmile_features.csv",
)
TRAIN_SPLIT_PATH = pick_path(
    "/content/drive/MyDrive/WuHaoAllenCentad/stressidtrainbalanced2.csv",
    "/mnt/data/stressidtrainbalanced2.csv",
)
TEST_SPLIT_PATH = pick_path(
    "/content/drive/MyDrive/WuHaoAllenCentad/stressidtest2.csv",
    "/mnt/data/stressidtest2.csv",
)

# -----------------------------
# Load + merge on path
# -----------------------------
def norm_path(series: pd.Series) -> pd.Series:
    return (series.astype(str).str.replace("\\", "/", regex=False).str.strip())

opensmile = pd.read_csv(OPENSMILE_PATH)
train_df  = pd.read_csv(TRAIN_SPLIT_PATH)
test_df   = pd.read_csv(TEST_SPLIT_PATH)

opensmile["path_key"] = norm_path(opensmile["file"])
train_df["path_key"]  = norm_path(train_df["path"])
test_df["path_key"]   = norm_path(test_df["path"])

opensmile_feats = opensmile.drop(columns=["file"])
train_merged = train_df.merge(opensmile_feats, on="path_key", how="left")
test_merged  = test_df.merge(opensmile_feats, on="path_key", how="left")

# -----------------------------
# X/y for affect-3
# -----------------------------
LABEL_COL = "affect3-class"

exclude_cols = {
    "subject/task","binary-stress","affect3-class","subject","isuseful","path",
    "continuouslabel","basename","path_key"
}
feature_cols = [c for c in train_merged.columns if c not in exclude_cols]
feature_cols = train_merged[feature_cols].select_dtypes(include=[np.number]).columns.tolist()

X_train = train_merged[feature_cols]
X_test  = test_merged[feature_cols]

le = LabelEncoder()
y_train = le.fit_transform(train_merged[LABEL_COL].astype(str))
y_test  = le.transform(test_merged[LABEL_COL].astype(str))

print("Classes (LabelEncoder):", list(le.classes_))
print("Test label counts:", pd.Series(y_test).value_counts().sort_index().to_dict())

# ---- class name mapping for your table (adjust if your dataset defines them differently)
# Assumption: 0=relaxed, 1=neutral, 2=stressed
class_name = {0: "relaxed", 1: "neutral", 2: "stressed"}

# -----------------------------
# CV setup (speaker-independent)
# -----------------------------
groups_train = train_merged["subject"].astype(str) if "subject" in train_merged.columns else train_merged["path_key"]
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

# -----------------------------
# Helpers
# -----------------------------
def macro_auc_ovr_from_proba(y_true, proba):
    return roc_auc_score(y_true, proba, multi_class="ovr", average="macro")

def eval_multiclass(name, pipeline_str, repr_str, fitted_model, y_pred, y_proba_or_none):
    bal = balanced_accuracy_score(y_test, y_pred)
    f1m = f1_score(y_test, y_pred, average="macro", zero_division=0)
    f1s = f1_score(y_test, y_pred, average=None, labels=[0,1,2], zero_division=0)
    auc = np.nan
    if y_proba_or_none is not None:
        auc = macro_auc_ovr_from_proba(y_test, y_proba_or_none)

    row = {
        "Pipeline": pipeline_str,
        "Representation": repr_str,
        "Model": name,
        "Bal Acc": bal,
        "F1 (macro)": f1m,
        "AUC (ROC-AUC)": auc,
        f"F1 ({class_name[0]})": f1s[0],
        f"F1 ({class_name[1]})": f1s[1],
        f"F1 ({class_name[2]})": f1s[2],
    }
    return row

# -----------------------------
# Baseline: always predict majority class (on TEST)
# -----------------------------
maj = int(pd.Series(y_test).value_counts().idxmax())
y_pred_base = np.full_like(y_test, maj)
baseline_row = eval_multiclass(
    name=f"Always predict class {maj}",
    pipeline_str="Baseline (Always predict majority class)",
    repr_str="-",
    fitted_model=None,
    y_pred=y_pred_base,
    y_proba_or_none=None
)

# -----------------------------
# Fit + evaluate the SAME model families as your original pipeline
# -----------------------------
rows = [baseline_row]

PIPELINE_NAME = "openSMILE (eGeMAPS) + best classical ML"
REPR_NAME     = "eGeMAPSv02 (88d)"  # keep as your table label (even though you used 37 prosodic dims)

# 1) SVC (Baseline)  --- use probability=True so we can compute multiclass AUC
svc_base = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", SVC(kernel="rbf", probability=True, random_state=42)),
])
svc_base.fit(X_train, y_train)
rows.append(eval_multiclass(
    "SVC (Baseline)", PIPELINE_NAME, REPR_NAME,
    svc_base,
    svc_base.predict(X_test),
    svc_base.predict_proba(X_test),
))

# 2) Logistic Regression
lr = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=4000, solver="lbfgs", random_state=42)),
])
lr.fit(X_train, y_train)
rows.append(eval_multiclass(
    "Logistic Reg.", PIPELINE_NAME, REPR_NAME,
    lr,
    lr.predict(X_test),
    lr.predict_proba(X_test),
))

# 3) Random Forest
rf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("clf", RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=-1)),
])
rf.fit(X_train, y_train)
rows.append(eval_multiclass(
    "Random Forest", PIPELINE_NAME, REPR_NAME,
    rf,
    rf.predict(X_test),
    rf.predict_proba(X_test),
))

# 4) Gradient Boosting (Tuned)
gb_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("clf", GradientBoostingClassifier(random_state=42)),
])
gb_grid = {
    "clf__n_estimators": [100, 200],
    "clf__learning_rate": [0.05, 0.1],
    "clf__max_depth": [2, 3],
    "clf__subsample": [1.0, 0.8],
}
gb_gs = GridSearchCV(gb_pipe, gb_grid, scoring="accuracy", cv=cv, n_jobs=-1, verbose=0)
gb_gs.fit(X_train, y_train, groups=groups_train)
gb_best = gb_gs.best_estimator_
rows.append(eval_multiclass(
    "GB (Tuned)", PIPELINE_NAME, REPR_NAME,
    gb_best,
    gb_best.predict(X_test),
    gb_best.predict_proba(X_test),
))

# 5) SVM (RBF, tuned)
# Tune with probability=False (faster), then refit same best params with probability=True for AUC.
svm_tune_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", SVC(kernel="rbf", probability=False, random_state=42)),
])
svm_grid = {
    "clf__C": [0.1, 1, 10],
    "clf__gamma": ["scale", 0.01, 0.1],
}
svm_gs = GridSearchCV(svm_tune_pipe, svm_grid, scoring="accuracy", cv=cv, n_jobs=-1, verbose=0)
svm_gs.fit(X_train, y_train, groups=groups_train)
bestC = svm_gs.best_params_["clf__C"]
bestG = svm_gs.best_params_["clf__gamma"]

svm_best = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", SVC(kernel="rbf", C=bestC, gamma=bestG, probability=True, random_state=42)),
])
svm_best.fit(X_train, y_train)
rows.append(eval_multiclass(
    "SVM (RBF, tuned)", PIPELINE_NAME, REPR_NAME,
    svm_best,
    svm_best.predict(X_test),
    svm_best.predict_proba(X_test),
))

# -----------------------------
# Final table
# -----------------------------
out = pd.DataFrame(rows)

# Order columns to match your screenshot
ordered_cols = [
    "Pipeline","Representation","Model",
    "Bal Acc","F1 (macro)","AUC (ROC-AUC)",
    f"F1 ({class_name[0]})", f"F1 ({class_name[1]})", f"F1 ({class_name[2]})"
]
out = out[ordered_cols]

display(out.style.format({
    "Bal Acc": "{:.4f}",
    "F1 (macro)": "{:.4f}",
    "AUC (ROC-AUC)": "{:.4f}",
    f"F1 ({class_name[0]})": "{:.4f}",
    f"F1 ({class_name[1]})": "{:.4f}",
    f"F1 ({class_name[2]})": "{:.4f}",
}))


Classes (LabelEncoder): ['0', '1', '2']
Test label counts: {0: 11, 1: 32, 2: 27}


Unnamed: 0,Pipeline,Representation,Model,Bal Acc,F1 (macro),AUC (ROC-AUC),F1 (relaxed),F1 (neutral),F1 (stressed)
0,Baseline (Always predict majority class),-,Always predict class 1,0.3333,0.2092,,0.0,0.6275,0.0
1,openSMILE (eGeMAPS) + best classical ML,eGeMAPSv02 (88d),SVC (Baseline),0.4058,0.3776,0.5928,0.3448,0.3396,0.4483
2,openSMILE (eGeMAPS) + best classical ML,eGeMAPSv02 (88d),Logistic Reg.,0.4939,0.4588,0.626,0.3871,0.4074,0.5818
3,openSMILE (eGeMAPS) + best classical ML,eGeMAPSv02 (88d),Random Forest,0.5472,0.5368,0.5823,0.6316,0.4,0.5789
4,openSMILE (eGeMAPS) + best classical ML,eGeMAPSv02 (88d),GB (Tuned),0.496,0.4878,0.6121,0.5882,0.3182,0.557
5,openSMILE (eGeMAPS) + best classical ML,eGeMAPSv02 (88d),"SVM (RBF, tuned)",0.4362,0.4015,0.5816,0.4348,0.2553,0.5143


In [None]:
# === StressID/RAVDESS affect-3 classification using pre-extracted openSMILE features ===
# Models (same as binary pipeline): SVM (RBF, tuned), GB (tuned), SVC (baseline), Logistic Regression, Random Forest
# Assumes Google Drive is mounted.

import os
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV, StratifiedGroupKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    balanced_accuracy_score, roc_auc_score
)

# -----------------------------
# 1) Paths (Drive + fallback to uploaded runtime files)
# -----------------------------
def pick_path(*candidates):
    for p in candidates:
        if p and os.path.exists(p):
            return p
    raise FileNotFoundError("None of the provided paths exist:\n" + "\n".join(candidates))

OPENSMILE_PATH = pick_path(
    "/content/drive/MyDrive/WuHaoAllenCentad/extracted_opensmile_features.csv",
    "/mnt/data/extracted_opensmile_features.csv",
)
TRAIN_SPLIT_PATH = pick_path(
    "/content/drive/MyDrive/WuHaoAllenCentad/stressidtrainbalanced2.csv",
    "/mnt/data/stressidtrainbalanced2.csv",
)
TEST_SPLIT_PATH = pick_path(
    "/content/drive/MyDrive/WuHaoAllenCentad/stressidtest2.csv",
    "/mnt/data/stressidtest2.csv",
)

print("Using:")
print(" openSMILE:", OPENSMILE_PATH)
print(" train:", TRAIN_SPLIT_PATH)
print(" test :", TEST_SPLIT_PATH)

# -----------------------------
# 2) Load + merge on path
# -----------------------------
opensmile = pd.read_csv(OPENSMILE_PATH)
train_df = pd.read_csv(TRAIN_SPLIT_PATH)
test_df  = pd.read_csv(TEST_SPLIT_PATH)

def norm_path(series: pd.Series) -> pd.Series:
    return (series.astype(str)
            .str.replace("\\", "/", regex=False)
            .str.strip())

# Required columns (based on your earlier setup)
# openSMILE: "file" ; split files: "path"
opensmile["path_key"] = norm_path(opensmile["file"])
train_df["path_key"]  = norm_path(train_df["path"])
test_df["path_key"]   = norm_path(test_df["path"])

opensmile_feats = opensmile.drop(columns=["file"])
train_merged = train_df.merge(opensmile_feats, on="path_key", how="left")
test_merged  = test_df.merge(opensmile_feats, on="path_key", how="left")

print(f"Rows with any missing values after merge -> train: {train_merged.isna().any(axis=1).sum()}, "
      f"test: {test_merged.isna().any(axis=1).sum()}")

# -----------------------------
# 3) Build X/y for affect-3 class
# -----------------------------
LABEL_COL = "affect3-class"  # column just to the right of binary-stress in your sheet

exclude_cols = {
    "subject/task", "binary-stress", "affect3-class", "subject", "isuseful", "path",
    "continuouslabel", "basename", "path_key"
}
feature_cols = [c for c in train_merged.columns if c not in exclude_cols]
feature_cols = train_merged[feature_cols].select_dtypes(include=[np.number]).columns.tolist()

if len(feature_cols) == 0:
    raise ValueError("No numeric feature columns found after merge. Check path matching and columns.")

X_train = train_merged[feature_cols]
X_test  = test_merged[feature_cols]

# Encode labels (works whether they're 0/1/2 or strings)
le = LabelEncoder()
y_train_raw = train_merged[LABEL_COL].astype(str)
y_test_raw  = test_merged[LABEL_COL].astype(str)

y_train = le.fit_transform(y_train_raw)
y_test  = le.transform(y_test_raw)

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)
print("Classes:", list(le.classes_))
print("Train label distribution:", pd.Series(y_train).value_counts().sort_index().to_dict())
print("Test  label distribution:", pd.Series(y_test).value_counts().sort_index().to_dict())

# Group key for CV tuning (speaker-independent): prefer subject if present, else fallback
groups_train = train_merged["subject"].astype(str) if "subject" in train_merged.columns else train_merged["path_key"]

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

# -----------------------------
# 4) Define models (same set as before)
# -----------------------------
models = {}

# SVC baseline (RBF default)
models["SVC (Baseline)"] = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", SVC(kernel="rbf", random_state=42)),
])

# Logistic Regression (multiclass)
models["Logistic Regression"] = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        max_iter=4000,
        solver="lbfgs",
        multi_class="auto",
        random_state=42
    )),
])

# Random Forest
models["Random Forest"] = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("clf", RandomForestClassifier(
        n_estimators=400,
        random_state=42,
        n_jobs=-1
    )),
])

# Tuned SVM (RBF)
svm_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", SVC(kernel="rbf", random_state=42)),
])
svm_grid = {
    "clf__C": [0.1, 1, 10],
    "clf__gamma": ["scale", 0.01, 0.1],
}
models["SVM (RBF, Tuned)"] = GridSearchCV(
    estimator=svm_pipe,
    param_grid=svm_grid,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=0,
)

# Tuned Gradient Boosting (multiclass supported)
gb_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("clf", GradientBoostingClassifier(random_state=42)),
])
gb_grid = {
    "clf__n_estimators": [100, 200],
    "clf__learning_rate": [0.05, 0.1],
    "clf__max_depth": [2, 3],
    "clf__subsample": [1.0, 0.8],
}
models["Gradient Boosting (Tuned)"] = GridSearchCV(
    estimator=gb_pipe,
    param_grid=gb_grid,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=0,
)

# -----------------------------
# 5) Train + evaluate (multiclass metrics)
# -----------------------------
def try_multiclass_auc(fitted, X, y_true):
    # Returns macro AUC (OVR) if possible, else NaN
    try:
        if hasattr(fitted, "predict_proba"):
            scores = fitted.predict_proba(X)  # (n, K)
            return roc_auc_score(y_true, scores, multi_class="ovr", average="macro")
        if hasattr(fitted, "decision_function"):
            scores = fitted.decision_function(X)  # (n, K) for multiclass SVC
            return roc_auc_score(y_true, scores, multi_class="ovr", average="macro")
    except Exception:
        return np.nan
    return np.nan

rows = []
for name, model in models.items():
    if isinstance(model, GridSearchCV):
        model.fit(X_train, y_train, groups=groups_train)
        best_params = model.best_params_
        fitted = model.best_estimator_
    else:
        model.fit(X_train, y_train)
        best_params = {}
        fitted = model

    y_pred = fitted.predict(X_test)

    rows.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Balanced Acc": balanced_accuracy_score(y_test, y_pred),
        "Macro Precision": precision_score(y_test, y_pred, average="macro", zero_division=0),
        "Macro Recall": recall_score(y_test, y_pred, average="macro", zero_division=0),
        "Macro F1": f1_score(y_test, y_pred, average="macro", zero_division=0),
        "Weighted F1": f1_score(y_test, y_pred, average="weighted", zero_division=0),
        "Macro ROC_AUC (OVR)": try_multiclass_auc(fitted, X_test, y_test),
        "Best Params (if tuned)": best_params,
    })

results = pd.DataFrame(rows).sort_values("Accuracy", ascending=False)

display(results.style.format({
    "Accuracy": "{:.4f}",
    "Balanced Acc": "{:.4f}",
    "Macro Precision": "{:.4f}",
    "Macro Recall": "{:.4f}",
    "Macro F1": "{:.4f}",
    "Weighted F1": "{:.4f}",
    "Macro ROC_AUC (OVR)": "{:.4f}",
}))


Using:
 openSMILE: /content/drive/MyDrive/WuHaoAllenCentad/extracted_opensmile_features.csv
 train: /content/drive/MyDrive/WuHaoAllenCentad/stressidtrainbalanced2.csv
 test : /content/drive/MyDrive/WuHaoAllenCentad/stressidtest2.csv
Rows with any missing values after merge -> train: 0, test: 0
Train shape: (440, 37)  Test shape: (70, 37)
Classes: ['0', '1', '2']
Train label distribution: {0: 149, 1: 133, 2: 158}
Test  label distribution: {0: 11, 1: 32, 2: 27}




Unnamed: 0,Model,Accuracy,Balanced Acc,Macro Precision,Macro Recall,Macro F1,Weighted F1,Macro ROC_AUC (OVR),Best Params (if tuned)
2,Random Forest,0.5286,0.5472,0.6304,0.5472,0.5368,0.5054,0.5823,{}
4,Gradient Boosting (Tuned),0.4857,0.496,0.6132,0.496,0.4878,0.4527,0.6121,"{'clf__learning_rate': 0.1, 'clf__max_depth': 2, 'clf__n_estimators': 200, 'clf__subsample': 0.8}"
1,Logistic Regression,0.4714,0.4939,0.4571,0.4939,0.4588,0.4715,0.626,{}
3,"SVM (RBF, Tuned)",0.4143,0.4362,0.4118,0.4362,0.4015,0.3834,,"{'clf__C': 10, 'clf__gamma': 0.01}"
0,SVC (Baseline),0.3857,0.4058,0.3752,0.4058,0.3776,0.3823,,{}


### Regression

In [3]:
# === StressID Regression (continuous label 0–10) using pre-extracted openSMILE features ===
# Same "family" of models as before, but regression versions:
# - SVM (RBF, tuned) -> SVR (RBF, tuned)
# - GB (Tuned)       -> GradientBoostingRegressor (tuned)
# - SVC (Baseline)   -> SVR (Baseline)
# - Logistic Reg.    -> Ridge Regression (linear reg counterpart; logistic doesn't apply to regression)
# - Random Forest    -> RandomForestRegressor
#
# Metrics: MAE, RMSE, % within ±1 (|pred-true|<=1)
# Assumes Drive is mounted.

import os
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error


# -----------------------------
# 1) Paths (Drive + fallback to uploaded runtime files)
# -----------------------------
def pick_path(*candidates):
    for p in candidates:
        if p and os.path.exists(p):
            return p
    raise FileNotFoundError("None of the provided paths exist:\n" + "\n".join(candidates))

OPENSMILE_PATH = pick_path(
    "/content/drive/MyDrive/WuHaoAllenCentad/extracted_opensmile_features.csv",
    "/mnt/data/extracted_opensmile_features.csv",
)
TRAIN_SPLIT_PATH = pick_path(
    "/content/drive/MyDrive/WuHaoAllenCentad/stressidtrainbalanced2.csv",
    "/mnt/data/stressidtrainbalanced2.csv",
)
TEST_SPLIT_PATH = pick_path(
    "/content/drive/MyDrive/WuHaoAllenCentad/stressidtest2.csv",
    "/mnt/data/stressidtest2.csv",
)

print("Using:")
print(" openSMILE:", OPENSMILE_PATH)
print(" train:", TRAIN_SPLIT_PATH)
print(" test :", TEST_SPLIT_PATH)


# -----------------------------
# 2) Load + merge on path
# -----------------------------
opensmile = pd.read_csv(OPENSMILE_PATH)
train_df  = pd.read_csv(TRAIN_SPLIT_PATH)
test_df   = pd.read_csv(TEST_SPLIT_PATH)

def norm_path(series: pd.Series) -> pd.Series:
    return (series.astype(str)
            .str.replace("\\", "/", regex=False)
            .str.strip())

opensmile["path_key"] = norm_path(opensmile["file"])
train_df["path_key"]  = norm_path(train_df["path"])
test_df["path_key"]   = norm_path(test_df["path"])

opensmile_feats = opensmile.drop(columns=["file"])
train_merged = train_df.merge(opensmile_feats, on="path_key", how="left")
test_merged  = test_df.merge(opensmile_feats, on="path_key", how="left")

print(f"Rows with any missing values after merge -> train: {train_merged.isna().any(axis=1).sum()}, "
      f"test: {test_merged.isna().any(axis=1).sum()}")


# -----------------------------
# 3) Select continuous label (column G in split CSVs) robustly
# -----------------------------
def get_continuous_label(series_df: pd.DataFrame) -> pd.Series:
    # Prefer a named column if present
    for col in ["continuouslabel", "continuous_label", "continuous label", "continuousLabel", "ContinuousLabel"]:
        if col in series_df.columns:
            return pd.to_numeric(series_df[col], errors="coerce")
    # Fallback: "column G" = index 6 (0-based)
    return pd.to_numeric(series_df.iloc[:, 6], errors="coerce")

y_train = get_continuous_label(train_merged).astype(float)
y_test  = get_continuous_label(test_merged).astype(float)

# Drop rows with missing targets just in case
train_mask = y_train.notna()
test_mask  = y_test.notna()

train_merged = train_merged.loc[train_mask].reset_index(drop=True)
test_merged  = test_merged.loc[test_mask].reset_index(drop=True)

y_train = y_train.loc[train_mask].to_numpy()
y_test  = y_test.loc[test_mask].to_numpy()

# Features (same approach as before)
exclude_cols = {
    "subject/task", "binary-stress", "affect3-class", "subject", "isuseful", "path",
    "continuouslabel", "continuous_label", "continuous label", "continuousLabel", "ContinuousLabel",
    "basename", "path_key"
}
feature_cols = [c for c in train_merged.columns if c not in exclude_cols]
feature_cols = train_merged[feature_cols].select_dtypes(include=[np.number]).columns.tolist()

X_train = train_merged[feature_cols]
X_test  = test_merged[feature_cols]

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)
print("y_train range:", (np.nanmin(y_train), np.nanmax(y_train)), " y_test range:", (np.nanmin(y_test), np.nanmax(y_test)))

# Groups for speaker-independent CV tuning (prefer subject if available)
groups_train = train_merged["subject"].astype(str) if "subject" in train_merged.columns else train_merged["path_key"]
cv = GroupKFold(n_splits=5)


# -----------------------------
# 4) Models (regression versions)
# -----------------------------
models = {}

# SVR baseline
models["SVR (Baseline)"] = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("reg", SVR(kernel="rbf")),
])

# "Logistic Reg." counterpart for regression (linear baseline)
models["Ridge Regression"] = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("reg", Ridge(random_state=42)),
])

# Random Forest Regressor
models["Random Forest"] = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("reg", RandomForestRegressor(
        n_estimators=400,
        random_state=42,
        n_jobs=-1
    )),
])

# Tuned SVR (RBF)
svr_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("reg", SVR(kernel="rbf")),
])
svr_grid = {
    "reg__C": [0.1, 1, 10],
    "reg__gamma": ["scale", 0.01, 0.1],
    "reg__epsilon": [0.1, 0.5, 1.0],
}
models["SVM (RBF, Tuned)"] = GridSearchCV(
    estimator=svr_pipe,
    param_grid=svr_grid,
    scoring="neg_mean_absolute_error",  # optimize MAE
    cv=cv,
    n_jobs=-1,
    verbose=0
)

# Tuned Gradient Boosting Regressor
gbr_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("reg", GradientBoostingRegressor(random_state=42)),
])
gbr_grid = {
    "reg__n_estimators": [100, 200],
    "reg__learning_rate": [0.03, 0.1],
    "reg__max_depth": [2, 3],
    "reg__subsample": [1.0, 0.8],
}
models["GB (Tuned)"] = GridSearchCV(
    estimator=gbr_pipe,
    param_grid=gbr_grid,
    scoring="neg_mean_absolute_error",
    cv=cv,
    n_jobs=-1,
    verbose=0
)


# -----------------------------
# 5) Evaluation helpers
# -----------------------------
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def pct_within_1(y_true, y_pred):
    return (np.abs(y_pred - y_true) <= 1.0).mean() * 100.0


# -----------------------------
# 6) Baseline (always predict train mean)
# -----------------------------
train_mean = float(np.mean(y_train))
y_pred_base = np.full_like(y_test, train_mean, dtype=float)

rows = [{
    "Pipeline": "Baseline (Always predicts train mean)",
    "Representation": "-",
    "Model": "-",
    "MAE": mean_absolute_error(y_test, y_pred_base),
    "RMSE": rmse(y_test, y_pred_base),
    "% within ±1": pct_within_1(y_test, y_pred_base),
    "Best Params (if tuned)": {}
}]

# -----------------------------
# 7) Train + evaluate models
# -----------------------------
PIPELINE_NAME = "openSMILE (eGeMAPS) + best classical ML"
REPR_NAME     = "eGeMAPSv02 (88d)"

for name, model in models.items():
    if isinstance(model, GridSearchCV):
        model.fit(X_train, y_train, groups=groups_train)
        best_params = model.best_params_
        fitted = model.best_estimator_
    else:
        model.fit(X_train, y_train)
        best_params = {}
        fitted = model

    y_pred = fitted.predict(X_test).astype(float)

    rows.append({
        "Pipeline": PIPELINE_NAME,
        "Representation": REPR_NAME,
        "Model": name,
        "MAE": mean_absolute_error(y_test, y_pred),
        "RMSE": rmse(y_test, y_pred),
        "% within ±1": pct_within_1(y_test, y_pred),
        "Best Params (if tuned)": best_params
    })

results_reg = pd.DataFrame(rows).sort_values("MAE", ascending=True)

display(results_reg.style.format({
    "MAE": "{:.4f}",
    "RMSE": "{:.4f}",
    "% within ±1": "{:.2f}",
}))


Using:
 openSMILE: /content/drive/MyDrive/WuHaoAllenCentad/extracted_opensmile_features.csv
 train: /content/drive/MyDrive/WuHaoAllenCentad/stressidtrainbalanced2.csv
 test : /content/drive/MyDrive/WuHaoAllenCentad/stressidtest2.csv
Rows with any missing values after merge -> train: 0, test: 0
Train shape: (440, 37)  Test shape: (70, 37)
y_train range: (np.float64(0.0), np.float64(10.0))  y_test range: (np.float64(1.0), np.float64(9.0))


Unnamed: 0,Pipeline,Representation,Model,MAE,RMSE,% within ±1,Best Params (if tuned)
3,openSMILE (eGeMAPS) + best classical ML,eGeMAPSv02 (88d),Random Forest,1.3586,1.7339,44.29,{}
5,openSMILE (eGeMAPS) + best classical ML,eGeMAPSv02 (88d),GB (Tuned),1.3877,1.7038,42.86,"{'reg__learning_rate': 0.03, 'reg__max_depth': 2, 'reg__n_estimators': 100, 'reg__subsample': 0.8}"
1,openSMILE (eGeMAPS) + best classical ML,eGeMAPSv02 (88d),SVR (Baseline),1.5339,1.8728,38.57,{}
4,openSMILE (eGeMAPS) + best classical ML,eGeMAPSv02 (88d),"SVM (RBF, Tuned)",1.6706,1.9216,28.57,"{'reg__C': 0.1, 'reg__epsilon': 0.5, 'reg__gamma': 0.1}"
2,openSMILE (eGeMAPS) + best classical ML,eGeMAPSv02 (88d),Ridge Regression,1.6804,2.1008,32.86,{}
0,Baseline (Always predicts train mean),-,-,1.7188,1.9843,28.57,{}
