In [5]:
import pandas as pd, numpy as np, re, pathlib, warnings, xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from sklearn.metrics import f1_score
warnings.filterwarnings("ignore")

CSV_IN  = "data/processed/var_filtered_features.csv"
CSV_OUT = "data/processed/globem_causal_input.csv"
OUTCOME = "dep_bin"
TEST_SZ = 0.2
MAX_EVALS = 100
QUOTAS = dict(call=2, screen=2, bluetooth=2, wifi=2,
              location=2, sleep=5, steps=5)
DECORR_TH = 0.5
VALID_PARTICIPANTS_FILE = "accepted_pids_filtered.csv"

# ---------- helpers ---------------------------------------------------
def fam(col):
    if not col.startswith("f_"): return "other"
    p = col.split(":")[0][2:].lower()
    if p.startswith("blue"): return "bluetooth"
    if p.startswith("wifi"): return "wifi"
    if p.startswith(("slp","sleep")): return "sleep"
    if p.startswith("loc"): return "location"
    if p.startswith(("steps","step","stp")): return "steps"
    if p.startswith("call"): return "call"
    if p.startswith("screen"): return "screen"
    return p
base_key = lambda c: re.sub(r"__(mean|std)$", "", c)

def f1_eval(preds, dmat):
    y = dmat.get_label()
    yhat = (preds >= 0.5)
    tp = ((yhat == 1) & (y == 1)).sum()
    fp = ((yhat == 1) & (y == 0)).sum()
    fn = ((yhat == 0) & (y == 1)).sum()
    f1 = 0 if tp == 0 else 2 * tp / (2 * tp + fp + fn)
    return 'f1', f1

def metrics(bst, X, y):
    prob = bst.predict(xgb.DMatrix(X))
    pred = (prob >= 0.5)
    return roc_auc_score(y, prob), accuracy_score(y, pred), f1_score(y, pred)

# ---------- load & split ---------------------------------------------
df = pd.read_csv(CSV_IN, index_col="pid")
# Load the valid participants
valid_pids = pd.read_csv(VALID_PARTICIPANTS_FILE, header=None).iloc[:, 0].values

# Filter the dataframe to include only valid participants
df = df[df.index.isin(valid_pids)]

y  = df[OUTCOME].values
X  = df.drop(columns=[c for c in ["dep_bin","dep","BDI2"] if c in df], errors="ignore")

X_tr, X_tmp, y_tr, y_tmp = train_test_split(X, y, test_size=2 * TEST_SZ, stratify=y, random_state=42)
X_val, X_te, y_val, y_te = train_test_split(X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=42)

dtr  = xgb.DMatrix(X_tr, label=y_tr)
dval = xgb.DMatrix(X_val, label=y_val)

# ---------- Hyperopt space & objective (F1 maximization, no feval) ----

space = {
    "max_depth":        hp.quniform("max_depth", 2, 7, 1),
    "min_child_weight": hp.quniform("min_child_weight", 1, 10, 1),
    "subsample":        hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
    "gamma":            hp.uniform("gamma", 5, 20),
    "lambda":           hp.uniform("lambda", 5, 20),
    "eta":              hp.loguniform("eta", np.log(0.01), np.log(0.2)),
    "n_estimators":     hp.quniform("n_estimators", 200, 1200, 200),
}

def objective(p):
    # cast integer parameters
    p = {k: int(v) if k in ["max_depth", "min_child_weight", "n_estimators"]
         else float(v) for k, v in p.items()}

    params = dict(objective="binary:logistic",
                  eval_metric="logloss",  # any builtin metric is fine
                  seed=0, **p)

    bst = xgb.train(
        params,
        dtr,
        num_boost_round=p["n_estimators"],
        evals=[(dval, "val")],
        early_stopping_rounds=50,
        verbose_eval=False
    )

    val_prob = bst.predict(dval)
    val_f1   = f1_score(y_val, (val_prob >= 0.5))
    return {
        "loss": -val_f1,          # Hyperopt maximizes F1
        "status": STATUS_OK,
        "bst": bst,
        "params": params,
        "best_iter": bst.best_iteration
    }

trials = Trials()
fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=trials,
    rstate=np.random.default_rng(0)
)

best_trial = max(trials, key=lambda t: -t["result"]["loss"])
best_bst   = best_trial["result"]["bst"]
best_params= best_trial["result"]["params"]
best_iter  = best_trial["result"]["best_iter"]
print(f"\nBest val‑F1={-best_trial['result']['loss']:.4f}")
print("Best params:", best_params)

# ---------- evaluate full‑feature model ------------------------------
rows = [
    ["train", *metrics(best_bst, X_tr, y_tr)],
    ["val  ", *metrics(best_bst, X_val, y_val)],
    ["test ", *metrics(best_bst, X_te, y_te)]
]
perf_full = pd.DataFrame(rows, columns=["split", "AUROC", "Acc", "F1"])
print("\nPerformance on FULL feature set:")
print(perf_full.to_string(index=False))

# ---------- (optional) 25‑feature quota subset -----------------------
corr_tr = X_tr.corr().abs()
imp_rank = pd.Series(best_bst.get_score(importance_type="gain"))\
             .reindex(X.columns, fill_value=0).sort_values(ascending=False)

sel, bases, counts = [], set(), {k: 0 for k in QUOTAS}
for f in imp_rank.index:
    fm, bk = fam(f), base_key(f)
    if fm not in QUOTAS or counts[fm] >= QUOTAS[fm] or bk in bases: continue
    if sel and corr_tr.loc[f, sel].max() > DECORR_TH: continue
    sel.append(f); bases.add(bk); counts[fm] += 1
    if sum(counts.values()) == sum(QUOTAS.values()): break
for fm, need in QUOTAS.items():
    while counts[fm] < need:
        cands = [f for f in imp_rank.index if fam(f) == fm and base_key(f) not in bases]
        if not cands: break
        f = cands[0]; sel.append(f); bases.add(base_key(f)); counts[fm] += 1
print("Sensor quota counts (50 features):", counts)

survey_cols = [c for c in X.columns if fam(c) == "other"]
final_feats = survey_cols + sel
df[final_feats + [OUTCOME]].to_csv(CSV_OUT, index_label="pid")
print("\n✅ Saved design matrix (50 sensor feats + surveys) →", CSV_OUT)


100%|██████████| 100/100 [01:55<00:00,  1.16s/trial, best loss: -0.6835443037974683]

Best val‑F1=0.6835
Best params: {'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 0, 'colsample_bytree': 0.9969724256476371, 'eta': 0.10823531835351333, 'gamma': 12.111115558719876, 'lambda': 5.004066557986126, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 600, 'subsample': 0.7553302694589472}

Performance on FULL feature set:
split    AUROC      Acc       F1
train 0.847048 0.759420 0.640693
val   0.777778 0.782609 0.683544
test  0.833176 0.741379 0.615385
Sensor quota counts (50 features): {'call': 2, 'screen': 2, 'bluetooth': 2, 'wifi': 2, 'location': 2, 'sleep': 5, 'steps': 5}

✅ Saved design matrix (50 sensor feats + surveys) → data/processed/globem_causal_input.csv
