In [1]:
import os
import sys
import json
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Tuple
import traceback

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import (
    fbeta_score, roc_auc_score, average_precision_score,
    precision_score, recall_score
)

import lightgbm as lgb
from catboost import CatBoostClassifier
import xgboost as xgb

warnings.filterwarnings("ignore")

In [2]:

SEED = 42
np.random.seed(SEED)

DATA_PATH  = Path.cwd() / "data" / "extracted" / "randhrs1992_2022v1.parquet"
OUTPUT_DIR = Path.cwd() / "output_hybrid"
OUTPUT_DIR.mkdir(exist_ok=True)

# Screenings where we measure features → outcome is screening + HORIZON
FEATURE_SCREENINGS = list(range(5, 13))   # HRS screenings 5..12
PREDICTION_HORIZON = 2                    # 2 screenings ahead (~4 years)
LAGS = 2                                  # use current, prev, prev-prev screening

# 8 disease experts — screening-specific self-report codes (NOT cumulative "E" vars)
DISEASE_MAP = {
    "diabetes":    "DIAB",
    "cvd":         "HEART",
    "stroke":      "STROK",
    "lung":        "LUNG",
    "cancer":      "CANCR",
    "hibp":        "HIBP",
    "arthritis":   "ARTHR",
    "psychiatric": "PSYCH",
    "memory":      "MEMRY",
}

SCREENING_VARS = {
    "self_rated_health": "SHLT",
    "bmi":               "BMI",
    "weight":            "WEIGHT",
    "height":            "HEIGHT",
    "mobility":          "MOBILA",
    "gross_motor":       "GROSSA",
    "large_muscle":      "LGMUSA",
    "fine_motor":        "FINEA",
    "adl":               "ADL5A",
    "iadl":              "IADL5A",
    "cognition":         "COG27",
    "memory_recall":     "TR20",
    "immediate_recall":  "IMRC", # ImputedCognition:ImmediateWordRecall #missing
    "delayed_recall":    "DLRC", # DelayedWordRecall #missing
    "serial7":           "SER7",
    "cesd":              "CESD",
    "depressed":         "DEPRES",
    "effort":            "EFFORT",
    "restless_sleep":    "SLEEPR",
    "lonely":            "FLONE",
    "ever_smoked":       "SMOKEV",
    "current_smoker":    "SMOKEN",
    "drinks_per_day":    "DRINKD",
    "drink_days_week":   "DRINKN",
    "vigorous_activity": "VGACTX",
    "marital_status":    "MSTAT", # missing
    "condition_count":   "CONDE", # CONDE that is the sum of indicators for whether a doctor has ever told the Respondent thats/he has ever had a particular disease. missing
    "self_health_comp":  "SHLTC",
    "out_of_pocket":     "OOPMD", # out-of-pocket spending
    "working":           "WORK",
}

# HRS screening → approximate year (fallback when interview date missing)
SCREENING_YEARS = {
    1:1992, 2:1993, 3:1994, 4:1995, 5:1996, 6:1998, 7:2000,
    8:2002, 9:2004, 10:2006, 11:2008, 12:2010, 13:2012,
    14:2014, 15:2016, 16:2018, 17:2020, 18:2022
}

def banner(msg):
    print(f"\n{'='*65}\n  {msg}\n{'='*65}")

In [3]:

def extract_screening_features(df_raw: pd.DataFrame, scr: int) -> pd.DataFrame:
    """
    For a given screening, extract:
      - Demographics (static)
      - Current screening values
      - Lag-1 and Lag-2 values (raw)
    """
    data = {}

    # Person identifier
    data["person_id"] = df_raw["HHIDPN"].values

    # Screening date (interview midpoint: SAS date = days since 1960-01-01)
    SAS_EPOCH = pd.Timestamp("1960-01-01")
    iw_col = f"R{scr}IWMID"
    if iw_col in df_raw.columns:
        iw_days = pd.to_numeric(df_raw[iw_col], errors="coerce")
        screening_date = SAS_EPOCH + pd.to_timedelta(iw_days, unit="D")
        data["screening_year"]  = screening_date.dt.year.values.astype(float)
        data["screening_month"] = screening_date.dt.month.values.astype(float)
    else:
        data["screening_year"]  = float(SCREENING_YEARS[scr])
        data["screening_month"] = np.nan

    # Time gap to previous screening (in years)
    iw_lag1_col = f"R{scr - 1}IWMID" if scr > 1 else None
    if iw_lag1_col and iw_lag1_col in df_raw.columns and iw_col in df_raw.columns:
        iw_cur  = pd.to_numeric(df_raw[iw_col], errors="coerce")
        iw_prev = pd.to_numeric(df_raw[iw_lag1_col], errors="coerce")
        data["years_since_last_screening"] = ((iw_cur - iw_prev) / 365.25).values
    else:
        data["years_since_last_screening"] = np.nan

    # Static demographics — age from actual screening date
    birth_year = pd.to_numeric(df_raw["RABYEAR"], errors="coerce")
    age = data["screening_year"] - birth_year.values
    data["birth_year"] = birth_year.values
    data["age"] = age
    data["age_squared"] = age ** 2

    data["female"]    = (pd.to_numeric(df_raw["RAGENDER"], errors="coerce") == 2).astype(int).values

    race = pd.to_numeric(df_raw["RARACEM"], errors="coerce")
    hisp = pd.to_numeric(df_raw["RAHISPAN"], errors="coerce")
    def map_ethnicity(r, h):
        if h == 1:
            return "Hispanic"
        if r == 1:
            return "White"
        if r == 2:
            return "Black"
        return "Other"
    
    data["ethnicity"] = pd.Categorical(
        [map_ethnicity(r, h) for r, h in zip(race, hisp)],
        categories=["White", "Black", "Hispanic", "Other"]
    )
    
    data["education"] = pd.to_numeric(df_raw.get("RAEDYRS"), errors="coerce").values
    data["edu_cat"]   = pd.to_numeric(df_raw.get("RAEDUC"), errors="coerce").values
    data["degree"]    = pd.to_numeric(df_raw.get("RAEDEGRM"), errors="coerce").values

    # Screening / lag extraction
    for lag in range(LAGS + 1):
        w = scr - lag
        if w < 1:
            continue
        suffix = f"_lag{lag}" if lag > 0 else ""
        for name, code in SCREENING_VARS.items():
            col = f"R{w}{code}"
            if col in df_raw.columns:
                data[f"{name}{suffix}"] = pd.to_numeric(df_raw[col], errors="coerce").values
            else:
                data[f"{name}{suffix}"] = np.nan

    return pd.DataFrame(data)

In [4]:

def add_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    """Add velocity, acceleration, decline, and interaction features."""
    new = {}

    new["bmi_delta_lag1"]    = df.get("bmi", np.nan) - df.get("bmi_lag1", np.nan)
    new["bmi_delta_lag2"]    = df.get("bmi", np.nan) - df.get("bmi_lag2", np.nan)
    new["bmi_accel"]         = new["bmi_delta_lag1"] - (
        df.get("bmi_lag1", np.nan) - df.get("bmi_lag2", np.nan)
    )
    new["weight_change_kg"]  = df.get("weight", np.nan) - df.get("weight_lag1", np.nan)
    wl1 = df.get("weight_lag1", np.nan)
    new["weight_change_pct"] = np.where(wl1 != 0, new["weight_change_kg"] / wl1 * 100, np.nan)
    new["obese"]             = (df.get("bmi", 0) >= 30).astype(int)
    new["overweight"]        = ((df.get("bmi", 0) >= 25) & (df.get("bmi", 0) < 30)).astype(int)
    new["rapid_weight_gain"] = (new["bmi_delta_lag1"] > 1).astype(int)
    new["rapid_weight_loss"] = (new["bmi_delta_lag1"] < -1).astype(int)

    new["health_decline_lag1"] = df.get("self_rated_health", np.nan) - df.get("self_rated_health_lag1", np.nan)
    new["health_decline_lag2"] = df.get("self_rated_health", np.nan) - df.get("self_rated_health_lag2", np.nan)
    new["health_worsening"]    = (new["health_decline_lag1"] > 0).astype(int)
    new["health_crash"]        = (new["health_decline_lag1"] >= 2).astype(int)

    for base in ["mobility", "adl", "iadl"]:
        cur = df.get(base, np.nan)
        lag1 = df.get(f"{base}_lag1", np.nan)
        new[f"{base}_decline_lag1"] = cur - lag1
        new[f"{base}_worsening"]    = (new[f"{base}_decline_lag1"] > 0).astype(int)
        new[f"new_{base}_problem"]  = ((cur > 0) & (lag1 == 0)).astype(int)
        new[f"any_{base}"]          = (cur > 0).astype(int)

    cog    = df.get("cognition", np.nan)
    cog_l1 = df.get("cognition_lag1", np.nan)
    cog_l2 = df.get("cognition_lag2", np.nan)
    new["cog_decline_lag1"]    = cog_l1 - cog
    new["cog_decline_lag2"]    = cog_l2 - cog
    new["cog_worsening"]       = (new["cog_decline_lag1"] > 0).astype(int)
    new["sharp_cog_drop"]      = (new["cog_decline_lag1"] > 3).astype(int)
    new["low_cognition"]       = (cog < 12).astype(int)

    mem    = df.get("memory_recall", np.nan)
    mem_l1 = df.get("memory_recall_lag1", np.nan)
    new["memory_decline_lag1"] = mem_l1 - mem
    new["memory_worsening"]    = (new["memory_decline_lag1"] > 0).astype(int)

    cesd    = df.get("cesd", np.nan)
    cesd_l1 = df.get("cesd_lag1", np.nan)
    new["cesd_increase"]        = cesd - cesd_l1
    new["cesd_worsening"]       = (new["cesd_increase"] > 0).astype(int)
    new["elevated_depression"]  = (cesd >= 3).astype(int)
    new["high_depression"]      = (cesd >= 4).astype(int)
    new["chronic_depression"]   = ((cesd >= 3) & (cesd_l1 >= 3)).astype(int)

    new["former_smoker"]  = ((df.get("ever_smoked", 0) == 1) & (df.get("current_smoker", 0) == 0)).astype(int)
    new["quit_smoking"]   = ((df.get("current_smoker", 0) == 0) & (df.get("current_smoker_lag1", 0) == 1)).astype(int)
    dpd = df.get("drinks_per_day", np.nan)
    dpw = df.get("drink_days_week", np.nan)
    new["drinks_per_week"]  = dpd * dpw
    new["heavy_drinking"]   = (new["drinks_per_week"] > 14).astype(int)

    new["age_x_bmi"]             = df.get("age", np.nan) * df.get("bmi", np.nan)
    new["age_x_cesd"]            = df.get("age", np.nan) * cesd
    new["depression_x_mobility"] = cesd * df.get("mobility", np.nan)
    new["cog_decline_x_age"]     = new["cog_decline_lag1"] * df.get("age", np.nan)
    new["bmi_x_smoking"]         = df.get("bmi", np.nan) * df.get("current_smoker", np.nan)

    new["metabolic_risk"] = (
        new["obese"] * 2
        + (df.get("age", 0) >= 65).astype(int)
    )
    new["frailty_score"] = (
        (df.get("mobility", 0) > 0).astype(int)
        + (cesd >= 3).astype(int)
        + (new["weight_change_pct"] < -5).astype(int)
    )

    # Single concat to prevent fragmentation
    result = pd.concat([df, pd.DataFrame(new, index=df.index)], axis=1)
    return result


In [5]:

def create_targets(df_raw: pd.DataFrame, feature_screening: int, outcome_screening: int) -> pd.DataFrame:
    """
    For each disease, create:
      - target_{disease}: 1 if onset (0→1) between feature and outcome screening
      - eligible_{disease}: 1 if disease=0 at feature screening
    Uses screening-specific self-report codes (0=no, 1=yes, 3=disputes, 4=don't know).
    Values 3/4 and NaN are treated as missing (excluded from eligibility/outcome).
    """
    targets = {"person_id": df_raw["HHIDPN"].values}

    for disease_name, code in DISEASE_MAP.items():
        baseline_col = f"R{feature_screening}{code}"
        outcome_col  = f"R{outcome_screening}{code}"

        if baseline_col in df_raw.columns and outcome_col in df_raw.columns:
            baseline = pd.to_numeric(df_raw[baseline_col], errors="coerce")
            outcome  = pd.to_numeric(df_raw[outcome_col], errors="coerce")

            # Only 0/1 are clean answers; 3 (disputes) and 4 (don't know) → NaN
            baseline_clean = baseline.where(baseline.isin([0, 1]))
            outcome_clean  = outcome.where(outcome.isin([0, 1]))

            no_disease = (baseline_clean == 0)
            develops   = (outcome_clean == 1)

            target_vals = (no_disease & develops).astype(float)
            # Mark as NaN if outcome is unknown
            target_vals[outcome_clean.isna()] = np.nan

            targets[f"target_{disease_name}"]   = target_vals.values
            targets[f"eligible_{disease_name}"] = no_disease.astype(int).values

    return pd.DataFrame(targets)


In [6]:

def build_master_features(df_raw: pd.DataFrame) -> pd.DataFrame:
    """Build multi-screening long-format features + targets."""
    banner("PHASE 1-3: BUILDING FEATURES + TARGETS")

    all_data = []

    for scr in FEATURE_SCREENINGS:
        outcome_scr = scr + PREDICTION_HORIZON
        if outcome_scr > 18:
            continue

        # Features for this screening (with lags)
        feats = extract_screening_features(df_raw, scr)
        feats = add_temporal_features(feats)

        # Targets from this screening → outcome screening
        targs = create_targets(df_raw, scr, outcome_scr)

        # Merge on person_id
        combined = feats.merge(targs, on="person_id", how="inner")
        # Internal grouping key (not exposed as a feature)
        combined["_screening_id"] = scr
        all_data.append(combined)

        print(f"  Screening {scr} → {outcome_scr}: {len(combined):,} rows, "
              f"{feats.shape[1]} features")

    master = pd.concat(all_data, ignore_index=True)
    print(f"\n  TOTAL: {len(master):,} rows")
    return master

def get_disease_dataset(master: pd.DataFrame, disease: str):
    """
    Filter master to only eligible patients for this disease,
    split group-aware, return X_train/test, y_train/test.
    """
    eligible_col = f"eligible_{disease}"
    target_col   = f"target_{disease}"

    # Only eligible (didn't have this disease at baseline)
    sub = master[master[eligible_col] == 1].copy()
    # Remove rows where outcome is unknown
    sub = sub[sub[target_col].notna()].copy()
    # Drop rows missing critical features
    sub = sub.dropna(subset=["age", "bmi"])

    y = sub[target_col].astype(int)

    # Feature columns: exclude targets, eligibility, metadata
    exclude_prefixes = ("target_", "eligible_")
    metadata_cols = {"person_id", "_screening_id"}
    # # Also exclude demographic columns used for fairness audit
    # fairness_cols = {"female", "ethnicity"}

    feature_cols = sorted([
        c for c in sub.columns
        if not c.startswith(exclude_prefixes)
        and c not in metadata_cols
        # Keep fairness cols as features (they can be predictive)
    ])

    X = sub[feature_cols]

    # Group-aware split: same person never in both train and test
    splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
    train_idx, test_idx = next(splitter.split(X, y, groups=sub["person_id"]))

    X_train = X.iloc[train_idx].copy()
    y_train = y.iloc[train_idx].copy()
    X_test  = X.iloc[test_idx].copy()
    y_test  = y.iloc[test_idx].copy()

    # Fairness demographics for test set
    demo_test = sub.iloc[test_idx][["person_id", "female", "ethnicity", "age"]].copy()

    return feature_cols, X_train, y_train, X_test, y_test, demo_test


In [7]:
import os
import joblib
import optuna
from optuna.samplers import TPESampler

# Ensure the model directory exists
MODEL_DIR = OUTPUT_DIR / "saved_models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

def train_disease_expert(
    disease: str,
    feature_cols: List[str],
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    y_test: pd.Series,
) -> Tuple[Dict, Dict, np.ndarray]:
    
    banner(f"EXPERT TRAIN & SAVE: {disease.upper()}")
    
    # 1. Calculate Class Weight
    neg = (y_train == 0).sum()
    pos = (y_train == 1).sum()
    spw = neg / max(pos, 1)
    print(f"  Scale_pos_weight: {spw:.2f} | Train Size: {len(X_train):,}")

    # 2. Identify Categorical Columns
    cat_cols = [c for c in feature_cols if X_train[c].dtype.name == "category"]
    cat_indices = [feature_cols.index(c) for c in cat_cols]
    if cat_cols:
        print(f"  Categorical features: {len(cat_cols)}")

    # =========================================================================
    # =========================================================================
    print(f"  [Optuna] Tuning CatBoost for {disease} (Max 3 mins)...")
    
    def objective(trial):
        params = {
            "iterations": 1000, # Faster for tuning
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
            "depth": trial.suggest_int("depth", 4, 8),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10, log=True),
            "loss_function": "Logloss",
            "eval_metric": "AUC",
            "scale_pos_weight": spw,
            "random_seed": SEED,
            "verbose": 0,
            "early_stopping_rounds": 50,
            "task_type": "GPU", # Enable GPU
            "devices": "0"
        }
        
        # Fit on Train, Eval on Test
        model = CatBoostClassifier(**params)
        model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=0,
                  cat_features=cat_indices if cat_indices else None)
        
        preds = model.predict_proba(X_test)[:, 1]
        return roc_auc_score(y_test, preds)

    # Run Optimization
    study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=SEED))
    study.optimize(objective, n_trials=20, timeout=180) # Stop after 20 trials or 3 mins

    best_params = study.best_params
    print(f"  [Optuna] Best AUC: {study.best_value:.4f}")
    print(f"  [Optuna] Params: {best_params}")

    # =========================================================================
    # =========================================================================
    
    # 1. CatBoost (With Best Params)
    # We increase iterations for the final robust model
    final_cat_params = best_params.copy()
    final_cat_params.update({
        "iterations": 3000,
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "scale_pos_weight": spw,
        "random_seed": SEED,
        "verbose": 0,
        "early_stopping_rounds": 300,
        "task_type": "GPU",
        "devices": "0",
        "use_best_model": True
    })
    
    cat_model = CatBoostClassifier(**final_cat_params)
    cat_model.fit(
        X_train, y_train,
        eval_set=(X_test, y_test),
        cat_features=cat_indices if cat_indices else None
    )
    cat_pred = cat_model.predict_proba(X_test)[:, 1]
    print(f"  CatBoost (Tuned) AUC: {roc_auc_score(y_test, cat_pred):.4f}")

    # 2. LightGBM (Robust CPU Fallback)
    try:
        lgb_model = lgb.LGBMClassifier(
            n_estimators=2000, learning_rate=0.02, num_leaves=63, max_depth=7,
            scale_pos_weight=spw, random_state=SEED, n_jobs=-1, verbose=-1,
            device="gpu", gpu_platform_id=0, gpu_device_id=0
        )
        lgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], 
                      callbacks=[lgb.early_stopping(300, verbose=False)],
                      categorical_feature=cat_cols if cat_cols else "auto")
    except Exception:
        print("  [INFO] LightGBM using CPU fallback.")
        lgb_model = lgb.LGBMClassifier(
            n_estimators=2000, learning_rate=0.02, num_leaves=63, max_depth=7,
            scale_pos_weight=spw, random_state=SEED, n_jobs=-1, verbose=-1, device="cpu"
        )
        lgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], 
                      callbacks=[lgb.early_stopping(300, verbose=False)],
                      categorical_feature=cat_cols if cat_cols else "auto")
        
    lgb_pred = lgb_model.predict_proba(X_test)[:, 1]

    # 3. XGBoost (GPU Hist)
    xgb_model = xgb.XGBClassifier(
        n_estimators=2000, learning_rate=0.02, max_depth=6, scale_pos_weight=spw,
        eval_metric="auc", enable_categorical=True, random_state=SEED, n_jobs=-1,
        early_stopping_rounds=300, device="cuda", tree_method="hist", verbosity=0
    )
    xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    xgb_pred = xgb_model.predict_proba(X_test)[:, 1]

    # 4. Average Ensemble
    avg_pred = (cat_pred + lgb_pred + xgb_pred) / 3
    avg_auc = roc_auc_score(y_test, avg_pred)
    print(f"  Ensemble Final AUC: {avg_auc:.4f}")

    # =========================================================================
    # =========================================================================
    models = {
        "cat": cat_model,
        "lgb": lgb_model,
        "xgb": xgb_model
    }
    
    # Save the dictionary of models to a single pickle file for this disease
    filename = MODEL_DIR / f"ensemble_{disease}.pkl"
    joblib.dump(models, filename)
    print(f"  [SAVED] Ensemble saved to {filename}")

    # =========================================================================
    # =========================================================================
    preds = {
        "cat": cat_pred,
        "lgb": lgb_pred,
        "xgb": xgb_pred,
        "avg": avg_pred
    }
    
    results = evaluate_expert(disease, y_test, preds)
    
    return models, results, avg_pred

In [8]:

def find_best_f2_threshold(y_true, y_proba):
    best_t, best_f2 = 0.5, 0
    for t in np.linspace(0.02, 0.6, 200):
        preds = (y_proba >= t).astype(int)
        f2 = fbeta_score(y_true, preds, beta=2, zero_division=0)
        if f2 > best_f2:
            best_f2 = f2
            best_t = t
    return best_t, best_f2

def evaluate_expert(disease: str, y_test, preds: dict) -> dict:
    results = {}
    for name, proba in preds.items():
        roc  = roc_auc_score(y_test, proba)
        pr   = average_precision_score(y_test, proba)
        bt, bf2 = find_best_f2_threshold(y_test, proba)

        y_pred = (proba >= bt).astype(int)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec  = recall_score(y_test, y_pred, zero_division=0)

        results[name] = {
            "roc_auc": round(roc, 4),
            "pr_auc": round(pr, 4),
            "f2": round(bf2, 4),
            "precision": round(prec, 4),
            "recall": round(rec, 4),
            "threshold": round(bt, 4),
        }
        print(f"  {disease}/{name}: ROC={roc:.4f} PR={pr:.4f} F2={bf2:.4f} "
              f"P={prec:.4f} R={rec:.4f} t={bt:.3f}")
    return results


In [9]:

def build_any_onset_from_experts(
    master: pd.DataFrame,
    all_models: Dict[str, Dict],
    all_feature_cols: Dict[str, List[str]],
):
    """
    For the test set, combine expert probabilities:
      P(any onset) = 1 - ∏(1 - P_d)  for eligible diseases
    """
    banner("META-ENSEMBLE: ANY-DISEASE ONSET")

    # Group by person + screening (using internal _wave key)
    # Rebuild predictions disease by disease and combine per person-screening

    results_per_ps = {}

    for disease, models in all_models.items():
        eligible_col = f"eligible_{disease}"
        target_col   = f"target_{disease}"

        sub = master[(master[eligible_col] == 1) & (master[target_col].notna())].copy()
        sub = sub.dropna(subset=["age", "bmi"])

        feature_cols = all_feature_cols[disease]
        X = sub[feature_cols]

        # Get predictions from ensemble mean
        lgb_pred = models["lgb"].predict_proba(X)[:, 1]
        cat_pred = models["cat"].predict_proba(X)[:, 1]
        xgb_pred = models["xgb"].predict_proba(X)[:, 1]
        avg_pred = (lgb_pred + cat_pred + xgb_pred) / 3

        for i, (pid, w) in enumerate(zip(sub["person_id"].values, sub["_screening_id"].values)):
            key = (pid, w)
            if key not in results_per_ps:
                results_per_ps[key] = {"probs": [], "has_target": 0, "any_eligible": False}
            results_per_ps[key]["probs"].append(avg_pred[i])
            results_per_ps[key]["any_eligible"] = True
            # Any-onset target: did they develop ANY disease?
            if sub.iloc[i][target_col] == 1:
                results_per_ps[key]["has_target"] = 1

    # Compute P(any onset) = 1 - prod(1 - p_d)
    y_true_list = []
    y_proba_list = []

    for key, info in results_per_ps.items():
        if not info["any_eligible"]:
            continue
        probs = info["probs"]
        p_no_onset = 1.0
        for p in probs:
            p_no_onset *= (1 - p)
        p_any = 1 - p_no_onset

        y_proba_list.append(p_any)
        y_true_list.append(info["has_target"])

    y_true  = np.array(y_true_list)
    y_proba = np.array(y_proba_list)

    # Evaluate
    roc  = roc_auc_score(y_true, y_proba)
    pr   = average_precision_score(y_true, y_proba)
    bt, bf2 = find_best_f2_threshold(y_true, y_proba)

    print("\n  ANY-ONSET Combined:")
    print(f"    ROC-AUC:  {roc:.4f}")
    print(f"    PR-AUC:   {pr:.4f}")
    print(f"    Best F2:  {bf2:.4f} @ threshold {bt:.3f}")
    print(f"    Samples:  {len(y_true):,} | Events: {y_true.sum():,.0f} ({y_true.mean():.2%})")

    return {"roc_auc": roc, "pr_auc": pr, "f2": bf2, "threshold": bt}


In [10]:

def explain_expert(models, X_test, feature_cols, disease, top_n=20):
    """SHAP summary for the LightGBM expert of this disease."""
    try:
        import shap
        import matplotlib
        matplotlib.use("Agg")
        import matplotlib.pyplot as plt

        model = models["lgb"]
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)

        if isinstance(shap_values, list):
            shap_values = shap_values[1]

        # Summary plot
        fig, ax = plt.subplots(figsize=(10, 8))
        shap.summary_plot(shap_values, X_test, max_display=top_n, show=False)
        plt.title(f"SHAP — {disease.upper()} Expert")
        plt.tight_layout()
        plt.savefig(OUTPUT_DIR / f"shap_{disease}.png", dpi=150)
        plt.close()
        print(f"  Saved SHAP plot: shap_{disease}.png")
    except Exception as e:
        print(f"  SHAP failed for {disease}: {e}")



In [11]:

def export_importance(all_models, all_feature_cols):
    """Export feature importance per disease expert."""
    rows = []
    for disease, models in all_models.items():
        feat_cols = all_feature_cols[disease]
        lgb_imp = models["lgb"].feature_importances_
        cat_imp = models["cat"].get_feature_importance()
        xgb_imp = models["xgb"].feature_importances_

        for i, feat in enumerate(feat_cols):
            rows.append({
                "disease": disease,
                "feature": feat,
                "lgb": lgb_imp[i],
                "cat": cat_imp[i],
                "xgb": xgb_imp[i],
                "avg": (lgb_imp[i] + cat_imp[i] + xgb_imp[i]) / 3,
            })

    df_imp = pd.DataFrame(rows)
    df_imp.to_csv(OUTPUT_DIR / "feature_importance_experts.csv", index=False)
    print(f"\n  Saved feature_importance_experts.csv ({len(df_imp)} rows)")

    # Print top-10 per disease
    for disease in DISEASE_MAP:
        sub = df_imp[df_imp["disease"] == disease].nlargest(10, "avg")
        print(f"\n  TOP-10 {disease.upper()}:")
        for _, row in sub.iterrows():
            print(f"    {row['feature']:<30} avg={row['avg']:.1f}")


In [12]:

def fairness_audit(disease, y_test, y_proba, threshold, demo_test):
    """Quick fairness check across demographics."""
    y_pred = (y_proba >= threshold).astype(int)

    print(f"\n  Fairness — {disease.upper()}")

    # Gender
    for gval, gname in {0: "Male", 1: "Female"}.items():
        mask = demo_test["female"].values == gval
        if mask.sum() < 50:
            continue
        g_rec = recall_score(y_test.values[mask], y_pred[mask], zero_division=0)
        g_prec = precision_score(y_test.values[mask], y_pred[mask], zero_division=0)
        g_f2  = fbeta_score(y_test.values[mask], y_pred[mask], beta=2, zero_division=0)
        print(f"    {gname:<12}: F2={g_f2:.4f}  P={g_prec:.4f}  R={g_rec:.4f} (n={mask.sum():,})")

    # Ethnicity
    for eth in ["White", "Black", "Hispanic", "Other"]:
        mask = demo_test["ethnicity"].values == eth
        if mask.sum() < 50:
            continue
        g_rec = recall_score(y_test.values[mask], y_pred[mask], zero_division=0)
        g_prec = precision_score(y_test.values[mask], y_pred[mask], zero_division=0)
        g_f2  = fbeta_score(y_test.values[mask], y_pred[mask], beta=2, zero_division=0)
        print(f"    {eth:<12}: F2={g_f2:.4f}  P={g_prec:.4f}  R={g_rec:.4f} (n={mask.sum():,})")



In [13]:

banner("HYBRID DISEASE-EXPERT ENSEMBLE")
print(f"  Data:    {DATA_PATH}")
print(f"  Output:  {OUTPUT_DIR}")
print(f"  Horizon: {PREDICTION_HORIZON} screenings (~{PREDICTION_HORIZON*2} years)")
print(f"  Diseases: {list(DISEASE_MAP.keys())}")

# Load raw
print("\n  Loading parquet data...")
df_raw = pd.read_parquet(str(DATA_PATH))
print(f"  Shape: {df_raw.shape}")

# Build master feature+target dataset
master = build_master_features(df_raw)

# Train one expert per disease
all_models = {}
all_results = {}
all_feature_cols = {}

for disease in DISEASE_MAP:
    try:
        feature_cols, X_tr, y_tr, X_te, y_te, demo = get_disease_dataset(master, disease)
        all_feature_cols[disease] = feature_cols

        models, results, avg_pred = train_disease_expert(
            disease, feature_cols, X_tr, y_tr, X_te, y_te
        )
        all_models[disease] = models
        all_results[disease] = results

        # Best threshold from ensemble average
        bt = results["avg"]["threshold"]
        fairness_audit(disease, y_te, avg_pred, bt, demo)
        explain_expert(models, X_te, feature_cols, disease)

    except Exception as e:
        print(f"\n  ERROR for {disease}: {e}")
        traceback.print_exc()

# Meta-ensemble: any-onset
if all_models:
    any_onset_results = build_any_onset_from_experts(
        master, all_models, all_feature_cols
    )
    all_results["_any_onset_combined"] = any_onset_results

# Export
if all_models:
    export_importance(all_models, all_feature_cols)

# Save results
with open(OUTPUT_DIR / "results.json", "w") as f:
    json.dump(all_results, f, indent=2, default=str)
print("\n  Saved results.json")

banner("FINAL SUMMARY")
print(f"{'Disease':<14} {'ROC-AUC':>8} {'PR-AUC':>8} {'F2':>8} {'Recall':>8} {'Prec':>8}")
print("-" * 62)
for disease, res in all_results.items():
    if disease.startswith("_"):
        # Meta-ensemble
        r = res
        print(f"{'ANY-ONSET':<14} {r.get('roc_auc',0):>8.4f} {r.get('pr_auc',0):>8.4f} "
                f"{r.get('f2',0):>8.4f}")
    else:
        r = res.get("avg", {})
        print(f"{disease:<14} {r.get('roc_auc',0):>8.4f} {r.get('pr_auc',0):>8.4f} "
                f"{r.get('f2',0):>8.4f} {r.get('recall',0):>8.4f} {r.get('precision',0):>8.4f}")


  HYBRID DISEASE-EXPERT ENSEMBLE
  Data:    /teamspace/studios/this_studio/data/extracted/randhrs1992_2022v1.parquet
  Output:  /teamspace/studios/this_studio/output_hybrid
  Horizon: 2 screenings (~4 years)
  Diseases: ['diabetes', 'cvd', 'stroke', 'lung', 'cancer', 'hibp', 'arthritis', 'psychiatric', 'memory']

  Loading parquet data...


  Shape: (45234, 19880)

  PHASE 1-3: BUILDING FEATURES + TARGETS
  Screening 5 → 7: 45,234 rows, 150 features
  Screening 6 → 8: 45,234 rows, 150 features
  Screening 7 → 9: 45,234 rows, 150 features
  Screening 8 → 10: 45,234 rows, 150 features
  Screening 9 → 11: 45,234 rows, 150 features
  Screening 10 → 12: 45,234 rows, 150 features
  Screening 11 → 13: 45,234 rows, 150 features
  Screening 12 → 14: 45,234 rows, 150 features

  TOTAL: 361,872 rows


[32m[I 2026-02-15 10:48:10,181][0m A new study created in memory with name: no-name-15003216-7650-4340-ba8c-96a8124b21c0[0m



  EXPERT TRAIN & SAVE: DIABETES
  Scale_pos_weight: 13.14 | Train Size: 76,088
  Categorical features: 1
  [Optuna] Tuning CatBoost for diabetes (Max 3 mins)...


Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:48:16,771][0m Trial 0 finished with value: 0.7022947139127821 and parameters: {'learning_rate': 0.023688639503640783, 'depth': 8, 'l2_leaf_reg': 5.395030966670228}. Best is trial 0 with value: 0.7022947139127821.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:48:19,948][0m Trial 1 finished with value: 0.7029537533963465 and parameters: {'learning_rate': 0.03968793330444373, 'depth': 4, 'l2_leaf_reg': 1.4321698289111515}. Best is trial 1 with value: 0.7029537533963465.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:48:29,413][0m Trial 2 finished with value: 0.7015598231102005 and parameters: {'learning_rate': 0.011430983876313222, 'depth': 8, 'l2_leaf_reg': 3.9913058785616786}. Best is trial 1 with value: 0.7029537533963465.[0m
Default metric period is 5 because AUC is/are not implemented for GPU

  [Optuna] Best AUC: 0.7042
  [Optuna] Params: {'learning_rate': 0.06798962421591129, 'depth': 5, 'l2_leaf_reg': 1.5199348301309807}


Default metric period is 5 because AUC is/are not implemented for GPU


  CatBoost (Tuned) AUC: 0.7042
  [INFO] LightGBM using CPU fallback.
  Ensemble Final AUC: 0.7034
  [SAVED] Ensemble saved to /teamspace/studios/this_studio/output_hybrid/saved_models/ensemble_diabetes.pkl
  diabetes/cat: ROC=0.7042 PR=0.1383 F2=0.3565 P=0.1225 R=0.6819 t=0.498
  diabetes/lgb: ROC=0.6840 PR=0.1272 F2=0.3396 P=0.1183 R=0.6377 t=0.090
  diabetes/xgb: ROC=0.6981 PR=0.1398 F2=0.3551 P=0.1187 R=0.7073 t=0.460
  diabetes/avg: ROC=0.7034 PR=0.1405 F2=0.3593 P=0.1257 R=0.6707 t=0.361

  Fairness — DIABETES
    Male        : F2=0.3573  P=0.1231  R=0.6813 (n=7,670)
    Female      : F2=0.3608  P=0.1278  R=0.6628 (n=11,537)
    White       : F2=0.3229  P=0.1159  R=0.5837 (n=14,355)
    Black       : F2=0.3817  P=0.1242  R=0.7930 (n=2,604)
    Hispanic    : F2=0.4325  P=0.1481  R=0.8318 (n=1,789)
    Other       : F2=0.4952  P=0.2071  R=0.7593 (n=459)
  Saved SHAP plot: shap_diabetes.png


[32m[I 2026-02-15 10:50:07,121][0m A new study created in memory with name: no-name-cf4eacd4-a3f0-432d-99d1-9b5bb4e78ace[0m



  EXPERT TRAIN & SAVE: CVD
  Scale_pos_weight: 9.96 | Train Size: 73,925
  Categorical features: 1
  [Optuna] Tuning CatBoost for cvd (Max 3 mins)...


Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:50:11,143][0m Trial 0 finished with value: 0.6868345707640341 and parameters: {'learning_rate': 0.023688639503640783, 'depth': 8, 'l2_leaf_reg': 5.395030966670228}. Best is trial 0 with value: 0.6868345707640341.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:50:13,504][0m Trial 1 finished with value: 0.6887310770675531 and parameters: {'learning_rate': 0.03968793330444373, 'depth': 4, 'l2_leaf_reg': 1.4321698289111515}. Best is trial 1 with value: 0.6887310770675531.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:50:19,977][0m Trial 2 finished with value: 0.6883868488832483 and parameters: {'learning_rate': 0.011430983876313222, 'depth': 8, 'l2_leaf_reg': 3.9913058785616786}. Best is trial 1 with value: 0.6887310770675531.[0m
Default metric period is 5 because AUC is/are not implemented for GPU

  [Optuna] Best AUC: 0.6906
  [Optuna] Params: {'learning_rate': 0.06205043932050714, 'depth': 5, 'l2_leaf_reg': 5.96648250185325}


Default metric period is 5 because AUC is/are not implemented for GPU


  CatBoost (Tuned) AUC: 0.6906
  [INFO] LightGBM using CPU fallback.
  Ensemble Final AUC: 0.6902
  [SAVED] Ensemble saved to /teamspace/studios/this_studio/output_hybrid/saved_models/ensemble_cvd.pkl
  cvd/cat: ROC=0.6906 PR=0.1810 F2=0.4057 P=0.1351 R=0.8127 t=0.422
  cvd/lgb: ROC=0.6735 PR=0.1689 F2=0.3931 P=0.1367 R=0.7400 t=0.113
  cvd/xgb: ROC=0.6856 PR=0.1821 F2=0.3998 P=0.1328 R=0.8041 t=0.422
  cvd/avg: ROC=0.6902 PR=0.1843 F2=0.4052 P=0.1384 R=0.7824 t=0.329

  Fairness — CVD
    Male        : F2=0.4147  P=0.1426  R=0.7928 (n=6,887)
    Female      : F2=0.3984  P=0.1353  R=0.7750 (n=11,682)
    White       : F2=0.4253  P=0.1466  R=0.8108 (n=12,884)
    Black       : F2=0.3466  P=0.1119  R=0.7288 (n=2,988)
    Hispanic    : F2=0.3379  P=0.1155  R=0.6519 (n=2,229)
    Other       : F2=0.3834  P=0.1420  R=0.6667 (n=468)
  Saved SHAP plot: shap_cvd.png


[32m[I 2026-02-15 10:51:30,893][0m A new study created in memory with name: no-name-b90887ed-6cce-411f-ab1f-a27024a1d01d[0m



  EXPERT TRAIN & SAVE: STROKE
  Scale_pos_weight: 31.15 | Train Size: 89,527
  Categorical features: 1
  [Optuna] Tuning CatBoost for stroke (Max 3 mins)...


Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:51:32,938][0m Trial 0 finished with value: 0.6902566051340514 and parameters: {'learning_rate': 0.023688639503640783, 'depth': 8, 'l2_leaf_reg': 5.395030966670228}. Best is trial 0 with value: 0.6902566051340514.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:51:35,275][0m Trial 1 finished with value: 0.6931445158454469 and parameters: {'learning_rate': 0.03968793330444373, 'depth': 4, 'l2_leaf_reg': 1.4321698289111515}. Best is trial 1 with value: 0.6931445158454469.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:51:38,775][0m Trial 2 finished with value: 0.6885337649873086 and parameters: {'learning_rate': 0.011430983876313222, 'depth': 8, 'l2_leaf_reg': 3.9913058785616786}. Best is trial 1 with value: 0.6931445158454469.[0m
Default metric period is 5 because AUC is/are not implemented for GPU

  [Optuna] Best AUC: 0.6935
  [Optuna] Params: {'learning_rate': 0.06512883388592991, 'depth': 6, 'l2_leaf_reg': 1.4897720904073855}


Default metric period is 5 because AUC is/are not implemented for GPU


  CatBoost (Tuned) AUC: 0.6935
  [INFO] LightGBM using CPU fallback.
  Ensemble Final AUC: 0.6922
  [SAVED] Ensemble saved to /teamspace/studios/this_studio/output_hybrid/saved_models/ensemble_stroke.pkl
  stroke/cat: ROC=0.6935 PR=0.0658 F2=0.2072 P=0.0634 R=0.4781 t=0.583
  stroke/lgb: ROC=0.6642 PR=0.0539 F2=0.1985 P=0.0534 R=0.6170 t=0.043
  stroke/xgb: ROC=0.6841 PR=0.0670 F2=0.1981 P=0.0514 R=0.6915 t=0.437
  stroke/avg: ROC=0.6922 PR=0.0677 F2=0.2041 P=0.0566 R=0.5863 t=0.355

  Fairness — STROKE
    Male        : F2=0.1924  P=0.0534  R=0.5524 (n=9,490)
    Female      : F2=0.2140  P=0.0593  R=0.6152 (n=12,800)
    White       : F2=0.2089  P=0.0588  R=0.5769 (n=15,656)
    Black       : F2=0.2184  P=0.0594  R=0.6583 (n=3,562)
    Hispanic    : F2=0.1493  P=0.0388  R=0.5167 (n=2,540)
    Other       : F2=0.1923  P=0.0517  R=0.6000 (n=532)
  Saved SHAP plot: shap_stroke.png


[32m[I 2026-02-15 10:52:44,446][0m A new study created in memory with name: no-name-42b06524-d164-439a-866a-cf7c72e2b2b3[0m



  EXPERT TRAIN & SAVE: LUNG
  Scale_pos_weight: 25.88 | Train Size: 86,700
  Categorical features: 1
  [Optuna] Tuning CatBoost for lung (Max 3 mins)...


Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:52:49,379][0m Trial 0 finished with value: 0.7288797737422679 and parameters: {'learning_rate': 0.023688639503640783, 'depth': 8, 'l2_leaf_reg': 5.395030966670228}. Best is trial 0 with value: 0.7288797737422679.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:52:53,936][0m Trial 1 finished with value: 0.7304645856430719 and parameters: {'learning_rate': 0.03968793330444373, 'depth': 4, 'l2_leaf_reg': 1.4321698289111515}. Best is trial 1 with value: 0.7304645856430719.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:53:00,309][0m Trial 2 finished with value: 0.7311534755408182 and parameters: {'learning_rate': 0.011430983876313222, 'depth': 8, 'l2_leaf_reg': 3.9913058785616786}. Best is trial 2 with value: 0.7311534755408182.[0m
Default metric period is 5 because AUC is/are not implemented for GPU

  [Optuna] Best AUC: 0.7350
  [Optuna] Params: {'learning_rate': 0.028580510658069373, 'depth': 7, 'l2_leaf_reg': 1.5837031559118748}


Default metric period is 5 because AUC is/are not implemented for GPU


  CatBoost (Tuned) AUC: 0.7350
  [INFO] LightGBM using CPU fallback.
  Ensemble Final AUC: 0.7365
  [SAVED] Ensemble saved to /teamspace/studios/this_studio/output_hybrid/saved_models/ensemble_lung.pkl
  lung/cat: ROC=0.7350 PR=0.1032 F2=0.2643 P=0.0913 R=0.5019 t=0.574
  lung/lgb: ROC=0.7022 PR=0.0871 F2=0.2451 P=0.0717 R=0.6207 t=0.049
  lung/xgb: ROC=0.7325 PR=0.1008 F2=0.2586 P=0.0989 R=0.4336 t=0.591
  lung/avg: ROC=0.7365 PR=0.1023 F2=0.2644 P=0.1104 R=0.4058 t=0.425

  Fairness — LUNG
    Male        : F2=0.2713  P=0.1120  R=0.4209 (n=8,611)
    Female      : F2=0.2602  P=0.1094  R=0.3968 (n=12,954)
    White       : F2=0.2868  P=0.1241  R=0.4266 (n=15,191)
    Black       : F2=0.2264  P=0.0830  R=0.3983 (n=3,502)
    Hispanic    : F2=0.1397  P=0.0528  R=0.2373 (n=2,359)
    Other       : F2=0.2890  P=0.1639  R=0.3571 (n=513)
  Saved SHAP plot: shap_lung.png


[32m[I 2026-02-15 10:54:45,815][0m A new study created in memory with name: no-name-51890378-e81b-4ade-8b05-09545a639965[0m



  EXPERT TRAIN & SAVE: CANCER
  Scale_pos_weight: 19.02 | Train Size: 83,170
  Categorical features: 1
  [Optuna] Tuning CatBoost for cancer (Max 3 mins)...


Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:54:48,094][0m Trial 0 finished with value: 0.5969541828481567 and parameters: {'learning_rate': 0.023688639503640783, 'depth': 8, 'l2_leaf_reg': 5.395030966670228}. Best is trial 0 with value: 0.5969541828481567.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:54:51,192][0m Trial 1 finished with value: 0.6015576312917037 and parameters: {'learning_rate': 0.03968793330444373, 'depth': 4, 'l2_leaf_reg': 1.4321698289111515}. Best is trial 1 with value: 0.6015576312917037.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:54:54,013][0m Trial 2 finished with value: 0.5975657346926408 and parameters: {'learning_rate': 0.011430983876313222, 'depth': 8, 'l2_leaf_reg': 3.9913058785616786}. Best is trial 1 with value: 0.6015576312917037.[0m
Default metric period is 5 because AUC is/are not implemented for GPU

  [Optuna] Best AUC: 0.6024
  [Optuna] Params: {'learning_rate': 0.032676417657817626, 'depth': 6, 'l2_leaf_reg': 1.1128853174905728}


Default metric period is 5 because AUC is/are not implemented for GPU


  CatBoost (Tuned) AUC: 0.6024
  [INFO] LightGBM using CPU fallback.
  Ensemble Final AUC: 0.5946
  [SAVED] Ensemble saved to /teamspace/studios/this_studio/output_hybrid/saved_models/ensemble_cancer.pkl
  cancer/cat: ROC=0.6024 PR=0.0732 F2=0.2334 P=0.0625 R=0.7388 t=0.466
  cancer/lgb: ROC=0.5795 PR=0.0663 F2=0.2245 P=0.0601 R=0.7119 t=0.058
  cancer/xgb: ROC=0.5865 PR=0.0761 F2=0.2286 P=0.0623 R=0.6877 t=0.425
  cancer/avg: ROC=0.5946 PR=0.0768 F2=0.2309 P=0.0628 R=0.6989 t=0.317

  Fairness — CANCER
    Male        : F2=0.2695  P=0.0725  R=0.8406 (n=8,638)
    Female      : F2=0.1917  P=0.0527  R=0.5628 (n=12,314)
    White       : F2=0.2395  P=0.0642  R=0.7542 (n=14,606)
    Black       : F2=0.2186  P=0.0610  R=0.6190 (n=3,387)
    Hispanic    : F2=0.1328  P=0.0401  R=0.3151 (n=2,321)
    Other       : F2=0.2217  P=0.0783  R=0.4091 (n=638)
  Saved SHAP plot: shap_cancer.png


[32m[I 2026-02-15 10:56:00,253][0m A new study created in memory with name: no-name-750ef435-16c1-42ef-8a0f-24f15af04cd5[0m



  EXPERT TRAIN & SAVE: HIBP
  Scale_pos_weight: 4.04 | Train Size: 41,274
  Categorical features: 1
  [Optuna] Tuning CatBoost for hibp (Max 3 mins)...


Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:56:11,655][0m Trial 0 finished with value: 0.6162841696904284 and parameters: {'learning_rate': 0.023688639503640783, 'depth': 8, 'l2_leaf_reg': 5.395030966670228}. Best is trial 0 with value: 0.6162841696904284.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:56:16,939][0m Trial 1 finished with value: 0.6171682205606326 and parameters: {'learning_rate': 0.03968793330444373, 'depth': 4, 'l2_leaf_reg': 1.4321698289111515}. Best is trial 1 with value: 0.6171682205606326.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 10:57:15,695][0m Trial 2 finished with value: 0.6194850374094351 and parameters: {'learning_rate': 0.011430983876313222, 'depth': 8, 'l2_leaf_reg': 3.9913058785616786}. Best is trial 2 with value: 0.6194850374094351.[0m
Default metric period is 5 because AUC is/are not implemented for GPU

  [Optuna] Best AUC: 0.6196
  [Optuna] Params: {'learning_rate': 0.01048407718678713, 'depth': 8, 'l2_leaf_reg': 7.136283648418459}


Default metric period is 5 because AUC is/are not implemented for GPU


  CatBoost (Tuned) AUC: 0.6195
  [INFO] LightGBM using CPU fallback.
  Ensemble Final AUC: 0.6180
  [SAVED] Ensemble saved to /teamspace/studios/this_studio/output_hybrid/saved_models/ensemble_hibp.pkl
  hibp/cat: ROC=0.6195 PR=0.2716 F2=0.5634 P=0.2177 R=0.9343 t=0.346
  hibp/lgb: ROC=0.5984 PR=0.2503 F2=0.5588 P=0.2050 R=0.9829 t=0.201
  hibp/xgb: ROC=0.6135 PR=0.2686 F2=0.5620 P=0.2164 R=0.9353 t=0.323
  hibp/avg: ROC=0.6180 PR=0.2713 F2=0.5639 P=0.2164 R=0.9422 t=0.291

  Fairness — HIBP
    Male        : F2=0.5711  P=0.2194  R=0.9529 (n=4,265)
    Female      : F2=0.5583  P=0.2140  R=0.9339 (n=6,056)
    White       : F2=0.5433  P=0.2036  R=0.9324 (n=7,818)
    Black       : F2=0.6357  P=0.2637  R=0.9821 (n=1,078)
    Hispanic    : F2=0.6100  P=0.2456  R=0.9695 (n=1,134)
    Other       : F2=0.5677  P=0.2355  R=0.8769 (n=291)
  Saved SHAP plot: shap_hibp.png


[32m[I 2026-02-15 11:01:50,748][0m A new study created in memory with name: no-name-bc780ac1-8084-4129-b3b8-5ec563b2f8c6[0m



  EXPERT TRAIN & SAVE: ARTHRITIS
  Scale_pos_weight: 3.93 | Train Size: 39,069
  Categorical features: 1
  [Optuna] Tuning CatBoost for arthritis (Max 3 mins)...


Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 11:02:24,344][0m Trial 0 finished with value: 0.6471673636622691 and parameters: {'learning_rate': 0.023688639503640783, 'depth': 8, 'l2_leaf_reg': 5.395030966670228}. Best is trial 0 with value: 0.6471673636622691.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 11:02:34,441][0m Trial 1 finished with value: 0.6469619463505927 and parameters: {'learning_rate': 0.03968793330444373, 'depth': 4, 'l2_leaf_reg': 1.4321698289111515}. Best is trial 0 with value: 0.6471673636622691.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 11:03:31,269][0m Trial 2 finished with value: 0.6473925587055136 and parameters: {'learning_rate': 0.011430983876313222, 'depth': 8, 'l2_leaf_reg': 3.9913058785616786}. Best is trial 2 with value: 0.6473925587055136.[0m
Default metric period is 5 because AUC is/are not implemented for GPU

  [Optuna] Best AUC: 0.6478
  [Optuna] Params: {'learning_rate': 0.06798962421591129, 'depth': 5, 'l2_leaf_reg': 1.5199348301309807}


Default metric period is 5 because AUC is/are not implemented for GPU


  CatBoost (Tuned) AUC: 0.6476
  [INFO] LightGBM using CPU fallback.
  Ensemble Final AUC: 0.6461
  [SAVED] Ensemble saved to /teamspace/studios/this_studio/output_hybrid/saved_models/ensemble_arthritis.pkl
  arthritis/cat: ROC=0.6476 PR=0.3055 F2=0.5648 P=0.2125 R=0.9649 t=0.326
  arthritis/lgb: ROC=0.6258 PR=0.2822 F2=0.5586 P=0.2020 R=1.0000 t=0.195
  arthritis/xgb: ROC=0.6393 PR=0.3009 F2=0.5612 P=0.2054 R=0.9901 t=0.297
  arthritis/avg: ROC=0.6461 PR=0.3058 F2=0.5629 P=0.2077 R=0.9832 t=0.276

  Fairness — ARTHRITIS
    Male        : F2=0.5157  P=0.1785  R=0.9771 (n=4,825)
    Female      : F2=0.6007  P=0.2340  R=0.9875 (n=5,215)
    White       : F2=0.5622  P=0.2070  R=0.9843 (n=6,952)
    Black       : F2=0.5947  P=0.2288  R=0.9906 (n=1,431)
    Hispanic    : F2=0.5480  P=0.1997  R=0.9720 (n=1,311)
    Other       : F2=0.4826  P=0.1613  R=0.9615 (n=346)
  Saved SHAP plot: shap_arthritis.png


[32m[I 2026-02-15 11:05:21,114][0m A new study created in memory with name: no-name-1df674e7-49e3-4843-85c2-8054ad10ec02[0m



  EXPERT TRAIN & SAVE: PSYCHIATRIC
  Scale_pos_weight: 18.96 | Train Size: 78,618
  Categorical features: 1
  [Optuna] Tuning CatBoost for psychiatric (Max 3 mins)...


Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 11:05:25,950][0m Trial 0 finished with value: 0.7273952403622614 and parameters: {'learning_rate': 0.023688639503640783, 'depth': 8, 'l2_leaf_reg': 5.395030966670228}. Best is trial 0 with value: 0.7273952403622614.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 11:05:30,794][0m Trial 1 finished with value: 0.7291497246360437 and parameters: {'learning_rate': 0.03968793330444373, 'depth': 4, 'l2_leaf_reg': 1.4321698289111515}. Best is trial 1 with value: 0.7291497246360437.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 11:05:40,356][0m Trial 2 finished with value: 0.7262806554273468 and parameters: {'learning_rate': 0.011430983876313222, 'depth': 8, 'l2_leaf_reg': 3.9913058785616786}. Best is trial 1 with value: 0.7291497246360437.[0m
Default metric period is 5 because AUC is/are not implemented for GPU

  [Optuna] Best AUC: 0.7309
  [Optuna] Params: {'learning_rate': 0.013787764619353767, 'depth': 5, 'l2_leaf_reg': 2.324672848950434}


Default metric period is 5 because AUC is/are not implemented for GPU


  CatBoost (Tuned) AUC: 0.7309
  [INFO] LightGBM using CPU fallback.
  Ensemble Final AUC: 0.7315
  [SAVED] Ensemble saved to /teamspace/studios/this_studio/output_hybrid/saved_models/ensemble_psychiatric.pkl
  psychiatric/cat: ROC=0.7309 PR=0.1273 F2=0.3068 P=0.1073 R=0.5738 t=0.539
  psychiatric/lgb: ROC=0.6792 PR=0.0942 F2=0.2729 P=0.0890 R=0.5651 t=0.061
  psychiatric/xgb: ROC=0.7243 PR=0.1219 F2=0.3007 P=0.1072 R=0.5477 t=0.515
  psychiatric/avg: ROC=0.7315 PR=0.1267 F2=0.3070 P=0.1070 R=0.5759 t=0.367

  Fairness — PSYCHIATRIC
    Male        : F2=0.2615  P=0.0969  R=0.4545 (n=8,941)
    Female      : F2=0.3272  P=0.1111  R=0.6368 (n=10,717)
    White       : F2=0.3227  P=0.1171  R=0.5754 (n=13,663)
    Black       : F2=0.2687  P=0.0824  R=0.6179 (n=3,289)
    Hispanic    : F2=0.2991  P=0.0994  R=0.6016 (n=2,184)
    Other       : F2=0.1826  P=0.0748  R=0.2857 (n=522)
  Saved SHAP plot: shap_psychiatric.png


[32m[I 2026-02-15 11:08:20,226][0m A new study created in memory with name: no-name-adb4dc06-001a-4958-8ddb-0c9ec5b80571[0m



  EXPERT TRAIN & SAVE: MEMORY
  Scale_pos_weight: 35.15 | Train Size: 35,753
  Categorical features: 1
  [Optuna] Tuning CatBoost for memory (Max 3 mins)...


Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 11:08:44,737][0m Trial 0 finished with value: 0.8292955560235391 and parameters: {'learning_rate': 0.023688639503640783, 'depth': 8, 'l2_leaf_reg': 5.395030966670228}. Best is trial 0 with value: 0.8292955560235391.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 11:08:49,036][0m Trial 1 finished with value: 0.8310044319087968 and parameters: {'learning_rate': 0.03968793330444373, 'depth': 4, 'l2_leaf_reg': 1.4321698289111515}. Best is trial 1 with value: 0.8310044319087968.[0m
Default metric period is 5 because AUC is/are not implemented for GPU
[32m[I 2026-02-15 11:09:24,917][0m Trial 2 finished with value: 0.8281668744986597 and parameters: {'learning_rate': 0.011430983876313222, 'depth': 8, 'l2_leaf_reg': 3.9913058785616786}. Best is trial 1 with value: 0.8310044319087968.[0m
Default metric period is 5 because AUC is/are not implemented for GPU

  [Optuna] Best AUC: 0.8335
  [Optuna] Params: {'learning_rate': 0.010086482454219638, 'depth': 4, 'l2_leaf_reg': 8.82693076112686}


Default metric period is 5 because AUC is/are not implemented for GPU


  CatBoost (Tuned) AUC: 0.8363
  [INFO] LightGBM using CPU fallback.
  Ensemble Final AUC: 0.8348
  [SAVED] Ensemble saved to /teamspace/studios/this_studio/output_hybrid/saved_models/ensemble_memory.pkl
  memory/cat: ROC=0.8363 PR=0.1567 F2=0.3230 P=0.1093 R=0.6316 t=0.597
  memory/lgb: ROC=0.8007 PR=0.1354 F2=0.3349 P=0.1542 R=0.4737 t=0.064
  memory/xgb: ROC=0.8223 PR=0.1926 F2=0.3291 P=0.1372 R=0.5061 t=0.600
  memory/avg: ROC=0.8348 PR=0.1844 F2=0.3453 P=0.1441 R=0.5304 t=0.446

  Fairness — MEMORY
    Male        : F2=0.3210  P=0.1320  R=0.5000 (n=3,702)
    Female      : F2=0.3605  P=0.1519  R=0.5490 (n=5,100)
    White       : F2=0.3333  P=0.1411  R=0.5056 (n=6,854)
    Black       : F2=0.4204  P=0.1739  R=0.6512 (n=1,168)
    Hispanic    : F2=0.3005  P=0.1209  R=0.4783 (n=628)
    Other       : F2=0.3226  P=0.1053  R=0.6667 (n=152)
  Saved SHAP plot: shap_memory.png

  META-ENSEMBLE: ANY-DISEASE ONSET

  ANY-ONSET Combined:
    ROC-AUC:  0.6478
    PR-AUC:   0.4899
    Best F2