# Segmented modeling (mixture‑of‑experts) — LightGBM

We test a simple mixture: a global model + segment‑specific models. Final prediction = weighted blend between global and segment model for each row.

Segments tested:
- `age_group`
- `health_worker`
- `doctor_recc_any` (derived)

We evaluate each segmenting scheme separately (no combinations).

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

ROOT = Path("..").resolve()
DATA_DIR = ROOT / "data"

X = pd.read_csv(DATA_DIR / "training_set_features.csv", index_col="respondent_id")
y = pd.read_csv(DATA_DIR / "training_set_labels.csv", index_col="respondent_id")

strat = (2 * y["h1n1_vaccine"].astype(int) + y["seasonal_vaccine"].astype(int)).values

cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
cat_indices = [X.columns.get_loc(c) for c in cat_cols]


def lgbm_model():
    return lgb.LGBMClassifier(
        n_estimators=800,
        learning_rate=0.05,
        num_leaves=64,
        max_depth=-1,
        min_data_in_leaf=50,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=1,
        objective="binary",
        n_jobs=4,
    )


def prep_cats(df: pd.DataFrame) -> pd.DataFrame:
    Xc = df.copy()
    for c in cat_cols:
        Xc[c] = Xc[c].astype("category")
    return Xc


def eval_segmented(segment_col: str, alpha: float = 0.5, seed: int = 42):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof = pd.DataFrame(index=y.index, columns=y.columns, dtype=float)
    for col in y.columns:
        oof[col] = 0.0

    for tr_idx, va_idx in skf.split(X, strat):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        X_tr_c = prep_cats(X_tr)
        X_va_c = prep_cats(X_va)

        for target in y.columns:
            # global model
            global_model = lgbm_model()
            global_model.fit(X_tr_c, y[target].iloc[tr_idx], categorical_feature=cat_indices)
            global_pred = global_model.predict_proba(X_va_c)[:, 1]

            # segment-specific models
            seg_preds = np.zeros(len(X_va_c), dtype=float)
            for seg_val in X_tr[segment_col].dropna().unique():
                tr_mask = X_tr[segment_col] == seg_val
                va_mask = X_va[segment_col] == seg_val
                if va_mask.sum() == 0 or tr_mask.sum() < 50:
                    continue
                seg_model = lgbm_model()
                seg_model.fit(X_tr_c[tr_mask], y[target].iloc[tr_idx][tr_mask], categorical_feature=cat_indices)
                seg_preds[va_mask.values] = seg_model.predict_proba(X_va_c[va_mask])[:, 1]

            # blend global + segment
            blended = (1 - alpha) * global_pred + alpha * seg_preds
            # fall back to global where segment model didn't run
            blended = np.where(seg_preds > 0, blended, global_pred)

            oof.loc[X_va.index, target] = blended

    scores = {col: roc_auc_score(y[col], oof[col]) for col in y.columns}
    scores["mean_auc"] = float(np.mean(list(scores.values())))
    return scores

# Derived segment
X["doctor_recc_any"] = ((X["doctor_recc_h1n1"] == 1) | (X["doctor_recc_seasonal"] == 1)).astype(float)

scores_age = eval_segmented("age_group", alpha=0.5)
scores_hw = eval_segmented("health_worker", alpha=0.5)
scores_dr = eval_segmented("doctor_recc_any", alpha=0.5)

scores_age, scores_hw, scores_dr