# Feature-engineered + isotonic-calibrated blend

Goal: add survey-structure features, train LGBM + CatBoost, then isotonic-calibrate
and blend with the same weights as the base blend.

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

ROOT = Path("..").resolve()
DATA_DIR = ROOT / "data"
RUNS_DIR = ROOT / "runs"

X_train = pd.read_csv(DATA_DIR / "training_set_features.csv", index_col="respondent_id")
y = pd.read_csv(DATA_DIR / "training_set_labels.csv", index_col="respondent_id")
X_test = pd.read_csv(DATA_DIR / "test_set_features.csv", index_col="respondent_id")

strat = (2 * y["h1n1_vaccine"].astype(int) + y["seasonal_vaccine"].astype(int)).values


def auc_scores(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> dict:
    scores = {c: roc_auc_score(y_true[c], y_pred[c]) for c in y_true.columns}
    scores["mean_auc"] = float(np.mean(list(scores.values())))
    return scores


def build_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    behavior_cols = [
        "behavioral_antiviral_meds",
        "behavioral_avoidance",
        "behavioral_face_mask",
        "behavioral_wash_hands",
        "behavioral_large_gatherings",
        "behavioral_outside_home",
        "behavioral_touch_face",
    ]
    out["behavioral_sum"] = out[behavior_cols].sum(axis=1, min_count=1)

    out["opinion_effective_gap"] = (
        out["opinion_h1n1_vacc_effective"] - out["opinion_seas_vacc_effective"]
    )
    out["opinion_risk_gap"] = out["opinion_h1n1_risk"] - out["opinion_seas_risk"]
    out["opinion_sick_gap"] = (
        out["opinion_h1n1_sick_from_vacc"] - out["opinion_seas_sick_from_vacc"]
    )
    out["doctor_recc_any"] = (
        (out["doctor_recc_h1n1"] == 1) | (out["doctor_recc_seasonal"] == 1)
    ).astype(float)
    return out


X_train_fe = build_features(X_train)
X_test_fe = build_features(X_test)


## Train LightGBM + CatBoost (OOF + test)

In [None]:
def prepare_lgbm(X: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]:
    cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
    Xc = X.copy()
    for c in cat_cols:
        Xc[c] = Xc[c].astype("category")
    return Xc, cat_cols


def prepare_catboost(X: pd.DataFrame) -> tuple[pd.DataFrame, list[int]]:
    cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
    Xc = X.copy()
    for c in cat_cols:
        Xc[c] = Xc[c].fillna("__MISSING__").astype(str)
    cat_idx = [Xc.columns.get_loc(c) for c in cat_cols]
    return Xc, cat_idx


def train_oof_lgbm(X: pd.DataFrame, y: pd.DataFrame, X_test: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    Xc, cat_cols = prepare_lgbm(X)
    X_test_c, _ = prepare_lgbm(X_test)
    oof = pd.DataFrame(index=X.index, columns=y.columns, dtype=float)
    test_preds = pd.DataFrame(index=X_test.index, columns=y.columns, dtype=float)

    params = dict(
        objective="binary",
        n_estimators=600,
        learning_rate=0.05,
        num_leaves=63,
        min_data_in_leaf=50,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=1,
        random_state=42,
    )

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for tr_idx, va_idx in skf.split(Xc, strat):
        X_tr, X_va = Xc.iloc[tr_idx], Xc.iloc[va_idx]
        for target in y.columns:
            y_tr, y_va = y.iloc[tr_idx][target], y.iloc[va_idx][target]
            model = lgb.LGBMClassifier(**params)
            model.fit(
                X_tr,
                y_tr,
                eval_set=[(X_va, y_va)],
                eval_metric="auc",
                categorical_feature=cat_cols,
                callbacks=[lgb.early_stopping(50, verbose=False)],
            )
            oof.loc[X_va.index, target] = model.predict_proba(X_va)[:, 1]
            test_preds[target] += model.predict_proba(X_test_c)[:, 1] / skf.get_n_splits()

    return oof, test_preds


def train_oof_catboost(X: pd.DataFrame, y: pd.DataFrame, X_test: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    Xc, cat_idx = prepare_catboost(X)
    X_test_c, _ = prepare_catboost(X_test)
    oof = pd.DataFrame(index=X.index, columns=y.columns, dtype=float)
    test_preds = pd.DataFrame(index=X_test.index, columns=y.columns, dtype=float)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for tr_idx, va_idx in skf.split(Xc, strat):
        X_tr, X_va = Xc.iloc[tr_idx], Xc.iloc[va_idx]
        for target in y.columns:
            y_tr, y_va = y.iloc[tr_idx][target], y.iloc[va_idx][target]
            model = CatBoostClassifier(
                iterations=600,
                learning_rate=0.05,
                depth=6,
                loss_function="Logloss",
                eval_metric="AUC",
                verbose=False,
            )
            model.fit(X_tr, y_tr, eval_set=(X_va, y_va), cat_features=cat_idx)
            oof.loc[X_va.index, target] = model.predict_proba(X_va)[:, 1]
            test_preds[target] += model.predict_proba(X_test_c)[:, 1] / skf.get_n_splits()

    return oof, test_preds


lgbm_oof_fe, lgbm_test_fe = train_oof_lgbm(X_train_fe, y, X_test_fe)
cat_oof_fe, cat_test_fe = train_oof_catboost(X_train_fe, y, X_test_fe)

lgbm_scores = auc_scores(y, lgbm_oof_fe)
cat_scores = auc_scores(y, cat_oof_fe)
lgbm_scores, cat_scores

## Isotonic calibration and blend

In [None]:
def _make_calibrator(base_clf, method: str, cv):
    try:
        return CalibratedClassifierCV(estimator=base_clf, method=method, cv=cv)
    except TypeError:
        return CalibratedClassifierCV(base_estimator=base_clf, method=method, cv=cv)


def calibrate_oof(oof: pd.DataFrame, y: pd.DataFrame, method: str = "isotonic") -> pd.DataFrame:
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    out = pd.DataFrame(index=oof.index, columns=oof.columns, dtype=float)
    for target in y.columns:
        base = oof[[target]].values
        y_target = y[target].values
        base_clf = LogisticRegression(solver="lbfgs", max_iter=1000)
        calib = _make_calibrator(base_clf, method=method, cv=skf)
        calib.fit(base, y_target)
        out[target] = calib.predict_proba(base)[:, 1]
    return out


def apply_calibrator_to_test(oof: pd.DataFrame, test: pd.DataFrame, y: pd.DataFrame, method: str = "isotonic") -> pd.DataFrame:
    out = pd.DataFrame(index=test.index, columns=test.columns, dtype=float)
    for target in y.columns:
        base_clf = LogisticRegression(solver="lbfgs", max_iter=1000)
        calib = _make_calibrator(base_clf, method=method, cv=5)
        calib.fit(oof[[target]].values, y[target].values)
        out[target] = calib.predict_proba(test[[target]].values)[:, 1]
    return out


lgbm_iso = calibrate_oof(lgbm_oof_fe, y)
cat_iso = calibrate_oof(cat_oof_fe, y)

w_lgbm = 0.3885570715220429
def blend(a: pd.DataFrame, b: pd.DataFrame, w_a: float) -> pd.DataFrame:
    return a * w_a + b * (1.0 - w_a)

blend_iso = blend(lgbm_iso, cat_iso, w_lgbm)
blend_scores = auc_scores(y, blend_iso)
blend_scores

## Save outputs and submission

In [None]:
out_dir = RUNS_DIR / "calibration_features"
out_dir.mkdir(parents=True, exist_ok=True)

lgbm_oof_fe.to_csv(out_dir / "oof_lgbm_fe.csv", index_label="respondent_id")
cat_oof_fe.to_csv(out_dir / "oof_cat_fe.csv", index_label="respondent_id")
lgbm_test_fe.to_csv(out_dir / "test_lgbm_fe.csv", index_label="respondent_id")
cat_test_fe.to_csv(out_dir / "test_cat_fe.csv", index_label="respondent_id")

lgbm_test_iso = apply_calibrator_to_test(lgbm_oof_fe, lgbm_test_fe, y)
cat_test_iso = apply_calibrator_to_test(cat_oof_fe, cat_test_fe, y)

submission = pd.read_csv(DATA_DIR / "submission_format.csv", index_col="respondent_id")
submission[["h1n1_vaccine", "seasonal_vaccine"]] = blend(lgbm_test_iso, cat_test_iso, w_lgbm)
out_path = out_dir / "submission_isotonic_blend_features.csv"
submission.to_csv(out_path, index_label="respondent_id")

out_path