# Calibration-aware blending

Goal: calibrate base model OOF predictions and re-evaluate AUC.
We test Platt (logistic) and isotonic calibration per target and model.

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

ROOT = Path("..").resolve()
DATA_DIR = ROOT / "data"
RUNS_DIR = ROOT / "runs"

y = pd.read_csv(DATA_DIR / "training_set_labels.csv", index_col="respondent_id")

def load_preds(run_dir: Path, name: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    oof = pd.read_csv(run_dir / f"oof_{name}.csv", index_col="respondent_id")
    test = pd.read_csv(run_dir / f"test_{name}.csv", index_col="respondent_id")
    return oof, test

lgbm_oof, lgbm_test = load_preds(RUNS_DIR / "gbdt_bakeoff", "lightgbm")
cat_oof, cat_test = load_preds(RUNS_DIR / "gbdt_bakeoff", "catboost")

assert lgbm_oof.index.equals(y.index)
assert cat_oof.index.equals(y.index)

def auc_scores(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> dict:
    scores = {c: roc_auc_score(y_true[c], y_pred[c]) for c in y_true.columns}
    scores["mean_auc"] = float(np.mean(list(scores.values())))
    return scores


## Helper: cross-validated calibration on OOF

We calibrate each model's OOF predictions using a secondary CV to avoid leakage.

In [None]:
def calibrate_oof(
    oof: pd.DataFrame,
    y: pd.DataFrame,
    method: str = "sigmoid",
    seed: int = 42,
) -> pd.DataFrame:
    # Calibrate per target via CV on the OOF predictions
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    out = pd.DataFrame(index=oof.index, columns=oof.columns, dtype=float)
    for target in y.columns:
        base = oof[[target]].values
        y_target = y[target].values
        # Use a simple logistic regression as the base estimator
        base_clf = LogisticRegression(solver="lbfgs", max_iter=1000)
        calib = CalibratedClassifierCV(base_estimator=base_clf, method=method, cv=skf)
        calib.fit(base, y_target)
        out[target] = calib.predict_proba(base)[:, 1]
    return out


def apply_calibrator_to_test(
    oof: pd.DataFrame,
    test: pd.DataFrame,
    y: pd.DataFrame,
    method: str = "sigmoid",
    seed: int = 42,
) -> pd.DataFrame:
    # Fit on full OOF and apply to test
    out = pd.DataFrame(index=test.index, columns=test.columns, dtype=float)
    for target in y.columns:
        base_clf = LogisticRegression(solver="lbfgs", max_iter=1000)
        calib = CalibratedClassifierCV(base_estimator=base_clf, method=method, cv=5)
        calib.fit(oof[[target]].values, y[target].values)
        out[target] = calib.predict_proba(test[[target]].values)[:, 1]
    return out


## Evaluate calibration variants

In [None]:
baseline_lgbm = auc_scores(y, lgbm_oof)
baseline_cat = auc_scores(y, cat_oof)

lgbm_sig = calibrate_oof(lgbm_oof, y, method="sigmoid")
cat_sig = calibrate_oof(cat_oof, y, method="sigmoid")

lgbm_iso = calibrate_oof(lgbm_oof, y, method="isotonic")
cat_iso = calibrate_oof(cat_oof, y, method="isotonic")

scores = {
    "lgbm_base": baseline_lgbm,
    "cat_base": baseline_cat,
    "lgbm_sigmoid": auc_scores(y, lgbm_sig),
    "cat_sigmoid": auc_scores(y, cat_sig),
    "lgbm_isotonic": auc_scores(y, lgbm_iso),
    "cat_isotonic": auc_scores(y, cat_iso),
}

scores

## Blend after calibration

We test a simple 2-model blend using the original best weights from the
previous blend (LGBM 0.3886, CatBoost 0.6114).

In [None]:
w_lgbm = 0.3885570715220429
w_cat = 0.6114429284779572

def blend(a: pd.DataFrame, b: pd.DataFrame, w_a: float) -> pd.DataFrame:
    return a * w_a + b * (1.0 - w_a)

blend_base = blend(lgbm_oof, cat_oof, w_lgbm)
blend_sig = blend(lgbm_sig, cat_sig, w_lgbm)
blend_iso = blend(lgbm_iso, cat_iso, w_lgbm)

blend_scores = {
    "blend_base": auc_scores(y, blend_base),
    "blend_sigmoid": auc_scores(y, blend_sig),
    "blend_isotonic": auc_scores(y, blend_iso),
}

blend_scores