# Stacking experiments (CatBoost/LGBM/baseline)

This notebook compares three stacking variants:
1) Meta-only (OOF predictions only)
2) Meta + one-hot encoded raw features
3) Meta + LOO target encoding for categorical features

Outputs are CV AUCs per target and mean. Optionally write submissions for the best variant.

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

ROOT = Path("..").resolve()
DATA_DIR = ROOT / "data"
RUNS_DIR = ROOT / "runs"

X = pd.read_csv(DATA_DIR / "training_set_features.csv", index_col="respondent_id")
y = pd.read_csv(DATA_DIR / "training_set_labels.csv", index_col="respondent_id")
X_test = pd.read_csv(DATA_DIR / "test_set_features.csv", index_col="respondent_id")


def load_preds(run_dir: Path, name: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    oof = pd.read_csv(run_dir / f"oof_{name}.csv", index_col="respondent_id")
    test = pd.read_csv(run_dir / f"test_{name}.csv", index_col="respondent_id")
    return oof, test

baseline_oof, baseline_test = load_preds(RUNS_DIR / "baseline_logreg", "baseline")
lgbm_oof, lgbm_test = load_preds(RUNS_DIR / "gbdt_bakeoff", "lightgbm")
cat_oof, cat_test = load_preds(RUNS_DIR / "gbdt_bakeoff", "catboost")

for df in [baseline_oof, lgbm_oof, cat_oof]:
    assert df.index.equals(X.index)
for df in [baseline_test, lgbm_test, cat_test]:
    assert df.index.equals(X_test.index)

meta_oof = pd.concat(
    [
        baseline_oof.add_prefix("baseline_"),
        lgbm_oof.add_prefix("lgbm_"),
        cat_oof.add_prefix("cat_"),
    ],
    axis=1,
)
meta_test = pd.concat(
    [
        baseline_test.add_prefix("baseline_"),
        lgbm_test.add_prefix("lgbm_"),
        cat_test.add_prefix("cat_"),
    ],
    axis=1,
)

strat = (2 * y["h1n1_vaccine"].astype(int) + y["seasonal_vaccine"].astype(int)).values

In [None]:
def eval_meta_only(meta_features: pd.DataFrame, y: pd.DataFrame, seed: int = 42):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof = pd.DataFrame(index=y.index, columns=y.columns, dtype=float)
    for target in y.columns:
        oof[target] = 0.0

    for tr_idx, va_idx in skf.split(meta_features, strat):
        X_tr, X_va = meta_features.iloc[tr_idx], meta_features.iloc[va_idx]
        for target in y.columns:
            model = LogisticRegression(solver="liblinear", max_iter=300)
            model.fit(X_tr, y[target].iloc[tr_idx])
            oof.loc[X_va.index, target] = model.predict_proba(X_va)[:, 1]

    scores = {
        col: roc_auc_score(y[col], oof[col]) for col in y.columns
    }
    scores["mean_auc"] = float(np.mean(list(scores.values())))
    return scores

meta_only_scores = eval_meta_only(meta_oof, y)
meta_only_scores

In [None]:
def eval_onehot_stack(X_raw: pd.DataFrame, meta_features: pd.DataFrame, y: pd.DataFrame, seed: int = 42):
    X_ext = X_raw.copy()
    for col in meta_features.columns:
        X_ext[col] = meta_features[col]

    cat_cols = X_ext.select_dtypes(include=["object"]).columns.tolist()
    num_cols = [c for c in X_ext.columns if c not in cat_cols]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="median"), num_cols),
            ("cat", Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("ohe", OneHotEncoder(handle_unknown="ignore")),
            ]), cat_cols),
        ]
    )

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof = pd.DataFrame(index=y.index, columns=y.columns, dtype=float)
    for target in y.columns:
        oof[target] = 0.0

    for tr_idx, va_idx in skf.split(X_ext, strat):
        X_tr, X_va = X_ext.iloc[tr_idx], X_ext.iloc[va_idx]
        for target in y.columns:
            model = Pipeline([
                ("prep", preprocessor),
                ("clf", LogisticRegression(solver="liblinear", max_iter=300)),
            ])
            model.fit(X_tr, y[target].iloc[tr_idx])
            oof.loc[X_va.index, target] = model.predict_proba(X_va)[:, 1]

    scores = {
        col: roc_auc_score(y[col], oof[col]) for col in y.columns
    }
    scores["mean_auc"] = float(np.mean(list(scores.values())))
    return scores

onehot_scores = eval_onehot_stack(X, meta_oof, y)
onehot_scores

In [None]:
def loo_encode_train(X_train: pd.DataFrame, y_train: pd.Series, cat_cols: list[str]) -> pd.DataFrame:
    enc = pd.DataFrame(index=X_train.index)
    global_mean = y_train.mean()
    for col in cat_cols:
        series = X_train[col].fillna("__MISSING__").astype(str)
        stats = y_train.groupby(series).agg(["sum", "count"])
        sum_map = series.map(stats["sum"])
        count_map = series.map(stats["count"])
        loo = (sum_map - y_train) / (count_map - 1)
        loo = loo.where(count_map > 1, global_mean)
        enc[f"{col}_loo"] = loo.fillna(global_mean)
    return enc


def target_encode_apply(
    X_apply: pd.DataFrame, X_train: pd.DataFrame, y_train: pd.Series, cat_cols: list[str]
) -> pd.DataFrame:
    enc = pd.DataFrame(index=X_apply.index)
    global_mean = y_train.mean()
    for col in cat_cols:
        train_series = X_train[col].fillna("__MISSING__").astype(str)
        stats = y_train.groupby(train_series).agg(["sum", "count"])
        mean_map = stats["sum"] / stats["count"]
        apply_series = X_apply[col].fillna("__MISSING__").astype(str)
        enc[f"{col}_loo"] = apply_series.map(mean_map).fillna(global_mean)
    return enc


def eval_loo_stack(X_raw: pd.DataFrame, meta_features: pd.DataFrame, y: pd.DataFrame, seed: int = 42):
    cat_cols = X_raw.select_dtypes(include=["object"]).columns.tolist()
    num_cols = [c for c in X_raw.columns if c not in cat_cols]

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof = pd.DataFrame(index=y.index, columns=y.columns, dtype=float)
    for target in y.columns:
        oof[target] = 0.0

    for tr_idx, va_idx in skf.split(X_raw, strat):
        X_tr, X_va = X_raw.iloc[tr_idx], X_raw.iloc[va_idx]
        for target in y.columns:
            y_tr = y[target].iloc[tr_idx]
            # numeric
            num_med = X_tr[num_cols].median()
            X_tr_num = X_tr[num_cols].fillna(num_med)
            X_va_num = X_va[num_cols].fillna(num_med)
            # loo encoding
            X_tr_loo = loo_encode_train(X_tr, y_tr, cat_cols)
            X_va_loo = target_encode_apply(X_va, X_tr, y_tr, cat_cols)
            # meta features
            X_tr_meta = meta_features.iloc[tr_idx]
            X_va_meta = meta_features.iloc[va_idx]
            # final matrices
            X_tr_final = pd.concat([X_tr_num, X_tr_loo, X_tr_meta], axis=1)
            X_va_final = pd.concat([X_va_num, X_va_loo, X_va_meta], axis=1)
            model = LogisticRegression(solver="liblinear", max_iter=300)
            model.fit(X_tr_final, y_tr)
            oof.loc[X_va.index, target] = model.predict_proba(X_va_final)[:, 1]

    scores = {
        col: roc_auc_score(y[col], oof[col]) for col in y.columns
    }
    scores["mean_auc"] = float(np.mean(list(scores.values())))
    return scores

loo_scores = eval_loo_stack(X, meta_oof, y)
loo_scores

In [None]:
def fit_and_submit_meta_only(meta_train: pd.DataFrame, meta_test: pd.DataFrame, y: pd.DataFrame, out_name: str):
    preds = pd.DataFrame(index=meta_test.index, columns=y.columns, dtype=float)
    for target in y.columns:
        model = LogisticRegression(solver="liblinear", max_iter=300)
        model.fit(meta_train, y[target])
        preds[target] = model.predict_proba(meta_test)[:, 1]
    submission = pd.read_csv(DATA_DIR / "submission_format.csv", index_col="respondent_id")
    submission[preds.columns] = preds
    out_path = RUNS_DIR / "stacking" / f"submission_{out_name}.csv"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    submission.to_csv(out_path, index_label="respondent_id")
    return out_path

# Example:
# fit_and_submit_meta_only(meta_oof, meta_test, y, "meta_only")