# Stacking experiments (CatBoost/LGBM/baseline)

This notebook compares three stacking variants:
1) Meta-only (OOF predictions only)
2) Meta + one-hot encoded raw features
3) Meta + LOO target encoding for categorical features

Outputs are CV AUCs per target and mean. Optionally write submissions for the best variant.

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

ROOT = Path("..").resolve()
DATA_DIR = ROOT / "data"
RUNS_DIR = ROOT / "runs"

X = pd.read_csv(DATA_DIR / "training_set_features.csv", index_col="respondent_id")
y = pd.read_csv(DATA_DIR / "training_set_labels.csv", index_col="respondent_id")
X_test = pd.read_csv(DATA_DIR / "test_set_features.csv", index_col="respondent_id")


def load_preds(run_dir: Path, name: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    oof = pd.read_csv(run_dir / f"oof_{name}.csv", index_col="respondent_id")
    test = pd.read_csv(run_dir / f"test_{name}.csv", index_col="respondent_id")
    return oof, test

baseline_oof, baseline_test = load_preds(RUNS_DIR / "baseline_logreg", "baseline")
lgbm_oof, lgbm_test = load_preds(RUNS_DIR / "gbdt_bakeoff", "lightgbm")
cat_oof, cat_test = load_preds(RUNS_DIR / "gbdt_bakeoff", "catboost")

for df in [baseline_oof, lgbm_oof, cat_oof]:
    assert df.index.equals(X.index)
for df in [baseline_test, lgbm_test, cat_test]:
    assert df.index.equals(X_test.index)

meta_oof = pd.concat(
    [
        baseline_oof.add_prefix("baseline_"),
        lgbm_oof.add_prefix("lgbm_"),
        cat_oof.add_prefix("cat_"),
    ],
    axis=1,
)
meta_test = pd.concat(
    [
        baseline_test.add_prefix("baseline_"),
        lgbm_test.add_prefix("lgbm_"),
        cat_test.add_prefix("cat_"),
    ],
    axis=1,
)

strat = (2 * y["h1n1_vaccine"].astype(int) + y["seasonal_vaccine"].astype(int)).values

In [2]:
def eval_meta_only(meta_features: pd.DataFrame, y: pd.DataFrame, seed: int = 42):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof = pd.DataFrame(index=y.index, columns=y.columns, dtype=float)
    for target in y.columns:
        oof[target] = 0.0

    for tr_idx, va_idx in skf.split(meta_features, strat):
        X_tr, X_va = meta_features.iloc[tr_idx], meta_features.iloc[va_idx]
        for target in y.columns:
            model = LogisticRegression(solver="liblinear", max_iter=300)
            model.fit(X_tr, y[target].iloc[tr_idx])
            oof.loc[X_va.index, target] = model.predict_proba(X_va)[:, 1]

    scores = {
        col: roc_auc_score(y[col], oof[col]) for col in y.columns
    }
    scores["mean_auc"] = float(np.mean(list(scores.values())))
    return scores

meta_only_scores = eval_meta_only(meta_oof, y)
meta_only_scores

{'h1n1_vaccine': 0.8703009392176427,
 'seasonal_vaccine': 0.8656755430931427,
 'mean_auc': 0.8679882411553927}

In [3]:
def eval_onehot_stack(X_raw: pd.DataFrame, meta_features: pd.DataFrame, y: pd.DataFrame, seed: int = 42):
    X_ext = X_raw.copy()
    for col in meta_features.columns:
        X_ext[col] = meta_features[col]

    cat_cols = X_ext.select_dtypes(include=["object"]).columns.tolist()
    num_cols = [c for c in X_ext.columns if c not in cat_cols]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="median"), num_cols),
            ("cat", Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("ohe", OneHotEncoder(handle_unknown="ignore")),
            ]), cat_cols),
        ]
    )

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof = pd.DataFrame(index=y.index, columns=y.columns, dtype=float)
    for target in y.columns:
        oof[target] = 0.0

    for tr_idx, va_idx in skf.split(X_ext, strat):
        X_tr, X_va = X_ext.iloc[tr_idx], X_ext.iloc[va_idx]
        for target in y.columns:
            model = Pipeline([
                ("prep", preprocessor),
                ("clf", LogisticRegression(solver="liblinear", max_iter=300)),
            ])
            model.fit(X_tr, y[target].iloc[tr_idx])
            oof.loc[X_va.index, target] = model.predict_proba(X_va)[:, 1]

    scores = {
        col: roc_auc_score(y[col], oof[col]) for col in y.columns
    }
    scores["mean_auc"] = float(np.mean(list(scores.values())))
    return scores

onehot_scores = eval_onehot_stack(X, meta_oof, y)
onehot_scores

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X_ext.select_dtypes(include=["object"]).columns.tolist()


{'h1n1_vaccine': 0.8680876892499577,
 'seasonal_vaccine': 0.8647895063297758,
 'mean_auc': 0.8664385977898668}

In [4]:
# LOO experiment removed (no measurable gain).

In [5]:
def fit_and_submit_meta_only(meta_train: pd.DataFrame, meta_test: pd.DataFrame, y: pd.DataFrame, out_name: str):
    preds = pd.DataFrame(index=meta_test.index, columns=y.columns, dtype=float)
    for target in y.columns:
        model = LogisticRegression(solver="liblinear", max_iter=300)
        model.fit(meta_train, y[target])
        preds[target] = model.predict_proba(meta_test)[:, 1]
    submission = pd.read_csv(DATA_DIR / "submission_format.csv", index_col="respondent_id")
    submission[preds.columns] = preds
    out_path = RUNS_DIR / "stacking" / f"submission_{out_name}.csv"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    submission.to_csv(out_path, index_label="respondent_id")
    return out_path

# Example:
# fit_and_submit_meta_only(meta_oof, meta_test, y, "meta_only")

In [6]:
from catboost import CatBoostClassifier


def eval_catboost_meta(meta_features: pd.DataFrame, y: pd.DataFrame, seed: int = 42):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof = pd.DataFrame(index=y.index, columns=y.columns, dtype=float)
    for target in y.columns:
        oof[target] = 0.0

    for tr_idx, va_idx in skf.split(meta_features, strat):
        X_tr, X_va = meta_features.iloc[tr_idx], meta_features.iloc[va_idx]
        for target in y.columns:
            model = CatBoostClassifier(
                iterations=500,
                learning_rate=0.05,
                depth=4,
                loss_function="Logloss",
                eval_metric="AUC",
                verbose=False,
            )
            model.fit(X_tr, y[target].iloc[tr_idx])
            oof.loc[X_va.index, target] = model.predict_proba(X_va)[:, 1]

    scores = {col: roc_auc_score(y[col], oof[col]) for col in y.columns}
    scores["mean_auc"] = float(np.mean(list(scores.values())))
    return scores

catboost_meta_scores = eval_catboost_meta(meta_oof, y)
catboost_meta_scores

{'h1n1_vaccine': 0.8702394684312067,
 'seasonal_vaccine': 0.8642460412981585,
 'mean_auc': 0.8672427548646826}

In [7]:
def rank_transform(df: pd.DataFrame) -> pd.DataFrame:
    ranked = df.rank(method="average")
    return ranked / (len(df) + 1)


def eval_rank_meta_only(meta_features: pd.DataFrame, y: pd.DataFrame, seed: int = 42):
    meta_rank = rank_transform(meta_features)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof = pd.DataFrame(index=y.index, columns=y.columns, dtype=float)
    for target in y.columns:
        oof[target] = 0.0

    for tr_idx, va_idx in skf.split(meta_rank, strat):
        X_tr, X_va = meta_rank.iloc[tr_idx], meta_rank.iloc[va_idx]
        for target in y.columns:
            model = LogisticRegression(solver="liblinear", max_iter=300)
            model.fit(X_tr, y[target].iloc[tr_idx])
            oof.loc[X_va.index, target] = model.predict_proba(X_va)[:, 1]

    scores = {col: roc_auc_score(y[col], oof[col]) for col in y.columns}
    scores["mean_auc"] = float(np.mean(list(scores.values())))
    return scores

rank_meta_scores = eval_rank_meta_only(meta_oof, y)
rank_meta_scores

{'h1n1_vaccine': 0.8715532389046194,
 'seasonal_vaccine': 0.8657355918939923,
 'mean_auc': 0.8686444153993058}