# Pseudo‑labeling experiments (LightGBM)

We simulate pseudo‑labeling by holding out a validation slice, predicting it as if it were "test", adding high‑confidence pseudo labels, and retraining. This avoids leakage while estimating impact.

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

ROOT = Path("..").resolve()
DATA_DIR = ROOT / "data"

X = pd.read_csv(DATA_DIR / "training_set_features.csv", index_col="respondent_id")
y = pd.read_csv(DATA_DIR / "training_set_labels.csv", index_col="respondent_id")

strat = (2 * y["h1n1_vaccine"].astype(int) + y["seasonal_vaccine"].astype(int)).values

cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
cat_indices = [X.columns.get_loc(c) for c in cat_cols]


def lgbm_model():
    return lgb.LGBMClassifier(
        n_estimators=800,
        learning_rate=0.05,
        num_leaves=64,
        max_depth=-1,
        min_data_in_leaf=50,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=1,
        objective="binary",
        n_jobs=4,
    )

In [None]:
def eval_baseline_holdout(X: pd.DataFrame, y: pd.DataFrame, seed: int = 42):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof = pd.DataFrame(index=y.index, columns=y.columns, dtype=float)
    for col in y.columns:
        oof[col] = 0.0

    for tr_idx, va_idx in skf.split(X, strat):
        X_tr, X_va = X.iloc[tr_idx].copy(), X.iloc[va_idx].copy()
        for c in cat_cols:
            X_tr[c] = X_tr[c].astype("category")
            X_va[c] = X_va[c].astype("category")

        for target in y.columns:
            model = lgbm_model()
            model.fit(X_tr, y[target].iloc[tr_idx], categorical_feature=cat_indices)
            oof.loc[X_va.index, target] = model.predict_proba(X_va)[:, 1]

    scores = {col: roc_auc_score(y[col], oof[col]) for col in y.columns}
    scores["mean_auc"] = float(np.mean(list(scores.values())))
    return scores

baseline_scores = eval_baseline_holdout(X, y)
baseline_scores

In [None]:
def pseudo_label_holdout(X: pd.DataFrame, y: pd.DataFrame, threshold: float = 0.98, seed: int = 42):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof = pd.DataFrame(index=y.index, columns=y.columns, dtype=float)
    for col in y.columns:
        oof[col] = 0.0

    for tr_idx, va_idx in skf.split(X, strat):
        X_tr, X_va = X.iloc[tr_idx].copy(), X.iloc[va_idx].copy()
        for c in cat_cols:
            X_tr[c] = X_tr[c].astype("category")
            X_va[c] = X_va[c].astype("category")

        for target in y.columns:
            # Step 1: train on true labels
            model = lgbm_model()
            model.fit(X_tr, y[target].iloc[tr_idx], categorical_feature=cat_indices)
            va_proba = model.predict_proba(X_va)[:, 1]

            # Step 2: build pseudo labels for high-confidence va
            pseudo_mask = (va_proba >= threshold) | (va_proba <= (1 - threshold))
            pseudo_labels = (va_proba >= 0.5).astype(int)

            # Step 3: retrain with pseudo-labeled va points
            X_aug = pd.concat([X_tr, X_va[pseudo_mask]], axis=0)
            y_aug = pd.concat([
                y[target].iloc[tr_idx],
                pd.Series(pseudo_labels[pseudo_mask], index=X_va[pseudo_mask].index),
            ])
            model2 = lgbm_model()
            model2.fit(X_aug, y_aug, categorical_feature=cat_indices)
            oof.loc[X_va.index, target] = model2.predict_proba(X_va)[:, 1]

    scores = {col: roc_auc_score(y[col], oof[col]) for col in y.columns}
    scores["mean_auc"] = float(np.mean(list(scores.values())))
    return scores

pseudo_scores = pseudo_label_holdout(X, y, threshold=0.98)
pseudo_scores

In [None]:
# Optional: try a looser threshold if the 0.98 mask is too small.
# pseudo_scores_095 = pseudo_label_holdout(X, y, threshold=0.95)
# pseudo_scores_09 = pseudo_label_holdout(X, y, threshold=0.90)
# pseudo_scores_095, pseudo_scores_09