# Distribution shift / leakage checks

Goal: detect train vs test drift via a classifier, and identify which features
shift most (missingness and distribution).

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

ROOT = Path("..").resolve()
DATA_DIR = ROOT / "data"

X_train = pd.read_csv(DATA_DIR / "training_set_features.csv", index_col="respondent_id")
X_test = pd.read_csv(DATA_DIR / "test_set_features.csv", index_col="respondent_id")

X_train = X_train.copy()
X_test = X_test.copy()
X_train["__is_train__"] = 1
X_test["__is_train__"] = 0

X_full = pd.concat([X_train, X_test], axis=0)
y_drift = X_full.pop("__is_train__")

cat_cols = X_full.select_dtypes(include=["object"]).columns.tolist()
for c in cat_cols:
    X_full[c] = X_full[c].astype("category")

params = dict(
    objective="binary",
    n_estimators=400,
    learning_rate=0.05,
    num_leaves=63,
    min_data_in_leaf=50,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    random_state=42,
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = pd.Series(index=X_full.index, dtype=float)

for tr_idx, va_idx in skf.split(X_full, y_drift):
    X_tr, X_va = X_full.iloc[tr_idx], X_full.iloc[va_idx]
    y_tr, y_va = y_drift.iloc[tr_idx], y_drift.iloc[va_idx]
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_tr,
        y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="auc",
        categorical_feature=cat_cols,
        callbacks=[lgb.early_stopping(50, verbose=False)],
    )
    oof.iloc[va_idx] = model.predict_proba(X_va)[:, 1]

drift_auc = roc_auc_score(y_drift, oof)
print("Drift AUC (train vs test):", drift_auc)


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X_full.select_dtypes(include=["object"]).columns.tolist()


[LightGBM] [Info] Number of positive: 21365, number of negative: 21367
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042820 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 194
[LightGBM] [Info] Number of data points in the train set: 42732, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499977 -> initscore=-0.000094
[LightGBM] [Info] Start training from score -0.000094
[LightGBM] [Info] Number of positive: 21365, number of negative: 21367
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032667 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 194
[LightGBM] [Info] Number of data points in the train set: 42732, number of used features: 35
[LightGBM] [Info] [b

## Feature shift ranking (missingness + numeric mean shift + categorical top share shift)

In [2]:
def shift_score_numeric(a: pd.Series, b: pd.Series) -> float:
    a = a.astype(float)
    b = b.astype(float)
    diff = abs(a.mean() - b.mean())
    scale = a.std() + b.std() + 1e-9
    return float(diff / scale)


def shift_score_categorical(a: pd.Series, b: pd.Series) -> float:
    top = pd.concat([a, b]).value_counts(dropna=True).idxmax()
    a_share = float((a == top).mean())
    b_share = float((b == top).mean())
    return abs(a_share - b_share)


rows = []
for col in X_train.columns:
    a = X_train[col]
    b = X_test[col]
    miss = abs(a.isna().mean() - b.isna().mean())
    if col in cat_cols:
        score = shift_score_categorical(a, b)
        ftype = "categorical"
    else:
        score = shift_score_numeric(a, b)
        ftype = "numeric"
    rows.append({"feature": col, "type": ftype, "shift_score": score, "missing_delta": miss})

shift_df = pd.DataFrame(rows)
shift_df = shift_df.sort_values(["shift_score", "missing_delta"], ascending=False)
shift_df.head(15)

Unnamed: 0,feature,type,shift_score,missing_delta
35,__is_train__,numeric,1000000000.0,0.0
14,health_insurance,numeric,0.01278792,0.001739538
30,census_msa,categorical,0.01050487,0.0
20,opinion_seas_sick_from_vacc,numeric,0.009460677,0.0005998243
6,behavioral_large_gatherings,numeric,0.007442953,0.0005617514
8,behavioral_touch_face,numeric,0.006952068,1.7945e-07
12,child_under_6_months,numeric,0.006690212,0.0002632434
16,opinion_h1n1_risk,numeric,0.006140875,0.0003000797
31,household_adults,numeric,0.005180328,0.0008989562
32,household_children,numeric,0.004917228,0.0008989562
