# Rule-augmented post-processing

Goal: test whether simple, interpretable rules can improve AUC when applied
to strong base predictions (blend OOF).

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

ROOT = Path("..").resolve()
DATA_DIR = ROOT / "data"
RUNS_DIR = ROOT / "runs"

X = pd.read_csv(DATA_DIR / "training_set_features.csv", index_col="respondent_id")
y = pd.read_csv(DATA_DIR / "training_set_labels.csv", index_col="respondent_id")

oof = pd.read_csv(RUNS_DIR / "blend" / "oof_blend.csv", index_col="respondent_id")
assert oof.index.equals(X.index)

def auc_scores(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> dict:
    scores = {c: roc_auc_score(y_true[c], y_pred[c]) for c in y_true.columns}
    scores["mean_auc"] = float(np.mean(list(scores.values())))
    return scores

base_scores = auc_scores(y, oof)
base_scores

{'h1n1_vaccine': 0.8721904704159187,
 'seasonal_vaccine': 0.8659562572912779,
 'mean_auc': 0.8690733638535983}

## Rule set: doctor recommendation (additive shifts)

We add a small delta when a doctor recommended the vaccine. We test a grid
of deltas to see if any improve AUC. Values are clipped to [0, 1].

In [2]:
def apply_doctor_rules(preds: pd.DataFrame, X: pd.DataFrame, delta_h1n1: float, delta_seas: float) -> pd.DataFrame:
    adj = preds.copy()
    if "doctor_recc_h1n1" in X.columns:
        mask = X["doctor_recc_h1n1"] == 1
        adj.loc[mask, "h1n1_vaccine"] = np.clip(adj.loc[mask, "h1n1_vaccine"] + delta_h1n1, 0.0, 1.0)
    if "doctor_recc_seasonal" in X.columns:
        mask = X["doctor_recc_seasonal"] == 1
        adj.loc[mask, "seasonal_vaccine"] = np.clip(adj.loc[mask, "seasonal_vaccine"] + delta_seas, 0.0, 1.0)
    return adj


grid = [0.0, 0.01, 0.02, 0.03, 0.05, 0.08]
best = None

for d_h1n1 in grid:
    for d_seas in grid:
        adj = apply_doctor_rules(oof, X, d_h1n1, d_seas)
        scores = auc_scores(y, adj)
        record = (scores["mean_auc"], d_h1n1, d_seas, scores)
        if best is None or record[0] > best[0]:
            best = record

best

(0.86907516541018,
 0.01,
 0.0,
 {'h1n1_vaccine': 0.8721940735290823,
  'seasonal_vaccine': 0.8659562572912779,
  'mean_auc': 0.86907516541018})

## Rule set: doctor recommendation + health worker

Adds a smaller bonus for health workers for both targets.

In [3]:
def apply_doctor_worker_rules(
    preds: pd.DataFrame,
    X: pd.DataFrame,
    delta_h1n1: float,
    delta_seas: float,
    delta_worker: float,
) -> pd.DataFrame:
    adj = apply_doctor_rules(preds, X, delta_h1n1, delta_seas)
    if "health_worker" in X.columns:
        mask = X["health_worker"] == 1
        adj.loc[mask, "h1n1_vaccine"] = np.clip(adj.loc[mask, "h1n1_vaccine"] + delta_worker, 0.0, 1.0)
        adj.loc[mask, "seasonal_vaccine"] = np.clip(adj.loc[mask, "seasonal_vaccine"] + delta_worker, 0.0, 1.0)
    return adj


grid = [0.0, 0.01, 0.02, 0.03, 0.05]
best = None

for d_h1n1 in grid:
    for d_seas in grid:
        for d_hw in grid:
            adj = apply_doctor_worker_rules(oof, X, d_h1n1, d_seas, d_hw)
            scores = auc_scores(y, adj)
            record = (scores["mean_auc"], d_h1n1, d_seas, d_hw, scores)
            if best is None or record[0] > best[0]:
                best = record

best

(0.869085797215863,
 0.01,
 0.0,
 0.01,
 {'h1n1_vaccine': 0.8722199237711973,
  'seasonal_vaccine': 0.8659516706605288,
  'mean_auc': 0.869085797215863})