# Feature engineering experiments

Goal: test simple survey-structure features (no combinations with other methods yet).

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier

ROOT = Path("..").resolve()
DATA_DIR = ROOT / "data"

X = pd.read_csv(DATA_DIR / "training_set_features.csv", index_col="respondent_id")
y = pd.read_csv(DATA_DIR / "training_set_labels.csv", index_col="respondent_id")

strat = (2 * y["h1n1_vaccine"].astype(int) + y["seasonal_vaccine"].astype(int)).values


def build_features(df: pd.DataFrame, add_engineered: bool) -> pd.DataFrame:
    Xf = df.copy()
    if not add_engineered:
        return Xf

    # Behavioral sum (binary behaviors)
    behavioral_cols = [
        "behavioral_antiviral_meds",
        "behavioral_avoidance",
        "behavioral_face_mask",
        "behavioral_wash_hands",
        "behavioral_large_gatherings",
        "behavioral_outside_home",
        "behavioral_touch_face",
    ]
    Xf["behavioral_sum"] = Xf[behavioral_cols].sum(axis=1, skipna=True)

    # Opinion gaps between H1N1 and seasonal
    Xf["opinion_effective_gap"] = Xf["opinion_h1n1_vacc_effective"] - Xf["opinion_seas_vacc_effective"]
    Xf["opinion_risk_gap"] = Xf["opinion_h1n1_risk"] - Xf["opinion_seas_risk"]
    Xf["opinion_sick_gap"] = Xf["opinion_h1n1_sick_from_vacc"] - Xf["opinion_seas_sick_from_vacc"]

    # Combined doctor recommendation indicator
    Xf["doctor_recc_any"] = (
        (Xf["doctor_recc_h1n1"] == 1) | (Xf["doctor_recc_seasonal"] == 1)
    ).astype(float)

    return Xf


def eval_catboost(X_raw: pd.DataFrame, y: pd.DataFrame, seed: int = 42) -> dict:
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof = pd.DataFrame(index=y.index, columns=y.columns, dtype=float)
    for target in y.columns:
        oof[target] = 0.0

    cat_cols = X_raw.select_dtypes(include=["object"]).columns.tolist()
    cat_indices = [X_raw.columns.get_loc(c) for c in cat_cols]

    for tr_idx, va_idx in skf.split(X_raw, strat):
        X_tr, X_va = X_raw.iloc[tr_idx].copy(), X_raw.iloc[va_idx].copy()
        for c in cat_cols:
            X_tr[c] = X_tr[c].fillna("__MISSING__").astype(str)
            X_va[c] = X_va[c].fillna("__MISSING__").astype(str)

        for target in y.columns:
            model = CatBoostClassifier(
                iterations=800,
                learning_rate=0.05,
                depth=6,
                loss_function="Logloss",
                eval_metric="AUC",
                verbose=False,
            )
            model.fit(X_tr, y[target].iloc[tr_idx], cat_features=cat_indices)
            oof.loc[X_va.index, target] = model.predict_proba(X_va)[:, 1]

    scores = {col: roc_auc_score(y[col], oof[col]) for col in y.columns}
    scores["mean_auc"] = float(np.mean(list(scores.values())))
    return scores

base_scores = eval_catboost(build_features(X, add_engineered=False), y)
engineered_scores = eval_catboost(build_features(X, add_engineered=True), y)

base_scores, engineered_scores