# Row/block feature experiments (index patterns)

Goal: test whether row-index block features add signal. We report both
standard stratified CV (optimistic if there is leakage) and GroupKFold
grouped by block ID (more conservative).

## Block-pattern discovery (labels ignored)

We first scan the row order for patterns without using labels:
- Missingness runs and block-level missingness shifts
- Block-level distribution shifts (numeric means, categorical modes)


In [None]:
from __future__ import annotations

from pathlib import Path

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold


DATA_DIR = Path("../data")


def load_training(data_dir: Path) -> tuple[pd.DataFrame, pd.DataFrame]:
    X = pd.read_csv(data_dir / "training_set_features.csv", index_col="respondent_id")
    y = pd.read_csv(data_dir / "training_set_labels.csv", index_col="respondent_id")
    if not X.index.equals(y.index):
        raise ValueError("Training features and labels are misaligned by respondent_id.")
    return X, y


def stratify_labels(y: pd.DataFrame) -> np.ndarray:
    return (2 * y["h1n1_vaccine"].astype(int) + y["seasonal_vaccine"].astype(int)).values


def add_block_features(X: pd.DataFrame, block_sizes: list[int]) -> pd.DataFrame:
    Xb = X.copy()
    row_index = np.arange(len(Xb))
    Xb["row_index"] = row_index
    for size in block_sizes:
        Xb[f"row_block_{size}"] = (row_index // size).astype(int)
        Xb[f"row_pos_{size}"] = (row_index % size).astype(int)
    return Xb


def make_cat_cols(X: pd.DataFrame) -> list[str]:
    cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
    return cat_cols


def set_categoricals(X: pd.DataFrame, cat_cols: list[str]) -> pd.DataFrame:
    Xc = X.copy()
    for c in cat_cols:
        Xc[c] = Xc[c].astype("category")
    return Xc


def eval_cv(X: pd.DataFrame, y: pd.DataFrame, folds) -> dict[str, float]:
    oof = pd.DataFrame(index=X.index, columns=y.columns, dtype=float)

    params = dict(
        objective="binary",
        n_estimators=600,
        learning_rate=0.05,
        num_leaves=63,
        min_data_in_leaf=50,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=1,
        random_state=42,
    )

    cat_cols = make_cat_cols(X)
    X_cat = set_categoricals(X, cat_cols)

    for tr_idx, va_idx in folds:
        X_tr, X_va = X_cat.iloc[tr_idx], X_cat.iloc[va_idx]
        for target in y.columns:
            y_tr, y_va = y.iloc[tr_idx][target], y.iloc[va_idx][target]
            model = lgb.LGBMClassifier(**params)
            model.fit(
                X_tr,
                y_tr,
                eval_set=[(X_va, y_va)],
                eval_metric="auc",
                categorical_feature=cat_cols,
                callbacks=[lgb.early_stopping(50, verbose=False)],
            )
            oof.loc[X_va.index, target] = model.predict_proba(X_va)[:, 1]

    scores = {col: roc_auc_score(y[col], oof[col]) for col in y.columns}
    scores["mean_auc"] = float(np.mean(list(scores.values())))
    return scores


X_raw, y = load_training(DATA_DIR)
block_sizes = [100, 200, 500, 1000]

# Baseline (no block features)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
baseline_scores = eval_cv(X_raw, y, skf.split(X_raw, stratify_labels(y)))
print("Baseline (no blocks)", baseline_scores)

# Block features + stratified CV (optimistic if index encodes split)
X_blocks = add_block_features(X_raw, block_sizes)
block_scores_strat = eval_cv(X_blocks, y, skf.split(X_blocks, stratify_labels(y)))
print("With blocks (stratified)", block_scores_strat)

# Block features + GroupKFold (conservative, grouped by row_block_500)
groups = X_blocks["row_block_500"].values
gkf = GroupKFold(n_splits=5)
block_scores_group = eval_cv(X_blocks, y, gkf.split(X_blocks, y, groups))
print("With blocks (GroupKFold on row_block_500)", block_scores_group)
