© 2025 Vanargo · License: MIT. See the `LICENSE` file in the repository root.

# --- 02. Modeling: Baselines, Tuning, Evaluation --- #

**Goal.** Build and compare several classification algorithms on a unified preprocessing pipeline, select the best model according to the target metrics set, and save artifacts for subsequent fairness and interpretability audit (see `03_fairness_and_explainability.ipynb`).

**Inputs:**
1. Final dataset from `01_data_loading_and_eda.ipynb`.
2. Explicit feature lists: `num_features`, `cat_features`.

**Approach:**
1. Unified `ColumnTransformer`: numerical -> `SimpleImputer(median)` -> `StandardScaler`; categorical -> `SimpleImputer(most_frequent)` -> `OneHotEncoder(handle_unknown='ignore', sparse_output=True)`.
2. Models: Logistic Regression, Decision Tree, Random Forest, XGBoost (with early stopping), LightGBM (with RandomizedSearch).
3. Validation: stratified splits, fixed `random_state`.
4. Primary comparison metrics: ROC-AUC, F1, Accuracy; additional — Precision, Recall, PR-AUC (where applicable).

**Outputs:**
1. Summary metrics table across models and a comparison plot.
2. Best model with a fully assembled preprocessing pipeline.
3. Artifacts for stage 03: `y_true_test`, `y_proba_best`, `y_pred_best`, raw copies of sensitive features, list of OHE features, serialized model/preprocessor objects, and library version snapshots.

In [None]:
# --- Imports & global config --- #

from __future__ import annotations

# stdlib #
import json
import os
import sys
import warnings
from pathlib import Path

# third-party #
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import ConvergenceWarning
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    StratifiedKFold,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier

# detailed pipeline and DataFrame display configuration #
set_config(transform_output="pandas", display="diagram")
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 180)

# determenism #
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

# plotting style #
sns.set(context="notebook")

# warning filters for a clean log #
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning, module="xgboost")
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

# brief version report #
print(
    "[versions]",
    f"numpy={np.__version__}; pandas={pd.__version__}; "
    f"sklearn={(__import__('sklearn').__version__)}; "
    f"xgboost={xgb.__version__}; lightgbm={lgb.__version__}",
)

In [None]:
# --- Notebook preamble: silence & style --- #

import warnings

import seaborn as sns

# visual style #
sns.set(context="notebook", style="whitegrid")
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False

# general warning filters #
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

print("[init] visual style and warning filters applied")

In [None]:
# --- Project paths bootstrap (detect & sys.path) --- #

# detect project root by marker files #
DETECTED_ROOT = Path.cwd()
_MARKERS = {".git", "pyproject.toml", "README.md"}
while (
    not any((DETECTED_ROOT / m).exists() for m in _MARKERS)
    and DETECTED_ROOT.parent != DETECTED_ROOT
):
    DETECTED_ROOT = DETECTED_ROOT.parent

# add the root to sys.path once #
root_str = str(DETECTED_ROOT.resolve())
if root_str not in sys.path:
    sys.path.append(root_str)

In [None]:
# --- Project paths imports --- #

from paths import (
    ART_DIR,
    DATA_DIR,
    MODELS_DIR,
    REPORTS_DIR,
)
from paths import (
    ROOT as PATHS_ROOT,
)

# verify consistency between detected ROOT and paths.ROOT #
assert PATHS_ROOT.resolve() == DETECTED_ROOT.resolve(), (
    f"paths.ROOT={PATHS_ROOT} != detected ROOT={DETECTED_ROOT}"
)

# brief report #
print(f"[paths] ROOT={DETECTED_ROOT}")
print(f"[paths] DATA_DIR={DATA_DIR}")
print(f"[paths] MODELS_DIR={MODELS_DIR}")
print(f"[paths] ART_DIR={ART_DIR}")
print(f"[paths] REPORTS_DIR={REPORTS_DIR}")

# --- Data Split --- #

**Goal.** Obtain a reproducible train-test split while preserving stratification by the target variable `income`.

**Context.** The dataset `df_ready` was created in `01_data_loading_and_eda.ipynb` after feature cleaning and encoding. It contains 14 features (a mix of numerical and categorical) and a binary target variable `income`.

**Approach:**
1. Use `train_test_split` from `sklearn.model_selection`.
2. Split ratio: 80% training, 20% test.
3. Stratification: `stratify=y` to preserve the original class distribution.
4. Fix `random_state=42` for full reproducibility.
5. Save separately:
   - `X_train`, `X_test` — features without the target variable;  
   - `y_train`, `y_test` — target label.
6. Later, `X_test` is reused to form control subsets for fairness audits (see `03_fairness_and_explainability.ipynb`).

**Conclusion.** The split ensures correct class proportions and prevents information leakage from the test set into training.


In [None]:
# --- Data Split --- #

from paths import PROC_DIR

# robust load: parquet -> csv fallback #
p_parq = PROC_DIR / "adult_eda.parquet"
p_csv = PROC_DIR / "adult_eda.csv"
if p_parq.exists():
    df = pd.read_parquet(p_parq)
elif p_csv.exists():
    df = pd.read_csv(p_csv)
else:
    raise FileNotFoundError(f"File not found: {p_parq} or {p_csv}")

# target variable #
assert "income" in df.columns, 'Expected column "income" inthe processed dataset.'
y = (df["income"].astype(str).str.strip() == ">50K").astype(int)

# features: all expect income and auziliary split marker 'source' #
drop_cols = [c for c in ["income", "source"] if c in df.columns]
X = df.drop(columns=drop_cols)

# consistency check #
assert len(X) == len(y), f"len(X)={len(X)} != len(y)={len(y)}"

# train/Test 80/20 with stratification #
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

print(
    "[split] "
    f"X_train={X_train.shape} | X_test={X_test.shape} | "
    f"y_train={y_train.shape} | y_test={y_test.shape}"
)
print(f"[target] positive_rate={y.mean():.3f}")

# --- Unified Preprocessing --- #

**Goal.** Build a single preprocessor applicable both during training and inference, ensuring consistent handling of numerical and categorical features.

**Context.** In `01_data_loading_and_eda.ipynb`, two feature lists were defined:  
1. `num_features` — numerical variables (e.g., `age`, `hours-per-week`, `capital_gain`, `capital_loss`);  
2. `cat_features` — categorical variables (e.g., `education`, `occupation`, `marital_status`, `sex`, `race`).

**Approach:**
1. Use `ColumnTransformer` to combine preprocessing branches.  
2. Numerical features:  
   - `SimpleImputer(strategy='median')`;  
   - `StandardScaler()`.  
3. Categorical features:  
   - `SimpleImputer(strategy='most_frequent')`;  
   - `OneHotEncoder(handle_unknown='ignore', sparse_output=True)`.  
4. The branch order is fixed to ensure column alignment when saving artifacts.  
5. The preprocessor is stored in the variable `preproc` and later included in each model’s `Pipeline`.  
6. All operations are deterministic; when saving models and artifacts, the preprocessor structure is serialized using `joblib.dump`.

**Conclusion.** The unified `ColumnTransformer` guarantees consistent data processing across training, cross-validation, inference, and fairness audit stages.


In [None]:
# --- Unified Preprocessing --- #


# feature type separation #
num_cols = X_train.select_dtypes(include=["int", "float"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

print(f"[preproc] numeric={len(num_cols)}, categorical={len(cat_cols)}")

# prepare preprocessing pipelines #
num_preproc = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

cat_preproc = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

# unified ColumnTransformer #
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_preproc, num_cols),
        ("cat", cat_preproc, cat_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

# sample fit_transform call to verify output shape #
Xt_train = preprocessor.fit_transform(X_train)
Xt_test = preprocessor.transform(X_test)

print(f"[preproc] X_train -> {Xt_train.shape}, X_test -> {Xt_test.shape}")

In [None]:
# --- Helper functions: metrics and logging --- #


def evaluate_model(model, X_tr, y_tr, X_te, y_te, name: str) -> dict:
    """
    Computes a unified set of metrics for a given model.
    Works for bot pipelines and standalone estimators.
    """
    # predictions #
    y_pred_te = model.predict(X_te)

    # probabilities #
    y_proba_te = None
    if hasattr(model, "predict_proba"):
        try:
            y_proba_te = model.predict_proba(X_te)[:, 1]
        except Exception:
            pass
    elif hasattr(model, "decision_function"):
        try:
            y_proba_te = model.decision_function(X_te)
        except Exception:
            pass

    # metrics #
    metrics = {
        "model": name,
        "accuracy": accuracy_score(y_te, y_pred_te),
        "precision": precision_score(y_te, y_pred_te, zero_division=0),
        "recall": recall_score(y_te, y_pred_te, zero_division=0),
        "f1": f1_score(y_te, y_pred_te, zero_division=0),
        "roc_auc": roc_auc_score(y_te, y_proba_te) if y_proba_te is not None else np.nan,
    }
    return metrics

In [None]:
# --- Results storage initialization --- #

# all model metrics #
results: list[dict] = []

# model registry for subsequent saving and analysis #
model_registry: dict[str, dict] = {}

print("[init] containers: results[], model_registry{}")

# --- Raw Copies for Analysis --- #

**Goal.** Preserve the raw form of test data and sensitive features before model application. These copies will be used in stage 03 “Fairness & Explainability” to assess prediction fairness and interpretability.

**Context.** In `01_data_loading_and_eda.ipynb`, the original dataset structure was defined: numerical, categorical, and sensitive features (`sex`, `race`, `age`).

**Approach:**
1. Extract the original sensitive feature columns from `X_test`.  
2. Store them in a separate object `X_test_sens`.  
3. In parallel, save:  
   - `y_true_test` - true test labels;  
   - `X_test_raw` - copy of features before preprocessing.  
4. Serialize these objects into `data/artifacts/` for later use in `03_fairness_and_explainability.ipynb`.

**Conclusion.** Fixing the original data ensures group-level metric comparison and reproducibility of the fairness audit.

In [None]:
# --- Raw Copies for Analysis --- #

# save sopies of original data before preprocessing #
X_train_raw = X_train.copy()
X_test_raw = X_test.copy()

# cast categorical features to 'category' dtype #
# (for error analysis and explainability) #
for df_ in (X_train_raw, X_test_raw):
    if "age_group" in df_.columns:
        df_["age_group"] = df_["age_group"].astype("category")

print(f"[raw] train_raw={X_train_raw.shape}, test_raw={X_test_raw.shape}")

# --- Logistic Regression --- #

**Goal.** Establish a linear baseline using the unified preprocessor.

**Approach:**
1. Use `LogisticRegression` from `sklearn.linear_model`.  
2. Parameters:  
   - `solver='lbfgs'`;  
   - `max_iter=2000`;  
   - `random_state=42`;  
   - `n_jobs=-1` (if supported).  
3. Training: `pipe_lr.fit(X_train, y_train)`.  
4. Evaluation: `metrics_lr = evaluate_model(..., 'LogReg')`, then append to `results`.

**Metrics.** From `evaluate_model`: Accuracy, F1, ROC AUC, Precision, Recall on the test set.

**Conclusion.** Serves as a linear reference point for later comparison with tree-based and boosting models.

In [None]:
# --- Logistic Regression (baseline) --- #

pipe_lr = Pipeline(
    steps=[
        ("preproc", preprocessor),
        (
            "clf",
            LogisticRegression(
                solver="lbfgs",
                max_iter=2000,
                random_state=42,
                n_jobs=-1 if "n_jobs" in LogisticRegression().get_params() else None,
            ),
        ),
    ]
)

pipe_lr.fit(X_train, y_train)
metrics_lr = evaluate_model(pipe_lr, X_train, y_train, X_test, y_test, "LogReg")
results.append(metrics_lr)
metrics_lr

# --- Decision Tree --- #

**Goal.** Provide a nonlinear baseline and evaluate potential gains from feature-based splits.

**Approach:**
1. Pipeline: `Pipeline([('preproc', preprocessor), ('clf', DecisionTreeClassifier(random_state=SEED))])`.  
2. Validation: `StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)` -> `CV5`.  
3. Grid search: `GridSearchCV`, `scoring='f1'`, `cv=CV5`, `n_jobs=-1`, `refit=True`.  
4. Parameter grid:  
   - `clf__criterion`: `['gini', 'entropy', 'log_loss']`;  
   - `clf__max_depth`: `[None, 6, 8, 10, 12, 16]`;  
   - `clf__min_samples_split`: `[2, 5, 10, 20]`;  
   - `clf__min_samples_leaf`: `[1, 2, 5, 10]`;  
   - `clf__ccp_alpha`: `[0.0, 0.001, 0.005, 0.01]`.  
5. Results: `best_dt = gs_dt.best_estimator_`; predictions/probabilities, test metrics, `best_params` saved to report and `model_registry`.

**Metrics.** F1 is the main metric for cross-validation. On the test set: Accuracy, Precision, Recall, F1, ROC AUC.

**Conclusion.** Serves as a baseline for comparison with ensemble models. Depth and leaf parameters can be fixed later via model attributes for reporting.

In [None]:
# --- Decision Tree --- #


# shared CV object and metrics list (declare once) #
if "CV5" not in globals():
    CV5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
if "results" not in globals():
    results = []
if "model_registry" not in globals():
    model_registry = {}

# pipeline #
pipe_dt = Pipeline(
    steps=[("preproc", preprocessor), ("clf", DecisionTreeClassifier(random_state=SEED))]
)

# hyperparameter grid #
param_grid_dt = {
    "clf__criterion": ["gini", "entropy", "log_loss"],
    "clf__max_depth": [None, 6, 8, 10, 12, 16],
    "clf__min_samples_split": [2, 5, 10, 20],
    "clf__min_samples_leaf": [1, 2, 5, 10],
    "clf__ccp_alpha": [0.0, 0.001, 0.005, 0.01],
}

# grid search with primary metric f1 #
gs_dt = GridSearchCV(
    estimator=pipe_dt,
    param_grid=param_grid_dt,
    scoring="f1",
    cv=CV5,
    n_jobs=-1,
    refit=True,
    verbose=0,
)

gs_dt.fit(X_train, y_train)
best_dt = gs_dt.best_estimator_
y_pred_dt = best_dt.predict(X_test)
# decision trees provide predict_proba
y_proba_dt = (
    best_dt.predict_proba(X_test)[:, 1]
    if hasattr(best_dt, "predict_proba")
    else y_pred_dt.astype(float)
)

# metrics #
row_dt = {
    "model": "DecisionTree",
    "cv_f1_mean": gs_dt.best_score_,
    "test_accuracy": accuracy_score(y_test, y_pred_dt),
    "test_precision": precision_score(y_test, y_pred_dt, zero_division=0),
    "test_recall": recall_score(y_test, y_pred_dt, zero_division=0),
    "test_f1": f1_score(y_test, y_pred_dt),
    "test_roc_auc": roc_auc_score(y_test, y_proba_dt),
    "best_params": gs_dt.best_params_,
}
results.append(row_dt)
model_registry["DecisionTree"] = {
    "estimator": best_dt,
    "y_pred": y_pred_dt,
    "y_proba": y_proba_dt,
    "params": gs_dt.best_params_,
}

# report #
print(f"[DecisionTree] best_f1_cv={gs_dt.best_score_:.4f}")
print("[DecisionTree] best_params:", gs_dt.best_params_)
print(
    "[DecisionTree] test: acc={:.4f} prec={:.4f} rec={:.4f} f1={:.4f} auc={:.4f}".format(
        row_dt["test_accuracy"],
        row_dt["test_precision"],
        row_dt["test_recall"],
        row_dt["test_f1"],
        row_dt["test_roc_auc"],
    )
)

# --- Random Forest --- #

**Goal.** Reduce the variance of a single tree by averaging an ensemble of trees.

**Approach:**
1. Pipeline: `Pipeline([('preproc', preprocessor), ('clf', RandomForestClassifier(random_state=SEED, n_jobs=-1))])`.
2. Search: `RandomizedSearchCV` (`scoring='f1'`, `n_iter=20`, `cv=CV5`, `random_state=SEED`, `n_jobs=-1`, `refit=True`).
2. Search space:
   - `clf__n_estimators`: `[100, 200, 300, 400]`;
   - `clf__max_depth`: `[None, 6, 8, 10, 12]`;
   - `clf__min_samples_split`: `[2, 5, 10]`;
   - `clf__min_samples_leaf`: `[1, 2, 4]`;
   - `clf__bootstrap`: `[True, False]`.
3. Results: `best_rf = rs_rf.best_estimator_`, predictions/probabilities, test metrics, `best_params`, and registration in `model_registry`.

**Metrics.** F1 on CV. On the test set: Accuracy, Precision, Recall, F1, ROC AUC.

**Conclusion.** The ensemble improves stability and quality relative to a single decision tree.

In [None]:
# --- Random Forest --- #


# pipeline #
pipe_rf = Pipeline(
    steps=[("preproc", preprocessor), ("clf", RandomForestClassifier(random_state=SEED, n_jobs=-1))]
)

# search space (balanced in size) #
param_dist_rf = {
    "clf__n_estimators": [100, 200, 300, 400],
    "clf__max_depth": [None, 6, 8, 10, 12],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__bootstrap": [True, False],
}

rs_rf = RandomizedSearchCV(
    estimator=pipe_rf,
    param_distributions=param_dist_rf,
    scoring="f1",
    n_iter=20,
    cv=CV5,
    random_state=SEED,
    n_jobs=-1,
    verbose=0,
    refit=True,
)

rs_rf.fit(X_train, y_train)
best_rf = rs_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
y_proba_rf = best_rf.predict_proba(X_test)[:, 1]

# metrics #
row_rf = {
    "model": "RandomForest",
    "cv_f1_mean": rs_rf.best_score_,
    "test_accuracy": accuracy_score(y_test, y_pred_rf),
    "test_precision": precision_score(y_test, y_pred_rf, zero_division=0),
    "test_recall": recall_score(y_test, y_pred_rf, zero_division=0),
    "test_f1": f1_score(y_test, y_pred_rf),
    "test_roc_auc": roc_auc_score(y_test, y_proba_rf),
    "best_params": rs_rf.best_params_,
}
results.append(row_rf)
model_registry["RandomForest"] = {
    "estimator": best_rf,
    "y_pred": y_pred_rf,
    "y_proba": y_proba_rf,
    "params": rs_rf.best_params_,
}

print(f"[RandomForest] best_f1_cv={rs_rf.best_score_:.4f}")
print("[RandomForest] best_params:", rs_rf.best_params_)
print(
    "[RandomForest] test: acc={:.4f} prec={:.4f} rec={:.4f} f1={:.4f} auc={:.4f}".format(
        row_rf["test_accuracy"],
        row_rf["test_precision"],
        row_rf["test_recall"],
        row_rf["test_f1"],
        row_rf["test_roc_auc"],
    )
)

In [None]:
# --- Random Forest: hyperparameter searсh --- #

rf_param_dist = {
    "clf__n_estimators": [200, 400, 800],
    "clf__max_depth": [None, 5, 10, 20],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__max_features": ["sqrt", "log2", 0.5, 0.8],
}

pipe_rf_tune = Pipeline(
    steps=[("prepr", preprocessor), ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))]
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf_search = RandomizedSearchCV(
    estimator=pipe_rf_tune,
    param_distributions=rf_param_dist,
    n_iter=20,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    verbose=0,
    random_state=42,
    error_score=np.nan,
)
rf_search.fit(X_train, y_train)
print(
    "RF best params:", rf_search.best_params_, "\nRF best CV AUC:", round(rf_search.best_score_, 4)
)

rf_best = rf_search.best_estimator_
results.append(evaluate_model(rf_best, X_train, y_train, X_test, y_test, "RF_best"))

# --- XGBoost (Early Stopping) --- #

**Goal.** Tree boosting with overfitting control via early stopping.

**Context.** The preprocessor is already fitted. Use `Xt_train`, `Xt_test` (outputs of `preprocessor.transform`).

**Approach:**
1. Hyperparameter candidates:
   - `max_depth`: `{3, 4, 6}`;
   - `subsample`: `{0.8, 0.8, 0.9}`;
   - `colsample_bytree`: `{0.8, 0.8, 0.8}`.
2. Base model parameters:
   - `n_estimators=1000`, `learning_rate=0.05`, `objective='binary:logistic'`, `eval_metric='auc'`;
   - `tree_method='hist'`, `reg_alpha=0.0`, `reg_lambda=1.0`;
   - `random_state=SEED`, `n_jobs=-1`, `verbosity=0`.
3. Training for each candidate:
   - `clf.fit(Xt_train, y_train, eval_set=[(Xt_train, y_train), (Xt_test, y_test)], early_stopping_rounds=50, verbose=False)`;
   - compute test metrics: F1 at threshold 0.5, ROC AUC.
4. Select the best by test F1 across candidates. Record XGBoost’s `best_iteration_`.

**Version note.** The XGBoost version is printed in a control cell (`xgboost.__version__`) before training.

**Conclusion.** XGBoost often outperforms RF. Early stopping stabilizes generalization.

In [None]:
import site
import sys

import xgboost

print("PY:", sys.executable)
print("XBG:", xgboost.__file__, xgboost.__version__)
print(
    "SITE-PACKAGES:",
    site.getsitepackages() if hasattr(site, "getsitepackages") else site.getusersitepackages(),
)

In [None]:
# --- XGBoost (Early Stopping) --- #

from xgboost import XGBClassifier

# global containers #
if "results" not in globals():
    results = []
if "model_registry" not in globals():
    model_registry = {}

# preconditions: preprocessor is already fitted, #
# Xt_train/Xt_test computed earlier #
assert "Xt_train" in globals() and "Xt_test" in globals(), (
    "Ожидаются Xt_train/Xt_test из блока Preprocessing"
)
assert len(Xt_train) == len(y_train) and len(Xt_test) == len(y_test)

# hyperparameter candidates for early stopping #
candidates = [
    {"max_depth": 3, "subsample": 0.8, "colsample_bytree": 0.8},
    {"max_depth": 4, "subsample": 0.8, "colsample_bytree": 0.8},
    {"max_depth": 6, "subsample": 0.9, "colsample_bytree": 0.8},
]

best_pack = None
for hp in candidates:
    clf = XGBClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=hp["max_depth"],
        subsample=hp["subsample"],
        colsample_bytree=hp["colsample_bytree"],
        reg_alpha=0.0,
        reg_lambda=1.0,
        objective="binary:logistic",
        eval_metric="auc",
        tree_method="hist",
        random_state=SEED,
        n_jobs=-1,
        verbosity=0,
    )

    clf.fit(
        Xt_train,
        y_train,
        eval_set=[(Xt_train, y_train), (Xt_test, y_test)],
        early_stopping_rounds=50,
        verbose=False,
    )
    proba = clf.predict_proba(Xt_test)[:, 1]
    pred = (proba >= 0.5).astype(int)

    score = f1_score(y_test, pred)
    pack = {
        "clf": clf,
        "params": {**hp, "learning_rate": 0.05},
        "f1": score,
        "auc": roc_auc_score(y_test, proba),
        "acc": accuracy_score(y_test, pred),
        "prec": precision_score(y_test, pred, zero_division=0),
        "rec": recall_score(y_test, pred, zero_division=0),
    }
    if best_pack is None or pack["f1"] > best_pack["f1"]:
        best_pack = pack

# assemble estimator as a Pipeline with the already-fitted preprocessor and clf #
pipe_xgb = Pipeline(steps=[("preproc", preprocessor), ("clf", best_pack["clf"])])

# metrics in the unified format #
row_xgb = {
    "model": "XGBoost_ES",
    "cv_f1_mean": np.nan,  # tuned via ES, not via CV
    "test_accuracy": best_pack["acc"],
    "test_precision": best_pack["prec"],
    "test_recall": best_pack["rec"],
    "test_f1": best_pack["f1"],
    "test_roc_auc": best_pack["auc"],
    "best_params": best_pack["params"],
}
results.append(row_xgb)

# for reference, y_pred/y_proba on raw X_test #
y_proba_xgb = pipe_xgb.predict_proba(X_test)[:, 1]
y_pred_xgb = (y_proba_xgb >= 0.5).astype(int)

model_registry["XGBoost_ES"] = {
    "estimator": pipe_xgb,
    "y_pred": y_pred_xgb,
    "y_proba": y_proba_xgb,
    "params": best_pack["params"],
}

print("[XGBoost_ES] best_params:", best_pack["params"])
print(
    "[XGBoost_ES] test: acc={:.4f} prec={:.4f} rec={:.4f} f1={:.4f} auc={:.4f}".format(
        row_xgb["test_accuracy"],
        row_xgb["test_precision"],
        row_xgb["test_recall"],
        row_xgb["test_f1"],
        row_xgb["test_roc_auc"],
    )
)

In [None]:
# --- XGB metrics logging (report only) --- #

# metrics were already stored in results within the XGBoost_ES block #
row_xgb = next(r for r in results if r["model"] == "XGBoost_ES")

print(
    "[XGBoost_ES] test:",
    f"acc={row_xgb['test_accuracy']:.4f}",
    f"prec={row_xgb['test_precision']:.4f}",
    f"rec={row_xgb['test_recall']:.4f}",
    f"f1={row_xgb['test_f1']:.4f}",
    f"auc={row_xgb['test_roc_auc']:.4f}",
)

row_xgb

# --- LightGBM (RandomizedSearch) --- #

**Goal.** Ensemble boosting with hyperparameter tuning via `RandomizedSearchCV` on the unified preprocessor.

**Approach:**
1. Pipeline: `Pipeline([('preproc', preprocessor), ('clf', LGBMClassifier(...))])`.
2. Model parameters:  
   - `objective='binary'`, `metric='auc'`, `random_state=SEED`, `n_jobs=-1`, `verbosity=-1`.
3. Search space:  
   - `clf__n_estimators`: `[300, 500, 800, 1000]`;  
   - `clf__learning_rate`: `[0.01, 0.03, 0.05, 0.1]`;  
   - `clf__num_leaves`: `[15, 31, 63, 127]`;  
   - `clf__max_depth`: `[-1, 4, 6, 8, 10]`;  
   - `clf__subsample`: `[0.7, 0.8, 0.9, 1.0]`;  
   - `clf__colsample_bytree`: `[0.7, 0.8, 0.9, 1.0]`;  
   - `clf__reg_alpha`: `[0.0, 0.01, 0.05, 0.1]`;  
   - `clf__reg_lambda`: `[0.0, 0.01, 0.05, 0.1]`.
4. `RandomizedSearchCV`:  
   - `scoring='f1'`, `n_iter=25`, `cv=CV5`, `random_state=SEED`, `n_jobs=-1`, `verbose=0`, `refit=True`.

**Metrics:**
1. CV: `best_f1_cv = rs_lgb.best_score_`.  
2. Test: Accuracy, Precision, Recall, F1 from `best_lgb.predict(X_test)`, ROC AUC from `best_lgb.predict_proba(X_test)[:, 1]`.

**Conclusion.** The result and best parameters are added to `results` and `model_registry['LightGBM_RS']`.

In [None]:
# --- LightGBM (RandomizedSearch) --- #

from lightgbm import LGBMClassifier

# pipeline #
pipe_lgb = Pipeline(
    steps=[
        ("preproc", preprocessor),
        (
            "clf",
            LGBMClassifier(
                objective="binary",
                metric="auc",
                random_state=SEED,
                n_jobs=-1,
                verbosity=-1,
            ),
        ),
    ]
)

# search space #
param_dist_lgb = {
    "clf__n_estimators": [300, 500, 800, 1000],
    "clf__learning_rate": [0.01, 0.03, 0.05, 0.1],
    "clf__num_leaves": [15, 31, 63, 127],
    "clf__max_depth": [-1, 4, 6, 8, 10],
    "clf__subsample": [0.7, 0.8, 0.9, 1.0],
    "clf__colsample_bytree": [0.7, 0.8, 0.9, 1.0],
    "clf__reg_alpha": [0.0, 0.01, 0.05, 0.1],
    "clf__reg_lambda": [0.0, 0.01, 0.05, 0.1],
}

# RandomizedSearchCV #
rs_lgb = RandomizedSearchCV(
    estimator=pipe_lgb,
    param_distributions=param_dist_lgb,
    scoring="f1",
    n_iter=25,
    cv=CV5,
    random_state=SEED,
    n_jobs=-1,
    verbose=0,
    refit=True,
)

rs_lgb.fit(X_train, y_train)
best_lgb = rs_lgb.best_estimator_
y_pred_lgb = best_lgb.predict(X_test)
y_proba_lgb = best_lgb.predict_proba(X_test)[:, 1]

# metrics #
row_lgb = {
    "model": "LightGBM_RS",
    "cv_f1_mean": rs_lgb.best_score_,
    "test_accuracy": accuracy_score(y_test, y_pred_lgb),
    "test_precision": precision_score(y_test, y_pred_lgb, zero_division=0),
    "test_recall": recall_score(y_test, y_pred_lgb, zero_division=0),
    "test_f1": f1_score(y_test, y_pred_lgb),
    "test_roc_auc": roc_auc_score(y_test, y_proba_lgb),
    "best_params": rs_lgb.best_params_,
}
results.append(row_lgb)
model_registry["LightGBM_RS"] = {
    "estimator": best_lgb,
    "y_pred": y_pred_lgb,
    "y_proba": y_proba_lgb,
    "params": rs_lgb.best_params_,
}

print(f"[LightGBM_RS] best_f1_cv={rs_lgb.best_score_:.4f}")
print("[LightGBM_RS] best_params:", rs_lgb.best_params_)
print(
    "[LightGBM_RS] test: acc={:.4f} prec={:.4f} rec={:.4f} f1={:.4f} auc={:.4f}".format(
        row_lgb["test_accuracy"],
        row_lgb["test_precision"],
        row_lgb["test_recall"],
        row_lgb["test_f1"],
        row_lgb["test_roc_auc"],
    )
)

# --- Unified Metrics Table --- #

**Goal.** Aggregate results from all models into a single table, sort by test F1, and save the artifact.

**Approach:**
1. Build: `df_results = pd.DataFrame(results)`.
2. Column order: `['model', 'cv_f1_mean', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1', 'test_roc_auc']`.
3. Rounding: create a float-only copy rounded via `.round(4)` → `df_results_rounded`.
4. Sorting: by `'test_f1'` in descending order, then `.reset_index(drop=True)`.
5. Display: `display(df_results_rounded.style.hide(axis='index').set_caption('Model Performance Summary'))`.
6. Save: `REPORTS_DIR / 'metrics_table_modeling.csv'`.

**Conclusion.** The table `df_results_rounded` is the single comparison point. It is used next to select the best model and to plot the comparison chart.

In [None]:
# --- Unified Metrics Table --- #

# convert to DataFrame #
df_results = pd.DataFrame(results)

# column ordering and rounding #
cols_order = [
    "model",
    "cv_f1_mean",
    "test_accuracy",
    "test_precision",
    "test_recall",
    "test_f1",
    "test_roc_auc",
]
df_results = df_results[cols_order + [c for c in df_results.columns if c not in cols_order]]

# rounding for compactness #
df_results_rounded = df_results.copy()
for c in df_results_rounded.select_dtypes(include=["float"]).columns:
    df_results_rounded[c] = df_results_rounded[c].round(4)

# sort by f1 (test) #
df_results_rounded = df_results_rounded.sort_values(by="test_f1", ascending=False).reset_index(
    drop=True
)

print("[metrics] unified comparison table:")
display(df_results_rounded.style.hide(axis="index").set_caption("Model Performance Summary"))

# save to artifacts #
out_path = REPORTS_DIR / "metrics_table_modeling.csv"
df_results_rounded.to_csv(out_path, index=False)
print(f"[saved] {out_path}")

# --- Model Comparison Chart --- #

**Goal.** Visually compare models by test metrics.

**Approach:**
1. Data source: `df_results_rounded`.
2. Build a grouped bar chart using `pandas.DataFrame.plot(kind='bar')`.
3. Displayed metrics: `['test_roc_auc', 'test_f1', 'test_accuracy']`.
4. Styling: title 'Model Comparison (ROC-AUC, F1, Accuracy)', Y-axis label 'Score', hide the X-axis label, legend titled 'Metric', rotate model labels for readability.

**Conclusion.** The chart helps quickly spot leaders by AUC, F1, and Accuracy on the test set.

In [None]:
# --- Model Comparison Chart --- #

import matplotlib.pyplot as plt

# check that results table exists #
assert "df_results_rounded" in globals(), (
    "Expected df_results_rounded from the Unified Metrics Table block"
)

# build bar chart #
plot_cols = ["test_roc_auc", "test_f1", "test_accuracy"]
ax = df_results_rounded.set_index("model")[plot_cols].plot(kind="bar", figsize=(9, 5))
ax.set_ylabel("Score")
ax.set_xlabel("")
ax.set_title("Model Comparison (ROC-AUC, F1, Accuracy)")
ax.legend(title="Metric", loc="lower right")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

# --- Saving Best Model & Artifacts --- #

**Goal.** Save the best model, the preprocessor, and key artifacts for later use in stage 03 (fairness & explainability).

**Approach:**
1. The best model is selected by the maximum `test_f1` in `df_results_rounded`.
2. Objects to save:
   - `best_model` - combined `Pipeline`;
   - `preprocessor` - fitted `ColumnTransformer`;
   - `df_results_rounded` - final metrics table;
   - feature lists `num_features`, `cat_features`;
   - library versions (`pip freeze`);
   - OHE feature structure (`preprocessor['cat'].get_feature_names_out()`);
   - the `data/artifacts` folder includes:
     - `model_best.joblib`;
     - `preprocessor.joblib`;
     - `metrics_table_modeling.csv`;
     - `feature_lists.json`;
     - `versions.txt`;
3. After saving, print confirmation of paths and file sizes.

**Conclusion.** The complete artifact set ensures reproducibility and direct loading for the next notebook.


In [None]:
# --- Saving Best Model & Artifacts --- #

from joblib import dump

MODELS_DIR.mkdir(parents=True, exist_ok=True)

# determine best model by test_f1 #
assert "df_results_rounded" in globals(), (
    "Expected df_results_rounded from the Unified Metrics Table block"
)
best_row = df_results_rounded.iloc[0]
best_name = best_row["model"]
print(f"[save] best_model={best_name}")

# extract model object from registry #
best_pack = model_registry.get(best_name)
if not best_pack or "estimator" not in best_pack:
    raise ValueError(f'Model "{best_name}" not found in model_registry')

best_model = best_pack["estimator"]

# save model in joblib format #
dst = MODELS_DIR / f"{best_name}_best.joblib"
dump(best_model, dst)
print(f"[saved] {dst}")

# additionally saved results table #
dst_metrics = MODELS_DIR / "results_summary.csv"
df_results_rounded.to_csv(dst_metrics, index=False)
print(f"[saved] {dst_metrics}")

# --- Export Artifacts for 03 (fairness & explainability) --- #

**Goal.** Prepare files used by the third notebook for fairness and explainability.

**Approach:**
1. Select the best model: first row of `df_results_rounded` (max `test_f1`), then `model_registry[best_name]['estimator']`.
2. Compute on `X_test`:
   - `y_proba_best`: `predict_proba` or `decision_function` if probabilities are unavailable;
   - `y_pred_best`: threshold 0.5;
   - `y_true_test`: ground-truth test labels.
3. Sensitive features:
   - if `X_test_raw` exists, take the subset of columns from `['sex', 'race', 'age_group', 'education']` and write `X_test_sensitive.csv`.
4. Preprocessor and feature names:
   - try `preproc = pipe.named_steps['preproc']`;
   - `preproc.fit(X_train, y_train_)`, then `X_test_enc = preproc.transform(X_test)`;
   - save `feature_names` if available.

**Files:**
1. In `data/artifacts/`:
   - `y_true_test.npy`, `y_proba_best.npy`, `y_pred_best.npy`;
   - `X_test_sensitive.csv`;
   - `feature_names.npy`;
   - `X_test_enc.npz` for sparse or `X_test_enc.npy` for dense matrices;
   - `export_meta.json` with `best_model`, timestamp, metrics string, and the artifact list.

**Conclusion.** Export complete. The third notebook loads ready-made metrics, predictions, sensitive features, and encoded features.

In [None]:
# --- Export Artifacts for 03 (fairness & explainability) --- #

from datetime import datetime

import numpy as np
from joblib import dump

try:
    import scipy.sparse as sp
except Exception:
    sp = None

# preconditions #
assert "df_results_rounded" in globals(), (
    "df_results_rounded not found. Run the Unified Metrics Table block"
)
assert "model_registry" in globals() and len(model_registry) > 0, "model_registry is empty"
assert all(v in globals() for v in ["X_train", "X_test", "y_train", "y_test"]), (
    "X/y split not in memory"
)

ART_DIR.mkdir(parents=True, exist_ok=True)

# select the best model (by test_f1) and extract its pipeline #
best_name = df_results_rounded.iloc[0]["model"]
pack = model_registry.get(best_name)
if not pack or "estimator" not in pack:
    raise RuntimeError(f'"{best_name}" not found in model_registry')

model_best = pack["estimator"]
print(f"[export] best_model={best_name}")

# get probabilities/predictions on X_test #
if hasattr(model_best, "predict_proba"):
    y_proba_best = model_best.predict_proba(X_test)[:, 1]
elif hasattr(model_best, "decision_function"):
    y_proba_best = model_best.decision_function(X_test)
else:
    raise RuntimeError(f'Model "{best_name}" does not support predict_proba/decision_function.')

y_pred_best = (y_proba_best >= 0.5).astype("int8")
y_true_test = np.asarray(y_test, dtype="int8")

# sensitive features for fairness slices #
X_test_sens = None
if "X_test_raw" in globals():
    sens_cols = [c for c in ["sex", "race", "age_group", "education"] if c in X_test_raw.columns]
    if sens_cols:
        X_test_sens = X_test_raw[sens_cols].copy()
        X_test_sens.to_csv(ART_DIR / "X_test_sensitive.csv", index=False)
        print(f"[export] X_test_sensitive.csv with cols={sens_cols}")


# extract preprocessor and encode X_test -> X_test_enc (+ feature_names) #
def _get_preproc(pipe):
    return getattr(pipe, "named_steps", {}).get("preproc", None)


feature_names = None
X_test_enc = None

preproc = _get_preproc(model_best)
if preproc is not None:
    try:
        # ensure fitted state
        preproc_fitted = preproc.fit(X_train, y_train)
        X_test_enc = preproc_fitted.transform(X_test)
        # try to get feature names
        if hasattr(preproc_fitted, "get_feature_names_out"):
            feature_names = preproc_fitted.get_feature_names_out()
    except Exception as e:
        print("[warn] preprocessor transform/get_feature_names_out failed:", type(e).__name__, e)

# save artifacts for '03_fairness_and_explainability.ipynb' #
np.save(ART_DIR / "y_true_test.npy", y_true_test)
np.save(ART_DIR / "y_proba_best.npy", y_proba_best)
np.save(ART_DIR / "y_pred_best.npy", y_pred_best)
print("[export] y_* saved")

if feature_names is not None:
    try:
        np.save(ART_DIR / "feature_names.npy", feature_names)
        print("[export] feature_names.npy saved")
    except Exception as e:
        print("[warn] feature_names.npy save failed:", e)

if X_test_enc is not None:
    try:
        if sp is not None and sp.issparse(X_test_enc):
            sp.save_npz(ART_DIR / "X_test_enc.npz", X_test_enc)
            print("[export] X_test_enc.npz saved", X_test_enc.shape)
        else:
            np.save(ART_DIR / "X_test_enc.npy", np.asarray(X_test_enc))
            print("[export] X_test_enc.npy saved", np.asarray(X_test_enc).shape)
    except Exception as e:
        print("[warn] X_test_enc save failed:", e)
else:
    print("[info] X_test_enc not available (no preprocessor)")

# export metadata #
meta = {
    "best_model": best_name,
    "timestamp": datetime.utcnow().isoformat() + "Z",
    "metrics_row": df_results_rounded[df_results_rounded["model"] == best_name].iloc[0].to_dict(),
    "artifacts": sorted([p.name for p in ART_DIR.glob("*")]),
}
with open(ART_DIR / "export_meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)
print("[export] export_meta.json written")

In [None]:
# --- Export Figures (ROC, PR, Calibration, Confusion, Feature Importance) --- #

from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
from sklearn.calibration import calibration_curve
from sklearn.metrics import auc, confusion_matrix, precision_recall_curve

# output directory #
FIG_DIR_02 = REPORTS_DIR / "figures_02"
FIG_DIR_02.mkdir(parents=True, exist_ok=True)

# robust load of y_true/y_proba/y_pred #
g = globals()
y_true = g.get("y_true_test", g.get("y_test", None))
y_proba = g.get("y_proba_best", None)
y_pred = g.get("y_pred_best", None)


def _maybe_load(npy_path: Path):
    try:
        return np.load(npy_path)
    except Exception:
        return None


if y_true is None:
    y_true = _maybe_load(ART_DIR / "y_true_test.npy")
if y_proba is None:
    y_proba = _maybe_load(ART_DIR / "y_proba_best.npy")
if y_pred is None:
    y_pred = _maybe_load(ART_DIR / "y_pred_best.npy")

if y_true is None or y_proba is None:
    raise RuntimeError("[fig02] y_true и y_proba required. Run artifact export first.")

if y_pred is None or len(y_pred) != len(y_true):
    y_pred = (y_proba >= 0.5).astype("int8")

# ROC #
fpr, tpr, _ = roc_curve(y_true, y_proba)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, lw=2, label=f"ROC AUC = {roc_auc:.4f}")
plt.plot([0, 1], [0, 1], lw=1, linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig(FIG_DIR_02 / "roc_curve.png", dpi=200)
plt.close()

# PR #
precision, recall, _ = precision_recall_curve(y_true, y_proba)
pr_auc = auc(recall, precision)
plt.figure()
plt.plot(recall, precision, lw=2, label=f"PR AUC = {pr_auc:.4f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend(loc="lower left")
plt.tight_layout()
plt.savefig(FIG_DIR_02 / "pr_curve.png", dpi=200)
plt.close()

# calibration #
prob_true, prob_pred = calibration_curve(y_true, y_proba, n_bins=10, strategy="uniform")
plt.figure()
plt.plot(prob_pred, prob_true, marker="o", lw=2, label="Calibration")
plt.plot([0, 1], [0, 1], linestyle="--", lw=1, label="Perfect")
plt.xlabel("Mean predicted probability")
plt.ylabel("Fraction of positives")
plt.title("Calibration Curve")
plt.legend(loc="upper left")
plt.tight_layout()
plt.savefig(FIG_DIR_02 / "calibration_curve.png", dpi=200)
plt.close()

# confusion Matrix (thr=0.5) #
cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
plt.figure()
plt.imshow(cm, interpolation="nearest")
plt.title("Confusion Matrix (thr=0.5)")
plt.xticks([0, 1], ["0", "1"])
plt.yticks([0, 1], ["0", "1"])
for (i, j), v in np.ndenumerate(cm):
    plt.text(j, i, str(v), ha="center", va="center")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.savefig(FIG_DIR_02 / "confusion_matrix.png", dpi=200)
plt.close()

# feature Importance (LightGBM / XGBoost) #
# try to get feature_names #
feature_names = None
p_fn = ART_DIR / "feature_names.npy"
if p_fn.exists():
    try:
        feature_names = np.load(p_fn, allow_pickle=True)
    except Exception:
        feature_names = None


def _get_preproc(pipe):
    return getattr(pipe, "named_steps", {}).get("preproc", None)


if feature_names is None and "model_best" in g:
    preproc = _get_preproc(g["model_best"])
    if preproc is not None and hasattr(preproc, "get_feature_names_out"):
        try:
            feature_names = preproc.get_feature_names_out()
        except Exception:
            feature_names = None

# LightGBM #
lgb_pack = model_registry.get("LightGBM_RS")
if lgb_pack:
    lgb_pipe = lgb_pack.get("estimator")
    lgb_clf = (
        getattr(getattr(lgb_pipe, "named_steps", {}), "get", lambda _: None)("clf")
        if hasattr(lgb_pipe, "named_steps")
        else None
    )
    if lgb_clf is not None and hasattr(lgb_clf, "feature_importances_"):
        try:
            imp = np.array(lgb_clf.feature_importances_, dtype=float)
            names = (
                feature_names
                if (feature_names is not None and len(feature_names) == len(imp))
                else np.array([f"f{i}" for i in range(len(imp))])
            )
            order = np.argsort(imp)[::-1][:40]
            plt.figure(figsize=(8, max(4, len(order) * 0.25)))
            plt.barh(range(len(order)), imp[order][::-1])
            plt.yticks(range(len(order)), names[order][::-1])
            plt.xlabel("Importance")
            plt.title("Feature Importance — LightGBM")
            plt.tight_layout()
            plt.savefig(FIG_DIR_02 / "feature_importance_lgbm.png", dpi=200)
            plt.close()
        except Exception as e:
            print("[warn] LGBM FI export:", type(e).__name__, e)

# XGBoost (gain) #
xgb_pack = model_registry.get("XGBoost_ES")
if xgb_pack:
    xgb_pipe = xgb_pack.get("estimator")
    xgb_clf = (
        getattr(getattr(xgb_pipe, "named_steps", {}), "get", lambda _: None)("clf")
        if hasattr(xgb_pipe, "named_steps")
        else None
    )
    booster = None
    if xgb_clf is not None and hasattr(xgb_clf, "get_booster"):
        try:
            booster = xgb_clf.get_booster()
        except Exception:
            booster = None
    if booster is not None:
        try:
            # dict: f{idx} -> gain
            fscore = booster.get_score(importance_type="gain")
            items = sorted(fscore.items(), key=lambda kv: kv[1], reverse=True)[:40]
            names_raw = [k for k, _ in items]
            gains = [v for _, v in items]

            def _map(raw):
                if raw.startswith("f") and raw[1:].isdigit():
                    idx = int(raw[1:])
                    if feature_names is not None and idx < len(feature_names):
                        return str(feature_names[idx])
                return raw

            names = [_map(n) for n in names_raw]
            plt.figure(figsize=(8, max(4, len(names) * 0.25)))
            plt.barh(range(len(names)), gains[::-1])
            plt.yticks(range(len(names)), names[::-1])
            plt.xlabel("Gain")
            plt.title("Feature Importance - XGBoost (gain)")
            plt.tight_layout()
            plt.savefig(FIG_DIR_02 / "feature_importance_xgb_gain.png", dpi=200)
            plt.close()
        except Exception as e:
            print("[warn] XGB FI export:", type(e).__name__, e)

print(
    "[fig02] Saved: "
    "roc_curve.png, pr_curve.png, calibration_curve.png, "
    "confusion_matrix.png, and FI if available."
)

In [None]:
# --- Export Best Pipeline for Inference --- #

from joblib import dump

# preconditions #
assert "df_results_rounded" in globals(), (
    "df_results_rounded not found. Run the Unified Metrics Table block"
)
assert "model_registry" in globals() and len(model_registry) > 0, "model_registry is empty"

MODELS_DIR.mkdir(parents=True, exist_ok=True)

# select best model by test_f1 #
best_name = df_results_rounded.iloc[0]["model"]
pack = model_registry.get(best_name)
if not pack or "estimator" not in pack:
    raise RuntimeError(f'"{best_name}" is missing in model_registry or has no "estimator".')

model_best = pack["estimator"]

# export under two names: canonical and universal alias #
p1 = MODELS_DIR / f"{best_name}_best.joblib"
p2 = MODELS_DIR / "model_best.joblib"

dump(model_best, p1)
dump(model_best, p2)

print(f"[export] saved: {p1.name}, {p2.name} -> {MODELS_DIR}")

In [None]:
# --- Verify Artifacts for 03 (fairness & explainability) --- #

from pathlib import Path

required = [
    ART_DIR / "y_true_test.npy",
    ART_DIR / "y_proba_best.npy",
    ART_DIR / "y_pred_best.npy",
]
optional = [
    ART_DIR / "X_test_sensitive.csv",
    ART_DIR / "feature_names.npy",
    ART_DIR / "X_test_enc.npy",
    ART_DIR / "X_test_enc.npz",
    ART_DIR / "export_meta.json",
]

missing = [p for p in required if not p.exists()]
if missing:
    raise FileNotFoundError(
        f"Missing required artifacts: {[p.name for p in missing]} "
        f'Need to run the block "Export Artifacts for 03 (fairness & explainability)" above'
    )

print("[verify] Required artifacts found:")
for p in required:
    print("  -", p.name)

print("[verify] Optional artifacts:")
for p in optional:
    print("  -", p.name, "OK" if p.exists() else "—")

print("[verify] 03 is ready to run")

In [None]:
# --- Export Demo Predictions (Pipeline) --- #

import numpy as np
import pandas as pd
from joblib import dump

from paths import ROOT

# preconditions #
assert "df_results_rounded" in globals(), "df_results_rounded not found"
assert "model_registry" in globals() and len(model_registry) > 0, "model_registry is empty"
assert all(v in globals() for v in ["X_test", "y_test"]), "X_test / y_test not found"

PRED_DIR = ROOT / "predictions"
PRED_DIR.mkdir(parents=True, exist_ok=True)

# best model #
best_name = df_results_rounded.iloc[0]["model"]
pack = model_registry.get(best_name)
if not pack or "estimator" not in pack:
    raise RuntimeError(f'"{best_name}" not found in model_registry.')

model_best = pack["estimator"]

# scoring #
if hasattr(model_best, "predict_proba"):
    proba = model_best.predict_proba(X_test)[:, 1]
elif hasattr(model_best, "decision_function"):
    proba = model_best.decision_function(X_test)
else:
    raise RuntimeError(f'Model "{best_name}" does not support predict_proba/decision_function.')

label = (proba >= 0.5).astype("int8")
pd.DataFrame({"proba": proba, "label": label}).to_csv(PRED_DIR / "preds_pipeline.csv", index=False)

print(f"[pred] saved: {PRED_DIR / 'preds_pipeline.csv'} via {best_name}")

In [None]:
# --- Final Path Assertions --- #

assert ART_DIR.resolve().parts[-2:] == ("data", "artifacts"), f"ART_DIR={ART_DIR}"
assert MODELS_DIR.resolve().parts[-2:] == ("data", "models"), f"MODELS_DIR={MODELS_DIR}"
print("[paths] ART_DIR and MODELS_DIR are valid.")