# Binned Training Notebook (Training-Only)

Train one model per cluster_Et bin using separate input files per bin. After training, compute per-bin AUC and the correlation between isoET and the model BDT score (predicted probability).

Bins: [5, 15, 25, 40] (GeV) — models trained on [5–15), [15–25), [25–40).

Outputs per bin:
- Trained model artifact (joblib)
- Metadata JSON
- Validation AUC
- isoET vs BDT score correlation (Pearson/Spearman)
- Plots: AUC by bin, isoET–BDT hexbin/scatter, optional ROC curves

In [None]:
# Section 1: Import Libraries and Set Seed
import os
import json
import time
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.ensemble import HistGradientBoostingClassifier

# Optional: XGBoost (preferred if available)
try:
    import xgboost as xgb
    XGB_AVAILABLE = True
except Exception:
    xgb = None
    XGB_AVAILABLE = False

from scipy import stats
import joblib

import matplotlib.pyplot as plt
import seaborn as sns

# Reproducibility
GLOBAL_SEED = 42
np.random.seed(GLOBAL_SEED)

# Matplotlib defaults
plt.rcParams["figure.figsize"] = (7.5, 5)
plt.rcParams["axes.grid"] = True
sns.set_context("talk")

In [None]:
# Section 2: Configure cluster_Et Bins and Per-Bin Input Files
# Define bin edges and derive labels
BIN_EDGES = [5, 15, 25, 40]
BIN_LABELS = [f"{BIN_EDGES[i]}_{BIN_EDGES[i+1]}" for i in range(len(BIN_EDGES)-1)]

# --- Input File Configuration ---
# Option 1: Use the same files for all bins (current setup)
# The data loader will filter events by pT for each bin.
# This is useful when you have inclusive data files.
USE_SINGLE_FILE_SET = True

SINGLE_FILE_SET = {
    "signal": "shapes_photon20.txt",
    "background": "shapes_jet30.txt"
}

# Option 2: Use different files for each pT bin.
# This is useful if your data is already split into separate files by pT.
# To use this, set USE_SINGLE_FILE_SET = False and edit the paths below.
PER_BIN_FILE_SETS = {
    "5_15": {
        "signal": "shapes_photon_5_15.txt",      # Example path
        "background": "shapes_jet_5_15.txt"     # Example path
    },
    "15_25": {
        "signal": "shapes_photon_15_25.txt",     # Example path
        "background": "shapes_jet_15_25.txt"    # Example path
    },
    "25_40": {
        "signal": "shapes_photon_25_40.txt",     # Example path
        "background": "shapes_jet_25_40.txt"    # Example path
    },
}

# Logic to select the configuration
if USE_SINGLE_FILE_SET:
    PER_BIN_FILES = {bin_label: SINGLE_FILE_SET for bin_label in BIN_LABELS}
else:
    PER_BIN_FILES = PER_BIN_FILE_SETS


# Column names from model_training.ipynb
COLUMNS = [
    "cluster_Et", "cluster_Eta", "cluster_Phi", "vertexz",
    "e11_over_e33", "e32_over_e35", "e11_over_e22", "e11_over_e13",
    "e11_over_e15", "e11_over_e17", "e11_over_e31",
    "e11_over_e51", "e11_over_e71", "e22_over_e33",
    "e22_over_e35", "e22_over_e37", "e22_over_e53",
    "cluster_prob", "cluster_weta_cogx", "cluster_wphi_cogx",
    "cluster_et1", "cluster_et2", "cluster_et3", "cluster_et4",
    "cluster_w32", "cluster_w52", "cluster_w72", 
    "recoisoET", "is_tight", "pid"
]

# Key column names
PT_COL = "cluster_Et"
ISO_COL = "recoisoET"      # Updated from "isoET"
LABEL_COL = "label"

# Feature columns to train on from model_training.ipynb
FEATURES = [
    "vertexz",
    "cluster_Et"
    "e11_over_e33",
    "cluster_et1",
    "cluster_et2",
    "cluster_et3",
    "cluster_et4",
]

# Output directory for models/metrics
OUT_DIR = Path("binned_models")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Train/val split
TRAIN_SIZE = 0.8
VAL_SIZE = 1 - TRAIN_SIZE
STRATIFY = True

In [None]:
# Section 3: Load and Concatenate Data for Each Bin
try:
    import uproot  # for ROOT files
except Exception:
    uproot = None


def load_single(path: str, names: list) -> pd.DataFrame:
    ext = os.path.splitext(path)[1].lower()
    if ext in [".csv", ".txt"]:
        return pd.read_csv(path, sep=r"\s+", header=0, names=names)
    if ext in [".pq", ".parquet"]:
        return pd.read_parquet(path)
    if ext in [".root"]:
        if uproot is None:
            raise ImportError("uproot is required to read ROOT files. pip install uproot")
        # Heuristic: read first tree and all branches
        with uproot.open(path) as f:
            # Pick the first TTree-like object
            trees = [k for k in f.keys() if isinstance(f[k], uproot.behaviors.TTree.TTree)]
            if not trees:
                # Fall back: search members
                trees = [k for k, v in f.items() if hasattr(v, "arrays")]
            key = trees[0] if trees else list(f.keys())[0]
            arrs = f[key].arrays(library="pd")
            return arrs
    raise ValueError(f"Unsupported file type: {ext}")


def load_bin_data(bin_label: str, et_min: float, et_max: float) -> pd.DataFrame:
    file_map = PER_BIN_FILES.get(bin_label, {})
    if not file_map:
        raise FileNotFoundError(f"No input files configured for bin {bin_label}")

    dfs = []
    for file_type, path in file_map.items():
        if not os.path.exists(path):
            raise FileNotFoundError(f"Configured file not found: {path}")
        df_i = load_single(path, names=COLUMNS)
        if file_type == "signal":
            df_i[LABEL_COL] = 1
            df_i = df_i[df_i["pid"].isin([1,2])] # photon only
        elif file_type == "background":
            df_i[LABEL_COL] = 0
            df_i = df_i[~df_i["pid"].isin([1,2])] # reject electrons
        dfs.append(df_i)
    df = pd.concat(dfs, ignore_index=True)

    # Validate required columns
    base_required = {PT_COL, ISO_COL, LABEL_COL}
    if FEATURES:
        required = base_required.union(FEATURES)
    else:
        required = base_required
    missing = sorted(required - set(df.columns))
    if missing:
        raise KeyError(f"Missing required columns for bin {bin_label}: {missing}")

    # Drop rows with missing label and key columns
    df = df.dropna(subset=[LABEL_COL])

    # Filter by Et bin
    df = df[(df[PT_COL] >= et_min) & (df[PT_COL] < et_max)]

    # Ensure numeric dtypes for key columns
    for c in [PT_COL, ISO_COL, LABEL_COL]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    # Drop rows with NA on key columns after coercion
    df = df.dropna(subset=[PT_COL, ISO_COL, LABEL_COL])

    return df

In [None]:
# Section 4: Train/Validation Split per Bin

def split_train_val(df: pd.DataFrame):
    X = df[FEATURES].copy()
    y = df[LABEL_COL].astype(int).values
    if STRATIFY:
        strat = y
    else:
        strat = None
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, train_size=TRAIN_SIZE, random_state=GLOBAL_SEED, stratify=strat
    )
    return X_train, X_val, y_train, y_val

In [None]:
# Section 5: Build Preprocessing and Model Pipeline

MODEL_CONFIG = {
    "classifier": "xgb" if 'XGB_AVAILABLE' in globals() and XGB_AVAILABLE else "hgb",
    "params": {
        # Defaults for XGBClassifier; if HGB used, some keys are ignored
        "n_estimators": 400,
        "max_depth": 4,
        "learning_rate": 0.05,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "reg_alpha": 0.0,
        "reg_lambda": 1.0,
        "random_state": GLOBAL_SEED,
        "n_jobs": 4,
        "tree_method": "hist",
    },
    "use_scaler": False,
}


def build_pipeline():
    steps = []
    steps.append(("imputer", SimpleImputer(strategy="median")))
    if MODEL_CONFIG.get("use_scaler", False):
        steps.append(("scaler", StandardScaler()))

    clf_name = MODEL_CONFIG["classifier"]
    if clf_name == "xgb" and XGB_AVAILABLE:
        clf = xgb.XGBClassifier(
            **MODEL_CONFIG["params"],
            objective="binary:logistic",
            eval_metric="auc",
            use_label_encoder=False,
        )
    elif clf_name == "hgb":
        hgb_params = {
            "max_depth": None if MODEL_CONFIG["params"].get("max_depth", 0) <= 0 else MODEL_CONFIG["params"]["max_depth"],
            "learning_rate": MODEL_CONFIG["params"].get("learning_rate", 0.1),
            "max_iter": MODEL_CONFIG["params"].get("n_estimators", 300),
            "l2_regularization": MODEL_CONFIG["params"].get("reg_lambda", 0.0),
            "min_samples_leaf": 20,
            "random_state": GLOBAL_SEED,
        }
        clf = HistGradientBoostingClassifier(**hgb_params)
    else:
        raise ValueError("Unsupported classifier in MODEL_CONFIG")

    steps.append(("clf", clf))
    return Pipeline(steps)

In [None]:
# Section 6: Train One Model per Bin

trained_pipelines = {}
train_metrics = {}

for i in range(len(BIN_EDGES) - 1):
    et_min, et_max = BIN_EDGES[i], BIN_EDGES[i+1]
    bin_label = BIN_LABELS[i]
    print(f"\n=== Training bin {bin_label} [{et_min}, {et_max}) GeV ===")

    df_bin = load_bin_data(bin_label, et_min, et_max)
    feats = FEATURES if FEATURES else autodetect_features(df_bin)
    if not feats:
        raise ValueError(f"No FEATURES found for bin {bin_label}. Please define FEATURES explicitly.")

    X_train, X_val, y_train, y_val = split_train_val_with_features(df_bin, feats)

    pipe = build_pipeline()
    t0 = time.time()
    pipe.fit(X_train, y_train)
    t1 = time.time()

    # Basic training AUC on train split
    if hasattr(pipe.named_steps["clf"], "predict_proba"):
        y_train_proba = pipe.predict_proba(X_train)[:, 1]
    else:
        y_train_proba = pipe.decision_function(X_train)
    auc_train = roc_auc_score(y_train, y_train_proba) if len(np.unique(y_train)) > 1 else np.nan

    trained_pipelines[bin_label] = {
        "pipeline": pipe,
        "features": feats,
        "X_val": X_val,
        "y_val": y_val,
        "val_index": X_val.index,  # store for alignment
        "df_bin": df_bin,  # keep for ISO correlation reference
    }
    train_metrics[bin_label] = {
        "train_time_sec": round(t1 - t0, 3),
        "n_train": int(len(y_train)),
        "n_val": int(len(y_val)),
        "auc_train": float(auc_train) if not np.isnan(auc_train) else np.nan,
        "et_min": float(et_min),
        "et_max": float(et_max),
    }

print("\nTraining complete for all bins.")

In [None]:
# Section 7: Persist Trained Pipelines and Metadata

for bin_label in BIN_LABELS:
    if bin_label not in trained_pipelines:
        continue
    entry = trained_pipelines[bin_label]
    pipe = entry["pipeline"]
    feats = entry["features"]

    model_path = OUT_DIR / f"model_{bin_label}.joblib"
    joblib.dump(pipe, model_path)

    meta = {
        "bin_label": bin_label,
        "bin_edges": BIN_EDGES,
        "pt_col": PT_COL,
        "iso_col": ISO_COL,
        "label_col": LABEL_COL,
        "features": feats,
        "train_size": TRAIN_SIZE,
        "random_seed": GLOBAL_SEED,
        "model_config": MODEL_CONFIG,
        "train_metrics": train_metrics.get(bin_label, {}),
        "input_files": PER_BIN_FILES.get(bin_label, []),
    }
    with open(OUT_DIR / f"metadata_{bin_label}.json", "w") as f:
        json.dump(meta, f, indent=2)

print(f"Saved models and metadata to: {OUT_DIR}")

In [None]:
# Section 8: Evaluate AUC per pT/cluster_Et Bin

val_results = {}

for bin_label in BIN_LABELS:
    if bin_label not in trained_pipelines:
        continue
    entry = trained_pipelines[bin_label]
    pipe = entry["pipeline"]
    X_val, y_val = entry["X_val"], entry["y_val"]

    if hasattr(pipe.named_steps["clf"], "predict_proba"):
        y_proba = pipe.predict_proba(X_val)[:, 1]
    else:
        y_proba = pipe.decision_function(X_val)

    auc = roc_auc_score(y_val, y_proba) if len(np.unique(y_val)) > 1 else np.nan
    fpr, tpr, thr = roc_curve(y_val, y_proba) if len(np.unique(y_val)) > 1 else (None, None, None)

    val_results[bin_label] = {
        "auc_val": float(auc) if not np.isnan(auc) else np.nan,
        "fpr": fpr,
        "tpr": tpr,
        "thresholds": thr,
        "n_val": int(len(y_val)),
    }

val_results

In [None]:
# Section 9: Compute isoET vs BDT Score Correlation per Bin

correlation_results = {}

for bin_label in BIN_LABELS:
    if bin_label not in trained_pipelines:
        continue
    entry = trained_pipelines[bin_label]
    pipe = entry["pipeline"]
    X_val, y_val, df_bin, val_index = entry["X_val"], entry["y_val"], entry["df_bin"], entry["val_index"]

    # isoET values for the validation rows
    iso_val = df_bin.loc[val_index, ISO_COL].to_numpy()

    # Predicted prob on validation set
    if hasattr(pipe.named_steps["clf"], "predict_proba"):
        y_proba = pipe.predict_proba(X_val)[:, 1]
    else:
        y_proba = pipe.decision_function(X_val)

    # Drop any NaNs
    mask = np.isfinite(iso_val) & np.isfinite(y_proba)
    iso_c = iso_val[mask]
    proba_c = y_proba[mask]

    pearson_r, pearson_p = stats.pearsonr(iso_c, proba_c) if len(iso_c) > 1 else (np.nan, np.nan)
    spearman_rho, spearman_p = stats.spearmanr(iso_c, proba_c) if len(iso_c) > 1 else (np.nan, np.nan)

    correlation_results[bin_label] = {
        "pearson_r": float(pearson_r) if np.isfinite(pearson_r) else np.nan,
        "pearson_p": float(pearson_p) if np.isfinite(pearson_p) else np.nan,
        "spearman_rho": float(spearman_rho) if np.isfinite(spearman_rho) else np.nan,
        "spearman_p": float(spearman_p) if np.isfinite(spearman_p) else np.nan,
        "n_pairs": int(np.sum(mask)),
    }

correlation_results

In [None]:
# Section 10: Visualize AUC and isoET–BDT Relationships

# AUC bar plot
auc_data = [(b, val_results[b]["auc_val"]) for b in BIN_LABELS if b in val_results]
if auc_data:
    bins_, aucs_ = zip(*auc_data)
    plt.figure()
    sns.barplot(x=list(bins_), y=list(aucs_), color="steelblue")
    plt.ylabel("Validation AUC")
    plt.xlabel("cluster_Et bin [GeV]")
    plt.title("AUC by pT bin")
    plt.ylim(0.5, 1.0)
    for i, v in enumerate(aucs_):
        plt.text(i, v + 0.01, f"{v:.3f}", ha="center")
    plt.show()

# isoET vs BDT score scatter/hexbin per bin
for bin_label in BIN_LABELS:
    if bin_label not in trained_pipelines:
        continue
    entry = trained_pipelines[bin_label]
    pipe = entry["pipeline"]
    X_val = entry["X_val"]
    df_bin = entry["df_bin"]
    val_index = entry["val_index"]

    iso_val = df_bin.loc[val_index, ISO_COL].to_numpy()

    if hasattr(pipe.named_steps["clf"], "predict_proba"):
        y_proba = pipe.predict_proba(X_val)[:, 1]
    else:
        y_proba = pipe.decision_function(X_val)

    mask = np.isfinite(iso_val) & np.isfinite(y_proba)
    iso_c, proba_c = iso_val[mask], y_proba[mask]

    plt.figure()
    if len(iso_c) > 5000:
        plt.hexbin(iso_c, proba_c, gridsize=40, cmap="viridis", mincnt=1)
        cb = plt.colorbar()
        cb.set_label("count")
    else:
        plt.scatter(iso_c, proba_c, s=8, alpha=0.4)
    plt.xlabel("isoET")
    plt.ylabel("BDT score (prob)")
    plt.title(f"isoET vs BDT score — bin {bin_label}")
    plt.show()

# Optional: ROC curves per bin
for bin_label in BIN_LABELS:
    if bin_label not in val_results:
        continue
    r = val_results[bin_label]
    if r["fpr"] is None:
        continue
    plt.figure()
    plt.plot(r["fpr"], r["tpr"], label=f"{bin_label} (AUC={r['auc_val']:.3f})")
    plt.plot([0,1],[0,1],"k--", alpha=0.5)
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.title("ROC per bin")
    plt.legend()
    plt.show()

In [None]:
# Section 11: Summarize and Save Metrics Table

rows = []
for bin_label in BIN_LABELS:
    row = {"bin": bin_label}
    # Training metrics
    row.update(train_metrics.get(bin_label, {}))
    # Validation AUC
    if bin_label in val_results:
        row["auc_val"] = val_results[bin_label]["auc_val"]
        row["n_val"] = val_results[bin_label]["n_val"]
    else:
        row["auc_val"] = np.nan
        row["n_val"] = 0
    # Correlations
    if bin_label in correlation_results:
        row.update(correlation_results[bin_label])
    else:
        row.update({
            "pearson_r": np.nan,
            "pearson_p": np.nan,
            "spearman_rho": np.nan,
            "spearman_p": np.nan,
            "n_pairs": 0,
        })
    rows.append(row)

metrics_df = pd.DataFrame(rows)
metrics_df = metrics_df[[
    "bin", "et_min", "et_max", "n_train", "n_val", "auc_train", "auc_val",
    "pearson_r", "pearson_p", "spearman_rho", "spearman_p", "n_pairs", "train_time_sec"
]].sort_values("bin")

metrics_path = OUT_DIR / "per_bin_metrics.csv"
metrics_df.to_csv(metrics_path, index=False)

print("Metrics saved:", metrics_path)
metrics_df

In [None]:
# Section 3.1: Auto-detect FEATURES if empty (numeric columns excluding PT/ISO/LABEL)

def autodetect_features(df: pd.DataFrame) -> list:
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    exclude = {PT_COL, ISO_COL, LABEL_COL}
    feats = [c for c in numeric_cols if c not in exclude]
    return feats


def split_train_val_with_features(df: pd.DataFrame, features: list):
    X = df[features].copy()
    y = df[LABEL_COL].astype(int).values
    strat = y if STRATIFY else None
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, train_size=TRAIN_SIZE, random_state=GLOBAL_SEED, stratify=strat
    )
    return X_train, X_val, y_train, y_val