# Imports and setup 

In [None]:
import uproot
import awkward as ak
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display, HTML

# Set the output box size for images
display(
    HTML(
        "<style>.output_png, .output_jpeg, .output_svg {height: 500px; overflow-y: scroll;}</style>"
    )
)

# Branches for muon pixel tracks
main_branch = "Events"
tk_branches = [
    "muon_pixel_tracks_p",
    "muon_pixel_tracks_pt",
    "muon_pixel_tracks_ptErr",
    "muon_pixel_tracks_eta",
    "muon_pixel_tracks_etaErr",
    "muon_pixel_tracks_phi",
    "muon_pixel_tracks_phiErr",
    "muon_pixel_tracks_chi2",
    "muon_pixel_tracks_normalizedChi2",
    "muon_pixel_tracks_nPixelHits",
    "muon_pixel_tracks_nTrkLays",
    "muon_pixel_tracks_nFoundHits",
    "muon_pixel_tracks_nLostHits",
    "muon_pixel_tracks_dsz",
    "muon_pixel_tracks_dszErr",
    "muon_pixel_tracks_dxy",
    "muon_pixel_tracks_dxyErr",
    "muon_pixel_tracks_dz",
    "muon_pixel_tracks_dzErr",
    "muon_pixel_tracks_qoverp",
    "muon_pixel_tracks_qoverpErr",
    "muon_pixel_tracks_lambdaErr",
    "muon_pixel_tracks_matched",
    "muon_pixel_tracks_duplicate",
    "muon_pixel_tracks_tpPdgId",
    "muon_pixel_tracks_tpPt",
    "muon_pixel_tracks_tpEta",
    "muon_pixel_tracks_tpPhi",
]
gen_branches = [
    "GenPart_pt",
    "GenPart_eta",
    "GenPart_phi",
    "GenPart_mass",
    "GenPart_pdgId",
    "GenPart_statusFlags",  # added to select last-copy muons
]

legacy = False
allPixel = False

filesSelector = [
    "data/ntuples_TTbarCAExtensionFull.root",
    "data/ntuples_ZMMCAExtensionFull.root",
    "data/ntuples_WprimeCAExtensionFull.root",
]

filesAllPixel = [
    "data/ntuples_TTbarCAExtensionAllPixel.root",
    "data/ntuples_ZMMCAExtensionAllPixel.root",
    "data/ntuples_WprimeCAExtensionAllPixel.root",
]

files = filesSelector if not allPixel else filesAllPixel

if legacy:
    for i, f in enumerate(files):
        files[i] = f.replace("CAExtension", "Legacy")
print(files)
# ntuples selection
arrays = []
for f in files:
    with uproot.open(f) as file:
        arrays_f = file[main_branch].arrays(tk_branches + gen_branches)
        arrays = ak.concatenate([arrays, arrays_f], axis=0)
print(f"Loaded {len(arrays)} events")

# Data preparation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Prepare flattened feature matrix and labels for model (matched=1, fake=0)
def prepare_training_data(
    selection=None,
    train_fraction=0.7,
    shuffle=True,
    standardize=True,
    drop_duplicates=False,
    return_mask=False,
):
    """
    Build (X_train, X_test, y_train, y_test) from per-track awkward arrays.
    selection: awkward boolean mask (same jagged structure as tracks) or None (use all)
    train_fraction: fraction for training (rest for test)
    standardize: apply StandardScaler (fit on train, transform both)
    drop_duplicates: optionally remove tracks flagged as duplicate
    return_mask: if True also return the flattened boolean mask of kept tracks
    """
    # Default selection: keep all tracks
    if selection is None:
        selection = arrays.muon_pixel_tracks_pt >= 0  # shape: events, nTracks

    # Base mask (apply selection)
    mask = selection

    if drop_duplicates and "muon_pixel_tracks_duplicate" in arrays.fields:
        mask = mask & (arrays.muon_pixel_tracks_duplicate == 0)

    feature_map = {
        "pt": arrays.muon_pixel_tracks_pt,
        "eta": arrays.muon_pixel_tracks_eta,
        "phi": arrays.muon_pixel_tracks_phi,
        # "p": arrays.muon_pixel_tracks_p,
        # "dxy": arrays.muon_pixel_tracks_dxy,
        # "dz": arrays.muon_pixel_tracks_dz,
        "qoverp": arrays.muon_pixel_tracks_qoverp,
        "qoverpErr": arrays.muon_pixel_tracks_qoverpErr,
        "dzErr": arrays.muon_pixel_tracks_dzErr,
        "etaErr": arrays.muon_pixel_tracks_etaErr,
        "lambdaErr": arrays.muon_pixel_tracks_lambdaErr,
        "dxyErr": arrays.muon_pixel_tracks_dxyErr,
        "phiErr": arrays.muon_pixel_tracks_phiErr,
        # "chi2": arrays.muon_pixel_tracks_chi2,
        "normalizedChi2": arrays.muon_pixel_tracks_normalizedChi2,
        "nPixelHits": arrays.muon_pixel_tracks_nPixelHits,
        "nTrkLays": arrays.muon_pixel_tracks_nTrkLays,
        # "nFoundHits": arrays.muon_pixel_tracks_nFoundHits,
        # "nLostHits": arrays.muon_pixel_tracks_nLostHits,
        # "dsz": arrays.muon_pixel_tracks_dsz,
        "dszErr": arrays.muon_pixel_tracks_dszErr,
    }

    feature_names = list(feature_map.keys())

    # Build list of flattened numpy feature columns applying mask
    cols = []
    for name in feature_names:
        data = feature_map[name][mask]
        # flatten to 1D numpy
        flat = ak.to_numpy(ak.flatten(data))
        cols.append(flat)

    # Stack into (n_samples, n_features)
    X = np.vstack(cols).T  # shape: (n_tracks, n_features)

    # Labels
    y = ak.to_numpy(ak.flatten(arrays.muon_pixel_tracks_matched[mask])).astype(
        np.int8
    )  # 1 matched, 0 fake

    # Clean: keep only finite rows
    finite_mask = np.isfinite(X).all(axis=1)
    if not finite_mask.all():
        X = X[finite_mask]
        y = y[finite_mask]

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        train_size=train_fraction,
        shuffle=shuffle,
        stratify=y if (y.sum() > 0 and y.sum() < len(y)) else None,
    )

    scaler = None
    if standardize:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    result = {
        "X": X,  # full feature matrix (after selection, cleaning)
        "y": y,  # full labels
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test,
        "feature_names": feature_names,
        "scaler": scaler,
    }
    if return_mask:
        result["global_mask_flat"] = finite_mask  # after masking & finite filter
    return result


# IMPORTANT: avoid double scaling (set standardize=False here; Pipeline will scale).
data_sets = prepare_training_data(selection=None, train_fraction=0.7, standardize=False)
X = data_sets["X"]
y = data_sets["y"]
X_train = data_sets["X_train"]
X_test = data_sets["X_test"]
y_train = data_sets["y_train"]
y_test = data_sets["y_test"]
feature_names = data_sets["feature_names"]
scaler = data_sets["scaler"]  # should be None now
print("Prepared training data (raw features, no pre-standardization).")
from collections import Counter

counter = Counter(y_train)
ratio = counter[0] / max(counter[1], 1)
class_weight = {0: 1.0, 1: ratio}
print("Class weights:", class_weight, "Class balance train:", counter)

# BDT implementation

In [None]:
do_selection = False

if do_selection:
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import GridSearchCV

    dt = DecisionTreeClassifier(class_weight=class_weight)

    searchParam = GridSearchCV(
        dt,
        {
            "max_leaf_nodes": list(range(2, 20)),
            "min_samples_split": list(range(2, 20)),
            "min_samples_leaf": [20, 50, 75, 100],
            "max_depth": list(range(5, 20)),
        },
        cv=5,
        n_jobs=50,
    )
    print(searchParam)
    searchParam.fit(X_train, y_train)
    print("Best params:", searchParam.best_params_)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=8, min_samples_leaf=6, min_samples_split=2, class_weight=class_weight)
dt.fit(X_train, y_train)
print(
    f"Plain DT acc train={dt.score(X_train, y_train):.3f} test={dt.score(X_test, y_test):.3f}"
)

In [None]:
drawTree = False

if drawTree:
    from graphviz import Source
    from sklearn.tree import export_graphviz
    import os

    export_graphviz(
        dt,
        out_file=os.path.join("./bdt_classifier.dot"),
        feature_names=feature_names,
        class_names=["fake", "matched"],
        rounded=True,
        filled=True,
    )

    Source.from_file(os.path.join("./bdt_classifier.dot"))

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

model_prediction = cross_val_predict(dt, X_test, y_test, cv=5)

cm = confusion_matrix(y_test, model_prediction)
# Tool to visualise the confusion matrix
import pandas as pd
import seaborn as sn

df_cm = pd.DataFrame(cm)
plt.figure(figsize=(10, 8))
sn.heatmap(df_cm, annot=True, cmap="YlOrRd", fmt="d")

In [None]:
# Compute precision, recall and F1 score
from sklearn.metrics import precision_score, recall_score, f1_score

ps = precision_score(y_test, model_prediction)
rc = recall_score(y_test, model_prediction)
f1 = f1_score(y_test, model_prediction)
print("Precision:", ps)
print("Recall:", rc)
print("F1 score:", f1)

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, model_prediction)


def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], "k--")  # dashed diagonal
    plt.axis([0, 1, 0, 1])
    plt.xlabel("False Positive Rate (Fall-Out)", fontsize=16)
    plt.ylabel("True Positive Rate (Recall)", fontsize=16)
    plt.grid(True)


plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
plt.show()

In [None]:
# Pipeline (single scaling) + diagnostics
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

# DecisionTreeClassifier(max_depth=6,min_samples_leaf=10,class_weight=class_weight)

pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        (
            "clf",
            dt,
        ),
    ]
)
pipeline.fit(X_train, y_train)
proba_test = pipeline.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, proba_test)
print(
    f"Pipeline: acc_train={pipeline.score(X_train, y_train):.3f} acc_test={pipeline.score(X_test, y_test):.3f} AUC={auc:.3f}"
)
pred_test = pipeline.predict(X_test)

unique_pred, counts_pred = np.unique(pred_test, return_counts=True)
print("Predicted class distribution on test:", dict(zip(unique_pred, counts_pred)))
print(
    "Probability stats test: min={:.4f} max={:.4f} mean={:.4f} var={:.2e}".format(
        proba_test.min(), proba_test.max(), proba_test.mean(), proba_test.var()
    )
)
plt.figure(figsize=(5, 4))
plt.hist(proba_test[y_test == 1], bins=40, histtype="step", label="matched")
plt.hist(proba_test[y_test == 0], bins=40, histtype="step", label="fake")
plt.xlabel("Prob(matched)")
plt.ylabel("Tracks")
plt.legend()
plt.tight_layout()
plt.savefig("pipeline_prob_dist_fixed.png", dpi=140)

# Save artifact
import joblib

artifact = {"pipeline": pipeline, "feature_names": feature_names}
joblib.dump(artifact, "bdt_pipeline.pkl")
print("Saved bdt_pipeline.pkl")

In [None]:
# Additional full-data diagnostic to ensure non-degenerate probabilities
full_proba = pipeline.predict_proba(X)[:, 1]
uniq_vals = np.unique(np.round(full_proba, 5))
spread = full_proba.max() - full_proba.min()
print(f"Full prob spread={spread:.5f}, unique rounded values={len(uniq_vals)}")
if spread < 1e-3 or len(uniq_vals) == 1:
    print(
        "WARNING: probabilities nearly constant -> model not separating; consider deeper tree or different algorithm."
    )