In [13]:
import pandas as pd
import torch
import numpy as np
import vector
from pathlib import Path
import os
import matplotlib.pyplot as plt
from downstreams.plotting.kinematic_comparison import plot_kinematics_comparison

In [14]:
def process_batch(batch):
    data_ = {}
    for label, group in [('predict', batch['predict']), ('target', batch['target'])]:
        for key, tensor in group.items():
            for i in range(tensor.shape[1]):
                col_name = (label, f"{key.replace('log_', '')}_{i}")

                if 'log_pt' in key:
                    data_[col_name] = np.exp(tensor[:, i].numpy())
                else:
                    data_[col_name] = tensor[:, i].numpy()

    # Create MultiIndex DataFrame
    df = pd.DataFrame(data_)
    df.columns = pd.MultiIndex.from_tuples(df.columns)
    return df


def extract_neutrinos(df, label):
    # Stack pt, eta, phi, mass for neutrinos 0 and 1
    pts = np.stack([df[(label, f"pt_{i}")].values for i in range(2)], axis=1)
    etas = np.stack([df[(label, f"eta_{i}")].values for i in range(2)], axis=1)
    phis = np.stack([df[(label, f"phi_{i}")].values for i in range(2)], axis=1)

    # Now build the vector array (num_events, 2)
    vecs = vector.array({
        "pt": pts,
        "eta": etas,
        "phi": phis,
        "mass": np.zeros_like(pts),
    })
    return vecs


def extract_particles(df, prefix1, prefix2):
    """
    Builds a vector array of shape (num_events, 2) by combining two particle sources.
    Each is extracted using pt/eta/phi/mass or energy from the DataFrame.
    """

    def get_components(prefix):
        if f"{prefix}/mass" in df.columns:
            return (
                df[f"{prefix}/pt"].values,
                df[f"{prefix}/eta"].values,
                df[f"{prefix}/phi"].values,
                df[f"{prefix}/mass"].values,
                "mass"
            )
        elif f"{prefix}/energy" in df.columns:
            return (
                df[f"{prefix}/pt"].values,
                df[f"{prefix}/eta"].values,
                df[f"{prefix}/phi"].values,
                df[f"{prefix}/energy"].values,
                "energy"
            )
        else:
            raise ValueError(f"Missing mass or energy columns for prefix: {prefix}")

    # Get components and type for each particle
    pt1, eta1, phi1, m1, type1 = get_components(prefix1)
    pt2, eta2, phi2, m2, type2 = get_components(prefix2)

    if type1 != type2:
        raise ValueError(f"Inconsistent 4-momentum components: {prefix1} uses {type1}, {prefix2} uses {type2}")

    return vector.arr({
        "pt": np.stack([pt1, pt2], axis=1),
        "eta": np.stack([eta1, eta2], axis=1),
        "phi": np.stack([phi1, phi2], axis=1),
        type1: np.stack([m1, m2], axis=1),
    })


def process_data(data, baseline_selections):
    dfs = []
    for batch in data:
        # Process the batch
        df_ = process_batch(batch['neutrinos'])
        df_extra = {extra_key.replace('EXTRA/', ''): batch[extra_key] for extra_key in batch.keys() if
                    'EXTRA/' in extra_key}
        df_extra = pd.DataFrame(df_extra)

        dfs.append(pd.concat([df_, df_extra], axis=1))

    final_df = pd.concat(dfs, ignore_index=True)
    final_df = final_df.query(baseline_selections)

    nu_pred = extract_neutrinos(final_df, "predict")
    nu_truth = extract_neutrinos(final_df, "target")

    particles = {
        "predict": nu_pred,
        "target": nu_truth,

        "b": extract_particles(final_df, "t1/b", "t2/b"),
        "lepton": extract_particles(final_df, "t1/l", "t2/l"),
        "truth_top": extract_particles(final_df, "truth_t1/t", "truth_t2/t"),
        "truth_W": extract_particles(final_df, "truth_t1/W", "truth_t2/W"),
        "truth_lepton": extract_particles(final_df, "truth_t1/l", "truth_t2/l"),
    }

    # calculate reconstructed W
    particles['W'] = particles['lepton'] + particles['predict']
    # calculate reconstructed top
    particles['top'] = particles['b'] + particles['W']
    # replace the truth W with reconstructed lepton + truth neutrino
    particles['plot_truth_W'] = particles['lepton'] + particles['target']
    # replace the truth Top with reconstructed lepton + truth neutrino + b
    particles['plot_truth_top'] = particles['b'] + particles['lepton'] + particles['target']

    return particles

In [15]:
def preprocess(baseline_nu):
    # calculate observables
    def build_observables(top: vector.MomentumNumpy4D, lepton: vector.MomentumNumpy4D):
        # tt̄ system
        ttbar = top.sum(axis=1)

        df = pd.DataFrame({
            "m_tt": ttbar.mass,
            "pt_tt": ttbar.pt,
            "y_tt": ttbar.rapidity,
            "pt_t1": getattr(top[:, 0], "pt"),
            "pt_t2": getattr(top[:, 1], "pt"),
            "dphi_ll": getattr(lepton[:, 0], "deltaphi")(lepton[:, 1]) / np.pi,
        })

        return df

    df_truth = build_observables(baseline_nu["truth_top"], baseline_nu["truth_lepton"])
    df_reco_truthnu = build_observables(baseline_nu["plot_truth_top"], baseline_nu["lepton"])
    df_reco_prednu = build_observables(baseline_nu["top"], baseline_nu["lepton"])

    # Rename columns to avoid collisions
    df_truth = df_truth.add_suffix("_truth")
    df_reco_truthnu = df_reco_truthnu.add_suffix("_reco_truthnu")
    df_reco_prednu = df_reco_prednu.add_suffix("_reco_prednu")

    # Combine into one DataFrame (all same length, aligned row by row)
    df_all = pd.concat([df_truth, df_reco_truthnu, df_reco_prednu], axis=1)

    return df_all

In [16]:
p_dir = Path(os.getcwd()) / "aux"

# data =
nu_pretrain = process_data(
    torch.load("/Users/avencastmini/PycharmProjects/EveNet/workspace/test_data/nu2flow/prediction-mg5-300.pretrain.pt"),
    baseline_selections="(num_bjet == 2) and `t1/b/pt` > 0 and `t2/b/pt` > 0 and `t1/l/pt` > 0 and `t2/l/pt` > 0"
)

nu_scratch = process_data(
    torch.load("/Users/avencastmini/PycharmProjects/EveNet/workspace/test_data/nu2flow/prediction-mg5-300.pt"),
    baseline_selections="(num_bjet == 2) and `t1/b/pt` > 0 and `t2/b/pt` > 0 and `t1/l/pt` > 0 and `t2/l/pt` > 0"
    # baseline_selections="(num_bjet >= 0)"
    # baseline_selections="`t1/b/pt` > 25 and `t2/b/pt` > 25 and `t1/l/pt` > 15 and `t2/l/pt` > 15",
)

print("Selected Events: ", len(nu_pretrain["predict"]), len(nu_scratch["predict"]))

Selected Events:  26729 26729


In [17]:
# Unfolding
from unfolding.unfold import main, hist_setup
from downstreams.plotting.unfolding import plot_block_response, plot_uncertainty_with_ratio

bin_edges = hist_setup()

processed_nu_pretrain = preprocess(nu_pretrain)
processed_nu_scratch = preprocess(nu_scratch)

df_unfolded_pretrain = main(df=processed_nu_pretrain)
df_unfolded_scratch = main(df=processed_nu_scratch)

Category: truthnu - Variable: dphi_ll - done
Category: truthnu - Variable: pt_t1 - done
Category: truthnu - Variable: pt_t2 - done
Category: truthnu - Variable: pt_tt - done
Category: truthnu - Variable: y_tt - done
Category: prednu - Variable: dphi_ll - done
Category: prednu - Variable: pt_t1 - done
Category: prednu - Variable: pt_t2 - done
Category: prednu - Variable: pt_tt - done
Category: prednu - Variable: y_tt - done
Category: truthnu - Variable: dphi_ll - done
Category: truthnu - Variable: pt_t1 - done
Category: truthnu - Variable: pt_t2 - done
Category: truthnu - Variable: pt_tt - done
Category: truthnu - Variable: y_tt - done
Category: prednu - Variable: dphi_ll - done
Category: prednu - Variable: pt_t1 - done
Category: prednu - Variable: pt_t2 - done
Category: prednu - Variable: pt_tt - done
Category: prednu - Variable: y_tt - done


In [18]:
def build_16x16_response(
        df,
        mtt_truth_col,
        mtt_reco_col,
        var_truth_col,
        var_reco_col,
        mtt_bins,
        var_bins
):
    nbins = len(var_bins) - 1
    response = np.zeros((nbins * nbins, nbins * nbins))

    for i in range(nbins):  # reco mtt bin
        reco_mask = (df[mtt_reco_col] >= mtt_bins[i]) & (df[mtt_reco_col] < mtt_bins[i + 1])
        for j in range(nbins):  # truth mtt bin
            truth_mask = (df[mtt_truth_col] >= mtt_bins[j]) & (df[mtt_truth_col] < mtt_bins[j + 1])
            mask = reco_mask & truth_mask
            df_sel = df[mask]

            h2d, _, _ = np.histogram2d(
                df_sel[var_reco_col],
                df_sel[var_truth_col],
                bins=[var_bins, var_bins]
            )

            row_start = i * nbins
            col_start = j * nbins
            response[row_start:row_start + nbins, col_start:col_start + nbins] = h2d

    return response


# Define bin edges
bins_mtt = bin_edges["m_tt"]
mtt_labels = [
    r"$m_{t\bar{t}} < 400$",
    r"$400 < m_{t\bar{t}} < 500$",
    r"$500 < m_{t\bar{t}} < 800$",
    r"$m_{t\bar{t}} \geq 800$"
]
common_labels = {
    "dphi_ll": {
        "name": r"$\Delta\phi(\ell^+,\ell^-) / \pi$ [rad/$\pi$]",
        "bins": bin_edges["dphi_ll"],
        "truth_col": "dphi_ll_truth",
        "labels": ["0–0.25", "0.25–0.5", "0.5–0.75", "0.75–1.0"]
    },
    "pt_t1": {
        "name": r"$p_T^t$ [GeV]",
        "bins": bin_edges["pt_t1"],
        "truth_col": "pt_t1_truth",
        "labels": ["<75", "75–125", "125–175", "≥175"]
    },
    "pt_tt": {
        "name": r"$p_{T}^{t\bar{t}}$ [GeV]",
        "bins": bin_edges["pt_tt"],
        "truth_col": "pt_tt_truth",
        "labels": ["<70", "70–140", "140–200", "≥200"]
    },
    "y_tt": {
        "name": r"$y_{t\bar{t}}$",
        "bins": bin_edges["y_tt"],
        "truth_col": "y_tt_truth",
        "labels": ["<–1", "–1–0", "0–1", ">1"]
    }
}

for scenario, s_name in zip(["reco_truthnu", "reco_prednu"], ["Truth_Nu", "Pred_Nu"]):
    variable_configs = [
        {
            "reco_col": f"dphi_ll_{scenario}",
            **common_labels["dphi_ll"],
        },
        {
            "reco_col": f"pt_t1_{scenario}",
            **common_labels["pt_t1"],
        },
        {
            "reco_col": f"pt_tt_{scenario}",
            **common_labels["pt_tt"],
        },
        {
            "reco_col": f"y_tt_{scenario}",
            **common_labels["y_tt"],
        }
    ]

    # Build and plot all variables
    for df_plot, tag in zip([
        processed_nu_pretrain,
        processed_nu_scratch
    ], [
        'pretrain',
        'scratch'
    ]):
        for var in variable_configs:
            response = build_16x16_response(
                df_plot,
                mtt_truth_col="m_tt_truth",
                mtt_reco_col="m_tt_reco_prednu",
                var_truth_col=var["truth_col"],
                var_reco_col=var["reco_col"],
                mtt_bins=bins_mtt,
                var_bins=var["bins"]
            )

            plot_block_response(
                response,
                title=f"EveNet: {s_name}",
                var_labels=var["labels"],
                mtt_labels=mtt_labels,
                xlabel=f"Detector-level {var['name']}",
                ylabel=f"Parton-level {var['name']}",
                p_dir=p_dir,
                save_name=f"{s_name}_{var['truth_col'].replace('_truth', '')}.{tag}.pdf"
            )

columns = list(range(16))
paper_results = {
    "y_tt": pd.DataFrame({
        "nu-weighting": [2.9, 3.2, 3.0, 2.6, 2.9, 3.4, 3.1, 2.6, 2.7, 2.8, 2.9, 2.6, 2.4, 2.5, 2.6, 2.7],
        "Ellipse": [3.5, 3.6, 3.2, 2.4, 2.2, 3.6, 3.5, 3.1, 3.6, 3.8, 4.1, 3.5, 3.1, 3.4, 3.6, 3.6],
        "nu2-flows": [1.8, 1.9, 1.9, 1.8, 1.9, 2.1, 1.9, 1.8, 1.8, 1.8, 1.8, 1.7, 1.6, 1.6, 1.6, 1.7],
        "nu2-flows-pythia8": [1.9, 1.9, 1.9, 1.8, 1.9, 2.1, 2.0, 1.8, 1.8, 1.8, 1.8, 1.7, 1.7, 1.6, 1.7, 1.8]
    }, index=columns),
    "pt_tt": pd.DataFrame({
        "nu-weighting": [3.5, 2.6, 1.6, 2.1, 3.4, 3.0, 2.4, 2.3, 2.5, 2.7, 2.5, 2.2, 2.1, 2.2, 2.2, 2.3],
        "Ellipse": [7.6, 5.3, 2.2, 3.7, 7.3, 5.7, 4.3, 4.1, 4.0, 4.7, 4.4, 3.8, 3.6, 3.3, 3.5, 3.8],
        "nu2-flows": [1.9, 1.6, 1.2, 1.4, 2.0, 1.7, 1.5, 1.5, 1.6, 1.6, 1.6, 1.5, 1.5, 1.5, 1.5, 1.5],
        "nu2-flows-pythia8": [2.0, 1.6, 1.2, 1.4, 2.0, 1.7, 1.5, 1.5, 1.7, 1.6, 1.6, 1.5, 1.5, 1.5, 1.5, 1.5]
    }, index=columns),
    "pt_t1": pd.DataFrame({
        "nu-weighting": [3.1, 2.3, 1.7, 2.5, 3.2, 3.2, 2.8, 2.9, 2.8, 2.9, 2.9, 2.3, 2.2, 2.2, 2.3, 2.4],
        "Ellipse": [4.8, 3.1, 2.5, 3.8, 4.9, 4.9, 3.8, 4.3, 4.0, 4.4, 4.9, 3.0, 3.5, 3.7, 3.6, 3.4],
        "nu2-flows": [2.2, 1.9, 1.8, 2.1, 2.5, 2.4, 2.0, 2.2, 2.1, 2.0, 2.0, 1.7, 1.9, 1.9, 1.7, 1.6],
        "nu2-flows-pythia8": [2.3, 1.9, 1.9, 2.2, 2.5, 2.4, 2.0, 2.3, 2.2, 2.1, 2.0, 1.7, 2.0, 2.0, 1.7, 1.6]
    }, index=columns),
    "dphi_ll": pd.DataFrame({
        "nu-weighting": [2.0, 2.0, 1.5, 1.4, 1.9, 2.1, 2.0, 1.9, 1.9, 2.0, 2.2, 2.2, 2.0, 1.5, 1.6, 1.9],
        "Ellipse": [2.2, 2.1, 1.5, 1.3, 1.8, 2.3, 2.4, 2.3, 2.3, 2.5, 2.8, 2.4, 1.7, 1.9, 2.4, 2.6],
        "nu2-flows": [1.5, 1.5, 1.4, 1.4, 1.6, 1.6, 1.5, 1.5, 1.5, 1.5, 1.5, 1.3, 1.4, 1.5, 1.5, 1.5],
        "nu2-flows-pythia8": [1.5, 1.5, 1.4, 1.4, 1.6, 1.6, 1.5, 1.5, 1.5, 1.5, 1.6, 1.5, 1.3, 1.4, 1.5, 1.5]
    }, index=columns)
}

for var, var_cfg in common_labels.items():
    truth_pretrain = df_unfolded_pretrain[f"{var}_truthnu_unfold_error"].values
    truth_scratch = df_unfolded_scratch[f"{var}_truthnu_unfold_error"].values
    methods = [
        {"name": r"EveNet - Scratch", "color": "red",
         "data": df_unfolded_scratch[f"{var}_prednu_unfold_error"] / truth_scratch},
        {"name": r"EveNet - Pretrain", "color": "green",
         "data": df_unfolded_pretrain[f"{var}_prednu_unfold_error"] / truth_pretrain},
        {"name": r"$\nu^2$-Flows", "color": "blue", "data": paper_results[var]["nu2-flows"].values},
        # {"name": r"$\nu-2$-flow (P8)", "color": "green", "data": paper_results[var]["nu2-flows-pythia8"].values},
        {"name": r"Ellipse", "color": "orange", "data": paper_results[var]["Ellipse"].values},
        {"name": r"$\nu$-weighting", "color": "purple", "data": paper_results[var]["nu-weighting"].values}
    ]

    plot_uncertainty_with_ratio(
        mtt_labels, var_cfg["labels"], var_cfg['name'], methods,
        ratio_baseline_name=r"$\nu^2$-Flows",
        p_dir=p_dir,
        save_name=f"unfolded_{var}.pdf",
        ratio_baseline_max=0.25,
        ratio_baseline_min=-0.05,
        ratio_y_label=r"Improvement to $\nu^2$-Flows",
    )



min_ratio, max_ratio -0.866666088889274 0.1608187236499177
min_ratio, max_ratio -1.4499992750003625 0.1362996164048001
min_ratio, max_ratio -2.9999984210534625 0.11212744687093991
min_ratio, max_ratio -1.277777067901629 0.12310610985019961


In [19]:
named_configs = {
    "neutrino": {
        "variables": ["pt", "eta", "phi"],
        "x_labels": [r"$p_T^{\nu}$ [GeV]", r"$\eta^{\nu}$", r"$\phi^{\nu}$"],
        "kin_range": {"pt": (0, 350), "eta": (-np.pi * 1.5, np.pi * 1.5), "phi": (-np.pi, np.pi)},
        # "labels": [r"$\nu$ from $(top^+)$", r"$\nu$ from $(top^-)$"],
        "labels": [r"$\nu$ (scratch)", r"$\nu$ (pretrain)"],
        "colors": ['#5bb5ac', '#de526c'],
        "columns": ['predict', 'target'],
        "log_y": [True, False, False],
    },
    "top": {
        "variables": ["pt", "eta", "phi", "mass"],
        "x_labels": [r"$p_T^{t}$ [GeV]", r"$\eta^{t}$", r"$\phi^{t}$", r"$mass^{t}$ [GeV]"],
        "kin_range": {"pt": (0, 600), "eta": (-np.pi * 1.5, np.pi * 1.5), "phi": (-np.pi, np.pi), "mass": (100, 240)},
        # "labels": [r"$(top^+)$ ", r"$(top^-)$"],
        "labels": [r"$top$ (scratch)", r"$top$ (pretrain)"],
        "colors": ['#5bb5ac', '#de526c'],
        "columns": ['top', 'plot_truth_top'],
        "log_y": [True, False, False, False],
    },
    "W": {
        "variables": ["pt", "eta", "phi", "mass"],
        "x_labels": [r"$p_T^{W}$ [GeV]", r"$\eta^{W}$", r"$\phi^{W}$", r"$mass^{W}$ [GeV]"],
        "kin_range": {"pt": (0, 350), "eta": (-np.pi * 1.5, np.pi * 1.5), "phi": (-np.pi, np.pi), "mass": (40, 120)},
        # "labels": [r"$(W^+)$", r"$(W^-)$"],
        "labels": [r"$W$ (scratch)", r"$W$ (pretrain)"],
        "colors": ['#5bb5ac', '#de526c'],
        "columns": ['W', 'plot_truth_W'],
        "log_y": [True, False, False, False],
    }
}

for particle, cfg in named_configs.items():

    for i, var in enumerate(cfg["variables"]):
        fig, axs = plt.subplots(
            3, 1, figsize=(10, 16),
            gridspec_kw={'height_ratios': [3, 1, 2], 'hspace': 0.0},
            sharex=True
        )

        plot_kinematics_comparison(
            axs=axs,
            # kin=[getattr(nu[cfg['columns'][0]][..., 0], var), getattr(nu[cfg['columns'][0]][..., 1], var)],
            # truth_kin=[getattr(nu[cfg['columns'][1]][..., 0], var), getattr(nu[cfg['columns'][1]][..., 1], var)],
            kin=[
                np.concatenate([
                    getattr(nu_scratch[cfg['columns'][0]][..., 0], var),
                    getattr(nu_scratch[cfg['columns'][0]][..., 1], var)
                ], axis=0),
                np.concatenate([
                    getattr(nu_pretrain[cfg['columns'][0]][..., 0], var),
                    getattr(nu_pretrain[cfg['columns'][0]][..., 1], var)
                ], axis=0)
            ],
            truth_kin=[
                np.concatenate([
                    getattr(nu_scratch[cfg['columns'][1]][..., 0], var),
                    getattr(nu_scratch[cfg['columns'][1]][..., 1], var)
                ], axis=0),
                np.concatenate([
                    getattr(nu_pretrain[cfg['columns'][1]][..., 0], var),
                    getattr(nu_pretrain[cfg['columns'][1]][..., 1], var)
                ], axis=0)
            ],
            bins=100,
            kin_range=cfg["kin_range"][var],
            labels=cfg["labels"],
            colors=cfg["colors"],
            xlabel=cfg["x_labels"][i],
            normalize_col=cfg.get("normalize_col", False),
            log_z=cfg.get("log_z", True),
            log_y=cfg.get("log_y", [False, False, False, False])[i],
            c_percent=np.array([10, 100])
        )

        plt.tight_layout()
        if not os.path.exists(p_dir / "kinematics"):
            os.makedirs(p_dir / "kinematics")
        plt.savefig(p_dir / "kinematics" / f"{particle}_{var}.pdf")
        plt.close(fig)

  ax.contourf(X, Y, Z, levels=levels, cmap=contour_colors[i], alpha=0.5, norm=mcolors.LogNorm())
  ax.contourf(X, Y, Z, levels=levels, cmap=contour_colors[i], alpha=0.5, norm=mcolors.LogNorm())
  ax.contourf(X, Y, Z, levels=levels, cmap=contour_colors[i], alpha=0.5, norm=mcolors.LogNorm())
  ax.contourf(X, Y, Z, levels=levels, cmap=contour_colors[i], alpha=0.5, norm=mcolors.LogNorm())
