In [None]:
import math
import os
import glob

import ROOT
from hepdata_lib import Submission, Table, Variable, Uncertainty, RootFileReader



ROOT_FILENAME = "input/ZNuNuGPrePostFitPostFitEBEE.root"
OUTPUT_DIR = "hepdata_output"


OBSERVABLE_NAME = r"$p_{T}^{\gamma}$"
OBSERVABLE_UNITS = "GeV"

USE_GARWOOD = True
POISSON_CL = 0.6827  # ~1 sigma

CM_ENERGY_GEV = 13000.0  # 13 TeV
PAPER_TITLE = (
    "Measurement of the $Z\\gamma$ production cross section and search for "
    "anomalous neutral triple gauge couplings in pp collisions at $\\sqrt{s}=13$ TeV"
)
CMS_ANALYSIS_ID = "CMS SMP-22-009"

REGIONS = [
    {
        "name": "Figure4_EB_pTgamma",
        "location": "Figure 4 (left)",
        "description": (
            "Post-fit reconstruction-level photon transverse momentum $p_{T}^{\\gamma}$ "
            "distribution in the ECAL barrel (EB) signal region for the full Run-2 CMS "
            "dataset (138 fb$^{-1}$ at $\\sqrt{s}=13$ TeV). Black points labelled "
            "'Observed' show the data with Poisson statistical uncertainties. The "
            "background expectation is decomposed into the components listed as "
            "dependent variables in this table. In the published figure, the "
            "components labelled here as fiducial $Z\\gamma$, $W+\\gamma$, ECAL "
            "spikes and misidentified electrons $e\\to\\gamma$ appear explicitly "
            "in the legend. The components labelled here as jet fakes, "
            "out-of-acceptance $Z(\\nu\\nu)\\gamma$ and minor backgrounds are "
            "drawn in the stack but grouped together and shown as 'Other' in the "
            "legend. The variable 'Total background' gives the sum of all non-signal "
            "background components evaluated at their post-fit yields. The variable "
            "'Predicted' gives the full post-fit model prediction (signal plus all "
            "background components), with uncertainties propagated from the fit "
            "covariance."
        ),
        "root_dir": "postfit/EB_SR",
        "data_hist": "data_obs",
        "primary_mc": [
            (r"Fiducial $Z\gamma$", "mergedFiducialZNuNuG"),
            (r"$W+\gamma$",         "mergedWLNuG"),
            ("Spikes",              "ECAL_spikes"),
            (r"$e\to\gamma$",       "eleFakes"),
        ],
        "other_mc": [
            ("Jet fakes",                         "jetFakes"),
            (r"Out-of-acceptance $Z(\nu\nu)\gamma$", "ooaFixed"),
            ("Minor backgrounds",                 "minor_bkg"),
        ],
        "extra_mc": [
            ("Total background", "TotalBkg"),
            ("Predicted",        "TotalProcs"),
        ],
    },
    {
        "name": "Figure4_EE_pTgamma",
        "location": "Figure 4 (right)",
        "description": (
            "Post-fit reconstruction-level photon transverse momentum $p_{T}^{\\gamma}$ "
            "distribution in the ECAL endcap (EE) signal region for the full Run-2 CMS "
            "dataset (138 fb$^{-1}$ at $\\sqrt{s}=13$ TeV). Black points labelled "
            "'Observed' show the data with Poisson statistical uncertainties. The "
            "background expectation is decomposed into the components listed as "
            "dependent variables in this table. In the published figure, the "
            "components labelled here as fiducial $Z\\gamma$, $W+\\gamma$, beam "
            "halo and misidentified electrons $e\\to\\gamma$ appear explicitly "
            "in the legend. The components labelled here as jet fakes, "
            "out-of-acceptance $Z(\\nu\\nu)\\gamma$ and minor backgrounds are "
            "drawn in the stack but grouped together and shown as 'Other' in the "
            "legend. The variable 'Total background' gives the sum of all non-signal "
            "background components evaluated at their post-fit yields. The variable "
            "'Predicted' gives the full post-fit model prediction (signal plus all "
            "background components), with uncertainties propagated from the fit "
            "covariance."
        ),
        "root_dir": "postfit/EE_SR",
        "data_hist": "data_obs",
        "primary_mc": [
            (r"Fiducial $Z\gamma$", "mergedFiducialZNuNuG"),
            (r"$W+\gamma$",         "mergedWLNuG"),
            (r"$e\to\gamma$",       "eleFakes"),
            ("Beam halo",           "beamHalo"),
        ],
        "other_mc": [
            ("Jet fakes",                         "jetFakes"),
            (r"Out-of-acceptance $Z(\nu\nu)\gamma$", "OOAfixed"),
            ("Minor backgrounds",                 "minor_bkg"),
        ],
        "extra_mc": [
            ("Total background", "TotalBkg"),
            ("Predicted",        "TotalProcs"),
        ],
    },
]


def make_asym_variable(label, values, err_down, err_up, units=None, unc_name="total"):
    v = Variable(label, is_independent=False, is_binned=False, units=units)
    v.values = values
    unc = Uncertainty(unc_name, is_symmetric=False)
    unc.values = [(-dn, +up) for dn, up in zip(err_down, err_up)]
    v.add_uncertainty(unc)
    return v

def build_table3_fiducial_xs():
    table = Table("Table3_FiducialCrossSections")
    table.location = "Table 3"
    table.description = (
        "Measured and predicted fiducial cross sections (fb) in the barrel, endcaps, and combined phase space. "
        "Predictions shown at NLO (MADGRAPH5_aMC@NLO) and NNLO (MATRIX)."
    )
    table.keywords["cmenergies"] = [CM_ENERGY_GEV]
    table.keywords["reactions"] = ["P P --> Z GAMMA"]
    table.keywords["phrases"] = ["Fiducial cross section"]

    reg = Variable("Region", is_independent=True, is_binned=False)
    reg.values = [r"Barrel ($|\eta|<1.4442$)",
                  r"Endcaps ($1.566<|\eta|<2.5$)",
                  "Total"]
    table.add_variable(reg)

    table.add_variable(make_asym_variable(
        "Measured", [16.7, 7.8, 23.3],
        err_down=[1.0, 0.7, 1.3],
        err_up  =[1.0, 0.8, 1.4],
        units="fb", unc_name="total"
    ))
    table.add_variable(make_asym_variable(
        r"NLO (MADGRAPH5_aMC@NLO)", [19.6, 6.4, 26.1],
        err_down=[0.7, 0.3, 1.0],
        err_up  =[0.7, 0.3, 1.0],
        units="fb", unc_name="theory"
    ))
    table.add_variable(make_asym_variable(
        r"NNLO (MATRIX)", [19.3, 6.21, 25.4],
        err_down=[0.3, 0.09, 0.3],
        err_up  =[0.3, 0.07, 0.4],
        units="fb", unc_name="theory"
    ))
    return table


def build_table4_xs_vs_ptgamma():
    table = Table("Table4_CrossSectionVsPtGamma")
    table.location = "Table 4"
    table.description = (
        "Measured and predicted cross sections (fb) in bins of photon transverse momentum."
    )
    table.keywords["cmenergies"] = [CM_ENERGY_GEV]
    table.keywords["reactions"] = ["P P --> Z GAMMA"]
    table.keywords["observables"] = ["SIG"]
    table.keywords["phrases"] = ["Binned cross section"]

    x = Variable(r"$p_{T}^{\gamma}$", is_independent=True, is_binned=True, units="GeV")
    bins = [(225,275),(275,350),(350,450),(450,600),(600,800),(800,1500)]
    x.values = bins
    table.add_variable(x)

    table.add_variable(make_asym_variable(
        "Measured",
        [12.45, 6.95, 2.61, 1.08, 0.282, 0.092],
        err_down=[0.87, 0.57, 0.33, 0.20, 0.009, 0.005],
        err_up  =[0.91, 0.60, 0.35, 0.21, 0.010, 0.005],
        units="fb", unc_name="total"
    ))
    table.add_variable(make_asym_variable(
        r"NLO (MADGRAPH5_aMC@NLO)",
        [12.88, 8.14, 3.34, 1.26, 0.345, 0.107],
        err_down=[0.49, 0.30, 0.11, 0.05, 0.019, 0.012],
        err_up  =[0.42, 0.31, 0.17, 0.08, 0.026, 0.011],
        units="fb", unc_name="theory"
    ))
    table.add_variable(make_asym_variable(
        r"NNLO (MATRIX)",
        [12.83, 7.89, 3.22, 1.22, 0.331, 0.091],
        err_down=[0.15, 0.15, 0.06, 0.02, 0.005, 0.014],
        err_up  =[0.17, 0.16, 0.07, 0.02, 0.007, 0.020],
        units="fb", unc_name="theory"
    ))
    return table


def build_table5_aNTGC_limits():
    table = Table("Table5_aNTGC_95CL")
    table.location = "Table 5"
    table.description = (
        "Expected and observed 95% CL intervals for anomalous coupling parameters, "
        "with other parameters fixed to zero."
    )
    table.keywords["cmenergies"] = [CM_ENERGY_GEV]
    table.keywords["phrases"] = ["aNTGC", "95% CL", "Limits"]

    p = Variable("Parameter", is_independent=True, is_binned=False)
    p.values = [r"$h_{3}^{\gamma}\times 10^{4}$",
                r"$h_{4}^{\gamma}\times 10^{7}$",
                r"$h_{3}^{Z}\times 10^{4}$",
                r"$h_{4}^{Z}\times 10^{7}$"]
    table.add_variable(p)

    exp_lo = Variable("Expected lower", is_independent=False, is_binned=False)
    exp_hi = Variable("Expected upper", is_independent=False, is_binned=False)
    obs_lo = Variable("Observed lower", is_independent=False, is_binned=False)
    obs_hi = Variable("Observed upper", is_independent=False, is_binned=False)

    exp_lo.values = [-2.8, -5.9, -1.8, -3.7]
    exp_hi.values = [ 2.9,  6.0,  1.9,  3.7]
    obs_lo.values = [-3.4, -6.8, -2.2, -4.1]
    obs_hi.values = [ 3.5,  6.8,  2.2,  4.2]

    table.add_variable(exp_lo); table.add_variable(exp_hi)
    table.add_variable(obs_lo); table.add_variable(obs_hi)
    return table

def poisson_garwood_intervals(values, cl=POISSON_CL):
    alpha = 1.0 - cl
    err_down, err_up = [], []
    for y in values:
        n = float(y)
        if n < 0:
            raise ValueError(f"Negative data bin content: {n}")
        if n == 0.0:
            low = 0.0
        else:
            low = 0.5 * ROOT.Math.chisquared_quantile(alpha / 2.0, 2.0 * n)
        up = 0.5 * ROOT.Math.chisquared_quantile_c(alpha / 2.0, 2.0 * (n + 1.0))
        err_down.append(n - low)
        err_up.append(up - n)
    return err_down, err_up


def make_mc_variable(label, hist_dict):
    v = Variable(label, is_independent=False, is_binned=False)
    v.values = hist_dict["y"]
    dy = hist_dict.get("dy")
    if dy is not None:
        if len(dy) > 0 and isinstance(dy[0], tuple):
            unc = Uncertainty("stat+syst", is_symmetric=False)
            unc.values = dy
        else:
            unc = Uncertainty("stat+syst", is_symmetric=True)
            unc.values = dy
        v.add_uncertainty(unc)
    return v


def make_data_variable(label, hist_dict):
    y = hist_dict["y"]
    v = Variable(label, is_independent=False, is_binned=False)
    v.values = y

    if USE_GARWOOD:
        err_down, err_up = poisson_garwood_intervals(y)
        asym = [(-dn, +up) for dn, up in zip(err_down, err_up)]
        unc = Uncertainty("stat", is_symmetric=False)
        unc.values = asym
    else:
        errs = [math.sqrt(val) if val >= 0.0 else 0.0 for val in y]
        unc = Uncertainty("stat", is_symmetric=True)
        unc.values = errs

    v.add_uncertainty(unc)
    return v


def read_hist_1d(reader, dir_path, hist_name):
    """Robust path handling for ROOT histograms."""
    tried = []
    path1 = f"{dir_path}/{hist_name}" if dir_path else hist_name
    tried.append(path1)
    try:
        return reader.read_hist_1d(path1)
    except Exception:
        if dir_path and "/" in dir_path:
            tail = dir_path.split("/", 1)[-1]
            path2 = f"{tail}/{hist_name}"
            tried.append(path2)
            try:
                return reader.read_hist_1d(path2)
            except Exception as e2:
                raise RuntimeError(
                    f"Could not find histogram '{hist_name}' in ROOT file. "
                    f"Tried paths: {tried}"
                ) from e2
        else:
            raise RuntimeError(
                f"Could not find histogram '{hist_name}' in ROOT file. "
                f"Tried path: {tried[0]}"
            )

def build_table_for_region(reader, cfg):
    root_dir = cfg["root_dir"]
    data_dict = read_hist_1d(reader, root_dir, cfg["data_hist"])

    x_var = Variable(
        OBSERVABLE_NAME,
        is_independent=True,
        is_binned=True,
        units=OBSERVABLE_UNITS,
    )
    x_var.values = data_dict["x_edges"]

    table = Table(cfg["name"])
    table.location = cfg["location"]
    table.description = cfg["description"]

    # Keywords: common for both regions
    table.keywords["cmenergies"] = [CM_ENERGY_GEV]
    table.keywords["reactions"] = ["P P --> Z GAMMA"]
    table.keywords["observables"] = ["DSIG/DPT"]
    table.keywords["phrases"] = [
        "Differential cross section",
        "Z to invisible",
        "High-pt photon",
        "Run 2",
    ]

    table.add_variable(x_var)
    table.add_variable(make_data_variable("Observed", data_dict))

    for label, hist_name in cfg.get("primary_mc", []):
        mc_dict = read_hist_1d(reader, root_dir, hist_name)
        table.add_variable(make_mc_variable(label, mc_dict))

    for label, hist_name in cfg.get("other_mc", []):
        mc_dict = read_hist_1d(reader, root_dir, hist_name)
        table.add_variable(make_mc_variable(label, mc_dict))

    for label, hist_name in cfg.get("extra_mc", []):
        extra_dict = read_hist_1d(reader, root_dir, hist_name)
        table.add_variable(make_mc_variable(label, extra_dict))

    return table


def build_submission():
    sub = Submission()

    # Short top-level description of the analysis
    sub.comment = (
        f"{PAPER_TITLE}. CMS analysis {CMS_ANALYSIS_ID}. "
        "Z→νν with an associated high-$p_{T}$ photon, using 138 fb$^{-1}$ "
        "of pp collisions at $\\sqrt{s}=13$ TeV. "
        "The tables in this submission provide the post-fit reconstruction-level "
        "$p_{T}^{\\gamma}$ spectra in the EB and EE signal regions corresponding "
        "to Figure 4 of the paper."
    )

    # Optional abstract file (if you provide one)
    if ABSTRACT_FILE is not None and os.path.exists(ABSTRACT_FILE):
        sub.read_abstract(ABSTRACT_FILE)

    reader = RootFileReader(ROOT_FILENAME)
    for cfg in REGIONS:
        table = build_table_for_region(reader, cfg)
        sub.add_table(table)

    sub.add_table(build_table3_fiducial_xs())
    sub.add_table(build_table4_xs_vs_ptgamma())
    sub.add_table(build_table5_aNTGC_limits())


    return sub


def main():
    sub = build_submission()
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    sub.create_files(OUTPUT_DIR)
    print(f"HEPData submission written to: {OUTPUT_DIR}")



def validate_hepdata_output(output_dir=OUTPUT_DIR):
    try:
        import yaml
    except ImportError:
        print("PyYAML is not available; cannot parse YAML for validation.")
        return

    yaml_files = sorted(glob.glob(os.path.join(output_dir, "*.yaml")))
    if not yaml_files:
        print(f"No YAML files found in {output_dir}")
        return

    print(f"\nValidation of HEPData YAML files in: {output_dir}")
    for path in yaml_files:
        with open(path, "r") as f:
            docs = list(yaml.safe_load_all(f))

        fname = os.path.basename(path)
        print(f"\n=== {fname} ===")

        # submission.yaml is a multi-document file with metadata + table refs
        if fname == "submission.yaml":
            print(f"  {fname} contains {len(docs)} document(s).")
            for i, doc in enumerate(docs):
                if doc is None:
                    print(f"    doc {i}: <empty>")
                    continue
                keys = list(doc.keys())
                print(f"    doc {i}: keys = {keys}")
            continue

        # table_X.yaml is normally a single document
        if not docs:
            print("  WARNING: empty YAML document.")
            continue

        doc = docs[0]
        if "independent_variables" not in doc or "dependent_variables" not in doc:
            print("  Not a standard table document. Keys:", list(doc.keys()))
            continue

        indep = doc["independent_variables"]
        dep = doc["dependent_variables"]
        print(f"  independent_variables: {len(indep)}, dependent_variables: {len(dep)}")

        if not indep:
            print("  WARNING: no independent variables found.")
            continue

        n_bins = len(indep[0].get("values", []))
        print(f"  number of bins (from first independent variable): {n_bins}")

        for dv in dep:
            name = dv.get("header", {}).get("name", "<unnamed>")
            n_vals = len(dv.get("values", []))
            status = "OK" if n_vals == n_bins else f"MISMATCH ({n_vals} vs {n_bins})"
            print(f"    {name}: {n_vals} values -> {status}")




In [4]:
main()
validate_hepdata_output()



HEPData submission written to: hepdata_output

Validation of HEPData YAML files in: hepdata_output

=== figure4_eb_ptgamma.yaml ===
  independent_variables: 1, dependent_variables: 10
  number of bins (from first independent variable): 11
    Observed: 11 values -> OK
    Fiducial $Z\gamma$: 11 values -> OK
    $W+\gamma$: 11 values -> OK
    Spikes: 11 values -> OK
    $e\to\gamma$: 11 values -> OK
    Jet fakes: 11 values -> OK
    Out-of-acceptance $Z(\nu\nu)\gamma$: 11 values -> OK
    Minor backgrounds: 11 values -> OK
    Total background: 11 values -> OK
    Predicted: 11 values -> OK

=== figure4_ee_ptgamma.yaml ===
  independent_variables: 1, dependent_variables: 10
  number of bins (from first independent variable): 7
    Observed: 7 values -> OK
    Fiducial $Z\gamma$: 7 values -> OK
    $W+\gamma$: 7 values -> OK
    $e\to\gamma$: 7 values -> OK
    Beam halo: 7 values -> OK
    Jet fakes: 7 values -> OK
    Out-of-acceptance $Z(\nu\nu)\gamma$: 7 values -> OK
    Minor back