In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# import uproot as up
import pandas as pd
import os
# import numpy as np
# import awkward as ak
# import mplhep as hep
import yaml
from coffea.nanoevents import TreeMakerSchema, BaseSchema, NanoEventsFactory
# import argparse
import sys

from helperClasses import *

In [3]:
run_data = True
run_mc = False

def load_events(file_path, schema_class, entry_stop):
    """
    Load events from a ROOT file with error handling.
    """
    try:
        events = NanoEventsFactory.from_root(
            {file_path: "ntuple"}, 
            schemaclass=schema_class, 
            entry_stop=entry_stop
        ).events()
        return events
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except Exception as e:
        print(f"Error loading events from '{file_path}': {e}")
        return None

def load_config(config_path):
    """
    Load configuration from a YAML file.
    """
    try:
        with open(config_path, 'r') as config_file:
            return yaml.safe_load(config_file)
    except FileNotFoundError:
        print(f"Error: Configuration file '{config_path}' not found.")
        sys.exit(1)
    except yaml.YAMLError as e:
        print(f"Error parsing YAML configuration file: {e}")
        sys.exit(1)

config = load_config('config.yaml')

events_data = None
events_mc = None

# Load events
if run_data:
    events_data = load_events(config['data_path'], BaseSchema, config['entry_stop'])
    if events_data is None:
        print("Failed to load data events. Skipping data analysis.")
        run_data = False

if run_mc:
    events_mc = load_events(config['mc_path'], BaseSchema, config['entry_stop'])
    if events_mc is None:
        print("Failed to load MC events. Skipping MC analysis.")
        run_mc = False

if not run_data and not run_mc:
    print("No valid data or MC events loaded. Exiting.")
    sys.exit(1)

# Select columns
if run_data:
    events_data = events_data[config['columns']].compute()
if run_mc:
    events_mc = events_mc[config['columns']].compute()

In [4]:
save_path = config['savepath_base']

In [5]:
save_path
os.makedirs(save_path, exist_ok=True)

In [6]:
dilepton_mass_cuts = [8, 9]
eleID_cuts = [10, 11, 12]

eleID_both = [0, 1, 3, 12, 7, 5, 8, 9, 6, 13]
cuts_to_show = [0, 3, 12, 7, 5, 8, 9, 6, 13]

In [7]:
cutdict = CutDict()
cutdict.add_cut(0,
                 CutType(name="0",
                     long_name="Preselection",
                     mask=events_data['B_Z_mass'] > -1, 
                     variables=['B_Z_mass'], 
                     plot=False))

cutdict.add_cut(1,
                 CutType(name="1",
                     long_name="UnOrdered pT",
                     mask=(events_data['B_Z_pt1'] > events_data['B_Z_pt2']) & (events_data['B_J_pt1'] > events_data['B_J_pt2']), 
                     variables=['B_Z_pt1', 'B_Z_pt2', 'B_J_pt1', 'B_J_pt2'], 
                     plot=False))

cutdict.add_cut(2,
                    CutType(name="2",
                        long_name="Electron Trigger",
                        mask=events_data['B_Z_TriggerPath'] == 1, 
                        variables='B_Z_TriggerPath', 
                        plot=False))

cutdict.add_cut(3, 
                    CutType(name="3",
                        long_name="Soft Muons",
                        mask=(events_data['B_J_soft1'] == 1) & (events_data['B_J_soft2'] == 1),
                        variables=['B_J_soft1', 'B_J_soft2'],
                        plot=False))

cutdict.add_cut(4,
                    CutType(name="4",
                        long_name="Electron Trigger Enforce",
                        mask=(events_data['B_Z_pt1'] > 27),
                        variables=['B_Z_pt1'],
                        plot=True,
                        bins=50,
                        x_range=(0, 300),
                        labels=["Z_pt1"],
                        xlabel="Z pt1 (GeV)"))

cutdict.add_cut(5,
                    CutType(name="5",
                        long_name="Dilepton Vtx > 1%",
                        mask=(events_data['B_J_VtxProb'] > 0.01) & (events_data['B_Z_VtxProb'] > 0.01),
                        variables=['B_J_VtxProb', 'B_Z_VtxProb'],
                        plot=True,
                        bins=100,
                        x_range=(0, 1),
                        labels=["J_VtxProb", "Z_VtxProb"],
                        xlabel="Vtx Prob"))

cutdict.add_cut(6,
                    CutType(name="6",
                        long_name="FourL Vtx > 1%",
                        mask=(events_data['FourL_VtxProb'] > 0.01),
                        variables=['FourL_VtxProb'],
                        plot=True,
                        bins=100,
                        x_range=(0, 1),
                        labels=["FourL_VtxProb"],
                        xlabel="Vtx Prob"))

cutdict.add_cut(7,
                    CutType(name="7",
                        long_name="Detector acceptance",
                        mask=(abs(events_data['B_Z_eta1']) < 2.4) & (abs(events_data['B_Z_eta2']) < 2.4) & (abs(events_data['B_J_eta1']) < 2.5) & (abs(events_data['B_J_eta2']) < 2.5),
                        variables=['B_Z_eta1', 'B_Z_eta2', 'B_J_eta1', 'B_J_eta2'],
                        plot=True,
                        bins=100,
                        x_range=(-3, 3),
                        labels=["Z_eta1", "Z_eta2", "J_eta1", "J_eta2"],
                        xlabel="Eta"))

cutdict.add_cut(8,
                    CutType(name="8",
                        long_name="J mass",
                        mask=(events_data['B_J_mass'] > 3.0) & (events_data['B_J_mass'] < 3.2),
                        variables=['B_J_mass'],
                        plot=True,
                        x_range=(3.0, 3.2),
                        bins=5,
                        labels=["J_mass"],
                        xlabel="J mass (GeV)"))

cutdict.add_cut(9,
                    CutType(name="9",
                        long_name="Z mass",
                        mask=(events_data['B_Z_mass'] > 70) & (events_data['B_Z_mass'] < 110),
                        variables=['B_Z_mass'],
                        plot=True,
                        bins=40,
                        x_range=(70, 110),
                        labels=["Z_mass"],
                        xlabel="Z mass (GeV)"))

cutdict.add_cut(10,
                    CutType(name="10",
                        long_name="eleID High pT",
                        mask=(events_data['B_Z_mvaIsoWP90_1']),
                        variables=['B_J_pt1'],
                        plot=True,
                        bins=100,
                        x_range=(0, 100),
                        labels=["J_pt1"],
                        xlabel="J pt1 (GeV)"))

cutdict.add_cut(11,
                    CutType(name="11",
                        long_name="eleID Low pT",
                        mask=(events_data['B_Z_mvaIsoWP90_2']),
                        variables=['B_J_pt2'],
                        plot=True,
                        bins=50,
                        x_range=(0, 100),
                        labels=["J_pt2"],
                        xlabel="J pt2 (GeV)"))

cutdict.add_cut(12,
                    CutType(name="12",
                        long_name="eleID either",
                        mask=(events_data['B_Z_mvaIsoWP90_1']) & (events_data['B_Z_mvaIsoWP90_2']),
                        variables=['B_J_pt1', 'B_J_pt2'],
                        plot=True,
                        bins=50,
                        x_range=(0, 100),
                        labels=["J_pt1", "J_pt2"],
                        xlabel="J pt (GeV)"))

cutdict.add_cut(13,
                    CutType(name="13",
                        long_name="FourL mass",
                        mask=(events_data['FourL_mass'] > 112) & (events_data['FourL_mass'] < 162),
                        variables=['FourL_mass'],
                        plot=True,
                        bins=25,
                        x_range=(112, 162),
                        labels=["FourL_mass"],
                        xlabel="FourL mass (GeV)"))


In [8]:
for key, item in cutdict.cutdict.items():
    print(key, item)

0 Cut: 0 (Preselection)
1 Cut: 1 (UnOrdered pT)
2 Cut: 2 (Electron Trigger)
3 Cut: 3 (Soft Muons)
4 Cut: 4 (Electron Trigger Enforce)
5 Cut: 5 (Dilepton Vtx > 1%)
6 Cut: 6 (FourL Vtx > 1%)
7 Cut: 7 (Detector acceptance)
8 Cut: 8 (J mass)
9 Cut: 9 (Z mass)
10 Cut: 10 (eleID High pT)
11 Cut: 11 (eleID Low pT)
12 Cut: 12 (eleID either)
13 Cut: 13 (FourL mass)


In [9]:
myorder = [0, 1, 3, 12, 7, 5, 8, 9, 6, 13]
cut_analysis = CutAnalysis(events_data, cutdict)
cutSummary = cut_analysis.prepare_masks(myorder)

Cut 0: Preselection - 439851 events, 253773 candidates
Cut 1: UnOrdered pT - 439677 events, 253662 candidates
Cut 3: Soft Muons - 86821 events, 69235 candidates
Cut 12: eleID either - 11399 events, 10410 candidates
Cut 7: Detector acceptance - 10578 events, 9686 candidates
Cut 5: Dilepton Vtx > 1% - 9782 events, 8983 candidates
Cut 8: J mass - 283 events, 283 candidates
Cut 9: Z mass - 283 events, 283 candidates
Cut 6: FourL Vtx > 1% - 259 events, 259 candidates
Cut 13: FourL mass - 100 events, 100 candidates


In [10]:
cutSummarydf = {k: v for k, v in cutSummary.items() if k not in ['mask', 'cutobj']}
pd.DataFrame(cutSummarydf)

Unnamed: 0,cut_id,cut_name,nevents,ncandidates
0,0,Preselection,439851,253773
1,1,UnOrdered pT,439677,253662
2,3,Soft Muons,86821,69235
3,12,eleID either,11399,10410
4,7,Detector acceptance,10578,9686
5,5,Dilepton Vtx > 1%,9782,8983
6,8,J mass,283,283
7,9,Z mass,283,283
8,6,FourL Vtx > 1%,259,259
9,13,FourL mass,100,100


In [11]:
plotter = Plotter(events_data, cutdict, save_path)

In [12]:
plotter.plot_preselection()

Cut 0 does not have a plot.
Cut 1 does not have a plot.
Cut 2 does not have a plot.
Cut 3 does not have a plot.
Plotting cut 4: Electron Trigger Enforce
Plotting cut 5: Dilepton Vtx > 1%
Plotting cut 6: FourL Vtx > 1%
Plotting cut 7: Detector acceptance
Plotting cut 8: J mass
Plotting cut 9: Z mass
Plotting cut 10: eleID High pT
Plotting cut 11: eleID Low pT
Plotting cut 12: eleID either
Plotting cut 13: FourL mass


In [13]:
plotter.plot_summary(cutSummary)

Next cut: 0 Preselection (439851)
Cut 0 does not have a plot.
Next cut: 1 UnOrdered pT (439677)
Cut 1 does not have a plot.
Next cut: 2 Soft Muons (86821)
Cut 3 does not have a plot.
Next cut: 3 eleID either (11399)
Drawn at: 2 Soft Muons (86821)
Next cut: 4 Detector acceptance (10578)
Drawn at: 3 eleID either (11399)
Next cut: 5 Dilepton Vtx > 1% (9782)
Drawn at: 4 Detector acceptance (10578)
Next cut: 6 J mass (283)
Drawn at: 5 Dilepton Vtx > 1% (9782)
Next cut: 7 Z mass (283)
Drawn at: 6 J mass (283)
Next cut: 8 FourL Vtx > 1% (259)
Drawn at: 7 Z mass (283)
Next cut: 9 FourL mass (100)
Drawn at: 8 FourL Vtx > 1% (259)


In [20]:
cuts_to_show = [0, 1, 3, 12, 7, 5, 8, 9, 6, 13]
plotter.plot_single_variable(cuts_to_show, cutSummary)

Cut 0 does not have a plot.
Cut 1 does not have a plot.
Cut 2 does not have a plot.
Cut 3 does not have a plot.
Plotting variable B_Z_pt1 at steps: [0, 1, 3, 12, 7, 5, 8, 9, 6, 13]
Plotting variable B_J_VtxProb at steps: [0, 1, 3, 12, 7, 5, 8, 9, 6, 13]
Plotting variable B_Z_VtxProb at steps: [0, 1, 3, 12, 7, 5, 8, 9, 6, 13]
Plotting variable FourL_VtxProb at steps: [0, 1, 3, 12, 7, 5, 8, 9, 6, 13]
Plotting variable B_Z_eta1 at steps: [0, 1, 3, 12, 7, 5, 8, 9, 6, 13]
Plotting variable B_Z_eta2 at steps: [0, 1, 3, 12, 7, 5, 8, 9, 6, 13]
Plotting variable B_J_eta1 at steps: [0, 1, 3, 12, 7, 5, 8, 9, 6, 13]
Plotting variable B_J_eta2 at steps: [0, 1, 3, 12, 7, 5, 8, 9, 6, 13]
Plotting variable B_J_mass at steps: [0, 1, 3, 12, 7, 5, 8, 9, 6, 13]
Plotting variable B_Z_mass at steps: [0, 1, 3, 12, 7, 5, 8, 9, 6, 13]
Plotting variable B_J_pt1 at steps: [0, 1, 3, 12, 7, 5, 8, 9, 6, 13]
Plotting variable B_J_pt2 at steps: [0, 1, 3, 12, 7, 5, 8, 9, 6, 13]
Plotting variable B_J_pt1 at steps: [0, 

In [16]:
cutSummary

{'cut_id': [0, 1, 3, 12, 7, 5, 8, 9, 6, 13],
 'cut_name': ['Preselection',
  'UnOrdered pT',
  'Soft Muons',
  'eleID either',
  'Detector acceptance',
  'Dilepton Vtx > 1%',
  'J mass',
  'Z mass',
  'FourL Vtx > 1%',
  'FourL mass'],
 'nevents': [439851, 439677, 86821, 11399, 10578, 9782, 283, 283, 259, 100],
 'ncandidates': [253773, 253662, 69235, 10410, 9686, 8983, 283, 283, 259, 100],
 'mask': [<Array [[True, True, True], ..., [True]] type='253773 * [var * bool[paramet...'>,
  <Array [[True, True, True], [True], ..., [True]] type='253773 * var * bool'>,
  <Array [[False, False, False], ..., [False]] type='253773 * var * bool'>,
  <Array [[False, False, False], ..., [False]] type='253773 * var * bool'>,
  <Array [[False, False, False], ..., [False]] type='253773 * var * bool'>,
  <Array [[False, False, False], ..., [False]] type='253773 * var * bool'>,
  <Array [[False, False, False], ..., [False]] type='253773 * var * bool'>,
  <Array [[False, False, False], ..., [False]] type='25

In [17]:
# class Analysis:
#     """
#     Main class to perform the analysis.
#     """
#     def __init__(self, events, savepath):
#         self.events = events
#         self.particle_selection = ParticleSelection(events)
#         self.plotter = Plotter(savepath)


#     def get_count(self, data):
#         """
#         Count the number of candidates and events in the given data.
#         """
#         array = data.B_J_mass
#         nevents = len(array[ak.num(array, axis=1) > 0])
#         ncandidates = ak.sum(ak.num(array, axis=1))
#         return ncandidates, nevents
    

#     def get_summary_of_cuts(self, cut_order):
#         """
#         Generate a cutSummary of cuts applied to the events.
#         """
#         key_list = []
#         ncandidates_list = []
#         nevents_list = []
#         aggMask_list = []
#         var_list = []

#         agg_mask = self.particle_selection.cutdict["0"]["mask"]

#         for i in cut_order:
#             mask = self.particle_selection.cutdict[str(i)]["mask"]
#             agg_mask = agg_mask & mask
#             ncandidates, nevents = self.get_count(self.events[agg_mask])

#             key_list.append(self.particle_selection.cutdict[str(i)]["name"])
#             ncandidates_list.append(ncandidates)
#             nevents_list.append(nevents)   
#             aggMask_list.append(agg_mask)
#             var_list.append(self.particle_selection.cutdict[str(i)].get("var", None))

#         summary_dict = {
#             "Cut": key_list, 
#             "Candidates": ncandidates_list, 
#             "Events": nevents_list, 
#             "Aggregated mask": aggMask_list, 
#             "Var": var_list
#         }
        
#         summary_table = pd.DataFrame({key: summary_dict[key] for key in ["Cut", "Candidates", "Events"]})

#         return summary_dict, summary_table

#     def get_view_at(self, mycut, summary_dict):
#         """
#         Get the view of the data at a specific cut.
#         """
#         mycut_name = self.particle_selection.cutdict[str(mycut)]['name']
#         cut_index = summary_dict['Cut'].index(mycut_name)
#         n_events_after_cut = summary_dict['Events'][cut_index]
#         view_index = cut_index - 1
#         view_index_name = summary_dict['Cut'][view_index]
#         view_index_mask = summary_dict['Aggregated mask'][view_index]
#         n_events_before_cut = summary_dict['Events'][view_index]
#         cut_of_interest = self.events[view_index_mask]
#         text_array = [view_index_name, mycut_name, n_events_before_cut, n_events_after_cut]
#         return cut_of_interest, text_array

#     def apply_cut_progression(self, cut_progression):
#         """
#         Apply a series of cuts and create plots at each stage.
#         """
#         summary_dict, summary_table = self.get_summary_of_cuts(self.events, cut_progression)
#         print("Summary of cuts")
#         print(summary_table)
#         self.show_plots_at_each_cut(summary_dict)
#         return summary_dict, summary_table

#     def show_plots_at_each_cut(self, summary_dict):
#         """
#         Create plots at each cut stage.
#         """
#         # Implement the logic to create plots at each cut stage
#         pass

#     def get_count(self, elePt_low, elePt_high, eleEta, ymass_upper):
#         """
#         Apply cuts and return the count of events passing all cuts.
#         """
#         self.particle_selection.cutdict = self.particle_selection.make_cut_list(elePt_low, elePt_high, eleEta, ymass_upper)
#         eleID_both = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13]
#         summary_dict, summary_table = self.apply_cut_progression(eleID_both)
#         count = summary_table[summary_table['Cut'] == 'FourL mass']['Events'].values[0]
#         return count

# def get_FOM(events_data, events_mc, savepath_data, savepath_mc, elePt_low, elePt_high, eleEta, ymass_upper):
#     """
#     Calculate the Figure of Merit.
#     """
#     analysis_data = Analysis(events_data, savepath_data)
#     analysis_mc = Analysis(events_mc, savepath_mc)
    
#     nb = analysis_data.get_count(elePt_low, elePt_high, eleEta, ymass_upper)
#     nEff = analysis_mc.get_count(elePt_low, elePt_high, eleEta, ymass_upper) / 100
#     FOM = nEff / np.sqrt(nb)
#     return nEff, nb, FOM

# def load_config(config_path):
#     """
#     Load configuration from a YAML file.
#     """
#     try:
#         with open(config_path, 'r') as config_file:
#             return yaml.safe_load(config_file)
#     except FileNotFoundError:
#         print(f"Error: Configuration file '{config_path}' not found.")
#         sys.exit(1)
#     except yaml.YAMLError as e:
#         print(f"Error parsing YAML configuration file: {e}")
#         sys.exit(1)

# def load_events(file_path, schema_class, entry_stop):
#     """
#     Load events from a ROOT file with error handling.
#     """
#     try:
#         events = NanoEventsFactory.from_root(
#             {file_path: "ntuple"}, 
#             schemaclass=schema_class, 
#             entry_stop=entry_stop
#         ).events()
#         return events
#     except FileNotFoundError:
#         print(f"Error: File '{file_path}' not found.")
#         return None
#     except Exception as e:
#         print(f"Error loading events from '{file_path}': {e}")
#         return None

# def main(config_path='config.yaml', run_data=True, run_mc=True):
#     # Load configuration
#     config = load_config(config_path)

#     events_data = None
#     events_mc = None

#     # Load events
#     if run_data:
#         events_data = load_events(config['data_path'], BaseSchema, config['entry_stop'])
#         if events_data is None:
#             print("Failed to load data events. Skipping data analysis.")
#             run_data = False

#     if run_mc:
#         events_mc = load_events(config['mc_path'], BaseSchema, config['entry_stop'])
#         if events_mc is None:
#             print("Failed to load MC events. Skipping MC analysis.")
#             run_mc = False

#     if not run_data and not run_mc:
#         print("No valid data or MC events loaded. Exiting.")
#         sys.exit(1)

#     # Select columns
#     if run_data:
#         events_data = events_data[config['columns']].compute()
#     if run_mc:
#         events_mc = events_mc[config['columns']].compute()

#     # Create DataFrame to store results
#     df = pd.DataFrame(columns=['elePt_low', 'elePt_high', 'eleEta', 'ymass_upper', 'Efficiency', 'Background', 'FOM'])

#     # Loop over different cut values
#     for elePt_low in config['cut_parameters']['elePt_low']:
#         for elePt_high in config['cut_parameters']['elePt_high']:
#             for eleEta in config['cut_parameters']['eleEta']:
#                 for ymass_upper in config['cut_parameters']['ymass_upper']:
#                     savepath_data = f"{config['savepath_data_base']}el_{elePt_low}_eh_{elePt_high}_eE_{eleEta}_yup_{ymass_upper}"
#                     savepath_mc = f"{config['savepath_mc_base']}el_{elePt_low}_eh_{elePt_high}_eE_{eleEta}_yup_{ymass_upper}"

#                     os.makedirs(savepath_data, exist_ok=True)
#                     os.makedirs(savepath_mc, exist_ok=True)

#                     eff, nb, FOM = 0, 0, 0
                    
#                     if run_data and run_mc:
#                         eff, nb, FOM = get_FOM(events_data, events_mc, savepath_data, savepath_mc, elePt_low, elePt_high, eleEta, ymass_upper)
#                     elif run_data:
#                         analysis_data = Analysis(events_data, savepath_data)
#                         nb = analysis_data.get_count(elePt_low, elePt_high, eleEta, ymass_upper)
#                     elif run_mc:
#                         analysis_mc = Analysis(events_mc, savepath_mc)
#                         eff = analysis_mc.get_count(elePt_low, elePt_high, eleEta, ymass_upper) / 100

#                     df.loc[len(df)] = {'elePt_low': elePt_low, 'elePt_high': elePt_high, 'eleEta': eleEta, 'ymass_upper': ymass_upper, 'Efficiency': eff, 'Background': nb, 'FOM': FOM}
#                     print(f"elePt_low: {elePt_low}, elePt_high: {elePt_high}, eleEta: {eleEta}, ymass_upper: {ymass_upper}")
#                     print(f"Efficiency: {eff}")
#                     print(f"Background: {nb}")
#                     print(f"FOM: {FOM}")
#                     print(df)
#                     print("\n")

#     print(df)
#     df.to_csv(config['output_file'], index=False)

# # if __name__ == "__main__":
# #     parser = argparse.ArgumentParser(description="Run particle physics analysis on data and/or MC events.")
# #     parser.add_argument("--config", default="config.yaml", help="Path to the configuration file")
# #     parser.add_argument("--data", action="store_true", help="Run analysis on data events")
# #     parser.add_argument("--mc", action="store_true", help="Run analysis on MC events")
# #     args = parser.parse_args()

# #     if not args.data and not args.mc:
# #         print("Error: You must specify at least one of --data or --mc")
# #         sys.exit(1)

#     # main(config_path=args.config, run_data=args.data, run_mc=args.mc)

In [18]:
# main(config_path="config.yaml", run_data=True, run_mc=False)