In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import os
import yaml
from coffea.nanoevents import TreeMakerSchema, BaseSchema, NanoEventsFactory
# import argparse
import sys

from utils.helperClasses import *
from utils.helperFunctions import *

In [3]:
run_data = True
run_mc = False

config = load_config('config.yaml')

events_data = None
events_mc = None

# Load events
if run_data:
    events_data = load_events(config['data_path'], BaseSchema, config['entry_stop'])
    if events_data is None:
        print("Failed to load data events. Skipping data analysis.")
        run_data = False

if run_mc:
    events_mc = load_events(config['mc_path'], BaseSchema, config['entry_stop'])
    if events_mc is None:
        print("Failed to load MC events. Skipping MC analysis.")
        run_mc = False

if not run_data and not run_mc:
    print("No valid data or MC events loaded. Exiting.")
    sys.exit(1)

# Select columns
if run_data:
    events_data = events_data[config['columns']].compute()
if run_mc:
    events_mc = events_mc[config['columns']].compute()

In [4]:
save_path = config['savepath_base']
os.makedirs(save_path, exist_ok=True)

In [5]:
events_data = combine_dimuon_candidates(events_data, J_lower=3.0, J_upper=3.2, z_lower=80, z_upper=100)

In [6]:
cutdict = CutDict()
cutdict.add_cut(0,
                 CutType(name="0",
                     long_name="Preselection",
                     mask=events_data['B_J1_mass'] > -1, 
                     variables=['B_J1_mass'], 
                     plot=False))

cutdict.add_cut(1,
                    CutType(name="1",
                        long_name="Muon Trigger",
                        mask=events_data['Mu_TriggerPath'] == 1, 
                        variables=['Mu_TriggerPath'], 
                        plot=False))

cutdict.add_cut(2, 
                    CutType(name="2",
                        long_name="Soft Muons",
                        mask=(events_data['B_Mu1_soft'] == 1) & (events_data['B_Mu2_soft'] == 1) & (events_data['B_Mu3_soft'] == 1) & (events_data['B_Mu4_soft'] == 1),
                        variables=['B_J1_soft', 'B_J2_soft', 'B_J3_soft', 'B_J4_soft'],
                        plot=False))

cutdict.add_cut(3,
                    CutType(name="3",
                        long_name="Muon pT",
                        mask=(events_data['B_Mu1_pt'] > 3) & (events_data['B_Mu2_pt'] > 3) & (events_data['B_Mu3_pt'] > 3) & (events_data['B_Mu4_pt'] > 3),
                        variables=['B_Mu1_pt', 'B_Mu2_pt', 'B_Mu3_pt', 'B_Mu4_pt'],
                        plot=True,
                        bins=100,
                        x_range=(0, 100),
                        labels=["Mu1_pt", "Mu2_pt", "Mu3_pt", "Mu4_pt"],
                        xlabel="pT (GeV)"))

cutdict.add_cut(4,
                    CutType(name="4",
                        long_name="Detector acceptance",
                        mask=(abs(events_data['B_Mu1_eta']) < 2.4) & (abs(events_data['B_Mu2_eta']) < 2.4) & (abs(events_data['B_Mu3_eta']) < 2.4) & (abs(events_data['B_Mu4_eta']) < 2.4),
                        variables=['B_Mu1_eta', 'B_Mu2_eta', 'B_Mu3_eta', 'B_Mu4_eta'],
                        plot=True,
                        bins=100,
                        x_range=(-3, 3),
                        labels=["Mu1_eta", "Mu2_eta", "Mu3_eta", "Mu4_eta"],
                        xlabel="Eta"))

cutdict.add_cut(5,
                    CutType(name="5",
                        long_name="Any dimuon pair vertex",
                        mask=((events_data['B_J1_VtxProb'] > 0.01) & (events_data['B_J2_VtxProb'] > 0.01)) | ((events_data['B_J3_VtxProb'] > 0.01) & (events_data['B_J4_VtxProb'] > 0.01)),
                        variables=['B_J1_VtxProb', 'B_J2_VtxProb', 'B_J3_VtxProb', 'B_J4_VtxProb'],
                        plot=True,
                        bins=100,
                        x_range=(0, 1),
                        labels=["J1_VtxProb", "J2_VtxProb", "J3_VtxProb", "J4_VtxProb"],
                        xlabel="Vtx Prob"))

cutdict.add_cut(6,
                    CutType(name="6",
                        long_name="Four muon vertex",
                        mask=(events_data['FourL_VtxProb'] > 0.01),
                        variables=['FourL_VtxProb'],
                        plot=True,
                        bins=100,
                        x_range=(0, 1),
                        labels=["FourL_VtxProb"],
                        xlabel="Vtx Prob"))

cutdict.add_cut(7,
                    CutType(name="7",
                        long_name="Candidate Combine",
                        mask=events_data['JPsiMass'] > 0,
                        variables=['JPsiMass'],
                        plot=False))

cutdict.add_cut(8,
                    CutType(name="8",
                        long_name="Dimuon Vertex Prob",
                        mask=(events_data['JPsi_VtxProb'] > 0.01) & (events_data['Z_VtxProb'] > 0.01),
                        variables=['JPsi_VtxProb', 'Z_VtxProb'],
                        plot=True,
                        bins=100,
                        x_range=(0, 1),
                        labels=["VtxProb1", "VtxProb2"],
                        xlabel="Vtx Prob"))

cutdict.add_cut(9,
                    CutType(name="9",
                        long_name="Dimuon pT",
                        mask=(events_data['JPsi_Pt'] > 5) & (events_data['Z_Pt'] > 5),
                        variables=['JPsi_Pt', 'Z_Pt'],
                        plot=True,
                        bins=100,
                        x_range=(0, 20),
                        labels=["JPsi_Pt", "Z_Pt"],
                        xlabel="pT (GeV)"))

cutdict.add_cut(10,
                    CutType(name="10",
                        long_name="Dimuon mass",
                        mask=(events_data['JPsi_mass'] > 2.8) & (events_data['JPsi_mass'] < 3.4) & (events_data['Z_mass'] > 70) & (events_data['Z_mass'] < 110),
                        variables=['JPsi_mass', 'Z_mass'],
                        plot=True,
                        bins=100,
                        x_range=(0, 20),
                        labels=["JPsi_mass", "Z_mass"],
                        xlabel="mass (GeV)"))


cutdict.add_cut(11,
                    CutType(name="11",
                        long_name="Four muon pT",
                        mask=(events_data['FourL_pt'] > 5),
                        variables=['FourL_pt'],
                        plot=True,
                        bins=100,
                        x_range=(0, 100),
                        labels=["FourL_pt"],
                        xlabel="pT (GeV)"))


cutdict.add_cut(12,
                    CutType(name="12",
                        long_name="Four muon mass",
                        mask=(events_data['FourL_mass'] > 112) & (events_data['FourL_mass'] < 162),
                        variables=['FourL_mass'],
                        plot=True,
                        bins=50,
                        x_range=(112, 162),
                        labels=["FourL_mass"],
                        xlabel="mass (GeV)"))


In [7]:
for key, item in cutdict.cutdict.items():
    print(key, item)

0 Cut: 0 (Preselection)
1 Cut: 1 (Muon Trigger)
2 Cut: 2 (Soft Muons)
3 Cut: 3 (Muon pT)
4 Cut: 4 (Detector acceptance)
5 Cut: 5 (Any dimuon pair vertex)
6 Cut: 6 (Four muon vertex)
7 Cut: 7 (Candidate Combine)
8 Cut: 8 (Dimuon Vertex Prob)
9 Cut: 9 (Dimuon pT)
10 Cut: 10 (Dimuon mass)
11 Cut: 11 (Four muon pT)
12 Cut: 12 (Four muon mass)


In [8]:
myorder = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
cut_analysis = CutAnalysis(events_data, cutdict)
cutSummary = cut_analysis.prepare_masks(myorder)

In [9]:
cutSummarydf = {k: v for k, v in cutSummary.items() if k not in ['mask', 'cutobj']}
pd.DataFrame(cutSummarydf)

Unnamed: 0,cut_id,cut_name,nevents,ncandidates
0,0,Preselection,1389300,499503
1,1,Muon Trigger,253858,156201
2,2,Soft Muons,46683,41296
3,3,Muon pT,32259,28024
4,4,Detector acceptance,31107,27119
5,5,Any dimuon pair vertex,30668,26762
6,6,Four muon vertex,27592,24186
7,7,Candidate Combine,720,706
8,8,Dimuon Vertex Prob,699,686
9,9,Dimuon pT,664,653


In [10]:
plotter = Plotter(events_data, cutdict, save_path)
plotter.plot_preselection()


Plotting preselection...
Cut 0 does not have a plot
Cut 1 does not have a plot
Cut 2 does not have a plot
Cut 7 does not have a plot


In [11]:
plotter.plot_summary(cutSummary)


Plotting summary of cuts...


In [12]:
cuts_to_show = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
plotter.plot_single_variable(cuts_to_show, cutSummary)


Plotting specific variables at [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [13]:
final = events_data[cutSummary['mask'][-1]]
df = ak.to_dataframe(final).reset_index(drop=True)
# cols_to_keep = ['B_Mu1_pt', 'B_Mu1_eta', 'B_Mu1_phi', 'B_Mu2_pt', 'B_Mu2_eta', 'B_Mu2_phi', 'B_Mu3_pt', 'B_Mu3_eta', 'B_Mu3_phi', 'B_Mu4_pt', 'B_Mu4_eta', 'B_Mu4_phi']
# df = df[cols_to_keep]

In [14]:
df.to_csv("test.csv", index=True)

In [16]:
final

In [17]:
df

Unnamed: 0,Event,Run,LumiBlock,Mu_TriggerPath,FourL_VtxProb,FourL_pt,FourL_mass,B_Mu1_soft,B_Mu2_soft,B_Mu3_soft,...,B_J2_rapidity,B_J3_rapidity,B_J4_rapidity,JPsi_mass,Z_mass,JPsiMass,JPsi_VtxProb,Z_VtxProb,JPsi_Pt,Z_Pt
0,8.637258e+08,274199.0,464.0,True,0.911033,15.478289,116.920921,True,True,True,...,1.726400,2.088897,-0.172939,3.108999,87.825439,1,0.397345,0.697847,12.266626,24.584282
1,2.300706e+07,274955.0,21.0,True,0.260900,76.713989,130.473404,True,True,True,...,0.432218,1.228765,1.505240,3.141289,84.942101,1,0.614676,0.360335,28.542305,92.185432
2,1.229868e+08,275375.0,128.0,True,0.411882,89.155075,122.497864,True,True,True,...,1.110003,1.078518,0.960106,3.156949,89.479218,1,0.741573,0.492061,18.123266,117.920280
3,7.862000e+08,274968.0,433.0,True,0.100745,36.173054,116.809029,True,True,True,...,-0.113013,-1.314998,-1.612163,3.114687,88.734093,1,0.145432,0.146799,32.482761,59.835724
4,1.054212e+09,274969.0,579.0,True,0.462482,8.834723,117.439949,True,True,True,...,0.637997,0.358890,-0.082686,3.020830,90.357307,1,0.837804,0.142202,34.112331,57.231007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,6.187336e+07,321887.0,81.0,True,0.285782,27.968149,160.920059,True,True,True,...,1.013068,-1.197121,-1.750479,3.018502,92.672058,1,0.039332,0.605537,29.423740,64.099518
295,7.854509e+08,321973.0,453.0,True,0.939161,31.693218,124.090714,True,True,True,...,-0.200956,0.674390,-0.493013,3.077051,92.276367,1,0.874259,0.308142,28.459492,54.858582
296,1.265521e+09,322348.0,696.0,True,0.045156,24.981705,134.615753,True,True,True,...,0.439688,-1.618205,0.593384,3.100198,86.333710,1,0.309137,0.658927,9.644593,37.744526
297,6.337103e+08,325022.0,396.0,True,0.198718,39.332626,139.062332,True,True,True,...,-0.014061,1.835325,1.260213,3.110590,91.221916,1,0.486316,0.108131,35.735947,64.013733


In [18]:
import uproot

In [45]:
# # Define a function to check if a record has any non-empty fields
# def has_non_empty_fields(record):
#     return ak.any([ak.num(record[field]) > 0 for field in record.fields])


# # Apply the function to filter out empty records
# filtered_array = final[ak.any([ak.num(final[field], axis=1) > 0 for field in final.fields], axis=0)]

In [34]:
# with uproot.recreate("test.root") as f:
#     # for field in final.fields:
#     f["Events"] = {field: final[field] for field in final.fields}

# with uproot.recreate("filtered.root") as f:
#     # for field in final.fields:
#     f["Events"] = {field: filtered_array[field] for field in filtered_array.fields}

    # f["Events"] = {"branch": final['B_J1_mass']}
    # f["Events"] = uproot.newtree({k: v for k, v in zip(df.columns, df.dtypes)})
    # f["Events"].extend(df.to_dict(orient='list'))


# Function to filter non-empty fields and flatten them if needed
def filter_and_flatten(array):
    # Keep only records where at least one field is non-empty
    filtered = array[ak.any([ak.num(array[field], axis=1) > 0 for field in array.fields], axis=0)]

    # Flatten fields that are lists
    # flattened = {key: ak.flatten(filtered[key], axis=1) if ak.type(filtered[key]).is_option and ak.type(filtered[key]).content.is_list else filtered[key] for key in filtered.fields}
    flattened = {key: ak.flatten(filtered[key], axis=1) for key in filtered.fields}

    return flattened


# Apply the combined filter and flattening function
filtered_and_flattened = filter_and_flatten(final)

# Save the processed array to a ROOT file
with uproot.recreate("output_combined.root") as file:
    file["myTree"] = filtered_and_flattened

print("Array saved to output_combined.root as a TTree named 'myTree'.")

Array saved to output_combined.root as a TTree named 'myTree'.


In [33]:
for key in final.fields:
    # print(key, final[key])
    # print(key, final[key])
    print(ak.flatten(final[key], axis=1))

[8.64e+08, 2.3e+07, 1.23e+08, 7.86e+08, ..., 1.27e+09, 6.34e+08, 6.06e+07]
[2.74e+05, 2.75e+05, 2.75e+05, 2.75e+05, ..., 3.22e+05, 3.25e+05, 3.25e+05]
[464, 21, 128, 433, 579, 152, 258, 11, ..., 5, 662, 274, 81, 453, 696, 396, 60]
[True, True, True, True, True, True, ..., True, True, True, True, True, True]
[0.911, 0.261, 0.412, 0.101, 0.462, ..., 0.286, 0.939, 0.0452, 0.199, 0.971]
[15.5, 76.7, 89.2, 36.2, 8.83, 9.1, 87.3, ..., 78.8, 28, 31.7, 25, 39.3, 5.62]
[117, 130, 122, 117, 117, 121, 116, 122, ..., 130, 132, 161, 124, 135, 139, 112]
[True, True, True, True, True, True, ..., True, True, True, True, True, True]
[True, True, True, True, True, True, ..., True, True, True, True, True, True]
[True, True, True, True, True, True, ..., True, True, True, True, True, True]
[True, True, True, True, True, True, ..., True, True, True, True, True, True]
[24.6, 92.2, 118, 59.8, 57.2, 33.4, 58.6, ..., 102, 64.1, 54.9, 37.7, 64, 36.8]
[12.3, 28.5, 18.1, 32.5, 34.1, 29.3, ..., 24.2, 29.4, 28.5, 9.

In [159]:
# def get_FOM(events_data, events_mc, savepath_data, savepath_mc, elePt_low, elePt_high, eleEta, ymass_upper):
#     """
#     Calculate the Figure of Merit.
#     """
#     analysis_data = Analysis(events_data, savepath_data)
#     analysis_mc = Analysis(events_mc, savepath_mc)
    
#     nb = analysis_data.get_count(elePt_low, elePt_high, eleEta, ymass_upper)
#     nEff = analysis_mc.get_count(elePt_low, elePt_high, eleEta, ymass_upper) / 100
#     FOM = nEff / np.sqrt(nb)
#     return nEff, nb, FOM

# def load_config(config_path):
#     """
#     Load configuration from a YAML file.
#     """
#     try:
#         with open(config_path, 'r') as config_file:
#             return yaml.safe_load(config_file)
#     except FileNotFoundError:
#         print(f"Error: Configuration file '{config_path}' not found.")
#         sys.exit(1)
#     except yaml.YAMLError as e:
#         print(f"Error parsing YAML configuration file: {e}")
#         sys.exit(1)

# def load_events(file_path, schema_class, entry_stop):
#     """
#     Load events from a ROOT file with error handling.
#     """
#     try:
#         events = NanoEventsFactory.from_root(
#             {file_path: "ntuple"}, 
#             schemaclass=schema_class, 
#             entry_stop=entry_stop
#         ).events()
#         return events
#     except FileNotFoundError:
#         print(f"Error: File '{file_path}' not found.")
#         return None
#     except Exception as e:
#         print(f"Error loading events from '{file_path}': {e}")
#         return None

# def main(config_path='config.yaml', run_data=True, run_mc=True):
#     # Load configuration
#     config = load_config(config_path)

#     events_data = None
#     events_mc = None

#     # Load events
#     if run_data:
#         events_data = load_events(config['data_path'], BaseSchema, config['entry_stop'])
#         if events_data is None:
#             print("Failed to load data events. Skipping data analysis.")
#             run_data = False

#     if run_mc:
#         events_mc = load_events(config['mc_path'], BaseSchema, config['entry_stop'])
#         if events_mc is None:
#             print("Failed to load MC events. Skipping MC analysis.")
#             run_mc = False

#     if not run_data and not run_mc:
#         print("No valid data or MC events loaded. Exiting.")
#         sys.exit(1)

#     # Select columns
#     if run_data:
#         events_data = events_data[config['columns']].compute()
#     if run_mc:
#         events_mc = events_mc[config['columns']].compute()

#     # Create DataFrame to store results
#     df = pd.DataFrame(columns=['elePt_low', 'elePt_high', 'eleEta', 'ymass_upper', 'Efficiency', 'Background', 'FOM'])

#     # Loop over different cut values
#     for elePt_low in config['cut_parameters']['elePt_low']:
#         for elePt_high in config['cut_parameters']['elePt_high']:
#             for eleEta in config['cut_parameters']['eleEta']:
#                 for ymass_upper in config['cut_parameters']['ymass_upper']:
#                     savepath_data = f"{config['savepath_data_base']}el_{elePt_low}_eh_{elePt_high}_eE_{eleEta}_yup_{ymass_upper}"
#                     savepath_mc = f"{config['savepath_mc_base']}el_{elePt_low}_eh_{elePt_high}_eE_{eleEta}_yup_{ymass_upper}"

#                     os.makedirs(savepath_data, exist_ok=True)
#                     os.makedirs(savepath_mc, exist_ok=True)

#                     eff, nb, FOM = 0, 0, 0
                    
#                     if run_data and run_mc:
#                         eff, nb, FOM = get_FOM(events_data, events_mc, savepath_data, savepath_mc, elePt_low, elePt_high, eleEta, ymass_upper)
#                     elif run_data:
#                         analysis_data = Analysis(events_data, savepath_data)
#                         nb = analysis_data.get_count(elePt_low, elePt_high, eleEta, ymass_upper)
#                     elif run_mc:
#                         analysis_mc = Analysis(events_mc, savepath_mc)
#                         eff = analysis_mc.get_count(elePt_low, elePt_high, eleEta, ymass_upper) / 100

#                     df.loc[len(df)] = {'elePt_low': elePt_low, 'elePt_high': elePt_high, 'eleEta': eleEta, 'ymass_upper': ymass_upper, 'Efficiency': eff, 'Background': nb, 'FOM': FOM}
#                     print(f"elePt_low: {elePt_low}, elePt_high: {elePt_high}, eleEta: {eleEta}, ymass_upper: {ymass_upper}")
#                     print(f"Efficiency: {eff}")
#                     print(f"Background: {nb}")
#                     print(f"FOM: {FOM}")
#                     print(df)
#                     print("\n")

#     print(df)
#     df.to_csv(config['output_file'], index=False)

# # if __name__ == "__main__":
# #     parser = argparse.ArgumentParser(description="Run particle physics analysis on data and/or MC events.")
# #     parser.add_argument("--config", default="config.yaml", help="Path to the configuration file")
# #     parser.add_argument("--data", action="store_true", help="Run analysis on data events")
# #     parser.add_argument("--mc", action="store_true", help="Run analysis on MC events")
# #     args = parser.parse_args()

# #     if not args.data and not args.mc:
# #         print("Error: You must specify at least one of --data or --mc")
# #         sys.exit(1)

#     # main(config_path=args.config, run_data=args.data, run_mc=args.mc)

In [160]:
# main(config_path="config.yaml", run_data=True, run_mc=False)