# Neutrino ID loader

### Imports

In [20]:
%matplotlib inline
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [21]:
import uproot
import pickle
import pandas as pd
import numpy as np
import helpers.helpfunction as helper
import gc

<IPython.core.display.Javascript object>

### Constants

Data Run1: 
- data_bnb_run1_C1_high_lifetime (for now, small subset of 5e19) 
    - POT: 2.57e19 (old, 1.922e19)
    - E1DCNT_wcut: 5711101 (old, 4271472)
- data_extbnb_run3_G1_high_lifetime
    - EXT: 6200046
    
Data Run2:
- data_bnb_run2_D2
    - POT: 1.593e+20
    - E1DCNT_wcut: 38186979
- data_extbnb_run2_D2
    - EXT: x
    
Data Run3:
- data_bnb_run3_G1_high_lifetime
    - POT: x
    - E1DCNT_wcut: x
- data_extbnb_run3_G1_high_lifetime
    - EXT: x

In [22]:
pot = [2.57e19, 1.593e20, 0]
en1dcnt = [5711101, 38186979, 0]
ext = [6200046, 14220894, 0]

<IPython.core.display.Javascript object>

In [23]:
path = "./input/Jan2020/pickled/"

min_lepton_kine = 0.02
min_e = min_lepton_kine + 0.000511
min_mu = min_lepton_kine + 0.105658
pot_per_event = {
    "nu": 1.201e21 / 957702,
    "nue": 5.12e22 / 85774,
    "dirt": 3.08e20 / 98679,
}

run = 1
data_samples = ["on", "off"]
mc_samples = ["nue", "nu", "dirt"]

<IPython.core.display.Javascript object>

### Load Samples

#### Load MC

In [24]:
%%time
file_dict = {}
first = True

for sample in mc_samples:
    file = uproot.open("input/Jan2020/run{}/{}_run{}.root".format(run, sample, run))
    file_neutrinoid = file["pandora"]
    file_dict[sample] = {}

    file_dict[sample]["num_events"] = file_neutrinoid["events"].numentries
    file_dict[sample]["pot"] = file_dict[sample]["num_events"] * pot_per_event[sample]
    print(
        sample,
        "num_events:",
        file_dict[sample]["num_events"],
        "pot:",
        file_dict[sample]["pot"],
    )
    file_dict[sample]["events"] = file_neutrinoid["events"].arrays(namedecode="utf-8")
    file_dict[sample]["slices"] = file_neutrinoid["slices"].arrays(namedecode="utf-8")
    file_dict[sample]["flashes"] = file_neutrinoid["flashes"].arrays(namedecode="utf-8")
    file_dict[sample]["showers"] = file["shrreco3d/_rcshr_tree"].arrays(
        namedecode="utf-8"
    )
    file_dict[sample]["metadata"] = file_neutrinoid["metadata"].arrays(entrystop=1,namedecode="utf-8", flatten=True)
    if first:
        print("\n--- Event Tree ---")
        print(file_dict[sample]["events"].keys())
        print("\n--- Slice Tree ---")
        print(file_dict[sample]["slices"].keys())
        print("\n--- Flash Tree ---")
        print(file_dict[sample]["flashes"].keys())
        print("\n--- Shower Tree ---")
        print(file_dict[sample]["showers"].keys())
        print("\n\n")
        first = False

    # Add the hashes:
    df_to_hash_slice_id = file_neutrinoid["events"].pandas.df(["nuVertex?", "*Energy"])
    slice_id_nu_hash = helper.eventHash(df_to_hash_slice_id)
    file_dict[sample]["events"]["hash"] = slice_id_nu_hash

nue num_events: 58420 pot: 3.4871919229603376e+22

--- Event Tree ---
dict_keys(['run', 'subRun', 'event', 'evt_time_sec', 'evt_time_nsec', 'nFlashes', 'nFlashesInBeamWindow', 'hasBeamFlash', 'nSlices', 'nSlicesAfterPrecuts', 'foundATargetSlice', 'targetSliceMethod', 'bestCosmicMatch', 'cosmicMatchHypothesis', 'bestCosmicMatchRatio', 'nuMode', 'nuX', 'nuW', 'nuPt', 'nuTheta', 'nuCCNC', 'nuEnergy', 'leptonEnergy', 'nuInteractionTime', 'nuPdgCode', 'nuVertexX', 'nuVertexY', 'nuVertexZ'])

--- Slice Tree ---
dict_keys(['sliceId', 'run', 'subRun', 'event', 'evt_time_sec', 'evt_time_nsec', 'hasDeposition', 'totalCharge', 'centerX', 'centerY', 'centerZ', 'minCRTdist', 'CRTtime', 'CRTplane', 'CRTtracklength', 'CRTnumtracks', 'deltaY', 'deltaZ', 'deltaYSigma', 'deltaZSigma', 'chargeToLightRatio', 'xclVariable', 'passesPreCuts', 'flashMatchScore', 'totalPEHypothesis', 'peHypothesisSpectrum', 'isTaggedAsTarget', 'targetMethod', 'isConsideredByFlashId', 'topologicalScore', 'hasBestTopologicalScor

<IPython.core.display.Javascript object>

In [33]:
%%time
# Collect the newest weights for the MC samples:
file = pickle.load(
    open(
        "/home/wouter/Documents/Jupyter/searchingfornues/input/16Jan/run1_slimmed.pckl",
        "rb",
    )
)

for sample in mc_samples:
    keys = ["true_nu_vtx_x", "true_nu_vtx_y", "true_nu_vtx_z", "nu_e", "lep_e"]
    this_weight_dict = {}
    for key in keys:
        this_weight_dict[key] = file[sample]["mc"][key]
    weight_hash = helper.eventHash(pd.DataFrame(this_weight_dict))
    weigths = file[sample]["mc"]["weightSplineTimesTune"]
    mapper = dict(zip(weight_hash, weigths))
    
    new_weights = file_dict[sample]["events"]["hash"].map(mapper)
    print(new_weights.describe())
    print(sample)
    file_dict[sample]["events"]["weight"] = (
        file_dict[sample]["events"]["hash"].map(mapper).fillna(1)
    ).values
    
    file_dict[sample]["flashes"]["weight"] = np.repeat(
        file_dict[sample]["events"]["weight"], file_dict[sample]["events"]["nFlashes"]
    )
    file_dict[sample]["slices"]["weight"] = np.repeat(
        file_dict[sample]["events"]["weight"], file_dict[sample]["events"]['nSlices']
    )
    
del file
gc.collect()

count    56013.000000
mean         1.173108
std          0.917618
min          0.559345
25%          1.000000
50%          1.000000
75%          1.225310
max         89.603714
dtype: float64
nue
count    53228.000000
mean         1.150156
std          0.581137
min          0.238485
25%          1.000000
50%          1.000000
75%          1.218523
max         54.544491
dtype: float64
nu
count    18557.000000
mean         1.106060
std          0.351910
min          0.421535
25%          1.000000
50%          1.000000
75%          1.203281
max         37.972496
dtype: float64
dirt
CPU times: user 20.4 s, sys: 825 ms, total: 21.3 s
Wall time: 21.1 s


0

<IPython.core.display.Javascript object>

In [None]:
file_dict['nu']["events"]["hash"]

In [26]:
%%time
# Define signal categories:
for sample in mc_samples:
    x = file_dict[sample]["events"]["nuVertexX"]
    y = file_dict[sample]["events"]["nuVertexY"]
    z = file_dict[sample]["events"]["nuVertexZ"]
    file_dict[sample]["events"]['true_fidvol'] = helper.is_fid(x, y, z)
    
    

    file_dict[sample]["events"]["nueccinc"] = (
        (file_dict[sample]["events"]["leptonEnergy"] > min_e)
        & (abs(file_dict[sample]["events"]["nuPdgCode"]) == 12)
        & (file_dict[sample]["events"]["nuCCNC"]==0)
        & file_dict[sample]["events"]['true_fidvol']
    )
    file_dict[sample]["events"]["numuccinc"] = (
        (file_dict[sample]["events"]["leptonEnergy"] > min_mu)
        & (abs(file_dict[sample]["events"]["nuPdgCode"]) == 14)
        & (file_dict[sample]["events"]["nuCCNC"]==0)
        & file_dict[sample]["events"]['true_fidvol']
    )
    file_dict[sample]["flashes"]["nueccinc"] = np.repeat(
        file_dict[sample]["events"]["nueccinc"], file_dict[sample]["events"]["nFlashes"]
    )
    file_dict[sample]["slices"]["nueccinc"] = np.repeat(
        file_dict[sample]["events"]["nueccinc"], file_dict[sample]["events"]['nSlices']
    )
    file_dict[sample]["flashes"]["numuccinc"] = np.repeat(
        file_dict[sample]["events"]["numuccinc"], file_dict[sample]["events"]["nFlashes"]
    )
    file_dict[sample]["slices"]["numuccinc"] = np.repeat(
        file_dict[sample]["events"]["numuccinc"], file_dict[sample]["events"]['nSlices']
    )
    file_dict[sample]["flashes"]["true_fidvol"] = np.repeat(
        file_dict[sample]["events"]["true_fidvol"], file_dict[sample]["events"]['nFlashes']
    )
    file_dict[sample]["slices"]["true_fidvol"] = np.repeat(
        file_dict[sample]["events"]["true_fidvol"], file_dict[sample]["events"]['nSlices']
    )

CPU times: user 27.3 ms, sys: 0 ns, total: 27.3 ms
Wall time: 26.5 ms


<IPython.core.display.Javascript object>

In [27]:
out_file = open(path + "mc_run{}.pckl".format(run), "wb")
pickle.dump(file_dict, out_file)
out_file.close()

<IPython.core.display.Javascript object>

####  Load Data

In [13]:
run = 1

<IPython.core.display.Javascript object>

In [17]:
%%time
file_dict = {}
first = True

for sample in data_samples:
    file = uproot.open("input/Jan2020/run{}/{}_run{}.root".format(run, sample, run))
    file_neutrinoid = file["pandora"]
    file_dict[sample] = {}

    file_dict[sample]["num_events"] = file_neutrinoid["events"].numentries
    print(
        sample,
        "num_events:",
        file_dict[sample]["num_events"]
    )
    file_dict[sample]["events"] = file_neutrinoid["events"].arrays(namedecode="utf-8")
    file_dict[sample]["slices"] = file_neutrinoid["slices"].arrays(namedecode="utf-8")
    file_dict[sample]["flashes"] = file_neutrinoid["flashes"].arrays(namedecode="utf-8")
    file_dict[sample]["showers"] = file["shrreco3d/_rcshr_tree"].arrays(
        namedecode="utf-8"
    )
    file_dict[sample]["metadata"] = file_neutrinoid["metadata"].arrays(entrystop=1,namedecode="utf-8", flatten=True)
    if first:
        print("\n--- Event Tree ---")
        print(file_dict[sample]["events"].keys())
        print("\n--- Slice Tree ---")
        print(file_dict[sample]["slices"].keys())
        print("\n--- Flash Tree ---")
        print(file_dict[sample]["flashes"].keys())
        print("\n--- Shower Tree ---")
        print(file_dict[sample]["showers"].keys())
        print("\n\n")
        first = False

on num_events: 77433

--- Event Tree ---
dict_keys(['run', 'subRun', 'event', 'evt_time_sec', 'evt_time_nsec', 'nFlashes', 'nFlashesInBeamWindow', 'hasBeamFlash', 'nSlices', 'nSlicesAfterPrecuts', 'foundATargetSlice', 'targetSliceMethod', 'bestCosmicMatch', 'cosmicMatchHypothesis', 'bestCosmicMatchRatio'])

--- Slice Tree ---
dict_keys(['sliceId', 'run', 'subRun', 'event', 'evt_time_sec', 'evt_time_nsec', 'hasDeposition', 'totalCharge', 'centerX', 'centerY', 'centerZ', 'minCRTdist', 'CRTtime', 'CRTplane', 'CRTtracklength', 'CRTnumtracks', 'deltaY', 'deltaZ', 'deltaYSigma', 'deltaZSigma', 'chargeToLightRatio', 'xclVariable', 'passesPreCuts', 'flashMatchScore', 'totalPEHypothesis', 'peHypothesisSpectrum', 'isTaggedAsTarget', 'targetMethod', 'isConsideredByFlashId', 'topologicalScore', 'hasBestTopologicalScore', 'hasBestFlashMatchScore', 'nHits', 'maxDeltaLLMCS', 'lengthDeltaLLMCS', 'ct_result_michel_plane0', 'ct_result_michel_plane1', 'ct_result_michel_plane2', 'ct_result_bragg_plane0', 

<IPython.core.display.Javascript object>

In [18]:
# Add the POT counting fields for samples:
file_dict["on"]["pot"] = pot[run - 1]
file_dict["on"]["en1dcnt"] = en1dcnt[run - 1]
file_dict["off"]["ext"] = ext[run - 1]

<IPython.core.display.Javascript object>

In [19]:
out_file = open(path + "data_run{}.pckl".format(run), "wb")
pickle.dump(file_dict, out_file)
out_file.close()

<IPython.core.display.Javascript object>

### Done