# Make SBND Event DF

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
# from util import *

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [None]:
MASS_MUON = 0.105658
MASS_NEUTRON = 0.9395654
MASS_PROTON = 0.938272
MASS_A = 22*MASS_NEUTRON + 18*MASS_PROTON - 0.34381
BE = 0.0295
MASS_Ap = MASS_A - MASS_NEUTRON + BE

In [None]:
def mag(x, y, z):
    return np.sqrt(x**2 + y**2 + z**2)


def magdf(df):
    return mag(df.x, df.y, df.z)


def dmagdf(df1, df2):
    return mag(df1.x - df2.x, df1.y - df2.y, df1.z - df2.z)


def dotdf(df1, df2):
    return df1.x*df2.x + df1.y*df2.y + df1.z*df2.z 


def unitdf(df):
    return df.divide(magdf(df), axis=0)

In [None]:
def broadcast(v, df):
    for vi, ii in zip(v.index.names, df.index.names):
        if vi != ii:
            raise ValueError("Value index (%s) does not match index (%s)." % (str(vi), str(ii)))
    if len(v.index.names) > len(df.index.names):
        raise ValueError("Value index too long.")
    if len(v.index.names) == len(df.index.names):
        return v

    rpt = df.groupby(level=list(range(v.index.nlevels))).size()
    has_value = v.index.intersection(rpt.index)
    v_rpt = np.repeat(v.loc[has_value].values, rpt)

    return pd.Series(v_rpt, df.index).rename(v.name) 

def multicol_concat(lhs, rhs):
    # Fix the columns
    lhs_col = lhs.columns
    rhs_col = rhs.columns

    nlevel = max(lhs_col.nlevels, rhs_col.nlevels)

    def pad(c):
       return tuple(list(c) + [""]*(nlevel - len(c))) 

    lhs.columns = pd.MultiIndex.from_tuples([pad(c) for c in lhs_col])
    rhs.columns = pd.MultiIndex.from_tuples([pad(c) for c in rhs_col])

    return pd.concat([lhs, rhs], axis=1)

def multicol_add(df, s, **panda_kwargs):
    # if both the series and the df is one level, we can do a simple join()
    if isinstance(s.name, str) and df.columns.nlevels == 1:
        return df.join(s, **panda_kwargs)

    if isinstance(s.name, str):
        s.name = (s.name,)

    nlevel = max(df.columns.nlevels, len(s.name))
    def pad(c):
       return tuple(list(c) + [""]*(nlevel - len(c))) 

    if df.columns.nlevels < nlevel:
        df.columns = pd.MultiIndex.from_tuples([pad(c) for c in df.columns])
    if len(s.name) < nlevel:
        s.name = pad(s.name)

    return df.join(s, **panda_kwargs)

def multicol_merge(lhs, rhs, **panda_kwargs):
    # Fix the columns
    lhs_col = lhs.columns
    rhs_col = rhs.columns

    nlevel = max(lhs_col.nlevels, rhs_col.nlevels)

    def pad(c):
       nc = 1 if isinstance(c, str) else len(c)
       c0 = [c] if isinstance(c, str) else list(c)
       return tuple(c0 + [""]*(nlevel - nc)) 

    lhs.columns = pd.MultiIndex.from_tuples([pad(c) for c in lhs_col])
    rhs.columns = pd.MultiIndex.from_tuples([pad(c) for c in rhs_col])

    return lhs.merge(rhs, **panda_kwargs)

def detect_vectors(tree, branch):
    ret = []
    hierarchy = branch.split(".")
    for i in range(len(hierarchy)):
        subbranch = ".".join(hierarchy[:i+1])
        lenbranch = subbranch + "..length"
        if lenbranch in tree.keys():
            ret.append(subbranch)
    return ret

def idarray(ids, lens):
    return np.repeat(ids.values, lens.values)

def loadbranches(tree, branches, **uprargs):
    vectors = []
    for i,branch in enumerate(branches):
        this_vectors = detect_vectors(tree, branch)
        if i == 0:
            vectors = this_vectors
        elif len(this_vectors) == 0: # This case is ok since it will automatically broadcast
            pass
        # All the branches must have the same vector structure for this to work
        elif vectors != this_vectors:
            raise ValueError("Branches %s and %s have different vector structures in the CAF." % (branches[0], branch))

    lengths = [tree.arrays([v+"..length"], library="pd", **uprargs) for v in vectors]
    data = tree.arrays(branches, library="pd", **uprargs)

    # If there's no vectors, we can just return the top guy
    if len(lengths) == 0:
        data.index.name = "entry"
        df = data
    else:
        tomerge = lengths + [data]
        # Otherwise, iteratively merge the branches
        df = tomerge[0]
        df.index.name = "entry"

        # handle the rest
        for i in range(1, len(tomerge)):
            thismerge = tomerge[i]
            v_ind = i - 1

            # Build the information in the right-hand table needed to do the join
            # The "upidx" will be matched to the index vector-by-vector
            for i in range(v_ind):
                thismerge[vectors[v_ind] + "..upidx" + str(i)] = idarray(df[vectors[i]+ "..index"], df[vectors[v_ind] + "..length"])

            # Inner join! Throw away rows in the right-hand with no match in the left-hand
            df = pd.merge(df, thismerge, how="inner",
                         left_on = ["entry"] + [v+"..index" for v in vectors[:v_ind]],
                         right_on = ["entry"] + [vectors[v_ind] + "..upidx" + str(i) for i in range(v_ind)],
                         validate="one_to_many")

            # Make sure no rows in the right-hand were dropped
            assert(df.shape[0] == thismerge.shape[0])

            # postprocess: build the index
            df[vectors[v_ind] + "..index"] = df.groupby(["entry"] + [v+"..index" for v in vectors[:v_ind]]).cumcount()

        # Set the index
        df.set_index([v+"..index" for v in vectors], append=True, verify_integrity=True, inplace=True)

        # Drop all the metadata info we don't need anymore
        df = df[branches]

    # Setup branch names so df reflects structure of CAF file
    bsplit = [b.split(".") for b in branches]
    # Replace any reserved names
    def unreserve(s):
        if s == "index":
            return "idx"
        if s[0].isdigit(): # make the name a legal field 
            return "I" + s
        return s

    bsplit = [[unreserve(s) for s in b] for b in bsplit]

    depth = max([len(b) for b in bsplit])

    def pad(b):
        return tuple(b + [""]*(depth - len(b)))

    df.columns = pd.MultiIndex.from_tuples([pad(b) for b in bsplit])

    return df

In [None]:
fname = "/exp/sbnd/data/users/munjung/osc/sbnd_gump.df"
mcdf = pd.read_hdf(fname, "mcnu")
# slcdf = pd.read_hdf(fname, "slc_trk")


# fname = "/exp/sbnd/data/users/munjung/sbnd_gump.df"
# fname_icarus = "/exp/sbnd/data/users/gputnam/gump.df"

fname = "/exp/sbnd/data/users/munjung/osc/stub.df"
with pd.HDFStore(fname) as store:
    print(store.keys())

In [None]:
slcdf = pd.read_hdf(fname, "slc")
trkdf = pd.read_hdf(fname, "trk")
stubdf = pd.read_hdf(fname, "stub")

In [None]:
mcdf

In [None]:
slcdf

In [None]:
binx = np.linspace(0, 5,11)
biny = np.linspace(0, 800000, 17)

lines = [[(3, 3e5), (3, 1e6)], [(1.5, 3e5), (1.5, 4e5)], [(0.5, 4e5), (0.5, 5.5e5)], 
         [(1.5, 3e5), (3, 3e5)], [(0.5, 4e5), (1.5, 4e5)], [(0, 5.5e5), (0.5, 5.5e5)]]

fig, ax = plt.subplots()
when = (np.abs(stubdf.truth.p.pdg) == 2212) & (stubdf.truth.p.genE - MASS_PROTON < 0.05) #& (stubdf.nplane == 1)
_ = plt.hist2d(stubdf.length[when], (stubdf.inc_sub_charge / stubdf.length)[when], bins=[binx, biny])
plt.ticklabel_format(axis='y', style='sci', scilimits=(5,5))
lc = mc.LineCollection(lines, linewidths=2, color="red", linestyle="--")
ax.add_collection(lc)

plt.xlabel("Length [cm]")
plt.ylabel("dQ/dx [#elec/cm]")
plt.title("Protons")
plt.show();

fig, ax = plt.subplots()
when = (np.abs(stubdf.truth.p.pdg) != 2212) & (stubdf.truth.p.interaction_id > 0)
_ = plt.hist2d(stubdf.length[when], (stubdf.charge / stubdf.length)[when], bins=[binx, biny])
plt.ticklabel_format(axis='y', style='sci', scilimits=(5,5))
lc = mc.LineCollection(lines, linewidths=2, color="red", linestyle="--")
ax.add_collection(lc)

plt.xlabel("Length [cm]")
plt.ylabel("dQ/dx [#elec/cm]")
plt.title("False Positive")

In [None]:
# add stub info
length = stubdf.length
dqdx = stubdf.inc_sub_charge/length

is_stub = (length < 3.) &\
          (((length > 0.) & (dqdx > 5.5e5)) |\
           ((length > 0.5) & (dqdx > 4e5)) |\
           ((length > 1.5) & (dqdx > 3e5)))

In [None]:
slcdf

In [None]:
has_stub = is_stub.groupby(level=[0,1,2]).any().rename(("slc","has_stub"))
slcdf = multicol_add(slcdf, has_stub) 
slcdf[("slc","has_stub")] = slcdf[("slc","has_stub")].fillna(False)

In [None]:
trkDistCut = 10
cutClearCosmic = True

slcdf = multicol_merge(slcdf, trkdf, left_index=True, right_index=True, how="right", validate="one_to_many")

# distance from vertex to track start
slcdf = multicol_add(slcdf, dmagdf(slcdf.slc.vertex, slcdf.pfp.trk.start).rename(("pfp", "dist_to_vertex")))

if trkDistCut > 0:
    slcdf = slcdf[slcdf.pfp.dist_to_vertex < trkDistCut]
if cutClearCosmic:
    slcdf = slcdf[slcdf.slc.is_clear_cosmic==0]

## PID Info

In [None]:
# PID

# use trackscore
ts_cut = (slcdf.pfp.trackScore > 0.5)

pid_shw = np.invert(ts_cut)

# muon
MUSEL_MUSCORE_TH = 25
MUSEL_PSCORE_TH = 100
MUSEL_LEN_TH = 50

# TODO: use average over planes
# muon_chi2 = (Avg(df, "muon", drop_0=True) < MUSEL_MUSCORE_TH) & (Avg(df, "proton", drop_0=True) > MUSEL_PSCORE_TH)

# TODO: used BDT scores
# len_cut = (masterdf.len.squeeze() > MUSEL_LEN_TH)
# dazzle_muon = (masterdf.dazzle.muonScore > 0.6)
# muon_cut = (muon_chi2) & (len_cut | dazzle_muon)

mu_score_cut = (slcdf.pfp.trk.chi2pid.I2.chi2_muon < MUSEL_MUSCORE_TH) & \
    (slcdf.pfp.trk.chi2pid.I2.chi2_proton > MUSEL_PSCORE_TH)
mu_len_cut = (slcdf.pfp.trk.len > MUSEL_LEN_TH)
mu_cut = (mu_score_cut) & (mu_len_cut)
pid_mu = (ts_cut) & (mu_cut)

# proton 
PSEL_MUSCORE_TH = 0
PSEL_PSCORE_TH = 90
p_score_cut = (slcdf.pfp.trk.chi2pid.I2.chi2_muon > PSEL_MUSCORE_TH) & (slcdf.pfp.trk.chi2pid.I2.chi2_muon < PSEL_PSCORE_TH) 
p_cut = np.invert(mu_cut) & p_score_cut
pid_p = (ts_cut) & (p_cut)

# rest is pion
pi_cut = np.invert(mu_cut | p_cut)
pid_pi = (ts_cut) & (pi_cut)

# TODO: don't use trackscore

# ---------------------------

# store PID info
slcdf[("pfp", "pid", "", "", "", "")] = np.nan
slcdf.loc[pid_shw, ("pfp","pid")] = -1
slcdf.loc[pid_mu, ("pfp","pid")] = 13
slcdf.loc[pid_p, ("pfp","pid")] = 2212
slcdf.loc[pid_pi, ("pfp","pid")] = 211

In [None]:
pid_mu = (slcdf.pfp.pid == 13)
truth_mu = (np.abs(slcdf.pfp.trk.truth.p.pdg) == 13)

pid_p = (slcdf.pfp.pid == 2212)
truth_p = (np.abs(slcdf.pfp.trk.truth.p.pdg) == 2212)

pid_pi = (slcdf.pfp.pid == 211)
truth_pi = (np.abs(slcdf.pfp.trk.truth.p.pdg) == 211)

In [None]:
var = slcdf.pfp.trk.chi2pid.I2.chi2_muon
pvar = [var[truth_mu], var[truth_p], var[truth_pi]]
plt.hist(pvar, bins=np.linspace(0,80,101), histtype='step', 
         label=['muon', 'proton', 'pion'], density=True)
plt.axvline(MUSEL_MUSCORE_TH, color='r', label="MUSEL")
plt.axvline(PSEL_MUSCORE_TH, color='b', label="PSEL")
plt.xlabel("Muon Score")
plt.legend()
plt.show();

var = slcdf.pfp.trk.chi2pid.I2.chi2_proton
pvar = [var[truth_mu], var[truth_p], var[truth_pi]]
plt.hist(pvar, bins=np.linspace(0,200,101), histtype='step', 
         label=['muon', 'proton', 'pion'], density=True)
plt.axvline(MUSEL_PSCORE_TH, color='r', label="MUSEL")
plt.axvline(PSEL_PSCORE_TH, color='b', label="PSEL")
plt.xlabel("Proton Score")
plt.legend()
plt.show();

In [None]:
var = slcdf.pfp.trk.len
pvar = [var[pid_mu & truth_mu], var[pid_mu & truth_p], var[pid_mu & truth_pi]]
plt.hist(pvar, bins=np.linspace(0,400,21), histtype="step",
         label=["muon", "proton", "pion"])

print("muon selection purity {:.2f} %".format(100*len(var[pid_mu & truth_mu])/len(var[pid_mu])))
plt.legend()
plt.show();

var = slcdf.pfp.trk.len
pvar = [var[truth_mu & pid_mu], var[truth_mu & pid_p], var[truth_mu & pid_pi]]
plt.hist(pvar, bins=np.linspace(0,400,21), histtype="step",
         label=["muon", "proton", "pion"])

print("muon selection efficiency {:.2f} %".format(100*len(var[pid_mu & truth_mu])/len(var[truth_mu])))
plt.legend()
plt.show();

In [None]:
def InFV(data): # cm
    xmin = -199.15 + 10
    ymin = -200. + 10
    zmin = 0.0 + 10
    xmax = 199.15 - 10
    ymax =  200. - 10
    zmax =  500. - 50
    return (data.x > xmin) & (data.x < xmax) & (data.y > ymin) & (data.y < ymax) & (data.z > zmin) & (data.z < zmax)

In [None]:
slcdf[("pfp", "trk", "is_contained", "", "", "")] = (InFV(slcdf.pfp.trk.start)) & (InFV(slcdf.pfp.trk.end))

slcdf[("pfp", "trk", "P", "p_muon", "", "")] = np.nan
slcdf.loc[slcdf.pfp.trk.is_contained, ("pfp", "trk", "P", "p_muon", "", "")]  = slcdf.loc[(slcdf.pfp.trk.is_contained), ("pfp", "trk", "rangeP", "p_muon", "", "")]
slcdf.loc[np.invert(slcdf.pfp.trk.is_contained), ("pfp", "trk", "P", "p_muon","", "")] = slcdf.loc[np.invert(slcdf.pfp.trk.is_contained), ("pfp", "trk", "mcsP", "fwdP_muon", "", "")]

slcdf[("pfp", "trk", "P", "p_pion", "", "")] = np.nan
slcdf.loc[slcdf.pfp.trk.is_contained, ("pfp", "trk", "P", "p_pion", "", "")]  = slcdf.loc[(slcdf.pfp.trk.is_contained), ("pfp", "trk", "rangeP", "p_pion", "", "")]
slcdf.loc[np.invert(slcdf.pfp.trk.is_contained), ("pfp", "trk", "P", "p_pion", "", "")] = slcdf.loc[np.invert(slcdf.pfp.trk.is_contained), ("pfp", "trk", "mcsP", "fwdP_pion", "", "")]

slcdf[("pfp", "trk", "P", "p_proton", "", "")] = np.nan
slcdf.loc[slcdf.pfp.trk.is_contained, ("pfp", "trk", "P", "p_proton", "", "")]  = slcdf.loc[(slcdf.pfp.trk.is_contained), ("pfp", "trk", "rangeP", "p_proton", "", "")]
slcdf.loc[np.invert(slcdf.pfp.trk.is_contained), ("pfp", "trk", "P", "p_proton", "", "")] = slcdf.loc[np.invert(slcdf.pfp.trk.is_contained), ("pfp", "trk", "mcsP", "fwdP_proton", "", "")]

In [None]:
slcdf[("pfp", "trk", "cos", "x", "", "")] = np.nan
slcdf[("pfp", "trk", "cos", "x", "", "")] = (slcdf.pfp.trk.end.x-slcdf.pfp.trk.start.x)/slcdf.pfp.trk.len
slcdf[("pfp", "trk", "cos", "y", "", "")] = np.nan
slcdf[("pfp", "trk", "cos", "y", "", "")] = (slcdf.pfp.trk.end.y-slcdf.pfp.trk.start.y)/slcdf.pfp.trk.len
slcdf[("pfp", "trk", "cos", "z", "", "")] = np.nan
slcdf[("pfp", "trk", "cos", "z", "", "")] = (slcdf.pfp.trk.end.z-slcdf.pfp.trk.start.z)/slcdf.pfp.trk.len

In [None]:
mudf = slcdf[(slcdf.pfp.pid == 13)].sort_values(slcdf.pfp.index.names[:-1] + [("pfp", "trk", "len", "", "", "")]).groupby(level=[0,1,2]).last()
mudf.columns = pd.MultiIndex.from_tuples([tuple(["mu"] + list(c)) for c in mudf.columns])

pdf = slcdf[(slcdf.pfp.pid == 2212)].sort_values(slcdf.pfp.index.names[:-1] + [("pfp", "trk", "len", "", "", "")]).groupby(level=[0,1,2]).last()
pdf.columns = pd.MultiIndex.from_tuples([tuple(["p"] + list(c)) for c in pdf.columns])

slcdf = multicol_merge(slcdf, mudf, left_index=True, right_index=True, how="left", validate="one_to_one")
slcdf = multicol_merge(slcdf, pdf, left_index=True, right_index=True, how="left", validate="one_to_one")

# in case we want to cut out other objects -- save the highest energy of each other particle
lead_shw_length = slcdf.pfp.trk.len[(slcdf.pfp.pid < 0)].groupby(level=[0,1,2]).max().rename("lead_shw_length")
slcdf = multicol_add(slcdf, lead_shw_length)

lead_pion_length = slcdf.pfp.trk.len[(slcdf.pfp.pid == 211)].groupby(level=[0,1,2]).max().rename("lead_pion_length")
slcdf = multicol_add(slcdf, lead_pion_length)

subl_muon_length = slcdf[(slcdf.pfp.pid == 13)].sort_values(slcdf.pfp.index.names[:-1] + [("pfp", "trk", "len", "", "", "")]).pfp.trk.len.groupby(level=[0,1,2]).nth(-2).rename("subl_muon_length")
slcdf = multicol_add(slcdf, subl_muon_length)

subl_proton_length = slcdf[(slcdf.pfp.pid == 2212)].sort_values(slcdf.pfp.index.names[:-1] + [("pfp", "trk", "len", "", "", "")]).pfp.trk.len.groupby(level=[0,1,2]).nth(-2).rename("subl_proton_length")
slcdf = multicol_add(slcdf, subl_proton_length)

## Truth Match

In [None]:
# truth match

bad_tmatch = np.invert(slcdf.slc.tmatch.eff > 0.5) & (slcdf.slc.tmatch.idx >= 0)
slcdf.loc[bad_tmatch, ("slc","tmatch","idx", "", "", "", "")] = np.nan

# match # of column levels
mcdf.columns = pd.MultiIndex.from_tuples([tuple(list(c) +["", "", "", ""]) for c in mcdf.columns])

df = pd.merge(slcdf.reset_index(), 
              mcdf.reset_index(),
              left_on=[("__ntuple", "", "",), 
                       ("entry", "", "",), 
                       ("slc", "tmatch", "idx")], 
              right_on=[("__ntuple", "", ""), 
                        ("entry", "", ""), 
                        ("rec.mc.nu..index", "", "")], 
              how="left"
              ) 

df = df.set_index(slcdf.index.names, verify_integrity=True)

In [None]:
df 

In [None]:
with pd.HDFStore("/exp/sbnd/data/users/munjung/osc/sbnd_gump.df") as hdf:
    hdf.put(key="evt", value=df, format="fixed")