In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from helpfunction import CheckBorderFixed,CheckBorderTPC

%matplotlib inline

In [10]:
sample_list = ['../Input/intime/intime.pckl','../Input/nue/nue.pckl','../Input/nu/nu_pure_close_600k.pckl']

pd_showers_raw = pd.DataFrame()

columns_req = ['shower_dedx','shower_dedx_avg','shower_dedx_hits','shower_pca','shower_open_angle',
               'shower_start_x', 'shower_start_y','shower_start_z',
               'shower_length','shower_daughter',
               'matched_showers'#This is the label generationg category
              ]

columns_flat = ['vx','vy','vz','shower_containment_q','shower_sp_profile']

#Fields to engineer:
# - length
# - 10cm contained (bool)
# - has shower daughter (bool)
# - has shower daughter
# - is rthe daughter of something else
# - distance from vertex start point
# - distance from vertex end point

# Fiducial volume borders in x,y,z:
fid_arr= [[10,10],[20,20],[10,50]]

In [11]:
#Load in samples
df_showers=pd.DataFrame(columns=columns_req+columns_flat)


In [12]:
for sample in sample_list:
    df_sample = pd.read_pickle(sample)
    droplist = [x for x  in df_sample.columns if (x not in columns_req+columns_flat)]
    df_sample.drop(droplist,inplace=True,axis=1)
    
    for index, row in df_sample.head(40000).iterrows():
        if CheckBorderTPC(row['vx'],row['vy'],row['vz'],fid_arr):
            for tr in range(len(row['matched_showers'])):
                d_tr = {}
                for field in columns_req:
                    #if type(row[field]) is float:
                    #    print(field)
                    #    print(row[field])
                    d_tr[field]=row[field][tr]
                for field in columns_flat:
                    d_tr[field]=row[field]
                df_showers=df_showers.append(d_tr,ignore_index=True)
        
df_showers.head()

Unnamed: 0,shower_dedx,shower_dedx_avg,shower_dedx_hits,shower_pca,shower_open_angle,shower_start_x,shower_start_y,shower_start_z,shower_length,shower_daughter,matched_showers,vx,vy,vz,shower_containment_q,shower_sp_profile
0,3.135025,16.841373,16.0,0.916504,0.221802,232.25,56.90625,135.75,22.125,0.0,0.0,236.426971,68.910492,127.865067,0.393415,0.648575
1,3.051837,8.656431,7.0,0.968262,0.094971,84.5,-107.3125,329.5,20.015625,0.0,0.0,236.426971,68.910492,127.865067,0.393415,0.648575
2,0.00656,0.00646,4.0,0.629395,0.06543,236.375,68.9375,128.875,9.226562,0.0,0.0,236.426971,68.910492,127.865067,0.393415,0.648575
3,0.09736,0.417215,9.0,0.939941,0.21936,59.03125,-109.1875,811.5,37.59375,0.0,0.0,36.073578,-71.722626,811.450012,0.320908,0.134881
4,1.697985,4.193395,4.0,0.927246,0.464355,38.09375,-74.1875,813.5,4.945312,0.0,0.0,36.073578,-71.722626,811.450012,0.320908,0.134881


In [13]:
df_showers['matched_showers'].value_counts()

 11.0      44896
 0.0       28269
 2212.0    24478
 22.0      18439
 211.0      9112
 13.0       5898
-13.0       2920
 2112.0     2893
-211.0      1215
-11.0        246
 321.0       132
 3222.0       97
-321.0        17
 3112.0        3
Name: matched_showers, dtype: int64

In [14]:
def shower_freatures(row):
    
    x=row["shower_start_x"]
    y=row["shower_start_y"]
    z=row["shower_start_z"]
    vx=row["vx"]
    vy=row["vy"]
    vz=row["vz"]
    
    shower_sh_daughter = True if row['shower_daughter']==1 else False
    shower_tr_daughter = True if row['shower_daughter']==2 else False
    shower_is_daughter = True if row['matched_showers']==0 else False
    shower_vtx         = np.linalg.norm([x-vx,y-vy,z-vz])
    
    # Class
    true_mu        = True if abs(row['matched_showers'])==13 else False
    true_e         = True if row['matched_showers']==11 else False
    
    return pd.Series({
                     'shower_sh_daughter':shower_sh_daughter,
                     'shower_tr_daughter':shower_tr_daughter,
                     'shower_is_daughter':shower_is_daughter,
                     'shower_vtx':shower_vtx,
                     'true_mu':true_mu,
                     'true_e':true_e
                     })

In [15]:
df_showers = pd.concat([df_showers, df_showers.apply(shower_freatures,axis=1)], axis=1)
droplist = ['shower_start_x','shower_start_y','shower_start_z','matched_showers','shower_daughter','vx','vy','vz']
df_showers.drop(droplist,inplace=True,axis=1)     

In [16]:
df_showers['true_mu'].value_counts()

False    129797
True       8818
Name: true_mu, dtype: int64

In [17]:
df_showers['true_e'].value_counts()

False    93719
True     44896
Name: true_e, dtype: int64

In [18]:
df_showers.to_pickle('../showerFrame.pckl')