In [1]:
import numpy as np
import pandas as pd

from helpfunction import CheckBorderFixed,CheckBorderTPC

In [2]:
sample_list = ['../Input/nue/nue_0.pckl','../Input/nue/nue_1.pckl','../Input/nue/nue_2.pckl',
               '../Input/nue/nue_3.pckl','../Input/nue/nue_4.pckl','../Input/nue/nue_5.pckl',
               '../Input/nue/nue_6.pckl','../Input/nue/nue_7.pckl','../Input/nue/nue_8.pckl','../Input/nue/nue_9.pckl'
               ]

df_tracks = pd.DataFrame()
df_showers = pd.DataFrame()

columns_req_track = [  'track_dedx','track_dedx_avg','track_dedx_hits','track_pca',
                       'predict_em','predict_mu', 'predict_cos', 'predict_pi', 'predict_p',
                       'track_start_x', 'track_start_y','track_start_z',
                       'track_end_x', 'track_end_y', 'track_end_z','track_daughter',
                        'matched_tracks' #This is the label generationg category
                      ]

columns_req_shower = [ 'shower_dedx','shower_dedx_avg','shower_dedx_hits','shower_pca','shower_open_angle',
                       'shower_start_x', 'shower_start_y','shower_start_z',
                       'shower_length','shower_daughter',
                       'matched_showers' #This is the label generationg category
                      ]

columns_flat = ['vx','vy','vz','true_vx_sce','true_vy_sce','true_vz_sce','shower_containment_q','shower_sp_profile']

columns_all = list(set(columns_req_shower) | set(columns_req_track))+columns_flat

# Fiducial volume borders in x,y,z:
fid_arr= [[10,10],[20,20],[10,50]]

In [3]:
def shower_features(row):
    x=row["shower_start_x"]
    y=row["shower_start_y"]
    z=row["shower_start_z"]
    vx=row["vx"]
    vy=row["vy"]
    vz=row["vz"]
    
    shower_sh_daughter = True if row['shower_daughter']==1 else False
    shower_tr_daughter = True if row['shower_daughter']==2 else False
    shower_is_daughter = True if row['matched_showers']==0 else False
    shower_vtx         = np.linalg.norm([x-vx,y-vy,z-vz])
    
    # Class
    true_mu        = True if abs(row['matched_showers'])==13 else False
    true_e         = True if row['matched_showers']==11 else False
    
    return pd.Series({ 'shower_sh_daughter':shower_sh_daughter,
             'shower_tr_daughter':shower_tr_daughter,
             'shower_is_daughter':shower_is_daughter,
             'shower_vtx':shower_vtx,
             'true_mu':true_mu,
             'true_e':true_e
             })


def track_features(row):  
    x=row["track_start_x"]
    y=row["track_start_y"]
    z=row["track_start_z"]
    x_e=row["track_end_x"]
    y_e=row["track_end_y"]
    z_e=row["track_end_z"]
    vx=row["vx"]
    vy=row["vy"]
    vz=row["vz"]
    
    track_length      = np.linalg.norm([x-x_e,y-y_e,z-z_e])
    track_containment = CheckBorderFixed(x_e,y_e,z_e,tolerance=10) 
    track_sh_daughter = True if row['track_daughter']==1 else False
    track_tr_daughter = True if row['track_daughter']==2 else False
    track_is_daughter = True if row['matched_tracks']==0 else False
    track_vtx         = np.linalg.norm([x-vx,y-vy,z-vz])
    track_vtx_end     = np.linalg.norm([vx-x_e,vy-y_e,vz-z_e])
    
    # Class
    true_mu        = True if abs(row['matched_tracks'])==13 else False
    true_e         = True if row['matched_tracks']==11 else False
    
    return pd.Series({ 'track_length':track_length,
             'track_containment':track_containment,
             'track_sh_daughter':track_sh_daughter,
             'track_tr_daughter':track_tr_daughter,
             'track_is_daughter':track_is_daughter,
             'track_vtx':track_vtx,
             'track_vtx_end':track_vtx_end,
             'true_mu':true_mu,
             'true_e':true_e
                     })

def ClosestElectronObject(row):
    # Initialise the return fields:
    sh_cl_e_arr=np.full(len(row['matched_showers']), False)
    tr_cl_e_arr=np.full(len(row['matched_tracks']), False)

    if (11 not in row['matched_showers']) and (11 not in row['matched_tracks']):
        return pd.Series({'sh_cl_e_arr': sh_cl_e_arr, 'tr_cl_e_arr': tr_cl_e_arr}) 
    
    d=10 # If the closest object is further than 10cm, forget it anyway
    i = -1
    
    true_x = row["true_vx_sce"]
    true_y = row["true_vy_sce"]
    true_z = row["true_vz_sce"]
    
    if (11 in row['matched_showers']):
        sh_x_arr = row["shower_start_x"]
        sh_y_arr = row["shower_start_y"]
        sh_z_arr = row["shower_start_z"]
        sh_pdg_arr = row['matched_showers']
    
        for sh_i,(sh_x,sh_y,sh_z,sh_pdg) in enumerate(zip(sh_x_arr,sh_y_arr,sh_z_arr,sh_pdg_arr)):
            if sh_pdg==11:
                dist = np.linalg.norm([sh_x-true_x,sh_y-true_y,sh_z-true_z])
                if dist<d:
                    d=dist
                    i = sh_i
        if i> -1:
            sh_cl_e_arr[i]=True
            i=-1
    
    if (11 in row['matched_tracks']):
        tr_x_arr = row["track_start_x"]
        tr_y_arr = row["track_start_y"]
        tr_z_arr = row["track_start_z"]
        tr_pdg_arr = row['matched_tracks']
    
        for tr_i,(tr_x,tr_y,tr_z,tr_pdg) in enumerate(zip(tr_x_arr,tr_y_arr,tr_z_arr,tr_pdg_arr)):
            if tr_pdg==11:
                dist = np.linalg.norm([tr_x-true_x,tr_y-true_y,tr_z-true_z])
                if dist<d:
                    d=dist
                    i = tr_i
                    sh_cl_e_arr=np.full(len(row['matched_showers']), False)
        if i>-1:
            tr_cl_e_arr[i]=True
            
    return  pd.Series({'sh_cl_e_arr': sh_cl_e_arr, 'tr_cl_e_arr': tr_cl_e_arr})  

In [4]:
droplist_tr = ['track_start_x','track_start_y','track_start_z','matched_tracks','track_daughter',
               'track_end_x','track_end_y','track_end_z','vx','vy','vz','true_vx_sce','true_vy_sce','true_vz_sce',
               'shower_containment_q','shower_sp_profile'
              ]

droplist_sh = ['shower_start_x','shower_start_y','shower_start_z','matched_showers','shower_daughter',
               'vx','vy','vz','true_vx_sce','true_vy_sce','true_vz_sce']

In [5]:
for i,sample in enumerate(sample_list):
    
    df_tracks = pd.DataFrame()
    df_showers = pd.DataFrame()

    df_sample = pd.read_pickle(sample)
    droplist = [x for x  in df_sample.columns if (x not in columns_all)]
    df_sample.drop(droplist,inplace=True,axis=1)
    df_sample = pd.concat([df_sample, df_sample.apply(ClosestElectronObject,axis=1)], axis=1)
    print(len(df_sample.index)) 
    
    for index, row in df_sample.iterrows():
        if CheckBorderTPC(row['vx'],row['vy'],row['vz'],fid_arr):
            # Tracks!
            for tr in range(len(row['matched_tracks'])):
                d_tr = {}
                for field in columns_req_track:
                    d_tr[field]=row[field][tr]
                    d_tr['true_e_cl']=row['tr_cl_e_arr'][tr]
                for field in columns_flat:
                    d_tr[field]=row[field]
                df_tracks=df_tracks.append(d_tr,ignore_index=True)
    
            # Showers!
            for sh in range(len(row['matched_showers'])):
                d_sh = {}
                for field in columns_req_shower:
                    d_sh[field]=row[field][sh]
                    d_sh['true_e_cl']=row['sh_cl_e_arr'][sh]
                for field in columns_flat:
                    d_sh[field]=row[field]
                df_showers=df_showers.append(d_sh,ignore_index=True)
                
    df_tracks['true_e_cl']=df_tracks['true_e_cl'].astype(bool, copy=False)
    df_tracks = pd.concat([df_tracks, df_tracks.apply(track_features,axis=1)], axis=1)
    df_tracks.drop(droplist_tr,inplace=True,axis=1) 
    df_tracks.to_pickle('../trackFrame_nue_'+str(i)+'.pckl')
    
    df_showers['true_e_cl']=df_showers['true_e_cl'].astype(bool, copy=False)
    df_showers = pd.concat([df_showers, df_showers.apply(shower_features,axis=1)], axis=1)
    df_showers.drop(droplist_sh,inplace=True,axis=1)    
    df_showers.to_pickle('../showerFrame_nue_'+str(i)+'.pckl')
    

10333
10413
10454
10203
10541
10382
10546
10530
10437
7702


In [6]:
print(df_tracks['matched_tracks'].value_counts())
df_tracks[df_tracks['matched_tracks']==11].head()

KeyError: 'matched_tracks'

In [None]:
print(df_showers['matched_showers'].value_counts())

df_showers.head()

In [None]:

df_tracks.head()

In [None]:
df_tracks.columns
df_tracks['true_mu'].value_counts()

In [None]:
df_tracks['true_e'].value_counts()

In [None]:
df_tracks['true_e_cl'].value_counts()

In [None]:
df_showers['true_mu'].value_counts()

In [None]:
df_showers['true_e'].value_counts()

In [None]:
df_showers.head()