In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from operator import itemgetter

from helpfunction import CheckBorderFixed,CheckBorderTPC
from sklearn.externals import joblib

pd.options.display.max_columns = 999

In [15]:
# Fiducial volume borders in x,y,z:
fid_arr= [[10,10],[20,20],[10,50]]

min_dedx_hits=3

In [16]:
def shower_features(row):
    x=row["shower_start_x"]
    y=row["shower_start_y"]
    z=row["shower_start_z"]
    vx=row["vx"]
    vy=row["vy"]
    vz=row["vz"]
    
    shower_sh_daughter = True if row['shower_daughter']==1 else False
    shower_tr_daughter = True if row['shower_daughter']==2 else False
    shower_is_daughter = True if row['matched_showers']==0 else False
    shower_vtx         = np.linalg.norm([x-vx,y-vy,z-vz])
    
    return { 'shower_sh_daughter':shower_sh_daughter,
             'shower_tr_daughter':shower_tr_daughter,
             'shower_is_daughter':shower_is_daughter,
             'shower_vtx':shower_vtx,
             }


def track_features(row):  
    x=row["track_start_x"]
    y=row["track_start_y"]
    z=row["track_start_z"]
    x_e=row["track_end_x"]
    y_e=row["track_end_y"]
    z_e=row["track_end_z"]
    vx=row["vx"]
    vy=row["vy"]
    vz=row["vz"]
    
    track_length      = np.linalg.norm([x-x_e,y-y_e,z-z_e])
    track_containment = CheckBorderFixed(x_e,y_e,z_e,tolerance=10) 
    track_sh_daughter = True if row['track_daughter']==1 else False
    track_tr_daughter = True if row['track_daughter']==2 else False
    track_is_daughter = True if row['matched_tracks']==0 else False
    track_vtx         = np.linalg.norm([x-vx,y-vy,z-vz])
    track_vtx_end     = np.linalg.norm([vx-x_e,vy-y_e,vz-z_e])
    
    return { 'track_length':track_length,
             'track_containment':track_containment,
             'track_sh_daughter':track_sh_daughter,
             'track_tr_daughter':track_tr_daughter,
             'track_is_daughter':track_is_daughter,
             'track_vtx':track_vtx,
             'track_vtx_end':track_vtx_end,
                     }

In [17]:
columns_req_shower = ['shower_dedx','shower_dedx_avg','shower_dedx_hits','shower_pca','shower_open_angle',
                      'shower_start_x', 'shower_start_y','shower_start_z',
                      'shower_length','shower_daughter',
                      'matched_showers'#This is the label generationg category
                     ]

columns_req_track  = ['track_dedx','track_dedx_avg','track_dedx_hits','track_pca',
                      'predict_em','predict_mu', 'predict_cos', 'predict_pi', 'predict_p',
                      'track_start_x', 'track_start_y','track_start_z',
                      'track_end_x', 'track_end_y', 'track_end_z','track_daughter',
                      'matched_tracks'#This is the label generationg category
                     ]

columns_flat = ['vx','vy','vz',
                'bnbweight','reconstructed_energy','flash_PE','flash_time',
                #'true_vx_sce','true_vy_sce','true_vz_sce','nu_E','nu_pdg',
                #'category','distance','CC_daughter_E',
                'shower_containment_q','shower_sp_profile',
                'subrun', 'run','shower_nhits','track_nhits'
               ]

columns_track_XGB = ['track_dedx', 'track_dedx_avg', 'track_dedx_hits', 'track_pca',
                     'predict_em', 'predict_mu', 'predict_cos', 'predict_pi', 'predict_p',
                     'track_containment', 'track_is_daughter', 'track_length',
                     'track_sh_daughter', 'track_tr_daughter', 'track_vtx', 'track_vtx_end']

columns_shower_XGB = [ 'shower_dedx', 'shower_dedx_avg', 'shower_dedx_hits', 'shower_pca',
                       'shower_open_angle', 'shower_length', 'shower_containment_q',
                       'shower_sp_profile', 'shower_is_daughter', 'shower_sh_daughter',
                       'shower_tr_daughter', 'shower_vtx']


columns_all = list(set(columns_req_shower) | set(columns_req_track))+columns_flat

In [19]:
df_sample = pd.read_pickle('../Input/bnb/data_bnb.pckl')
df_sample['matched_tracks'].head()

KeyError: 'matched_tracks'

In [5]:
sample_list = ['../Input/nue/nue_3.pckl','../Input/nue/nue_4.pckl','../Input/nue/nue_5.pckl']
sample_list = ['../Input/nu/nu_3.pckl','../Input/nu/nu_4.pckl','../Input/nu/nu_5.pckl']

sample_list = ['../Input/nu/nu_6.pckl']

In [6]:
print(columns_all)

['track_end_y', 'track_start_y', 'track_end_x', 'track_start_x', 'track_dedx', 'shower_start_y', 'shower_dedx_hits', 'predict_cos', 'predict_pi', 'shower_start_x', 'predict_p', 'track_daughter', 'shower_length', 'predict_em', 'track_dedx_hits', 'predict_mu', 'shower_daughter', 'track_end_z', 'shower_dedx_avg', 'shower_pca', 'track_start_z', 'shower_start_z', 'track_pca', 'shower_open_angle', 'shower_dedx', 'track_dedx_avg', 'vx', 'vy', 'vz', 'bnbweight', 'reconstructed_energy', 'flash_PE', 'flash_time', 'shower_containment_q', 'shower_sp_profile', 'subrun', 'run', 'shower_nhits', 'track_nhits']


In [7]:
model_sh_e  = joblib.load('../Input/XGBoost/model_sh_e.pkl')
model_sh_cle  = joblib.load('../Input/XGBoost/model_sh_cle.pkl')
model_sh_mu = joblib.load('../Input/XGBoost/model_sh_mu.pkl')

model_tr_e  = joblib.load('../Input/XGBoost/model_tr_e.pkl')
model_tr_cle  = joblib.load('../Input/XGBoost/model_tr_cle.pkl')
model_tr_mu = joblib.load('../Input/XGBoost/model_tr_mu.pkl')

In [8]:
def XGBoostClassification(row):
    tr_e, tr_cle, tr_mu = [],[],[]
    sh_e, sh_cle, sh_mu = [],[],[]
    if row['fidvol']:
        if len(row['track_end_y']) > 0:
            tr_e ,tr_cle, tr_mu = TrackClassification(row)
        sh_e, sh_cle, sh_mu = ShowerClassification(row)
    
    return pd.Series({
        'track_electron_score': tr_e,
        'track_closest_electron_score': tr_cle,
        'track_muon_score': tr_mu,
        'shower_electron_score': sh_e,
        'shower_closest_electron_score': sh_cle,
        'shower_muon_score': sh_mu,
    })
    
    
def TrackClassification(row):
    pred_tr_e = []
    pred_tr_mu = []
    pred_tr_cle= []
    XGB_input = np.zeros([sum(row['track_dedx_hits']>=min_dedx_hits),len(columns_track_XGB)])
    
    tr_ok=0
    for tr in range(len(row['track_end_y'])):
        if row['track_dedx_hits'][tr]>=min_dedx_hits:
            d_tr={}
            for field in columns_req_track:
                d_tr[field]=row[field][tr]
            for field in ['vx','vy','vz']:
                d_tr[field]=row[field]
            d_tr = {**d_tr,**track_features(d_tr)}
            
            XGB_input[tr_ok] = np.asarray(itemgetter(*columns_track_XGB)(d_tr))
            tr_ok+=1
            
        pred_tr_e = model_tr_e.predict_proba( XGB_input )[:,1] 
        pred_tr_cle = model_tr_cle.predict_proba( XGB_input )[:,1]
        pred_tr_mu = model_tr_mu.predict_proba( XGB_input)[:,1]  
        
    return pred_tr_e, pred_tr_cle, pred_tr_mu
    
    
def ShowerClassification(row):
    pred_sh_e = []
    pred_sh_mu = []
    pred_sh_cle = []
    
    XGB_input = np.zeros([sum(row['shower_dedx_hits']>=min_dedx_hits),len(columns_shower_XGB)])
    
    sh_ok=0
    for sh in range(len(row['matched_showers'])):
        if row['shower_dedx_hits'][sh]>=min_dedx_hits:
            d_sh={}
            for field in columns_req_shower:
                d_sh[field]=row[field][sh]
            for field in ['vx','vy','vz','shower_containment_q','shower_sp_profile']:
                d_sh[field]=row[field]
            d_sh = {**d_sh,**shower_features(d_sh)}  

            XGB_input[sh_ok] = np.asarray( itemgetter(*columns_shower_XGB)(d_sh)) 
            sh_ok+=1
        
    pred_sh_e =  model_sh_e.predict_proba( XGB_input )[:,1] 
    pred_sh_cle =  model_sh_cle.predict_proba( XGB_input )[:,1] 
    pred_sh_mu=  model_sh_mu.predict_proba( XGB_input )[:,1] 
        
    return pred_sh_e, pred_sh_cle, pred_sh_mu

In [9]:
columns_final = ['vx','vy','vz','event', 'subrun', 'run','shower_nhits','track_nhits',
                 'bnbweight','reconstructed_energy','flash_PE','flash_time',
                 #'true_vx_sce','true_vy_sce','true_vz_sce','nu_E','nu_pdg',
                 #'category','distance','CC_daughter_E','true_fidvol',
                 'shower_containment_q','shower_sp_profile',
                 'shower_electron_score','track_muon_score','shower_muon_score','track_electron_score',
                 'track_closest_electron_score','shower_closest_electron_score','fidvol'
                ]

In [10]:
df_joined = pd.DataFrame()

for sample in sample_list:
    df_sample = pd.read_pickle(sample)
    df_sample = df_sample.head(10)
    print(df_sample.columns)
    droplist = [x for x  in df_sample.columns if (x not in columns_all)]
    print(len(df_sample.index))
    df_sample.drop(droplist,inplace=True,axis=1)
    #df_sample['true_fidvol'] =df_sample.apply(lambda row: CheckBorderTPC(row['true_vx_sce'],row['true_vy_sce'],row['true_vz_sce'],fid_arr) ,axis=1)
    df_sample['fidvol'] =df_sample.apply(lambda row: CheckBorderTPC(row['vx'],row['vy'],row['vz'],fid_arr) ,axis=1)
    df_sample = pd.concat([df_sample, df_sample.apply(XGBoostClassification,axis=1)], axis=1)    
    df_sample = df_sample[columns_final]
    df_joined = pd.concat([df_joined,df_sample],ignore_index=True,copy=False)
    print('done sample')
df_joined.head(10)

Index(['event', 'subrun', 'run', 'nu_pdg', 'nu_E', 'true_vx_sce',
       'true_vy_sce', 'true_vz_sce', 'distance', 'category', 'vx', 'vy', 'vz',
       'bnbweight', 'passed', 'shower_open_angle', 'shower_length',
       'shower_start_x', 'shower_start_y', 'shower_start_z', 'shower_dir_x',
       'shower_dir_y', 'shower_dir_z', 'shower_pca', 'track_start_x',
       'track_start_y', 'track_start_z', 'track_end_x', 'track_end_y',
       'track_end_z', 'track_dir_x', 'track_dir_y', 'track_dir_z', 'track_pca',
       'predict_em', 'predict_mu', 'predict_cos', 'predict_pi', 'predict_p',
       'nu_daughters_pdg', 'nu_daughters_E', 'nu_daughters_px',
       'nu_daughters_py', 'nu_daughters_pz', 'nu_daughters_endx',
       'nu_daughters_endy', 'nu_daughters_endz', 'true_shower_pdg',
       'true_shower_x_sce', 'true_shower_y_sce', 'true_shower_z_sce',
       'true_shower_depE', 'shower_daughter', 'track_daughter',
       'shower_containment_q', 'shower_sp_profile', 'reconstructed_energy',
    

KeyError: ('matched_tracks', 'occurred at index 0')

In [None]:
df_joined.to_pickle('nu_score_6.pckl')

In [None]:
print(len(df_sample.index))
print(df_sample['category'].value_counts())

In [None]:
df_pure=df_sample[df_sample.apply(lambda x: np.all(np.array(x['track_muon_score'])<0.10) and
                                            np.all(np.array(x['shower_muon_score'])<0.15) and
                                         (np.any(np.array(x['shower_electron_score'])>0.9) or np.any(np.array(x['track_electron_score'])>0.99) ),
                                  axis=1)]

In [None]:
print(len(df_pure.index))
print(df_pure['category'].value_counts())

Unnamed: 0,subrun,run,vx,vy,vz,bnbweight,shower_open_angle,shower_length,shower_start_x,shower_start_y,...,track_nhits,shower_dedx,shower_dedx_avg,shower_dedx_hits,track_dedx,track_dedx_avg,track_dedx_hits,flash_PE,flash_time,fidvol
0,60,5147,59.609886,3.348907,975.683105,-2147483648,[0.000617],[8.6],[59.6],[3.54],...,[158.0],[2.409960113262952],[8.404563866844752],[4.0],[1.086502648947119],[2.7157828191120217],[3.0],586.106689,3.33125,True
1,192,5804,172.495834,-69.986298,193.566223,-2147483648,"[0.2578, 0.04083]","[23.75, 35.12]","[85.1, 172.0]","[85.56, -69.94]",...,"[140.0, 16.0]","[0.18326435243720587, 3.253782700415696]","[0.7725998786895283, 26.905033995678046]","[6.0, 15.0]","[1.7715411583947391, 2.555193382743279e-308]","[6.397796641796328, 1.0908509705198797e-307]","[8.0, 9.0]",126.048508,4.366875,True
2,35,5731,207.262039,74.179749,887.702148,-2147483648,[0.05875],[47.2],[151.6],[45.47],...,"[252.0, 82.0, 13.0]",[5.1488384842237],[52.79129893833483],[11.0],"[1.5986814870866983, 1.530424876017253, 6.3209...","[7.341068917756814, 2.720965297224062, 26.2921...","[13.0, 5.0, 13.0]",1123.879883,3.63125,True
3,44,5367,225.08075,90.818245,512.745178,-2147483648,[0.4026],[20.66],[221.2],[77.0],...,[589.0],[0.3331920441249615],[4.114074002507645],[10.0],[1.4637157517356303],[2.107851007400168],[2.0],109.823792,4.41125,True
4,44,5367,224.076004,45.061203,285.856232,-2147483648,[0.0716],[14.86],[62.53],[71.44],...,"[361.0, 20.0]",[4.983496730998916],[40.00623147874592],[10.0],"[5.373462908475707, 1.8494496847650954]","[19.437010588401076, 3.5806497511651676]","[8.0, 3.0]",1286.20813,3.3325,True
