In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from operator import itemgetter

from helpfunction import CheckBorderFixed,CheckBorderTPC
from sklearn.externals import joblib

pd.options.display.max_columns = 999

In [2]:
# Fiducial volume borders in x,y,z:
fid_arr= [[10,10],[20,20],[10,50]]

min_dedx_hits=3

In [3]:
def shower_features(row):
    x=row["shower_start_x"]
    y=row["shower_start_y"]
    z=row["shower_start_z"]
    vx=row["vx"]
    vy=row["vy"]
    vz=row["vz"]
    
    shower_dedx_cali = row['shower_dedx_cali']
    shower_dedx = row['shower_dedx']
    shower_dedx_med = shower_dedx*shower_dedx_cali
    
    shower_sh_daughter = True if row['shower_daughter']==1 else False
    shower_tr_daughter = True if row['shower_daughter']==2 else False
    shower_multi_daughter = True if row['shower_daughter']==3 else False
    
    shower_is_tr_daughter = True if row['shower_is_daughter']==2 else False
    shower_is_sh_daughter = True if row['shower_is_daughter']==1 else False
    
    shower_vtx         = np.linalg.norm([x-vx,y-vy,z-vz])
    
    return { 'shower_sh_daughter':shower_sh_daughter,
             'shower_tr_daughter':shower_tr_daughter,
             'shower_multi_daughter':shower_multi_daughter,
             'shower_is_tr_daughter':shower_is_tr_daughter,
             'shower_is_sh_daughter':shower_is_sh_daughter,      
             'shower_vtx':shower_vtx,
             'shower_dedx_med':shower_dedx_med
             }

In [4]:
def track_features(row):  
    x=row["track_start_x"]
    y=row["track_start_y"]
    z=row["track_start_z"]
    x_e=row["track_end_x"]
    y_e=row["track_end_y"]
    z_e=row["track_end_z"]
    vx=row["vx"]
    vy=row["vy"]
    vz=row["vz"]
    
    track_dedx_cali = row['track_dedx_cali']
    track_dedx = row['track_dedx']
    track_dedx_med = track_dedx*track_dedx_cali
    
    track_length      = np.linalg.norm([x-x_e,y-y_e,z-z_e])
    track_containment = CheckBorderFixed(x_e,y_e,z_e,tolerance=10) 
    track_sh_daughter = True if row['track_daughter']==1 else False
    track_tr_daughter = True if row['track_daughter']==2 else False
    track_multi_daughter = True if row['track_daughter']==3 else False
    
    track_is_tr_daughter = True if row['track_is_daughter']==2 else False
    track_is_sh_daughter = True if row['track_is_daughter']==1 else False
    
    track_vtx         = np.linalg.norm([x-vx,y-vy,z-vz])
    track_vtx_end     = np.linalg.norm([vx-x_e,vy-y_e,vz-z_e])
    
    return { 'track_length':track_length,
             'track_containment':track_containment,
             'track_sh_daughter':track_sh_daughter,
             'track_tr_daughter':track_tr_daughter,
             'track_multi_daughter':track_multi_daughter,
             'track_is_tr_daughter':track_is_tr_daughter,
             'track_is_sh_daughter':track_is_sh_daughter,          
             'track_vtx':track_vtx,
             'track_vtx_end':track_vtx_end,
             'track_dedx_med':track_dedx_med
                     }

In [5]:
columns_req_track = [  'track_dedx','track_dedx_avg','track_dedx_hits','track_pca',
                       'predict_em','predict_mu', 'predict_cos', 'predict_pi', 'predict_p',
                       'track_start_x', 'track_start_y','track_start_z',
                       'track_end_x', 'track_end_y', 'track_end_z','track_daughter','track_is_daughter',
                       'track_cali','track_dedx_cali','track_hits_ratio','track_maxangle',
                       #'matched_tracks','matched_tracks_energy', # Command out for data. not used for training 
                       'track_energy','track_nhits' # NOT USED FOR TRAINING
                      ]

columns_req_shower = [ 'shower_dedx','shower_dedx_avg','shower_dedx_hits','shower_pca','shower_open_angle',
                       'shower_start_x', 'shower_start_y','shower_start_z',
                       'shower_length','shower_daughter','shower_is_daughter',
                       'shower_cali','shower_dedx_cali','shower_hits_ratio','shower_maxangle',
                       #'matched_showers', 'matched_showers_energy', # Command out for data. not used for training
                       'shower_energy','shower_nhits'  #NOT USED FOR TRAINING
                      ]


columns_flat = ['vx','vy','vz',"candidate_pdg",'numu_cuts',#'ccnc','qsqr','theta',
                'bnbweight','reconstructed_energy','flash_PE','flash_time',
                #'true_vx_sce','true_vy_sce','true_vz_sce','nu_E','nu_pdg',
                'category',#'distance','CC_daughter_E','nu_daughters_pdg','nu_daughters_E',
                'shower_containment_q','shower_sp_profile',
                'subrun', 'run','event','nu_daughters_pdg','nu_daughters_E'
               ]

columns_track_XGB = ['track_dedx_med', 'track_dedx_avg', 'track_pca',
                     'predict_em', 'predict_mu', 'predict_cos', 'predict_pi', 'predict_p',
                     'track_length', 'track_vtx', 'track_vtx_end',
                     'track_sh_daughter', 'track_tr_daughter',
                     'track_is_tr_daughter','track_is_sh_daughter',
                     'track_hits_ratio',
                     'track_dedx_hits','track_multi_daughter','track_containment','track_maxangle'
                    ] 

columns_shower_XGB = [ 'shower_dedx_med', 'shower_dedx_avg', 'shower_dedx_hits', 'shower_pca',
                       'shower_open_angle', 'shower_length', 'shower_vtx', 'shower_containment_q','shower_sp_profile', 
                       #'shower_sh_daughter','shower_tr_daughter', 'shower_multi_daughter',
                       'shower_is_sh_daughter','shower_is_tr_daughter','shower_maxangle',
                       'shower_hits_ratio']


columns_all = list(set(columns_req_shower) | set(columns_req_track))+columns_flat

In [6]:
#df_sample = pd.read_pickle('../Input/bnb/data_bnb.pckl')
#df_sample = pd.read_pickle('../Input/nue/nue_0.pckl')

In [7]:
#sample_list = ['../Input/nue/nue_4.pckl','../Input/nue/nue_5.pckl','../Input/nue/nue_6.pckl',
#               '../Input/nue/nue_7.pckl','../Input/nue/nue_8.pckl','../Input/nue/nue_9.pckl'
#              ]
sample_list = [ '../Input/nue/nue_0.pckl',
                '../Input/nue/nue_1.pckl',
                '../Input/nue/nue_2.pckl',
                '../Input/nue/nue_3.pckl',
                '../Input/nue/nue_4.pckl']

sample_list = ['../Input/intime/intime.pckl']


In [8]:
print(columns_all)

['shower_start_y', 'shower_is_daughter', 'track_end_y', 'shower_length', 'shower_energy', 'track_maxangle', 'track_cali', 'shower_dedx_hits', 'track_nhits', 'track_dedx_avg', 'predict_em', 'shower_dedx_avg', 'shower_maxangle', 'shower_cali', 'shower_hits_ratio', 'track_hits_ratio', 'track_energy', 'predict_p', 'track_dedx', 'track_pca', 'track_is_daughter', 'shower_pca', 'shower_open_angle', 'track_dedx_cali', 'track_start_x', 'shower_daughter', 'track_end_z', 'shower_dedx_cali', 'track_dedx_hits', 'shower_start_x', 'predict_pi', 'shower_start_z', 'shower_dedx', 'shower_nhits', 'predict_cos', 'track_end_x', 'track_daughter', 'track_start_z', 'predict_mu', 'track_start_y', 'vx', 'vy', 'vz', 'candidate_pdg', 'numu_cuts', 'bnbweight', 'reconstructed_energy', 'flash_PE', 'flash_time', 'category', 'shower_containment_q', 'shower_sp_profile', 'subrun', 'run', 'event', 'nu_daughters_pdg', 'nu_daughters_E']


In [9]:
model_sh_e  = joblib.load('../Input/XGBoost/model_sh_e.pkl')
model_sh_cle  = joblib.load('../Input/XGBoost/model_sh_cle.pkl')
model_sh_mu = joblib.load('../Input/XGBoost/model_sh_mu.pkl')

model_tr_e  = joblib.load('../Input/XGBoost/model_tr_e.pkl')
model_tr_cle  = joblib.load('../Input/XGBoost/model_tr_cle.pkl')
model_tr_mu = joblib.load('../Input/XGBoost/model_tr_mu.pkl')

In [10]:
def XGBoostClassification(row):
    tr_e, tr_cle, tr_mu = [],[],[]
    sh_e, sh_cle, sh_mu = [],[],[]
    if row['fidvol']:
        if len(row['track_end_y']) > 0:
            tr_e ,tr_cle, tr_mu = TrackClassification(row)
        sh_e, sh_cle, sh_mu = ShowerClassification(row)
    
    return pd.Series({
        'track_electron_score': tr_e,
        'track_closest_electron_score': tr_cle,
        'track_muon_score': tr_mu,
        'shower_electron_score': sh_e,
        'shower_closest_electron_score': sh_cle,
        'shower_muon_score': sh_mu,
    })
    
    
def TrackClassification(row):
    pred_tr_e = []
    pred_tr_mu = []
    pred_tr_cle= []
    # Item with less then 5 hits on the collection plane will not be discarded, they will just not have a score for those objects.
    XGB_input = np.zeros([sum(row['track_dedx_hits']>=min_dedx_hits),len(columns_track_XGB)])
    
    tr_ok=0
    for tr in range(len(row['track_start_y'])):
        if row['track_dedx_hits'][tr]>=min_dedx_hits:
            d_tr={}
            for field in columns_req_track:
                d_tr[field]=row[field][tr]
            for field in ['vx','vy','vz']:
                d_tr[field]=row[field]
            d_tr = {**d_tr,**track_features(d_tr)}
            
            XGB_input[tr_ok] = np.asarray(itemgetter(*columns_track_XGB)(d_tr))
            tr_ok+=1
            
        pred_tr_e = model_tr_e.predict_proba( XGB_input )[:,1] 
        pred_tr_cle = model_tr_cle.predict_proba( XGB_input )[:,1]
        pred_tr_mu = model_tr_mu.predict_proba( XGB_input)[:,1]  
        
    return pred_tr_e, pred_tr_cle, pred_tr_mu
    
    
def ShowerClassification(row):
    pred_sh_e = []
    pred_sh_mu = []
    pred_sh_cle = []
    
    XGB_input = np.zeros([sum(row['shower_dedx_hits']>=min_dedx_hits),len(columns_shower_XGB)])
    
    sh_ok=0
    for sh in range(len(row['shower_start_y'])):
        if row['shower_dedx_hits'][sh]>=min_dedx_hits:
            d_sh={}
            for field in columns_req_shower:
                d_sh[field]=row[field][sh]
            for field in ['vx','vy','vz','shower_containment_q','shower_sp_profile']:
                d_sh[field]=row[field]
            d_sh = {**d_sh,**shower_features(d_sh)}  

            XGB_input[sh_ok] = np.asarray( itemgetter(*columns_shower_XGB)(d_sh)) 
            sh_ok+=1
        
    pred_sh_e =  model_sh_e.predict_proba( XGB_input )[:,1] 
    pred_sh_cle =  model_sh_cle.predict_proba( XGB_input )[:,1] 
    pred_sh_mu=  model_sh_mu.predict_proba( XGB_input )[:,1] 
        
    return pred_sh_e, pred_sh_cle, pred_sh_mu

In [11]:
columns_final = ['vx','vy','vz','event', 'subrun', 'run','shower_nhits','track_nhits',
                 "candidate_pdg",'numu_cuts',#'ccnc','qsqr','theta',
                 'bnbweight','reconstructed_energy','flash_PE','flash_time',
                 #'true_vx_sce','true_vy_sce','true_vz_sce','nu_E','nu_pdg',
                 'category',#'distance','true_fidvol','CC_daughter_E','nu_daughters_pdg','nu_daughters_E',
                 'shower_containment_q','shower_sp_profile','shower_energy','track_energy',
                 'shower_electron_score','track_muon_score','shower_muon_score','track_electron_score',
                 'track_closest_electron_score','shower_closest_electron_score','fidvol'
                ]

In [12]:
df_joined = pd.DataFrame()

for sample in sample_list:
    df_sample = pd.read_pickle(sample)
    #df_sample = df_sample
    #print(df_sample.columns)
    droplist = [x for x  in df_sample.columns if (x not in columns_all)]
    print(len(df_sample.index))
    df_sample.drop(droplist,inplace=True,axis=1)
    #print(df_sample.columns)
    #df_sample['true_fidvol'] =df_sample.apply(lambda row: CheckBorderTPC(*row[['true_vx_sce','true_vy_sce','true_vz_sce']],array=fid_arr) ,axis=1)
    df_sample['fidvol'] =df_sample.apply(lambda row: CheckBorderTPC(*row[['vx','vy','vz']],array=fid_arr) ,axis=1)
    df_sample = pd.concat([df_sample, df_sample.apply(XGBoostClassification,axis=1)], axis=1)    
    df_sample = df_sample[columns_final]
    df_joined = pd.concat([df_joined,df_sample],ignore_index=True,copy=False)
    print('done sample')
df_joined.head(10)

18122
done sample


Unnamed: 0,vx,vy,vz,event,subrun,run,shower_nhits,track_nhits,candidate_pdg,numu_cuts,bnbweight,reconstructed_energy,flash_PE,flash_time,category,shower_containment_q,shower_sp_profile,shower_energy,track_energy,shower_electron_score,track_muon_score,shower_muon_score,track_electron_score,track_closest_electron_score,shower_closest_electron_score,fidvol
0,163.432816,74.66523,581.301636,4025566,8052,1,"[36.0, 0.0]",[],12,0,-2147483648,0.068619,218.955429,4.82625,1,0.999998,0.714286,"[0.0686187468554774, 0.0]",[],[0.14872448],[],[0.34780127],[],[],[0.08588807],True
1,90.191078,-48.167835,392.18158,869013,1739,3,[11.0],[175.0],14,0,-2147483648,0.388204,2388.502686,3.991875,1,0.999995,1.0,[0.014716051448132914],[0.3734877553111967],[0.082016096],[0.98222566],[0.5907143],[0.0038417191],[0.0023788025],[0.047815703],True
2,225.798126,-10.140754,147.729767,869253,1739,3,[25.0],[279.0],14,0,-2147483648,0.331134,2017.335693,4.176875,1,0.999996,0.833333,[0.01907214392573449],[0.3120614786103255],[0.089001],[0.9358497],[0.5077585],[0.0031435785],[0.0013477992],[0.034519367],True
3,133.970856,13.782758,890.239746,1356676,2714,1,[902.0],"[317.0, 116.0]",14,0,-2147483648,3.19402,5744.176758,3.691875,1,0.607727,0.460036,[2.2068621390912777],"[0.8728641316513368, 0.1142935273413197]",[0.27780297],"[0.92168134, 0.9704663]",[0.53504217],"[0.33377364, 0.00613852]","[0.12270158, 0.0016427977]",[0.029720282],True
4,198.020279,28.606606,837.43811,1841101,3683,3,[21.0],"[245.0, 398.0]",14,0,-2147483648,0.814348,629.560608,4.471875,1,0.999997,0.555556,[0.03786367137361111],"[0.3684846285674063, 0.4079994861220946]",[0.22409517],"[0.98246956, 0.9974909]",[0.7204483],"[0.005515914, 0.025099006]","[0.0014903687, 0.0022474474]",[0.0091330735],True
5,96.5746,-60.03199,848.300537,1983412,3967,3,[29.0],[225.0],14,0,-2147483648,0.408034,1946.516113,5.0875,1,0.999996,1.181818,[0.027893559444415184],[0.380140678985528],[0.19074881],[0.9956678],[0.57857937],[0.01981507],[0.00110288],[0.0727183],True
6,230.79425,-92.10305,798.432739,1887074,3775,3,[14.0],"[18.0, 12.0]",14,0,-2147483648,0.17307,594.730591,4.441875,1,0.999998,1.5,[0.04852772970541063],"[0.07430925468537114, 0.05023321366158762]",[0.14581028],"[0.98564494, 0.9955701]",[0.36430198],"[0.026354471, 0.028670166]","[0.0092050675, 0.0019282273]",[0.004612783],True
7,156.714493,85.625397,87.72496,682805,1366,2,[50.0],"[54.0, 10.0]",14,0,-2147483648,0.322874,3396.790771,4.051875,1,0.902126,1.173913,[0.145309740145228],"[0.1636484939370863, 0.013916130088135505]",[0.119179316],[0.5938435],[0.8265963],[0.008402553],[0.002929466],[0.045438975],True
8,99.029999,36.1064,482.147247,1165683,2332,1,[11.0],"[92.0, 10.0]",14,0,-2147483648,0.233321,1430.671021,5.0675,1,0.999997,0.75,[0.028309425041930906],"[0.18721906430850482, 0.017792476471082626]",[],"[0.9657376, 0.9786961]",[],"[0.017561337, 0.014205255]","[0.0016556231, 0.001000716]",[],True
9,204.416122,64.374382,249.880249,1165881,2332,1,[16.0],"[177.0, 88.0, 104.0, 0.0]",14,0,-2147483648,0.656196,898.89209,5.071875,1,0.999997,0.230769,[0.022699030430787612],"[0.4070294107266396, 0.10914657353664524, 0.11...",[0.08260123],"[0.9951426, 0.99778074, 0.9982673]",[0.6009582],"[0.033760592, 0.020246489, 0.02264263]","[0.002148024, 0.00073562015, 0.0011993336]",[0.0033756802],True


In [13]:
df_joined.to_pickle('../bnbext_xgb.pckl')

In [14]:
print(len(df_joined.index))
print(df_joined['category'].value_counts())

18122
1    18058
0       64
Name: category, dtype: int64


In [15]:
df_pure=df_sample[df_sample.apply(lambda x: np.all(np.array(x['track_muon_score'])<0.10) and
                                            np.all(np.array(x['shower_muon_score'])<0.15) and
                                         (np.any(np.array(x['shower_electron_score'])>0.9) or np.any(np.array(x['track_electron_score'])>0.99) ),
                                  axis=1)]

In [16]:
print(len(df_pure.index))
print(df_pure['category'].value_counts())

7
1    7
Name: category, dtype: int64
