In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from operator import itemgetter

from helpfunction import CheckBorderFixed,CheckBorderTPC
from sklearn.externals import joblib

In [2]:
# Fiducial volume borders in x,y,z:
fid_arr= [[10,10],[20,20],[10,50]]

min_dedx_hits=2

In [3]:
columns_req_shower = ['shower_dedx','shower_dedx_avg','shower_dedx_hits','shower_pca','shower_open_angle',
                      'shower_start_x', 'shower_start_y','shower_start_z',
                      'shower_length','shower_daughter',
                      'matched_showers'#This is the label generationg category
                     ]

columns_req_track  = ['track_dedx','track_dedx_avg','track_dedx_hits','track_pca',
                      'predict_em','predict_mu', 'predict_cos', 'predict_pi', 'predict_p',
                      'track_start_x', 'track_start_y','track_start_z',
                      'track_end_x', 'track_end_y', 'track_end_z','track_daughter',
                      'matched_tracks'#This is the label generationg category
                     ]

columns_flat = ['vx','vy','vz',
                'bnbweight','reconstructed_energy','flash_PE','flash_time','nu_E','nu_pdg',
                'true_x_sce','true_y_sce','true_z_sce',
                'category','distance','CC_daughter_E','shower_containment_q','shower_sp_profile'
               ]

columns_track_XGB = ['track_dedx', 'track_dedx_avg', 'track_dedx_hits', 'track_pca',
                     'predict_em', 'predict_mu', 'predict_cos', 'predict_pi', 'predict_p',
                     'track_containment', 'track_is_daughter', 'track_length',
                     'track_sh_daughter', 'track_tr_daughter', 'track_vtx', 'track_vtx_end']

columns_shower_XGB = [ 'shower_dedx', 'shower_dedx_avg', 'shower_dedx_hits', 'shower_pca',
                       'shower_open_angle', 'shower_length', 'shower_containment_q',
                       'shower_sp_profile', 'shower_is_daughter', 'shower_sh_daughter',
                       'shower_tr_daughter', 'shower_vtx']

columns_all = list(set(columns_req_shower) | set(columns_req_track))+columns_flat

In [4]:
sample_list = ['../Input/nu/nu_Track_BDT.pckl']

In [5]:
model_sh_e  = joblib.load('../Input/XGBoost/model_sh_e.pkl')
model_sh_mu = joblib.load('../Input/XGBoost/model_sh_mu.pkl')
model_tr_e  = joblib.load('../Input/XGBoost/model_tr_e.pkl')
model_tr_mu = joblib.load('../Input/XGBoost/model_tr_mu.pkl')

In [6]:
def XGBoostClassification(row):
    tr_e, tr_mu = [],[]
    if len(row['matched_tracks']) > 0:
        tr_e, tr_mu = TrackClassification(row)
    sh_e, sh_mu = ShowerClassification(row)
    
    return pd.Series({
        'track_electron_score': tr_e,
        'track_muon_score': tr_mu,
        'shower_electron_score': sh_e,
        'shower_muon_score': sh_mu,
    })
    
    
def TrackClassification(row):
    pred_tr_e = []
    pred_tr_mu = []
    XGB_input = np.zeros([sum(row['track_dedx_hits']>=min_dedx_hits),len(columns_track_XGB)])
    
    tr_ok=0
    for tr in range(len(row['matched_tracks'])):
        if row['track_dedx_hits'][tr]>=min_dedx_hits:
            d_tr={}
            for field in columns_req_track:
                d_tr[field]=row[field][tr]
            for field in ['vx','vy','vz']:
                d_tr[field]=row[field]
            d_tr = {**d_tr,**track_features(d_tr)}
            
            XGB_input[tr_ok] = np.asarray(itemgetter(*columns_track_XGB)(d_tr))
            tr_ok+=1
            
        pred_tr_e = model_tr_e.predict_proba( XGB_input )[:,1] 
        pred_tr_mu = model_tr_mu.predict_proba( XGB_input)[:,1]  
        
    return pred_tr_e,pred_tr_mu
    
    
def ShowerClassification(row):
    pred_sh_e = []
    pred_sh_mu = []
    XGB_input = np.zeros([sum(row['shower_dedx_hits']>=min_dedx_hits),len(columns_shower_XGB)])
    
    sh_ok=0
    for sh in range(len(row['matched_showers'])):
        if row['shower_dedx_hits'][sh]>=min_dedx_hits:
            d_sh={}
            for field in columns_req_shower:
                d_sh[field]=row[field][sh]
            for field in ['vx','vy','vz','shower_containment_q','shower_sp_profile']:
                d_sh[field]=row[field]
            d_sh = {**d_sh,**shower_features(d_sh)}  

            XGB_input[sh_ok] = np.asarray( itemgetter(*columns_shower_XGB)(d_sh)) 
            sh_ok+=1
        
    pred_sh_e =  model_sh_e.predict_proba( XGB_input )[:,1] 
    pred_sh_mu=  model_sh_mu.predict_proba( XGB_input )[:,1] 
        
    return pred_sh_e,pred_sh_mu

In [None]:
columns_final = ['vx','vy','vz',
                'bnbweight','reconstructed_energy','flash_PE','flash_time','nu_E','nu_pdg',
                'true_x_sce','true_y_sce','true_z_sce',
                'category','distance','CC_daughter_E','shower_containment_q','shower_sp_profile',
                'shower_electron_score','shower_muon_score','track_electron_score','track_electron_score'
               ]

In [7]:
df_joined = df.DataFrame()

for sample in sample_list:
    df_sample = pd.read_pickle(sample)
    droplist = [x for x  in df_sample.columns if (x not in columns_all)]
    print(len(df_sample.index))
    df_sample.drop(droplist,inplace=True,axis=1)
    df_sample = df_sample[df_sample.apply(lambda row: CheckBorderTPC(row['vx'],row['vy'],row['vz'],fid_arr) ,axis=1)]
    df_sample = pd.concat([df_sample, df_sample.apply(XGBoostClassification,axis=1)], axis=1)    
    df_sample = df_sample[columns_final]
    df_joined = pd.concat([df_joined,df_sample],ignore_index=True,copy=False) 
df_joined.head(10)

Unnamed: 0,nu_pdg,nu_E,distance,category,vx,vy,vz,bnbweight,shower_open_angle,shower_length,...,track_dedx_hits,flash_PE,flash_time,CC_daughter_E,matched_showers,matched_tracks,shower_electron_score,shower_muon_score,track_electron_score,track_muon_score
0,14,1.6252,14.595528,7,233.783386,-53.704464,943.809265,0.990291,"[0.361, 0.523]","[72.3, 3.898]",...,"[14.0, 4.0, 12.0]",666.404602,3.87125,0.212016,"[13, 2112]","[2112, 2112, -13]","[0.4620925, 0.22241624]","[0.02675151, 0.054553673]","[0.00095558714, 0.0022868544, 0.06702287]","[0.0017884423, 0.0038805632, 0.7133571]"
2,14,0.606236,88.4907,1,198.662201,-59.815376,700.93219,0.991531,"[0.2145, 0.3438]","[22.16, 8.16]",...,[],58.534115,3.78125,-1.0,"[-13, -13]",[],"[0.1997226, 0.22575743]","[0.065051556, 0.15024912]",[],[]
4,14,0.541416,10.631845,3,120.68148,-11.807071,165.075089,0.994432,"[0.0738, 0.5405]","[12.695, 4.83]",...,[],795.488281,4.32125,0.120262,"[211, 211]",[],[0.11959982],[0.114267744],[],[]
5,14,1.04298,10.893191,3,173.90033,95.340721,351.875519,0.9958,[0.03546],[18.72],...,"[14.0, 6.0]",666.502136,3.69125,0.224553,[2212],"[2212, 13]",[0.24744806],[0.11607253],"[0.014236772, 0.1647531]","[0.0018994451, 0.16249429]"
7,14,1.439656,0.173543,3,197.177277,88.53228,677.950012,0.987504,"[0.0964, 0.1375, 0.7114, 0.4946]","[52.4, 76.2, 3.188, 2.387]",...,"[4.0, 2.0]",1200.724976,4.56125,0.208927,"[13, 0, 2212, 2112]","[211, 2212]","[0.5837737, 0.0010686477, 0.1136777, 0.2705093]","[0.033540033, 0.00067740626, 0.024442133, 0.06...","[0.035162926, 0.008414617]","[0.11614513, 0.0025745805]"
9,14,0.717259,17.242661,4,73.47599,-32.119797,501.983459,0.991032,"[0.6606, 0.1874]","[49.16, 6.254]",...,[],2664.784668,3.27125,-1.0,"[22, 22]",[],"[0.43908706, 0.1394679]","[0.011997285, 0.034118474]",[],[]
10,14,1.148458,35.152428,5,225.485718,-94.026604,432.923126,0.991477,"[0.518, 0.3733]","[24.23, 15.06]",...,[9.0],232.868317,4.50625,-1.0,"[22, 22]",[22],"[0.30520895, 0.30525252]","[0.04108333, 0.07083924]",[0.38631493],[0.14412805]
12,14,1.743248,5.339813,3,239.180954,-51.177376,724.291992,0.991446,[0.1345],[36.88],...,"[0.0, 0.0]",671.590332,4.510625,0.90023,[2112],"[2112, 0]",[0.3698149],[0.26330668],[],[]
13,14,0.770566,1.341341,3,221.011185,80.790207,907.703247,0.996072,[0.2336],[19.16],...,"[14.0, 0.0]",267.199036,4.02125,0.310085,[211],"[2212, 211]",[0.122377135],[0.09368877],[0.001013069],[0.0017884423]
14,14,0.566884,7.83043,3,210.068542,52.676941,739.563538,0.994138,[0.0957],[27.45],...,"[15.0, 0.0]",586.407166,4.101875,0.176551,[0],"[2212, 13]",[],[],[0.0015520828],[0.0017014521]


In [8]:
df_joined.to_pickle('nue_score.pckl')

In [9]:
print(len(df_sample.index))
print(df_sample['category'].value_counts())

123306
3    41792
4    28280
1    25636
7    18995
5     6042
2     2561
Name: category, dtype: int64


In [24]:
df_pure=df_sample[df_sample.apply(lambda x: np.all(np.array(x['track_muon_score'])<0.10) and
                                            np.all(np.array(x['shower_muon_score'])<0.15) and
                                         (np.any(np.array(x['shower_electron_score'])>0.9) or np.any(np.array(x['track_electron_score'])>0.99) ),
                                  axis=1)]

In [25]:
print(len(df_pure.index))
print(df_pure['category'].value_counts())

1735
2    711
4    454
3    261
1    198
7     56
5     55
Name: category, dtype: int64


0.487790013158806

0.07538802660753881

0.0220125786163522