In [1]:
import numpy as np
import pandas as pd
from operator import itemgetter
from sklearn.externals import joblib

pd.options.display.max_columns = 999

In [2]:
min_dedx_hits=3
min_reco_e=0.03
z_dead_start = 675
z_dead_end=z_dead_start+100
input_dir = "../Input/pandora_pdg_cut/"

In [3]:
def track_features(row):  
    
    track_daughter_1 = 1 if row['track_daughter']==1 else 0
    track_daughter_2 = 1 if row['track_daughter']==2 else 0
    track_daughter_3 = 1 if row['track_daughter']==3 else 0
    track_is_daughter_1 = 1 if row['track_is_daughter']==1 else 0
    track_is_daughter_2 = 1 if row['track_is_daughter']==2 else 0
    
    return { 'track_daughter_1':track_daughter_1,
             'track_daughter_2':track_daughter_2,
             'track_daughter_3':track_daughter_3,
             'track_is_daughter_1':track_is_daughter_1,
             'track_is_daughter_2':track_is_daughter_2,
                     }

In [4]:
def shower_features(row):  
    
    shower_daughter_1 = 1 if row['shower_daughter']==1 else 0
    shower_daughter_2 = 1 if row['shower_daughter']==2 else 0
    shower_daughter_3 = 1 if row['shower_daughter']==3 else 0
    shower_is_daughter_1 = 1 if row['shower_is_daughter']==1 else 0
    shower_is_daughter_2 = 1 if row['shower_is_daughter']==2 else 0
    
    return { 'shower_daughter_1':shower_daughter_1,
             'shower_daughter_2':shower_daughter_2,
             'shower_daughter_3':shower_daughter_3,
             'shower_is_daughter_1':shower_is_daughter_1,
             'shower_is_daughter_2':shower_is_daughter_2,
                     }

In [5]:
vec_columns_shower  = ["shower_open_angle","shower_length","shower_pca",
                       "shower_maxangle","shower_vtxdistance","shower_daughter","shower_is_daughter",
                       "shower_fidvol_ratio","shower_spacepoint_dqdx_ratio",
                       "shower_dedx_hits_w","shower_dedx_w","shower_dedx_best_w",
                       "shower_energy_w", "shower_hitsratio_w","shower_hits_w",
                       "shower_theta", "shower_phi", "shower_energy_product","shower_start_z"
                      ]

vec_columns_track = [ "track_pca","track_start_z",
                      "predict_em","predict_mu","predict_cos","predict_pi","predict_p",  # Katrin's BDT
                      "track_res_mean","track_res_std", 
                      "track_maxangle","track_vtxdistance","track_daughter","track_is_daughter",
                      "track_spacepoint_dqdx_ratio","track_containment",
                      "track_dedx_hits_w","track_dedx_w","track_dedx_best_w",
                      "track_energy_w", "track_hitsratio_w","track_hits_w",
                      "track_theta","track_phi", "track_len"
                     ]
             
vec_columns_truth = [ "shower_cle", "matched_showers", "matched_showers_energy",
                      "track_cle", "matched_tracks", "matched_tracks_energy",
                      "nu_daughters_pdg","nu_daughters_E"
                    ]

flat_columns_truth = ["nu_pdg","nu_E","true_vx_sce","true_vy_sce","true_vz_sce",
                      "distance",'ccnc','qsqr','theta',"true_1eX_signal","true_nu_fiducial",
                      "lepton_E","lepton_theta",
                     ]


flat_columns_reco = ["event","subrun","run","category","vx","vy","vz","bnbweight","candidate_pdg",
                     "numu_cuts",
                     "n_showers","n_tracks","flash_time_max","flash_PE_max",
                     "chargecenter_x","chargecenter_y","chargecenter_z",
                     "total_spacepoint_containment","vtx_activity_nr"
                    ]

# Columns for the training
columns_track_XGB =       ['track_vtxdistance', 'track_maxangle',
                           'track_spacepoint_dqdx_ratio', 'predict_cos', 'track_pca',
                           'track_dedx_best_w',
                           'predict_mu', 'predict_pi',
                           'track_dedx_hits_w', 'predict_p',
                           'track_dedx_w',
                           'track_hitsratio_w', 'predict_em', 
                           "track_len"
                          ]
columns_track_XGB_mu = columns_track_XGB+["track_hits_w"]


columns_shower_XGB  = ["shower_open_angle","shower_length","n_showers",
                       "shower_pca", "shower_maxangle","shower_vtxdistance",
                       "shower_fidvol_ratio","shower_spacepoint_dqdx_ratio",
                       "shower_dedx_hits_w","shower_dedx_w","shower_dedx_best_w","shower_hitsratio_w"
                      ]

columns_shower_XGB_mu = columns_shower_XGB+["shower_hits_w"]

columns_all_data = vec_columns_shower + vec_columns_track + flat_columns_reco
columns_all_mc = columns_all_data + flat_columns_truth + vec_columns_truth

In [6]:
print(columns_all_data)

['shower_open_angle', 'shower_length', 'shower_pca', 'shower_maxangle', 'shower_vtxdistance', 'shower_daughter', 'shower_is_daughter', 'shower_fidvol_ratio', 'shower_spacepoint_dqdx_ratio', 'shower_dedx_hits_w', 'shower_dedx_w', 'shower_dedx_best_w', 'shower_energy_w', 'shower_hitsratio_w', 'shower_hits_w', 'shower_theta', 'shower_phi', 'shower_energy_product', 'shower_start_z', 'track_pca', 'track_start_z', 'predict_em', 'predict_mu', 'predict_cos', 'predict_pi', 'predict_p', 'track_res_mean', 'track_res_std', 'track_maxangle', 'track_vtxdistance', 'track_daughter', 'track_is_daughter', 'track_spacepoint_dqdx_ratio', 'track_containment', 'track_dedx_hits_w', 'track_dedx_w', 'track_dedx_best_w', 'track_energy_w', 'track_hitsratio_w', 'track_hits_w', 'track_theta', 'track_phi', 'track_len', 'event', 'subrun', 'run', 'category', 'vx', 'vy', 'vz', 'bnbweight', 'candidate_pdg', 'numu_cuts', 'n_showers', 'n_tracks', 'flash_time_max', 'flash_PE_max', 'chargecenter_x', 'chargecenter_y', 'char

In [7]:
model_sh_e  = joblib.load(input_dir+'XGBoost/model_sh_e.pkl')
model_sh_cle  = joblib.load(input_dir+'XGBoost/model_sh_cle.pkl')
model_sh_cle_lee  = joblib.load(input_dir+'XGBoost/model_sh_cle_lee.pkl')
model_sh_mu = joblib.load(input_dir+'XGBoost/model_sh_mu.pkl')

model_tr_e  = joblib.load(input_dir+'XGBoost/model_tr_e.pkl')
model_tr_cle  = joblib.load(input_dir+'XGBoost/model_tr_cle.pkl')
model_tr_mu = joblib.load(input_dir+'XGBoost/model_tr_mu.pkl')

In [8]:
def XGBoostClassification(row):
    tr_e, tr_cle, tr_mu, tr_pred_index = [],[],[],[]
    sh_e, sh_cle, sh_mu, sh_pred_index = [],[],[],[]

    if row["n_tracks"] > 0:
        tr_e ,tr_cle, tr_mu,tr_pred_index = TrackClassification(row)
        
    if row["n_showers"] > 0:
        sh_e, sh_cle, sh_cle_lee, sh_mu,sh_pred_index = ShowerClassification(row)
    
    return pd.Series({
        'track_electron_score': tr_e,
        'track_closest_electron_score': tr_cle,
        'track_muon_score': tr_mu,
        'shower_electron_score': sh_e,
        'shower_closest_electron_score': sh_cle,
        'shower_cle_lee_score': sh_cle_lee,
        'shower_muon_score': sh_mu,
        
        'track_score_index': tr_pred_index,
        'shower_score_index': sh_pred_index
    })
    
    
def TrackClassification(row):
    pred_tr_e  = []
    pred_tr_mu = []
    pred_tr_cle= []
    pred_index = []
    
    XGB_input = []
    XGB_input_mu = []
    
    for tr in range(row["n_tracks"]):
        if (row['track_dedx_hits_w'][tr]>=min_dedx_hits) and (row['track_energy_w'][tr]>=min_reco_e):
            if(z_dead_start>row["track_start_z"][tr] or z_dead_end<row["track_start_z"][tr]):
                d_tr={}

                for field in vec_columns_track:
                    d_tr[field]=row[field][tr]
                d_tr = {**d_tr,**track_features(d_tr)}
                XGB_input.append( np.asarray(itemgetter(*columns_track_XGB)(d_tr)))
                XGB_input_mu.append( np.asarray(itemgetter(*columns_track_XGB_mu)(d_tr)))
                pred_index.append(tr)
                
                
    if(len(XGB_input)>0):      
        XGB_input = np.array(XGB_input)
        pred_tr_e = model_tr_e.predict_proba( XGB_input )[:,1] 
        pred_tr_cle = model_tr_cle.predict_proba( XGB_input )[:,1]
        pred_tr_mu = model_tr_mu.predict_proba( XGB_input_mu)[:,1]  
    return pred_tr_e, pred_tr_cle, pred_tr_mu,pred_index
    
    
def ShowerClassification(row):
    pred_sh_e = []
    pred_sh_mu = []
    pred_sh_cle= []
    pred_sh_cle_lee =[]
    pred_index = []
    
    XGB_input = []
    XGB_input_mu = []
    
    for sh in range(row["n_showers"]):
        if (row['shower_dedx_hits_w'][sh]>=min_dedx_hits) and (row['shower_energy_w'][sh]>=min_reco_e):
            if(z_dead_start>row["shower_start_z"][sh] or z_dead_end<row["shower_start_z"][sh]):
                d_sh={}

                for field in vec_columns_shower:
                    d_sh[field]=row[field][sh]
                d_sh = {**d_sh,**shower_features(d_sh)}
                d_sh["n_showers"]= row["n_showers"]
                XGB_input.append(np.asarray(itemgetter(*columns_shower_XGB)(d_sh)))
                XGB_input_mu.append( np.asarray(itemgetter(*columns_shower_XGB_mu)(d_sh)))
                pred_index.append(sh)
                
    if(len(XGB_input)>0):      
        XGB_input = np.array(XGB_input)      
        pred_sh_e = model_sh_e.predict_proba( XGB_input )[:,1] 
        pred_sh_cle = model_sh_cle.predict_proba( XGB_input )[:,1]
        pred_sh_mu = model_sh_mu.predict_proba( XGB_input_mu)[:,1]  
        pred_sh_cle_lee = model_sh_cle_lee.predict_proba( XGB_input)[:,1]  
    return pred_sh_e, pred_sh_cle, pred_sh_cle_lee, pred_sh_mu,pred_index

In [9]:
columns_final_reco_vec = ['shower_electron_score','track_muon_score',
                          'shower_muon_score','track_electron_score',
                          'track_closest_electron_score','shower_closest_electron_score',"shower_cle_lee_score",
                          "track_energy_w","shower_energy_w",'track_score_index','shower_score_index',"track_theta","shower_theta"
                         ]

columns_final_truth_vec = vec_columns_truth 


columns_final_data = flat_columns_reco + columns_final_reco_vec 
columns_final_mc = columns_final_data + columns_final_truth_vec + flat_columns_truth

In [10]:
nue_test_list=[input_dir+'nue/nue_5.pckl',
               input_dir+'nue/nue_6.pckl',
               input_dir+'nue/nue_7.pckl',
               input_dir+'nue/nue_8.pckl',
               input_dir+'nue/nue_9.pckl'
              ]
nue_train_list=[input_dir+'nue/nue_5.pckl',
               input_dir+'nue/nue_6.pckl',
               input_dir+'nue/nue_7.pckl',
               input_dir+'nue/nue_8.pckl',
               input_dir+'nue/nue_9.pckl'
              ]
nu_train_list= [input_dir+'nu/nu_4.pckl',
               input_dir+'nu/nu_5.pckl',
               input_dir+'nu/nu_6.pckl',
               input_dir+'nu/nu_7.pckl',
              ]
nu_test_list= [input_dir+'nu/nu_0.pckl',
               input_dir+'nu/nu_1.pckl',
               input_dir+'nu/nu_2.pckl',
               input_dir+'nu/nu_3.pckl',
              ]

intime = [input_dir+'nu/nu_0.pckl']

In [11]:
sample_list = nue_test_list

In [12]:
##########
data=False
##########

df_joined = pd.DataFrame()
columns_all = columns_all_mc
columns_final = columns_final_mc
if data:
    columns_all = columns_all_data
    columns_final = columns_final_data
    
for sample in sample_list:
    print("Reading in the sample (data:",data,")")
    df_sample = pd.read_pickle(sample)
    #df_sample = df_sample.head(15)
    duplicates = df_sample.duplicated(subset=["run","subrun","event"]).sum()
    print("There were duplicates in the beginning:",duplicates)

    droplist = [x for x  in df_sample.columns if (x not in columns_all)]
    df_sample.drop(droplist,inplace=True,axis=1)
    print("Entries in current sample:",len(df_sample.index))
    #print(df_sample.columns)
    df_sample["vtx_activity"] = df_sample[['n_tracks','n_showers']].sum(axis=1) > 1
    df_scores = df_sample.apply(XGBoostClassification,axis=1)
    df_sample = pd.concat([df_sample, df_scores], axis=1)    
    df_sample = df_sample[columns_final]
    df_joined = pd.concat([df_joined,df_sample],ignore_index=True,copy=False)
    print('done sample')

    
df_joined.head(10)
duplicates = df_joined.duplicated(subset=["run","subrun","event"]).sum()
print("There were duplicates in the end:",duplicates)

Reading in the sample (data: False )
There were duplicates in the beginning: 0
Entries in current sample: 6889
done sample
Reading in the sample (data: False )
There were duplicates in the beginning: 0
Entries in current sample: 6896
done sample
Reading in the sample (data: False )
There were duplicates in the beginning: 24
Entries in current sample: 6822
done sample
Reading in the sample (data: False )
There were duplicates in the beginning: 5
Entries in current sample: 6766
done sample
Reading in the sample (data: False )
There were duplicates in the beginning: 0
Entries in current sample: 6756
done sample
There were duplicates in the end: 71


In [13]:
df_joined.to_pickle('../nue_test56789_xgb.pckl')

In [14]:
print(len(df_joined.index))
print(df_joined['category'].value_counts())

34129
2    27317
7     6696
1      116
Name: category, dtype: int64


In [15]:
def nue_select_manual(row):
    # Reasonable cuts
    if(row["n_tracks"]+row["n_showers"]>4):
        return False
    if(row["total_spacepoint_containment"]<0.95):
        return False
    # Muon cuts
    if(np.any(np.array(row['track_muon_score'])>0.05)):
        return False
    if(np.any(np.array(row['shower_muon_score'])>0.05)):
        return False
    # Electron cuts
    if(np.any(np.array(row['track_electron_score'])>0.9) and np.any(np.array(row['track_closest_electron_score'])>0.7)):
        if(np.any(np.array(row['shower_electron_score'])>0.6)):
            return True
    if(np.any(np.array(row['shower_electron_score'])>0.8)):
        if(np.any(np.array(row['shower_cle_lee_score'])>0.7) or np.any(np.array(row['shower_closest_electron_score'])>0.7)):
            return True
    return False

          
df_pure=df_sample[df_sample.apply(nue_select_manual, axis=1)]

In [16]:
print(len(df_pure.index))
print(df_pure['category'].value_counts())

1303
2    1231
7      71
1       1
Name: category, dtype: int64


In [17]:
#Roberto's Nue
#df_joined[(df_joined["event"]==1515) & (df_joined["run"]==5328)].head()
df_joined[(df_joined["event"]==31) & (df_joined["run"]==5513)].head()


Unnamed: 0,event,subrun,run,category,vx,vy,vz,bnbweight,candidate_pdg,numu_cuts,n_showers,n_tracks,flash_time_max,flash_PE_max,chargecenter_x,chargecenter_y,chargecenter_z,total_spacepoint_containment,vtx_activity_nr,shower_electron_score,track_muon_score,shower_muon_score,track_electron_score,track_closest_electron_score,shower_closest_electron_score,shower_cle_lee_score,track_energy_w,shower_energy_w,track_score_index,shower_score_index,track_theta,shower_theta,shower_cle,matched_showers,matched_showers_energy,track_cle,matched_tracks,matched_tracks_energy,nu_daughters_pdg,nu_daughters_E,nu_pdg,nu_E,true_vx_sce,true_vy_sce,true_vz_sce,distance,ccnc,qsqr,theta,true_1eX_signal,true_nu_fiducial,lepton_E,lepton_theta
