In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from helpfunction import CheckBorderFixed,CheckBorderTPC

%matplotlib inline

# Idea, use ntracks and nshowers, but dangerous since data mc discrepancies

In [2]:
sample_list = ['../Input/intime/intime.pckl','../Input/nue/nue.pckl','../Input/nu/nu_pure_close_600k.pckl']

pd_tracks_raw = pd.DataFrame()

columns_req = ['track_dedx','track_dedx_avg','track_dedx_hits','track_pca',
               'predict_em','predict_mu', 'predict_cos', 'predict_pi', 'predict_p',
               'track_start_x', 'track_start_y','track_start_z',
               'track_end_x', 'track_end_y', 'track_end_z','track_daughter',
                'matched_tracks'#This is the label generationg category
              ]

columns_flat = ['vx','vy','vz']

#Fields to engineer:
# - length
# - 10cm contained (bool)
# - has shower daughter (bool)
# - has track daughter
# - is rthe daughter of something else
# - distance from vertex start point
# - distance from vertex end point

# Fiducial volume borders in x,y,z:
fid_arr= [[10,10],[20,20],[10,50]]

In [3]:
#Load in samples
df_tracks=pd.DataFrame(columns=columns_req+columns_flat)



In [4]:
for sample in sample_list:
    df_sample = pd.read_pickle(sample)
    droplist = [x for x  in df_sample.columns if (x not in columns_req+columns_flat)]
    df_sample.drop(droplist,inplace=True,axis=1)
    
    for index, row in df_sample.head(40000).iterrows():
        if CheckBorderTPC(row['vx'],row['vy'],row['vz'],fid_arr):
            for tr in range(len(row['matched_tracks'])):
                d_tr = {}
                for field in columns_req:
                    d_tr[field]=row[field][tr]
                for field in columns_flat:
                    d_tr[field]=row[field]
                df_tracks=df_tracks.append(d_tr,ignore_index=True)
        
df_tracks.head()

Unnamed: 0,track_dedx,track_dedx_avg,track_dedx_hits,track_pca,predict_em,predict_mu,predict_cos,predict_pi,predict_p,track_start_x,track_start_y,track_start_z,track_end_x,track_end_y,track_end_z,track_daughter,matched_tracks,vx,vy,vz
0,0.093979,0.102463,4.0,0.99707,0.007759,0.750977,0.008698,0.231079,0.001492,36.03125,-71.75,811.5,0.513184,-15.242188,814.0,2.0,0.0,36.073578,-71.722626,811.450012
1,0.373662,0.326146,4.0,0.960449,0.288086,0.144897,0.003736,0.556641,0.006557,19.609375,-31.0,812.5,0.925293,-16.296875,811.0,0.0,0.0,36.073578,-71.722626,811.450012
2,0.241747,0.218862,3.0,0.999023,0.014725,0.011597,0.961914,0.008034,0.003828,36.15625,-71.9375,811.5,62.75,-113.625,811.0,1.0,0.0,36.073578,-71.722626,811.450012
3,2.122207,13.154626,12.0,1.0,4.6e-05,0.022476,0.974609,0.001173,0.001669,244.125,-84.8125,274.75,146.375,104.875,415.75,0.0,0.0,244.315079,-85.193787,274.58551
4,1.911418,10.771297,12.0,0.999512,0.000382,0.06543,0.919922,0.013466,0.000851,193.0,-103.0625,266.25,255.75,-24.8125,349.5,0.0,0.0,244.315079,-85.193787,274.58551


In [5]:
df_tracks['matched_tracks'].value_counts()

 2212.0    48533
 13.0      31276
 0.0       17409
 211.0     13464
 11.0       7453
-13.0       4708
 22.0       3192
-211.0      2128
 2112.0     1698
 321.0       170
 3222.0       69
-11.0         30
-321.0         7
 3112.0        5
Name: matched_tracks, dtype: int64

In [6]:
def track_freatures(row):
    
    x=row["track_start_x"]
    y=row["track_start_y"]
    z=row["track_start_z"]
    x_e=row["track_end_x"]
    y_e=row["track_end_y"]
    z_e=row["track_end_z"]
    vx=row["vx"]
    vy=row["vy"]
    vz=row["vz"]
    
    track_length      = np.linalg.norm([x-x_e,y-y_e,z-z_e])
    track_containment = CheckBorderFixed(x_e,y_e,z_e,tolerance=10) 
    track_sh_daughter = True if row['track_daughter']==1 else False
    track_tr_daughter = True if row['track_daughter']==2 else False
    track_is_daughter = True if row['matched_tracks']==0 else False
    track_vtx         = np.linalg.norm([x-vx,y-vy,z-vz])
    track_vtx_end     = np.linalg.norm([vx-x_e,vy-y_e,vz-z_e])
    
    # Class
    true_mu        = True if abs(row['matched_tracks'])==13 else False
    true_e         = True if row['matched_tracks']==11 else False
    
    return pd.Series({'track_length':track_length,
                     'track_containment':track_containment,
                     'track_sh_daughter':track_sh_daughter,
                     'track_tr_daughter':track_tr_daughter,
                     'track_is_daughter':track_is_daughter,
                     'track_vtx':track_vtx,
                     'track_vtx_end':track_vtx_end,
                     'true_mu':true_mu,
                     'true_e':true_e
                     })

In [7]:
df_tracks = pd.concat([df_tracks, df_tracks.apply(track_freatures,axis=1)], axis=1)
droplist = ['track_start_x','track_start_y','track_start_z','matched_tracks','track_daughter',
            'track_end_x','track_end_y','track_end_z','vx','vy','vz'
           ]
df_tracks.drop(droplist,inplace=True,axis=1)     

In [8]:
df_tracks['true_mu'].value_counts()

False    94158
True     35984
Name: true_mu, dtype: int64

In [9]:
df_tracks['true_e'].value_counts()

False    122689
True       7453
Name: true_e, dtype: int64

In [10]:
df_tracks.to_pickle('../trackFrame.pckl')