## Appendix B: Feature engineering. Get ready for classification modeling

**UPDATE**: Use **relative** hit positions ONLY for feature engineering and modeling.

In [1]:
import h5py 
import numpy as np 
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import glob
import warnings

#### 1.0: Read hit data

In [2]:
# read hit data from local
df_hit_data = pd.read_csv("../../../data_0/df_hit_data.csv")

In [3]:
# only keep in-time hits
df_hit_data = df_hit_data.query('abs(chod_delta) <= 0.5')

In [4]:
# drop irrelevant cols to reduce size of dt
df_hit_data = df_hit_data.drop(['x', 
                                'y',
                                'mirror',
                                'hit_time',
                                'chod_time',
                                'chod_delta'
                                ], axis=1)
df_hit_data

Unnamed: 0.1,Unnamed: 0,x_realigned,y_realigned,label,event,momentum,track_pos_x,track_pos_y,ring_radius_cal,event_id
0,0,-101.8,-269.219998,1,0,20.778881,-58.20177,-22.614530,152.531027,2155500
1,1,-173.8,-113.329999,1,0,20.778881,-58.20177,-22.614530,152.531027,2155500
2,2,-191.8,42.549998,1,0,20.778881,-58.20177,-22.614530,152.531027,2155500
3,3,-92.8,120.500003,1,0,20.778881,-58.20177,-22.614530,152.531027,2155500
5,5,87.2,-35.390000,1,0,20.778881,-58.20177,-22.614530,152.531027,2155500
...,...,...,...,...,...,...,...,...,...,...
38058006,38058006,1.3,-25.090000,0,900699,44.893456,-171.59705,-0.669099,186.290323,778557
38058007,38058007,-349.7,-103.029999,0,900699,44.893456,-171.59705,-0.669099,186.290323,778557
38058008,38058008,-124.7,-180.970001,0,900699,44.893456,-171.59705,-0.669099,186.290323,778557
38058009,38058009,-322.7,37.270000,0,900699,44.893456,-171.59705,-0.669099,186.290323,778557


In [5]:
df_hit_data = df_hit_data.reset_index()

In [6]:
## update hit position as relative to track_pos

df_hit_data['x_realigned'] = df_hit_data['x_realigned'] - df_hit_data['track_pos_x']
df_hit_data['y_realigned'] = df_hit_data['y_realigned'] - df_hit_data['track_pos_y']

#### 2.0: Process and get engineered features

In [7]:
# comupter distance
df_hit_data['distance'] = (
    (df_hit_data["track_pos_x"] - df_hit_data["x_realigned"]) ** 2 +
    (df_hit_data["track_pos_y"] - df_hit_data["y_realigned"]) ** 2
) ** 0.5

In [8]:
# comupter x+y and x-y
df_hit_data['x+y'] = df_hit_data['x_realigned'] + df_hit_data['y_realigned']
df_hit_data['x-y'] = df_hit_data['x_realigned'] - df_hit_data['y_realigned']

In [9]:
%%time
rms = lambda x: np.sqrt(np.mean(x**2))
grouped_hit_data = df_hit_data.groupby('event').agg(
    {'x_realigned':['min', 'max', 'median'], 
     'y_realigned':['min','max', 'median'],
     'x+y': ['idxmin', 'idxmax'],
     'x-y': ['idxmin', 'idxmax'],
     'distance': ['min', 'max', 'mean', 'median', rms],
     'momentum': ['mean'],
     'label': ['mean'],
     # 'ring_radius_cal': ['mean'],
     # 'track_pos_x': ['mean'],
     # 'track_pos_y': ['mean']
    })
grouped_hit_data.columns = ['x_realigned_min', 
                            'x_realigned_max',
                            'x_realigned_median',
                            'y_realigned_min', 
                            'y_realigned_max',
                            'y_realigned_median',
                            'index_min(x+y)', 
                            'index_max(x+y)', 
                            'index_min(x-y)', 
                            'index_max(x-y)', 
                            'min_hit_radius', 
                            'max_hit_radius', 
                            'mean_hit_radius', 
                            'median_hit_radius', 
                            'rms_hit_radius',
                            'momentum',
                            'label',
                            # 'ring_radius_cal',
                            # 'track_pos_x',
                            # 'track_pos_y'
                            ]
grouped_hit_data = grouped_hit_data.reset_index()

CPU times: user 2min 42s, sys: 6.25 s, total: 2min 48s
Wall time: 2min 43s


In [10]:
grouped_hit_data['x_min_sum'] = grouped_hit_data['index_min(x+y)'].apply(lambda x: df_hit_data.iloc[x]['x_realigned'])
grouped_hit_data['y_min_sum'] = grouped_hit_data['index_min(x+y)'].apply(lambda x: df_hit_data.iloc[x]['y_realigned'])

grouped_hit_data['x_max_sum'] = grouped_hit_data['index_max(x+y)'].apply(lambda x: df_hit_data.iloc[x]['x_realigned'])
grouped_hit_data['y_max_sum'] = grouped_hit_data['index_max(x+y)'].apply(lambda x: df_hit_data.iloc[x]['y_realigned'])

grouped_hit_data['x_min_diff'] = grouped_hit_data['index_min(x-y)'].apply(lambda x: df_hit_data.iloc[x]['x_realigned'])
grouped_hit_data['y_min_diff'] = grouped_hit_data['index_min(x-y)'].apply(lambda x: df_hit_data.iloc[x]['y_realigned'])

grouped_hit_data['x_max_diff'] = grouped_hit_data['index_max(x-y)'].apply(lambda x: df_hit_data.iloc[x]['x_realigned'])
grouped_hit_data['y_max_diff'] = grouped_hit_data['index_max(x-y)'].apply(lambda x: df_hit_data.iloc[x]['y_realigned'])

In [11]:
grouped_hit_data = grouped_hit_data.drop(columns=['index_min(x+y)', 
                                                  'index_max(x+y)', 
                                                  'index_min(x-y)', 
                                                  'index_max(x-y)'])

In [12]:
grouped_hit_data['max_x-min_x'] = grouped_hit_data['x_realigned_max']-grouped_hit_data['x_realigned_min']
grouped_hit_data['max_y-min_y'] = grouped_hit_data['y_realigned_max']-grouped_hit_data['y_realigned_min']

In [13]:
grouped_hit_data

Unnamed: 0,event,x_realigned_min,x_realigned_max,x_realigned_median,y_realigned_min,y_realigned_max,y_realigned_median,min_hit_radius,max_hit_radius,mean_hit_radius,...,x_min_sum,y_min_sum,x_max_sum,y_max_sum,x_min_diff,y_min_diff,x_max_diff,y_max_diff,max_x-min_x,max_y-min_y
0,0,-133.598230,145.401770,-43.598230,-246.605468,143.114533,-12.775470,89.062290,225.010553,170.484039,...,-79.598230,-246.605468,145.401770,-12.775470,-133.598230,65.164528,-43.598230,-246.605468,279.0,389.720001
1,1,-120.844790,167.155210,122.155210,-173.964260,-33.664257,-111.614262,302.700149,425.678402,386.590638,...,-120.844790,-127.194260,167.155210,-33.664257,-120.844790,-127.194260,122.155210,-111.614262,288.0,140.300003
2,2,-162.284234,179.715766,62.715766,-177.274909,40.965089,-68.159911,117.046458,255.175408,217.406335,...,-117.284234,-130.514910,179.715766,9.785089,-162.284234,40.965089,125.715766,-114.924910,342.0,218.239998
3,3,-181.836470,160.163530,-51.336470,-178.299338,133.470658,-53.589339,64.169418,261.847041,159.538870,...,-181.836470,-84.759345,160.163530,8.770662,-145.836470,102.300660,142.163530,-84.759345,342.0,311.769997
4,4,-171.362450,170.637550,-90.362450,-167.676860,159.683133,-27.376861,57.424149,292.773931,222.763034,...,-99.362450,-167.676860,170.637550,50.563138,-54.362450,159.683133,170.637550,-11.786861,342.0,327.359993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900646,900695,-119.722800,186.277200,73.777200,-194.382323,179.737680,-54.082320,108.012478,349.000844,262.665393,...,-119.722800,-163.202322,96.277200,148.567682,-101.722800,179.737680,123.277200,-147.612318,306.0,374.120003
900647,900696,-135.834860,80.165140,-81.834860,-201.099023,173.020980,-138.749020,59.446186,318.969563,191.483891,...,-135.834860,-169.919022,-63.834860,173.020980,-117.834860,173.020980,80.165140,-169.919022,216.0,374.120003
900648,900697,-230.394048,12.605952,-117.894048,-153.634297,236.075710,72.390708,169.593660,258.315514,209.110429,...,-122.394048,-153.634297,12.605952,236.075710,-230.394048,33.425708,-104.394048,-153.634297,243.0,389.710007
900649,900698,-211.585942,58.414058,-148.585942,-177.019040,25.630962,-99.079037,178.789260,244.878926,217.994928,...,-184.585942,-99.079037,58.414058,-177.019040,-211.585942,10.040965,58.414058,-177.019040,270.0,202.650002


In [14]:
grouped_hit_data['num_hits'] = df_hit_data.groupby('event').size()
#grouped_hit_data['num_hits'] = grouped_hit_data['num_hits'].astype('int')

In [15]:
grouped_hit_data.query('num_hits.isnull()')

Unnamed: 0,event,x_realigned_min,x_realigned_max,x_realigned_median,y_realigned_min,y_realigned_max,y_realigned_median,min_hit_radius,max_hit_radius,mean_hit_radius,...,y_min_sum,x_max_sum,y_max_sum,x_min_diff,y_min_diff,x_max_diff,y_max_diff,max_x-min_x,max_y-min_y,num_hits
13870,13871,-148.47112,139.52888,13.52888,-148.075636,116.92436,-54.545638,105.101662,406.808545,255.553205,...,-85.725645,94.52888,116.92436,-121.47112,85.74436,76.52888,-132.48564,288.0,264.999996,
24824,24826,-176.5701,165.4299,-109.0701,-141.572717,170.19728,-94.807715,126.639505,195.825725,163.939347,...,-110.392716,165.4299,45.487288,-104.5701,139.017287,165.4299,45.487288,342.0,311.769997,
26782,26785,-169.87568,163.12432,-16.87568,-183.35097,159.589032,3.709028,50.982533,306.705193,185.171937,...,-152.180972,118.12432,128.41903,-97.87568,128.41903,163.12432,-43.060969,333.0,342.940002,
28721,28725,-170.717114,162.282886,22.782886,-188.969029,169.570979,-9.699023,109.066998,237.742449,183.002706,...,-95.439023,153.282886,76.040976,-170.717114,107.210978,144.282886,-95.439023,333.0,358.540009,
40377,40382,-169.62189,172.37811,5.87811,-169.581391,173.368613,-37.081393,145.458403,203.713255,182.221124,...,-122.811394,55.37811,173.368613,28.37811,157.778602,-16.62189,-169.581391,342.0,342.950005,
41987,41993,-135.0957,170.9043,-49.5957,-159.087388,168.272605,12.382606,30.186736,242.474829,168.363419,...,-143.497392,134.9043,105.912604,-126.0957,121.502608,143.9043,-96.737397,306.0,327.359993,
68126,68133,-328.48536,171.41464,-13.08536,-199.256556,174.863439,18.983438,132.709219,332.089493,188.334362,...,-136.906565,144.41464,96.923437,-310.48536,153.993441,45.41464,-199.256556,499.9,374.119995,
83862,83870,-177.45175,119.54825,-60.45175,-174.81072,105.789278,-112.45072,139.178785,388.948507,282.41606,...,-159.22072,119.54825,105.789278,-159.45175,-3.330717,119.54825,-112.45072,297.0,280.599998,
86118,86127,-187.01665,154.98335,-115.01665,-24.479021,162.58098,69.050978,137.558246,442.466169,284.007233,...,-24.479021,154.98335,69.050978,-106.01665,146.99098,154.98335,69.050978,342.0,187.060001,
115159,115169,-117.587685,161.412315,116.412315,-207.41115,135.528852,-98.291155,146.215102,208.233528,179.753476,...,-207.41115,152.412315,73.178846,-117.587685,135.528852,134.412315,-113.881151,279.0,342.940002,


In [16]:
grouped_hit_data = grouped_hit_data.dropna(axis=0) # drop rows which contain missing values
grouped_hit_data['num_hits'] = grouped_hit_data['num_hits'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grouped_hit_data['num_hits'] = grouped_hit_data['num_hits'].astype('int')


In [17]:
grouped_hit_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 900602 entries, 0 to 900650
Data columns (total 25 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   event               900602 non-null  int64  
 1   x_realigned_min     900602 non-null  float64
 2   x_realigned_max     900602 non-null  float64
 3   x_realigned_median  900602 non-null  float64
 4   y_realigned_min     900602 non-null  float64
 5   y_realigned_max     900602 non-null  float64
 6   y_realigned_median  900602 non-null  float64
 7   min_hit_radius      900602 non-null  float64
 8   max_hit_radius      900602 non-null  float64
 9   mean_hit_radius     900602 non-null  float64
 10  median_hit_radius   900602 non-null  float64
 11  rms_hit_radius      900602 non-null  float64
 12  momentum            900602 non-null  float64
 13  label               900602 non-null  float64
 14  x_min_sum           900602 non-null  float64
 15  y_min_sum           900602 non-nul

In [18]:
grouped_hit_data.describe()

Unnamed: 0,event,x_realigned_min,x_realigned_max,x_realigned_median,y_realigned_min,y_realigned_max,y_realigned_median,min_hit_radius,max_hit_radius,mean_hit_radius,...,y_min_sum,x_max_sum,y_max_sum,x_min_diff,y_min_diff,x_max_diff,y_max_diff,max_x-min_x,max_y-min_y,num_hits
count,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,...,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0
mean,450356.234069,-191.883323,148.59984,-37.445512,-185.95094,155.811869,-20.717891,89.649465,326.918203,225.447298,...,-130.813032,102.872274,103.214181,-142.067995,108.012859,98.56652,-138.769621,340.483163,341.76281,17.247942
std,260008.89198,33.382534,52.780302,75.334058,37.475724,47.790588,77.863747,54.630502,76.803907,49.192199,...,57.760517,80.695346,78.953331,52.499955,59.218867,65.618091,58.581893,60.470456,58.764819,5.983424
min,0.0,-497.015619,-347.107518,-401.223224,-470.455953,-380.343275,-411.523268,0.040606,23.864236,15.699232,...,-470.455953,-347.107518,-411.523268,-497.015619,-380.343275,-374.58247,-470.455953,0.0,0.0,1.0
25%,225184.25,-208.269685,130.838168,-97.066191,-205.814963,147.973258,-81.538517,45.322431,262.003299,188.577847,...,-170.026768,73.909862,72.129264,-174.652295,81.679593,68.223502,-175.610183,315.0,322.069997,13.0
50%,450360.5,-195.204225,168.450085,-44.9613,-190.789835,170.705384,-23.413407,84.360218,330.07344,213.817159,...,-137.454987,126.59088,126.027568,-147.778154,117.943083,110.646045,-148.919529,355.9,358.529999,17.0
75%,675528.75,-179.93586,180.569235,16.601978,-174.280331,180.536899,38.400134,128.276622,381.445268,253.659895,...,-98.447174,159.840422,158.999355,-114.44851,147.039173,143.960473,-113.535311,378.0,374.129997,21.0
max,900699.0,404.46505,464.4869,404.46505,342.483655,438.183453,358.073652,709.329379,789.257938,712.093105,...,342.483655,461.23708,438.183453,404.46505,438.183453,444.01528,342.483655,643.9,602.659985,89.0


In [19]:
# grouped_hit_data.to_csv('../../../data_0/grouped_hit_data_relative.csv')