## Appendix B: Feature engineering. Get ready for classification modeling

In [1]:
import h5py 
import numpy as np 
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import glob
import warnings

#### 1.0: Read hit data

In [2]:
df_hit_data = pd.read_csv("data_0/df_hit_data.csv")

In [3]:
# only keep in-time hits
df_hit_data = df_hit_data.query('abs(chod_delta) <= 0.5')

In [4]:
# drop irrelevant cols to reduce size of dt
df_hit_data = df_hit_data.drop(['x', 
                                'y',
                                'mirror',
                                'hit_time',
                                'chod_time',
                                'chod_delta'
                                ], axis=1)
df_hit_data

Unnamed: 0.1,Unnamed: 0,x_realigned,y_realigned,label,event,momentum,track_pos_x,track_pos_y,ring_radius_cal,event_id
0,0,-101.8,-269.219998,1,0,20.778881,-58.20177,-22.614530,152.531027,2155500
1,1,-173.8,-113.329999,1,0,20.778881,-58.20177,-22.614530,152.531027,2155500
2,2,-191.8,42.549998,1,0,20.778881,-58.20177,-22.614530,152.531027,2155500
3,3,-92.8,120.500003,1,0,20.778881,-58.20177,-22.614530,152.531027,2155500
5,5,87.2,-35.390000,1,0,20.778881,-58.20177,-22.614530,152.531027,2155500
...,...,...,...,...,...,...,...,...,...,...
38058006,38058006,1.3,-25.090000,0,900699,44.893456,-171.59705,-0.669099,186.290323,778557
38058007,38058007,-349.7,-103.029999,0,900699,44.893456,-171.59705,-0.669099,186.290323,778557
38058008,38058008,-124.7,-180.970001,0,900699,44.893456,-171.59705,-0.669099,186.290323,778557
38058009,38058009,-322.7,37.270000,0,900699,44.893456,-171.59705,-0.669099,186.290323,778557


In [5]:
df_hit_data = df_hit_data.reset_index()

#### 2.0: Process and get engineered features

In [6]:
# comupter distance
df_hit_data['distance'] = (
    (df_hit_data["track_pos_x"] - df_hit_data["x_realigned"]) ** 2 +
    (df_hit_data["track_pos_y"] - df_hit_data["y_realigned"]) ** 2
) ** 0.5

In [7]:
# comupter x+y and x-y
df_hit_data['x+y'] = df_hit_data['x_realigned'] + df_hit_data['y_realigned']
df_hit_data['x-y'] = df_hit_data['x_realigned'] - df_hit_data['y_realigned']

In [9]:
%%time
rms = lambda x: np.sqrt(np.mean(x**2))
grouped_hit_data = df_hit_data.groupby('event').agg(
    {'x_realigned':['min', 'max', 'median'], 
     'y_realigned':['min','max', 'median'],
     'x+y': ['idxmin', 'idxmax'],
     'x-y': ['idxmin', 'idxmax'],
     'distance': ['min', 'max', 'mean', 'median', rms],
     'momentum': ['mean'],
     'label': ['mean'],
     'ring_radius_cal': ['mean'],
     'track_pos_x': ['mean'],
     'track_pos_y': ['mean']
    })
grouped_hit_data.columns = ['x_realigned_min', 
                            'x_realigned_max',
                            'x_realigned_median',
                            'y_realigned_min', 
                            'y_realigned_max',
                            'y_realigned_median',
                            'index_min(x+y)', 
                            'index_max(x+y)', 
                            'index_min(x-y)', 
                            'index_max(x-y)', 
                            'min_hit_radius', 
                            'max_hit_radius', 
                            'mean_hit_radius', 
                            'median_hit_radius', 
                            'rms_hit_radius',
                            'momentum',
                            'label',
                            'ring_radius_cal',
                            'track_pos_x',
                            'track_pos_y'
                            ]
grouped_hit_data = grouped_hit_data.reset_index()

CPU times: user 2min 42s, sys: 6.87 s, total: 2min 48s
Wall time: 2min 43s


In [10]:
grouped_hit_data['x_min_sum'] = grouped_hit_data['index_min(x+y)'].apply(lambda x: df_hit_data.iloc[x]['x_realigned'])
grouped_hit_data['y_min_sum'] = grouped_hit_data['index_min(x+y)'].apply(lambda x: df_hit_data.iloc[x]['y_realigned'])

grouped_hit_data['x_max_sum'] = grouped_hit_data['index_max(x+y)'].apply(lambda x: df_hit_data.iloc[x]['x_realigned'])
grouped_hit_data['y_max_sum'] = grouped_hit_data['index_max(x+y)'].apply(lambda x: df_hit_data.iloc[x]['y_realigned'])

grouped_hit_data['x_min_diff'] = grouped_hit_data['index_min(x-y)'].apply(lambda x: df_hit_data.iloc[x]['x_realigned'])
grouped_hit_data['y_min_diff'] = grouped_hit_data['index_min(x-y)'].apply(lambda x: df_hit_data.iloc[x]['y_realigned'])

grouped_hit_data['x_max_diff'] = grouped_hit_data['index_max(x-y)'].apply(lambda x: df_hit_data.iloc[x]['x_realigned'])
grouped_hit_data['y_max_diff'] = grouped_hit_data['index_max(x-y)'].apply(lambda x: df_hit_data.iloc[x]['y_realigned'])

In [11]:
grouped_hit_data = grouped_hit_data.drop(columns=['index_min(x+y)', 
                                                  'index_max(x+y)', 
                                                  'index_min(x-y)', 
                                                  'index_max(x-y)'])

In [12]:
grouped_hit_data['max_x-min_x'] = grouped_hit_data['x_realigned_max']-grouped_hit_data['x_realigned_min']
grouped_hit_data['max_y-min_y'] = grouped_hit_data['y_realigned_max']-grouped_hit_data['y_realigned_min']

In [14]:
grouped_hit_data

Unnamed: 0,event,x_realigned_min,x_realigned_max,x_realigned_median,y_realigned_min,y_realigned_max,y_realigned_median,min_hit_radius,max_hit_radius,mean_hit_radius,...,x_min_sum,y_min_sum,x_max_sum,y_max_sum,x_min_diff,y_min_diff,x_max_diff,y_max_diff,max_x-min_x,max_y-min_y
0,0,-191.8,87.2,-101.8,-269.219998,120.500003,-35.390000,110.145177,259.133431,172.642014,...,-137.8,-269.219998,87.2,-35.390000,-191.8,42.549998,-101.8,-269.219998,279.0,389.720001
1,1,-322.7,-34.7,-79.7,-9.500000,130.800003,52.849998,146.309490,175.447550,166.862196,...,-322.7,37.270000,-34.7,130.800003,-322.7,37.270000,-79.7,52.849998,288.0,140.300003
2,2,-209.8,132.2,15.2,-113.329999,104.909999,-4.215001,163.124895,180.984803,172.711909,...,-164.8,-66.570000,132.2,73.729999,-209.8,104.909999,78.2,-50.980000,342.0,218.239998
3,3,-263.8,78.2,-133.3,-269.219998,42.549998,-144.509999,124.472915,200.620658,167.970376,...,-263.8,-175.680005,78.2,-82.149998,-227.8,11.380000,60.2,-175.680005,342.0,311.769997
4,4,-263.8,78.2,-182.8,-50.980000,276.379993,89.319999,143.818986,194.906198,169.622280,...,-191.8,-50.980000,78.2,167.259998,-146.8,276.379993,78.2,104.909999,342.0,327.359993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900646,900695,-250.7,55.3,-57.2,-118.620003,255.500000,21.680000,141.092828,206.526419,186.128292,...,-250.7,-87.440002,-34.7,224.330002,-232.7,255.500000,-7.7,-71.849998,306.0,374.120003
900647,900696,-250.7,-34.7,-196.7,-118.620003,255.500000,-56.270000,182.033911,217.539843,194.252565,...,-250.7,-87.440002,-178.7,255.500000,-232.7,255.500000,-34.7,-87.440002,216.0,374.120003
900648,900697,-227.8,15.2,-115.3,-175.680005,214.030002,50.345000,182.146891,236.412036,199.895585,...,-119.8,-175.680005,15.2,214.030002,-227.8,11.380000,-101.8,-175.680005,243.0,389.710007
900649,900698,-218.8,51.2,-155.8,-128.920003,73.729999,-50.980000,178.590139,211.824059,193.054460,...,-191.8,-50.980000,51.2,-128.920003,-218.8,58.140002,51.2,-128.920003,270.0,202.650002


In [24]:
grouped_hit_data['num_hits'] = df_hit_data.groupby('event').size()
#grouped_hit_data['num_hits'] = grouped_hit_data['num_hits'].astype('int')

In [30]:
grouped_hit_data.query('num_hits.isnull()')

Unnamed: 0,event,x_realigned_min,x_realigned_max,x_realigned_median,y_realigned_min,y_realigned_max,y_realigned_median,min_hit_radius,max_hit_radius,mean_hit_radius,...,y_min_sum,x_max_sum,y_max_sum,x_min_diff,y_min_diff,x_max_diff,y_max_diff,max_x-min_x,max_y-min_y,num_hits
13870,13871,-367.7,-79.7,-205.7,-290.089996,-25.09,-196.559998,139.742533,158.173639,150.221268,...,-227.740005,-124.7,-25.09,-340.7,-56.27,-142.7,-274.5,288.0,264.999996,
24824,24826,-200.8,141.2,-133.3,-128.920003,182.849994,-82.155001,141.621626,208.23917,170.230661,...,-97.740002,141.2,58.140002,-128.8,151.670001,141.2,58.140002,342.0,311.769997,
26782,26785,-254.8,78.2,-101.8,-284.8,58.140002,-97.740002,137.967669,183.520039,167.537772,...,-253.630002,33.2,26.97,-182.8,26.97,78.2,-144.509999,333.0,342.940002,
28721,28725,-218.8,114.2,-25.3,-238.040005,120.500003,-58.769999,141.054666,201.589997,171.727969,...,-144.509999,105.2,26.97,-218.8,58.140002,96.2,-144.509999,333.0,358.540009,
40377,40382,-200.8,141.2,-25.3,-144.509999,198.440005,-12.010001,146.425054,209.414002,174.288431,...,-97.740002,24.2,198.440005,-2.8,182.849994,-47.8,-144.509999,342.0,342.950005,
41987,41993,-173.8,132.2,-88.3,-222.449994,104.909999,-50.98,102.790889,197.084625,163.099682,...,-206.859998,96.2,42.549998,-164.8,58.140002,105.2,-160.100003,306.0,327.359993,
68126,68133,-358.7,141.2,-43.3,-222.449994,151.670001,-4.21,136.738786,350.693275,189.522191,...,-160.100003,114.2,73.729999,-340.7,130.800003,15.2,-222.449994,499.9,374.119995,
83862,83870,-358.7,-61.7,-241.7,-40.68,239.919998,21.68,112.012747,190.97463,164.052725,...,-25.09,-61.7,239.919998,-340.7,130.800003,-61.7,21.68,297.0,280.599998,
86118,86127,-412.7,-70.7,-340.7,-180.970001,6.09,-87.440002,137.568438,188.611903,170.450245,...,-180.970001,-70.7,-87.440002,-331.7,-9.5,-70.7,-87.440002,342.0,187.060001,
115159,115169,-137.8,141.2,96.2,-206.859998,136.080005,-97.740002,127.307843,207.63263,167.99014,...,-206.859998,132.2,73.729999,-137.8,136.080005,114.2,-113.329999,279.0,342.940002,


In [31]:
grouped_hit_data = grouped_hit_data.dropna(axis=0) # drop rows which contain missing values
grouped_hit_data['num_hits'] = grouped_hit_data['num_hits'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grouped_hit_data['num_hits'] = grouped_hit_data['num_hits'].astype('int')


In [32]:
grouped_hit_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 900602 entries, 0 to 900650
Data columns (total 28 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   event               900602 non-null  int64  
 1   x_realigned_min     900602 non-null  float64
 2   x_realigned_max     900602 non-null  float64
 3   x_realigned_median  900602 non-null  float64
 4   y_realigned_min     900602 non-null  float64
 5   y_realigned_max     900602 non-null  float64
 6   y_realigned_median  900602 non-null  float64
 7   min_hit_radius      900602 non-null  float64
 8   max_hit_radius      900602 non-null  float64
 9   mean_hit_radius     900602 non-null  float64
 10  median_hit_radius   900602 non-null  float64
 11  rms_hit_radius      900602 non-null  float64
 12  momentum            900602 non-null  float64
 13  label               900602 non-null  float64
 14  ring_radius_cal     900602 non-null  float64
 15  track_pos_x         900602 non-nul

In [34]:
grouped_hit_data.describe()

Unnamed: 0,event,x_realigned_min,x_realigned_max,x_realigned_median,y_realigned_min,y_realigned_max,y_realigned_median,min_hit_radius,max_hit_radius,mean_hit_radius,...,y_min_sum,x_max_sum,y_max_sum,x_min_diff,y_min_diff,x_max_diff,y_max_diff,max_x-min_x,max_y-min_y,num_hits
count,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,...,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0,900602.0
mean,450356.234069,-296.975266,43.507897,-142.537454,-185.275735,156.487075,-20.042686,149.391772,219.922764,182.869971,...,-130.137827,-2.219669,103.889386,-247.159938,108.688064,-6.525422,-138.094416,340.483163,341.76281,17.247942
std,260008.89198,84.272724,70.376959,85.921143,83.788914,76.762505,87.434871,21.903477,30.38017,9.294614,...,96.158227,91.605983,88.719766,94.292675,92.405989,82.874152,95.979273,60.470456,58.764819,5.983424
min,0.0,-493.7,-457.7,-475.7,-315.979993,-300.389996,-300.389996,0.468424,57.126125,35.117506,...,-315.979993,-475.7,-315.979993,-493.7,-315.979993,-475.7,-315.979993,0.0,0.0,1.0
25%,225184.25,-367.7,-7.7,-196.7,-258.919998,89.320003,-82.149998,141.306831,206.458261,178.029189,...,-206.859998,-61.7,42.549998,-322.7,42.549998,-61.7,-212.149994,315.0,322.069997,13.0
50%,450360.5,-290.8,51.2,-138.2,-191.270001,167.259998,-17.295,151.321513,215.794863,183.635207,...,-128.920003,6.2,104.909999,-236.8,115.209999,1.3,-144.509999,355.9,358.529999,17.0
75%,675528.75,-227.8,105.2,-83.8,-118.620003,224.330002,42.549998,162.224746,224.433838,187.976924,...,-50.98,69.2,177.559998,-173.8,182.849994,60.2,-66.57,378.0,374.129997,21.0
max,900699.0,150.2,150.2,150.2,255.5,286.679993,263.294998,407.138399,510.473803,453.741037,...,255.5,150.2,286.679993,150.2,286.679993,150.2,271.089996,643.9,602.659985,89.0


In [36]:
# grouped_hit_data.to_csv('data_0/grouped_hit_data.csv')