## Appendix A: Read raw data, uniform sample, and create df_hit_data

In [1]:
import h5py 
import numpy as np 
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import glob
import warnings

In [2]:
# import functions from a local py file
from helpers import events_to_pandas
from helpers import calc_ring_radius
from helpers import get_string_label
from helpers import get_hit_info_df_rev_0
from helpers import compute_seq_id
from helpers import uniform_sampling

#### 1.0: Read raw data and remove irrelevant col/row

In [3]:
# load the h5py raw data provided by the TRIUMF team for MDS 2023 capstone project
f = h5py.File('data/CaloRICH_Run11100_CTRL_v1.h5', 'r')
f

<HDF5 file "CaloRICH_Run11100_CTRL_v1.h5" (mode r)>

In [4]:
datasets = list(f.keys())
datasets

['Events', 'HitMapping', 'Hits']

In [5]:
# read the event dataset to pandas df
df = events_to_pandas(f)
df.head()

Unnamed: 0,run_id,burst_id,event_id,track_id,track_momentum,chod_time,track_pos_x,track_pos_y,ring_radius,ring_centre_pos_x,ring_centre_pos_y,ring_likelihood_pion,ring_likelihood_muon,ring_likelihood_positron,label,first_hit,last_hit,total_hits
0,11100,1468,11235,0,22.761024,7.789327,-195.126602,-153.930786,172.41095,-190.112167,-154.579514,2.083958e-12,1.0,1.914225e-24,0,0,20,20
1,11100,1468,11812,0,23.600529,0.000198,-316.197571,-57.275291,175.251694,-309.305939,-54.84441,7.73201e-10,1.0,8.275685999999999e-19,0,20,39,19
2,11100,1468,14104,0,16.275131,11.789481,-88.681786,58.657421,155.040802,-88.66584,59.053833,1.216099e-37,1.0,1.216099e-37,0,39,65,26
3,11100,1468,14634,1,36.436443,7.426493,-39.124882,81.853058,185.832642,-35.864372,80.70858,0.008006046,1.0,0.03626212,0,65,117,52
4,11100,1468,18030,0,16.525362,8.923427,-66.697784,-15.932317,158.641846,-65.450981,-20.469883,1.216099e-37,1.0,1.216099e-37,0,117,141,24


In [6]:
# drop irrelevant columns to reduce size of df
# drop id columns (keep event_id only for mapping to hit data
df = df.drop(['run_id', 'burst_id'], axis=1)
df.head()

Unnamed: 0,event_id,track_id,track_momentum,chod_time,track_pos_x,track_pos_y,ring_radius,ring_centre_pos_x,ring_centre_pos_y,ring_likelihood_pion,ring_likelihood_muon,ring_likelihood_positron,label,first_hit,last_hit,total_hits
0,11235,0,22.761024,7.789327,-195.126602,-153.930786,172.41095,-190.112167,-154.579514,2.083958e-12,1.0,1.914225e-24,0,0,20,20
1,11812,0,23.600529,0.000198,-316.197571,-57.275291,175.251694,-309.305939,-54.84441,7.73201e-10,1.0,8.275685999999999e-19,0,20,39,19
2,14104,0,16.275131,11.789481,-88.681786,58.657421,155.040802,-88.66584,59.053833,1.216099e-37,1.0,1.216099e-37,0,39,65,26
3,14634,1,36.436443,7.426493,-39.124882,81.853058,185.832642,-35.864372,80.70858,0.008006046,1.0,0.03626212,0,65,117,52
4,18030,0,16.525362,8.923427,-66.697784,-15.932317,158.641846,-65.450981,-20.469883,1.216099e-37,1.0,1.216099e-37,0,117,141,24


In [7]:
# count the number of three particles
df['label'].value_counts()

0    2160219
1     215955
2      28515
Name: label, dtype: int64

In [8]:
# drop positron with label #2 as this is not our interest at this time
df = df.drop(df[df.label == 2].index)
df['label'].value_counts()

0    2160219
1     215955
Name: label, dtype: int64

In [9]:
# drop irrelevant columns to reduce size of df
# drop columns which are output of current NA62 algorithm
df = df.drop(['ring_radius', 
              'ring_centre_pos_x', 
              'ring_centre_pos_y',
              'ring_likelihood_pion',
              'ring_likelihood_muon',
              'ring_likelihood_positron'
             ], axis=1)
df.head()

Unnamed: 0,event_id,track_id,track_momentum,chod_time,track_pos_x,track_pos_y,label,first_hit,last_hit,total_hits
0,11235,0,22.761024,7.789327,-195.126602,-153.930786,0,0,20,20
1,11812,0,23.600529,0.000198,-316.197571,-57.275291,0,20,39,19
2,14104,0,16.275131,11.789481,-88.681786,58.657421,0,39,65,26
3,14634,1,36.436443,7.426493,-39.124882,81.853058,0,65,117,52
4,18030,0,16.525362,8.923427,-66.697784,-15.932317,0,117,141,24


In [10]:
# drop rows with track_momentum out of [20,45]
df = df.drop(df[df.track_momentum <20].index)
df = df.drop(df[df.track_momentum >45].index)
df.describe()

Unnamed: 0,event_id,track_id,track_momentum,chod_time,track_pos_x,track_pos_y,label,first_hit,last_hit,total_hits
count,1552842.0,1552842.0,1552842.0,1552842.0,1552842.0,1552842.0,1552842.0,1552842.0,1552842.0,1552842.0
mean,1348888.0,0.11434,33.06397,12.36835,-105.8141,0.7665038,0.1161445,50041590.0,50041630.0,42.34876
std,736480.7,0.346462,6.647698,8.560482,81.51135,83.55496,0.3203982,29282740.0,29282740.0,23.27619
min,9983.0,0.0,20.0,-24.38476,-344.7701,-171.7946,0.0,0.0,20.0,5.0
25%,731090.0,0.0,27.76289,5.854976,-174.3368,-76.60971,0.0,24307990.0,24308040.0,25.0
50%,1339128.0,0.0,33.18998,12.43226,-97.40569,1.570445,0.0,49991020.0,49991050.0,38.0
75%,1920988.0,0.0,38.51507,19.01669,-29.83885,78.53598,0.0,75744040.0,75744090.0,55.0
max,2823886.0,9.0,44.99998,49.30447,16.13446,171.7478,1.0,99397030.0,99397080.0,379.0


In [11]:
# add new column with the corresponding mass value for pion and muon
# refer to calc_ring_radius() function provided in TRIUMF repo for details
# mass of the particle in MeV/c^2
df['mass'] = [105.66 if x == 0 else 139.57 for x in df['label']]

In [12]:
# add new column with the calculated ring radius from the theoretical formula
# ring_radius_cal in mm (consistent with ring_radius in the event dataset)
# in below code, the first 1000 is to convert track_momentum from GeV/c to MeV/c
# the second 1000 is to convert ring_radius_cal from m to mm
df['ring_radius_cal'] = calc_ring_radius(df['mass'], df['track_momentum']*1000)*1000

In [13]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,event_id,track_id,track_momentum,chod_time,track_pos_x,track_pos_y,label,first_hit,last_hit,total_hits,mass,ring_radius_cal
0,11235,0,22.761024,7.789327,-195.126602,-153.930786,0,0,20,20,105.66,173.43763
1,11812,0,23.600529,0.000198,-316.197571,-57.275291,0,20,39,19,105.66,174.649395
2,14634,1,36.436443,7.426493,-39.124882,81.853058,0,65,117,52,105.66,184.053395
3,18340,0,34.514111,0.684758,-206.711899,66.719482,0,141,190,49,105.66,183.284136
4,25603,0,41.155918,20.199379,-152.587402,78.473106,0,190,232,42,105.66,185.510338


In [14]:
# drop interim column 'mass'
df = df.drop(['mass'
             ], axis=1)
df.head()

Unnamed: 0,event_id,track_id,track_momentum,chod_time,track_pos_x,track_pos_y,label,first_hit,last_hit,total_hits,ring_radius_cal
0,11235,0,22.761024,7.789327,-195.126602,-153.930786,0,0,20,20,173.43763
1,11812,0,23.600529,0.000198,-316.197571,-57.275291,0,20,39,19,174.649395
2,14634,1,36.436443,7.426493,-39.124882,81.853058,0,65,117,52,184.053395
3,18340,0,34.514111,0.684758,-206.711899,66.719482,0,141,190,49,183.284136
4,25603,0,41.155918,20.199379,-152.587402,78.473106,0,190,232,42,185.510338


#### 2.0: Uniform sample data

In [15]:
df_sample = uniform_sampling(df)
df_sample

Unnamed: 0,event_id,track_id,track_momentum,chod_time,track_pos_x,track_pos_y,label,first_hit,last_hit,total_hits,ring_radius_cal,momentum_bin
0,2155500,0,20.778881,13.055243,-58.201771,-22.614531,1,96059752,96059769,17,152.531027,"(20, 21]"
1,2477694,0,20.048994,10.540932,-201.855209,164.464264,0,57291267,57291283,16,168.160161,"(20, 21]"
2,1173336,0,20.881472,20.406666,-47.515766,63.944908,0,85098090,85098108,18,170.029766,"(20, 21]"
3,2612686,0,20.109674,6.305044,-81.963531,-90.920662,0,63719162,63719188,26,168.333388,"(20, 21]"
4,2479300,0,20.928585,6.572837,-92.437553,116.696861,0,53813387,53813408,21,170.144083,"(20, 21]"
...,...,...,...,...,...,...,...,...,...,...,...,...
900695,1483227,0,44.295181,20.453075,-130.977203,75.762321,0,34630754,34630844,90,186.192080,"(44, 45]"
900696,763612,0,44.697300,1.890692,-114.865143,82.479019,0,54776649,54776741,92,186.264814,"(44, 45]"
900697,2484287,0,44.018299,24.262856,2.594048,-22.045708,0,49145037,49145086,49,186.136482,"(44, 45]"
900698,1475065,0,44.554790,4.770075,-7.214058,48.099037,0,12805241,12805270,29,186.261281,"(44, 45]"


In [16]:
# df_sample.to_csv("data_0/df_sample.csv")

In [17]:
%%time
hit_data = [get_hit_info_df_rev_0(f, df_sample, index) for index in df_sample.index]

CPU times: user 42min 29s, sys: 1min 21s, total: 43min 51s
Wall time: 44min 47s


In [18]:
%%time
df_hit_data = pd.concat(hit_data, axis=0, ignore_index=True)

CPU times: user 1min 49s, sys: 5min 28s, total: 7min 17s
Wall time: 9min 57s


In [19]:
%%time
# df_hit_data.to_csv("data_0/df_hit_data.csv")

CPU times: user 3min 8s, sys: 6.63 s, total: 3min 15s
Wall time: 3min 23s


In [20]:
df_hit_data

Unnamed: 0,x,y,mirror,x_realigned,y_realigned,hit_time,chod_time,chod_delta,label,event,momentum,track_pos_x,track_pos_y,ring_radius_cal,event_id
0,45.0,-249.419998,0.0,-101.8,-269.219998,13.163244,13.055243,0.108001,1,0,20.778881,-58.201771,-22.614531,152.531027,2155500
1,-27.0,-93.529999,0.0,-173.8,-113.329999,13.077265,13.055243,0.022021,1,0,20.778881,-58.201771,-22.614531,152.531027,2155500
2,-45.0,62.349998,0.0,-191.8,42.549998,13.421317,13.055243,0.366074,1,0,20.778881,-58.201771,-22.614531,152.531027,2155500
3,54.0,140.300003,0.0,-92.8,120.500003,13.255676,13.055243,0.200433,1,0,20.778881,-58.201771,-22.614531,152.531027,2155500
4,243.0,-31.180000,0.0,96.2,-50.980000,12.519780,13.055243,-0.535463,1,0,20.778881,-58.201771,-22.614531,152.531027,2155500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38058006,198.0,-15.590000,1.0,1.3,-25.090000,8.470926,8.657274,-0.186348,0,900699,44.893456,-171.597046,-0.669099,186.290323,778557
38058007,-153.0,-93.529999,1.0,-349.7,-103.029999,8.188066,8.657274,-0.469208,0,900699,44.893456,-171.597046,-0.669099,186.290323,778557
38058008,72.0,-171.470001,1.0,-124.7,-180.970001,8.767607,8.657274,0.110332,0,900699,44.893456,-171.597046,-0.669099,186.290323,778557
38058009,-126.0,46.770000,1.0,-322.7,37.270000,8.683517,8.657274,0.026242,0,900699,44.893456,-171.597046,-0.669099,186.290323,778557


In [21]:
# only keep in-time hits
df_hit_data = df_hit_data.query('abs(chod_delta) <= 0.5')

In [22]:
# drop irrelevant cols to reduce size of dt
df_hit_data = df_hit_data.drop(['x', 
                                'y',
                                'mirror',
                                'hit_time',
                                'chod_time',
                                'chod_delta'
                                ], axis=1)
df_hit_data

Unnamed: 0,x_realigned,y_realigned,label,event,momentum,track_pos_x,track_pos_y,ring_radius_cal,event_id
0,-101.8,-269.219998,1,0,20.778881,-58.201771,-22.614531,152.531027,2155500
1,-173.8,-113.329999,1,0,20.778881,-58.201771,-22.614531,152.531027,2155500
2,-191.8,42.549998,1,0,20.778881,-58.201771,-22.614531,152.531027,2155500
3,-92.8,120.500003,1,0,20.778881,-58.201771,-22.614531,152.531027,2155500
5,87.2,-35.390000,1,0,20.778881,-58.201771,-22.614531,152.531027,2155500
...,...,...,...,...,...,...,...,...,...
38058006,1.3,-25.090000,0,900699,44.893456,-171.597046,-0.669099,186.290323,778557
38058007,-349.7,-103.029999,0,900699,44.893456,-171.597046,-0.669099,186.290323,778557
38058008,-124.7,-180.970001,0,900699,44.893456,-171.597046,-0.669099,186.290323,778557
38058009,-322.7,37.270000,0,900699,44.893456,-171.597046,-0.669099,186.290323,778557


In [23]:
df_hit_data = df_hit_data.reset_index()