## Size of dataset

#### Training

In [1]:
from dataset.rich_dataset import combine_datset

input_size, muon_size, pion_size, positron_size = 0, 0, 0, 0
for _, dataset in combine_datset("train").items():
    input_size += dataset.offsets["entries"]
    muon_size += dataset.offsets["pion"] - dataset.offsets["muon"]
    pion_size += dataset.offsets["positron"] - dataset.offsets["pion"]
    positron_size += dataset.offsets["entries"] - dataset.offsets["positron"]

print(f"Input Size: {input_size}")
print(f"Muon Size: {muon_size}")
print(f"Pion Size: {pion_size}")
print(f"Positron Size: {positron_size}")


2022-05-18 23:32:36,682  INFO      Train directories: ['/data/bvelghe/capstone2022/B', '/data/bvelghe/capstone2022/C']
2022-05-18 23:32:36,685  INFO      data_version: 8565aa47-4e6a-4acd-b1c7-453688daa0c7
2022-05-18 23:32:36,686  INFO      description: NA62 RICH PID dataset - MDS Capstone project - Fri 22 Apr 2022 10:15:02 AM <bob.velghe@triumf.ca>
2022-05-18 23:32:36,687  INFO      entries: 1194703
2022-05-18 23:32:36,687  INFO      muon_offset: 0
2022-05-18 23:32:36,688  INFO      pion_offset: 1066606
2022-05-18 23:32:36,689  INFO      positron_offset: 1183353
2022-05-18 23:32:36,749  INFO      hit map size: 9557632 bytes
2022-05-18 23:32:36,751  INFO      Offsets: {'entries': 1194703, 'muon': 0, 'pion': 1066606, 'positron': 1183353}
2022-05-18 23:32:36,752  INFO      Entries: 1194703
2022-05-18 23:32:36,753  INFO      Muons start at index: 0
2022-05-18 23:32:36,753  INFO      Pions start at index: 1066606
2022-05-18 23:32:36,754  INFO      Positron start at index: 1183353
2022-05-18

Input Size: 10857472
Muon Size: 9650101
Pion Size: 1100902
Positron Size: 106469


#### Testing

In [2]:
from dataset.rich_dataset import combine_datset

input_size, muon_size, pion_size, positron_size = 0, 0, 0, 0
for _, dataset in combine_datset("test").items():
    input_size += dataset.offsets["entries"]
    muon_size += dataset.offsets["pion"] - dataset.offsets["muon"]
    pion_size += dataset.offsets["positron"] - dataset.offsets["pion"]
    positron_size += dataset.offsets["entries"] - dataset.offsets["positron"]

print(f"Input Size: {input_size}")
print(f"Muon Size: {muon_size}")
print(f"Pion Size: {pion_size}")
print(f"Positron Size: {positron_size}")
# Input Size:  10857472


2022-05-18 23:32:37,895  INFO      Train directories: ['/data/bvelghe/capstone2022/A']
2022-05-18 23:32:37,898  INFO      data_version: 397411f5-71f4-49ab-b876-7960946ec1e9
2022-05-18 23:32:37,898  INFO      description: NA62 RICH PID dataset - MDS Capstone project - Fri 22 Apr 2022 10:28:31 AM <bob.velghe@triumf.ca>
2022-05-18 23:32:37,899  INFO      entries: 181847
2022-05-18 23:32:37,900  INFO      muon_offset: 0
2022-05-18 23:32:37,901  INFO      pion_offset: 162478
2022-05-18 23:32:37,902  INFO      positron_offset: 180080
2022-05-18 23:32:37,916  INFO      hit map size: 1454784 bytes
2022-05-18 23:32:37,918  INFO      Offsets: {'entries': 181847, 'muon': 0, 'pion': 162478, 'positron': 180080}
2022-05-18 23:32:37,918  INFO      Entries: 181847
2022-05-18 23:32:37,919  INFO      Muons start at index: 0
2022-05-18 23:32:37,920  INFO      Pions start at index: 162478
2022-05-18 23:32:37,920  INFO      Positron start at index: 180080
2022-05-18 23:32:37,924  INFO      hit array mmap s

Input Size: 706497
Muon Size: 631200
Pion Size: 68654
Positron Size: 6643


## Pion efficiency and precision

In [3]:
import pandas as pd

# read predictions.csv
df = pd.read_csv('saved_models/predictions.csv')

# count where the prediction is correct
df['correct'] = df['predictions'] == df['labels']

# filter label = 1
pion = df[df['labels'] == 1]
pion_efficiency = pion['correct'].sum() / pion['labels'].sum()

predicted_pion = df[df['predictions'] == 1]
pion_precision = predicted_pion['correct'].sum() / predicted_pion['predictions'].sum()

print(f"Pion efficiency: {pion_efficiency:.2f}")
print(f"Pion prescision: {pion_precision:.2f}")

Pion efficiency: 0.97
Pion prescision: 0.89


**Pion precision:** Of all particles that predicted as pion, how many actually are pions? High precision relates to the low false positive rate. 

**Pion efficiency:** Of all particles that are labelled as pion, how many actually predicted correctly as pions?

## Movement of particles

In [4]:
# rename labels and predictions 0 to muon, labels = 1 to pion, and labels = 2 to positron 
df.loc[df['labels'] == 0, 'labels'] = 'muon'
df.loc[df['labels'] == 1, 'labels'] = 'pion'
df.loc[df['labels'] == 2, 'labels'] = 'positron'
df.loc[df['predictions'] == 0, 'predictions'] = 'muon'
df.loc[df['predictions'] == 1, 'predictions'] = 'pion'
df.loc[df['predictions'] == 2, 'predictions'] = 'positron'


# movement
mov_df = df.groupby(["labels", "predictions"]).size().unstack()
mov_df

predictions,muon,pion,positron
labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
muon,628954,2149,97
pion,1988,66612,54
positron,199,6437,7


In [5]:
# movement percentages
mov_pct_df = mov_df.div(mov_df.sum(axis=1), axis=0).round(4) * 100
mov_pct_df = mov_pct_df.applymap(lambda x: f"{x:.2f}%")
mov_pct_df



predictions,muon,pion,positron
labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
muon,99.64%,0.34%,0.02%
pion,2.90%,97.03%,0.08%
positron,3.00%,96.90%,0.11%
