# Creating balanced data

- **Purpose**: The purpose of this notebook is to demonstrate the balanced data generation process.
- **Author**: Shiv Jena
- **module used**: `dataset.balance_data.py`

In [1]:
cd ../

/home/shiv/richai


In [2]:
# importing module
from dataset.balance_data import *
from utils.helpers import get_config

## Changes in `configs/config.yaml` to create new save paths

The original source path is the key in the `train` unit, and the save path is the value. In order to securely create balanced dataset without overwriting the original data, it is suggested that the save path be changed as shown below.
>```
    dataset:
          delta: 0.5 #pointnet: 0.5, dgcnn: 0.3
    train:
          /data/bvelghe/capstone2022/B/2018B.Sample.EOSlist.CTRL.p.v2.0.4_f.v2.0.4_patched.h5:
              /fast_scratch_1/capstone_2022/combined_datasets/B.2018B_copy.h5
          /data/bvelghe/capstone2022/C/2018E.EOSlist.CTRL_patched.h5:
              /fast_scratch_1/capstone_2022/datasetC_combined_copy.h5
>```

In [3]:
%%time
# Creating and saving balanced data using balance_data()
balance_data(dset_path = get_config("dataset.train").items())

CPU times: user 12 s, sys: 1min 13s, total: 1min 25s
Wall time: 1min 20s


In [4]:
# defining original and sample file paths for matching
path_original_b = '/fast_scratch_1/capstone_2022/combined_datasets/B.2018B.h5'
path_sample_b = '/fast_scratch_1/capstone_2022/combined_datasets/B.2018B_copy.h5'
path_original_c = '/fast_scratch_1/capstone_2022/datasetC_combined.h5'
path_sample_c = '/fast_scratch_1/capstone_2022/datasetC_combined.h5'

## Testing for Original and Sample B

In [5]:
# Comparing original and sample files of B dataset
df_b_org = pd.read_hdf(path_original_b)
df_b_sample = pd.read_hdf(path_sample_b)

In [6]:
df_b_org

Unnamed: 0,run_id,burst_id,event_id,track_id,track_momentum,chod_time,ring_radius,ring_centre_pos_x,ring_centre_pos_y,ring_likelihood_pion,ring_likelihood_muon,ring_likelihood_positron,label,original_index,momentum-bin
0,8584,11,467162,0,20.545891,26.191343,172.459305,-187.848038,-163.433380,7.914828e-19,1.000000,4.462974e-15,0,313820,15-25
1,8585,644,90178,0,18.684219,23.152292,166.742630,-148.628265,-150.348267,1.216099e-37,1.000000,4.858268e-37,0,346918,15-25
2,8594,396,839550,0,19.536823,21.162600,166.987915,-63.194630,82.889381,1.216099e-37,1.000000,1.216099e-37,0,647962,15-25
3,8596,1116,499071,0,19.072475,-4.647523,165.470795,-50.019455,-39.349384,1.216099e-37,1.000000,1.216099e-37,0,862768,15-25
4,8584,812,242733,2,17.260925,29.883760,162.446198,-120.595657,125.187935,1.216099e-37,1.000000,1.216099e-37,0,228333,15-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205391,8597,11,1134060,0,39.990234,9.208418,178.909622,-12.610793,-4.642335,1.000000e+00,0.018242,1.016588e-08,1,1183344,35-45
205392,8597,11,1185901,0,42.756977,20.629349,180.795471,-149.515915,-49.405823,1.000000e+00,0.013767,4.464561e-10,1,1183345,35-45
205393,8597,11,1485217,0,43.556774,7.807468,183.350784,-137.691193,61.226639,1.000000e+00,0.008168,1.698273e-10,1,1183346,35-45
205394,8597,11,1492138,0,35.692371,16.451027,180.316269,-38.615685,-61.617775,1.000000e+00,0.130762,4.901688e-08,1,1183347,35-45


In [7]:
df_b_sample

Unnamed: 0,run_id,burst_id,event_id,track_id,track_momentum,chod_time,ring_radius,ring_centre_pos_x,ring_centre_pos_y,ring_likelihood_pion,ring_likelihood_muon,ring_likelihood_positron,label,original_index,momentum-bin
0,8600,505,1255898,0,22.134642,7.573945,171.025391,-21.651001,0.106048,1.674560e-13,1.000000,1.456128e-28,0,95271,15-25
1,8584,471,900821,0,16.544842,14.156938,158.820816,-81.064362,62.893135,1.216099e-37,1.000000,1.216099e-37,0,267471,15-25
2,8586,520,1157037,0,22.175705,3.747281,173.562256,-68.062523,-85.532005,4.248587e-27,1.000000,8.202211e-29,0,479092,15-25
3,8596,311,1008844,0,23.371750,10.589095,175.663101,-168.908798,151.677185,4.882544e-22,1.000000,4.632577e-19,0,947047,15-25
4,8585,321,1865296,0,21.112137,9.583098,171.693588,-27.746183,-19.747335,9.201775e-30,1.000000,5.465181e-35,0,382234,15-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205391,8597,11,1134060,0,39.990234,9.208418,178.909622,-12.610793,-4.642335,1.000000e+00,0.018242,1.016588e-08,1,1183344,35-45
205392,8597,11,1185901,0,42.756977,20.629349,180.795471,-149.515915,-49.405823,1.000000e+00,0.013767,4.464561e-10,1,1183345,35-45
205393,8597,11,1485217,0,43.556774,7.807468,183.350784,-137.691193,61.226639,1.000000e+00,0.008168,1.698273e-10,1,1183346,35-45
205394,8597,11,1492138,0,35.692371,16.451027,180.316269,-38.615685,-61.617775,1.000000e+00,0.130762,4.901688e-08,1,1183347,35-45


In [8]:
# Comparing sample and original for label==1 (pions)
df_b_sample_pions = df_b_sample.query('label==1')
df_b_org_pions = df_b_org.query('label==1')
df_b_sample_pions.equals(df_b_org_pions)

True

**Conclusion**: This shows two datasets are equal for pion entries. Muon entries could also be equal, if generated with random seed.

## Testing for Original and Sample C

In [9]:
# Comparing original and sample files of B dataset
df_c_org = pd.read_hdf(path_original_c)
df_c_sample = pd.read_hdf(path_sample_c)

In [10]:
df_c_org

Unnamed: 0,run_id,burst_id,event_id,track_id,track_momentum,chod_time,ring_radius,ring_centre_pos_x,ring_centre_pos_y,ring_likelihood_pion,ring_likelihood_muon,ring_likelihood_positron,label,original_index,momentum-bin
0,9011,887,1800376,0,24.691673,16.837833,177.461960,-60.107861,106.794441,5.328789e-10,1.000000,9.393904e-21,0,2687658,15-25
1,8992,1059,770241,0,23.396029,9.284823,175.237564,-12.228768,11.319837,1.080940e-22,1.000000,6.127901e-07,0,4676657,15-25
2,9007,477,1922012,0,15.891424,22.654669,152.091827,-241.150375,-172.708389,1.216099e-37,1.000000,1.216099e-37,0,3262259,15-25
3,8987,1295,1634792,0,16.026428,23.254803,155.005783,-65.635712,-0.601753,1.216099e-37,1.000000,1.216099e-37,0,5473943,15-25
4,8971,654,2010374,0,21.888950,24.337191,172.810287,-231.750687,159.534821,4.307081e-14,1.000000,7.490480e-21,0,8071459,15-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1731221,8968,7,563388,0,35.011139,16.212551,177.854492,-213.486633,-2.936464,1.000000e+00,0.000043,2.887881e-20,1,9567636,35-45
1731222,8968,7,572378,0,38.958267,19.714800,182.430420,-98.754082,92.138535,1.000000e+00,0.146916,3.528800e-07,1,9567637,35-45
1731223,8968,7,583135,0,38.474777,13.620609,176.691132,-145.546219,-77.260124,1.000000e+00,0.001099,5.301323e-11,1,9567638,35-45
1731224,8968,7,1461058,0,36.989967,14.916398,180.464676,-186.481369,-47.899498,1.000000e+00,0.004252,8.353033e-16,1,9567646,35-45


In [11]:
df_c_sample

Unnamed: 0,run_id,burst_id,event_id,track_id,track_momentum,chod_time,ring_radius,ring_centre_pos_x,ring_centre_pos_y,ring_likelihood_pion,ring_likelihood_muon,ring_likelihood_positron,label,original_index,momentum-bin
0,9011,887,1800376,0,24.691673,16.837833,177.461960,-60.107861,106.794441,5.328789e-10,1.000000,9.393904e-21,0,2687658,15-25
1,8992,1059,770241,0,23.396029,9.284823,175.237564,-12.228768,11.319837,1.080940e-22,1.000000,6.127901e-07,0,4676657,15-25
2,9007,477,1922012,0,15.891424,22.654669,152.091827,-241.150375,-172.708389,1.216099e-37,1.000000,1.216099e-37,0,3262259,15-25
3,8987,1295,1634792,0,16.026428,23.254803,155.005783,-65.635712,-0.601753,1.216099e-37,1.000000,1.216099e-37,0,5473943,15-25
4,8971,654,2010374,0,21.888950,24.337191,172.810287,-231.750687,159.534821,4.307081e-14,1.000000,7.490480e-21,0,8071459,15-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1731221,8968,7,563388,0,35.011139,16.212551,177.854492,-213.486633,-2.936464,1.000000e+00,0.000043,2.887881e-20,1,9567636,35-45
1731222,8968,7,572378,0,38.958267,19.714800,182.430420,-98.754082,92.138535,1.000000e+00,0.146916,3.528800e-07,1,9567637,35-45
1731223,8968,7,583135,0,38.474777,13.620609,176.691132,-145.546219,-77.260124,1.000000e+00,0.001099,5.301323e-11,1,9567638,35-45
1731224,8968,7,1461058,0,36.989967,14.916398,180.464676,-186.481369,-47.899498,1.000000e+00,0.004252,8.353033e-16,1,9567646,35-45


In [12]:
# Comparing both dataframes
df_c_sample.equals(df_c_org)

True

**Conclusion**: This shows two datasets are identical.