In [1]:
from pprint import pprint
import pandas as pd
import json
import os 
import sys
import velopix_tracking as velopix 

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)
from py_modules.validation_to_datasets import output_distributions, output_aggregates

In [2]:
events = []
json_events = []
n_files = 100

for i in range(0, n_files):
    if i == 51:
        continue
    else:
        event_file = open(os.path.join("../DB/raw", f"velo_event_{i}.json"))
        json_data = json.loads(event_file.read())
        events.append(velopix.Event(json_data))
        json_events.append(json_data)
        event_file.close()

In [3]:
wrapper = [] # this wrapper is required because we want to eval multiple set of parameters which can be added into a list
solver = velopix.TrackFollowing(max_slopes=(0.1, 0.1), max_tolerance=(0.1, 0.1), max_scatter=0.1)
tracks = solver.solve_parallel(events=events)
results = velopix.validate_to_json_nested(json_events, tracks, verbose=False)
results["parameters"] = {
        "max_slopes": (0.1, 0.1),
        "max_tolerance": (0.1, 0.1),
        "scatter": 0.1
    }
wrapper.append(results)

In [4]:
pprint(wrapper, indent=4)

[   {   'categories': [   {   'avg_hiteff': 0.0,
                              'avg_purity': 0.0,
                              'avg_recoeff': 57.65765765765766,
                              'clone_percentage': 7.8125,
                              'hit_eff_percentage': 90.89633735248711,
                              'label': 'long_fromb',
                              'n_clones': 20,
                              'n_particles': 444,
                              'n_reco': 256,
                              'purityT': 99.21207264957266,
                              'recoeffT': 57.65765765765766},
                          {   'avg_hiteff': 100.0,
                              'avg_purity': 100.0,
                              'avg_recoeff': 93.10344827586206,
                              'clone_percentage': 4.320987654320987,
                              'hit_eff_percentage': 94.51195747053735,
                              'label': 'long_strange>5GeV',
                           

In [5]:
overall , catagory, event = output_distributions(wrapper)

### Lets review data size generated

In [9]:
track_size = sys.getsizeof(tracks)
output_raw = sys.getsizeof(wrapper) + sys.getsizeof(results)
db1 = sys.getsizeof(overall)
db2 = sys.getsizeof(catagory)
db3 = sys.getsizeof(event)

print("{:<10} {:>10}".format("Variable", "Byte Size"))
print("{:<10} {:>10}".format("--------", "---------"))
print("{:<10} {:>10}".format("tracks", track_size))
print("{:<10} {:>10}".format("wrapper", output_raw))
print("{:<10} {:>10}".format("overall", db1))
print("{:<10} {:>10}".format("catagory", db2))
print("{:<10} {:>10}".format("event", db3))

Variable    Byte Size
--------    ---------
tracks            848
wrapper           360
overall           364
catagory         2202
event            6340


## Data Generation Analysis

We have **99 events** (excluding event **51**, which is corrupted) and **1 run**.  
For this analysis, we assume that there are no differences in data generation between runs and that a sample size of **99** is representative of average data collection.

### Estimation of Data Generation

We estimate the data generation of each method/operation using the following equation:

```math
n_{\text{events}} \times n_{\text{runs}} \times C_V = \text{bytes}
```

where $C_V$ is a constant for each variable/operation. This constant can be derived by rewriting the equation as:

```math
C_V = \frac{\text{bytes}}{99}
```

### Calculated Values for $C_V$

| Variable  | $C_V$  |
|-----------|--------:|
| tracks    | **8.57**  |
| wrapper   | **3.64**  |
| overall   | **3.68**  |
| category  | **22.24** |
| event     | **64.04** |
| **Total DB output** | **89.96** |

This table represents the constant $C_V$ values calculated for different variables.
