# Data Preparation Script

This script is responsible for preparing data to be read by the written framework. It converts data from the `trackml` format to a format that the framework can use.

In [1]:
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from trackml.dataset import load_event
import os
import pandas as pd
import sys
sys.path.insert(1, './../../')
from utils import *
import json

In [None]:
parsed_data_dir = "/Users/zefwolffs/Documents/phd/tracking/trackformers/data/experiment_3d_noisy-100k-events-50-to-100-helical-tracks/parsed_data_memeff_1m"
eventrange = range(21100, 21102)

In [None]:
df_all = None

x_max = None
x_min = None
y_max = None
y_min = None
z_max = None
z_min = None

for event_number in eventrange:
    hits_event = load_event(f'./../../data/trackml/train_sample/event0000{event_number}', parts=["hits"])[0]
    hits_event = hits_event.loc[hits_event['volume_id'].isin([7, 8, 9])]
    if x_max is None or hits_event["x"].max() > x_max:
        x_max = hits_event["x"].max()
    if x_min is None or hits_event["x"].min() < x_min:
        x_min = hits_event["x"].min()
    if y_max is None or hits_event["y"].max() > y_max:
        y_max = hits_event["y"].max()
    if y_min is None or hits_event["y"].min() < y_min:
        y_min = hits_event["y"].min()
    if z_max is None or hits_event["z"].max() > z_max:
        z_max = hits_event["z"].max()
    if z_min is None or hits_event["z"].min() < z_min:
        z_min = hits_event["z"].min()

In [3]:
norm = Normalizer(x_min, x_max, y_min, y_max, z_min, z_max)
extrema = {"dim_1_min": x_min,"dim_1_max": x_max,"dim_2_min": y_min,"dim_2_max": y_max,"dim_3_min": z_min,"dim_3_max": z_max}

In [5]:
max_size_tracks = 20
max_size_hits = 65000

tracks = np.zeros((0, max_size_tracks + 1, 3), dtype=np.float16)
hits = np.zeros((0, 65000, 3), dtype=np.float16)
track_event_map = np.zeros((0), dtype=np.int32) # Event id per track

event_i = 0

for event_number in eventrange:
    hits_event, truth_event = load_event(f'./../../data/trackml/train_sample/event0000{event_number}', parts=["hits", "truth"])
    df_event = pd.merge(hits_event, truth_event, on="hit_id")
    del hits_event, truth_event
    df_event = df_event.loc[df_event['volume_id'].isin([7, 8, 9])]
    df_event["x"], df_event["y"], df_event["z"] = norm.normalize(df_event["x"], df_event["y"], df_event["z"])
    
    tracks_event = np.zeros((df_event["particle_id"].nunique() - 1, max_size_tracks + 1, 3), dtype=np.float16) # n_tracks, n_hits_per_track, n_dims_per_hit
    particle_ids = df_event["particle_id"].unique()
    particle_ids = np.delete(particle_ids, np.where(particle_ids == 0), axis=0)
    for i, unique_particle_id in enumerate(particle_ids):
        if unique_particle_id == 0:
            i -= 1
            continue
        track_coords = df_event[df_event["particle_id"] == unique_particle_id][["x", "y", "z"]].to_numpy(dtype=np.float16)
        track = np.pad(track_coords, ((1, max_size_tracks - track_coords.shape[0]), (0, 0)), mode='constant', constant_values=0)
        track[0, :] = [0, 0, 0.5]
        track[track_coords.shape[0], :] = [0.1,0,0.5]
        tracks_event[i, :, :] = track
        track_event_map = np.append(track_event_map, event_i)
    
    event_i += 1
    hits_event = df_event[["x", "y", "z"]].to_numpy(dtype=np.float16)

    if hits_event.shape[1] > 65000:
        continue
    
    hits_event = np.expand_dims(hits_event, axis=0)
    hits_event = np.pad(hits_event, ((0, 0), (0, 65000 - hits_event.shape[1]), (0, 0)), mode='constant', constant_values=0)
    
    hits = np.concatenate((hits, hits_event), axis=0)
    tracks = np.concatenate((tracks, tracks_event), axis=0)
        
    print(hits.dtype, hits.nbytes)
    print(tracks.dtype, tracks.nbytes)

    print(f"event {event_number} done.")
    del hits_event, tracks_event, df_event

float16 390000
float16 1000944
event 21100 done.
float16 780000
float16 1763874
event 21101 done.


In [None]:
with open(parsed_data_dir + "extrema.json", 'w') as fp:
    json.dump(extrema, fp)
np.save(parsed_data_dir + "tracks", tracks)
np.save(parsed_data_dir + "hits", hits)
np.save(parsed_data_dir + "track_event_map", track_event_map)