In [3]:
from trackml import dataset, randomize, score, weights

import numpy as np
import pandas as pd

# https://dynalist.io/d/wBsMA-9ua_pIDikOtGhR_vT-

# hits, cells, particles, truth = dataset.load_event_particles('/home/ec2-user/SageMaker/efs/dataset/train/')
import time
from collections import defaultdict

from helper_functions.file_utilities import file_url


In [10]:
hit_orders_template = '/home/ec2-user/SageMaker/efs/particles-in-order/{event_id}-hit_orders.csv'

pd.read_csv(hit_orders_template.format(event_id='event000001000')).head()

Unnamed: 0,particle_id,hit_id,hit_order
0,4503668346847232,20880,1
1,4503668346847232,29323,2
2,4503668346847232,35621,3
3,4503668346847232,42238,4
4,4503668346847232,73763,5


In [13]:
pd.read_csv(file_url('hits', 1000)).head()

Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id
0,1,-64.4099,-7.1637,-1502.5,7,2,1
1,2,-55.3361,0.635342,-1502.5,7,2,1
2,3,-83.8305,-1.14301,-1502.5,7,2,1
3,4,-96.1091,-8.24103,-1502.5,7,2,1
4,5,-62.6736,-9.3712,-1502.5,7,2,1


In [12]:
pd.read_csv(file_url('truth', 1000)).head()

Unnamed: 0,hit_id,particle_id,tx,ty,tz,tpx,tpy,tpz,weight
0,1,0,-64.4116,-7.16412,-1502.5,250710.0,-149908.0,-956385.0,0.0
1,2,22525763437723648,-55.3385,0.630805,-1502.5,-0.570605,0.02839,-15.4922,1e-05
2,3,0,-83.828,-1.14558,-1502.5,626295.0,-169767.0,-760877.0,0.0
3,4,297237712845406208,-96.1229,-8.23036,-1502.5,-0.225235,-0.050968,-3.70232,8e-06
4,5,418835796137607168,-62.6594,-9.37504,-1502.5,-0.281806,-0.023487,-6.57318,9e-06


In [None]:
pd.DataFrame(columns=["volume_id_1", "layer_id_1", "module_id_1", "volume_id_2", "layer_id_2", "module_id_2"], dtype=np.dtype([('str','float')]))

In [16]:

# CONSTANTS 
EVENT_ID = 'event000001000';

#def create_graph():
    

def create_event_graph(event_id):
    """Create a graph from the provided event_id.

    Keyword arguments:
    event_id -- the event_id to be used for making a graph
    hits_file_template -- the file path template in the EFS for the hits_file
    truth_file_template -- the file path template in the EFS for the truth file
    hit_orders_template -- the file path template in the EFS for the hit_order file (manually generated)
    """
    
    print("=========Start=========")
    start_time = time.process_time()
    
    hits_file_template = '/home/ec2-user/SageMaker/efs/dataset/train/{event_id}-hits.csv'
    truth_file_template = '/home/ec2-user/SageMaker/efs/dataset/train/{event_id}-truth.csv'
    hit_orders_template = '/home/ec2-user/SageMaker/efs/particles-in-order/{event_id}-hit_orders.csv'
    graph_output_file_template = '/home/ec2-user/SageMaker/efs/graph-data/{event_id}_graph_data.csv'

    # Step 0: Obtain file paths
    hits_file = hits_file_template.format(event_id=event_id)
    truth_file = truth_file_template.format(event_id=event_id)
    hit_orders_file = hit_orders_template.format(event_id=event_id)

    # Step 1.1 
    hits_df = pd.read_csv(hits_file)
    # Step 1.2
    truth_df = pd.read_csv(truth_file)
    # Step 1.3
    hit_orders_df = pd.read_csv(hit_orders_file)

    # Step 2
    hits_truth_df = pd.merge(hits_df, truth_df, on=['hit_id'])

    # Step 3
    hits_truth_orders_df = pd.merge(hits_truth_df, hit_orders_df, on=['particle_id','hit_id'])


    # Step 4. drop all columns except these
    col_list = ["volume_id", "layer_id", "module_id", "particle_id", "hit_order"]

    hits_truth_orders_df_col_filtered = hits_truth_orders_df[col_list]

    # Step 5 Make a copy of hits_truth_orders_df_col_filtered
    hits_truth_orders_df_col_filtered_copy = hits_truth_orders_df_col_filtered.copy()

    # Step 6 -- renaming columns
    hits_truth_orders_df_col_filtered = hits_truth_orders_df_col_filtered.rename(index=str, columns={"volume_id": "volume_id_1", "layer_id": "layer_id_1", "module_id": "module_id_1"})

    hits_truth_orders_df_col_filtered_copy = hits_truth_orders_df_col_filtered_copy.rename(index=str, columns={"volume_id": "volume_id_2", "layer_id": "layer_id_2", "module_id": "module_id_2"})


    # Step 7 -- subtracting 1 from `hit_order` column values
    hits_truth_orders_df_col_filtered_copy['hit_order'] = hits_truth_orders_df_col_filtered_copy['hit_order'].apply(lambda x: x - 1)

    # Step 8 -- create df by inner join of df1 and df2 on particle_id, hit_order
    hits_truth_orders_join_particle_hit_id = pd.merge(hits_truth_orders_df_col_filtered, hits_truth_orders_df_col_filtered_copy, on=['particle_id','hit_order'])


    # Step 9 -- create edge_weight column and initialize each entry to 1
    hits_truth_orders_join_particle_hit_id['edge_weight'] = pd.Series(1, index=hits_truth_orders_join_particle_hit_id.index)

    # Step 10 hit_orders_template
    cols_to_join = ["volume_id_1", "layer_id_1", "module_id_1", "volume_id_2", "layer_id_2", "module_id_2"]

    # Step 11
    output_df = hits_truth_orders_join_particle_hit_id.groupby(cols_to_join)['edge_weight'].sum()
    
    graph_output_file_name = graph_output_file_template.format(event_id=event_id)
    
    # Step 12 -- write to csv
    write_df_to_csv_default_location(output_df, graph_output_file_name)
    
    end_time = time.process_time()
    print("Time taken for event {event_id}: ".format(event_id=event_id) + str(end_time - start_time))

    print(output_df)

    
def write_df_to_csv_default_location(df, file_name):
    """Write dataframe to csv with the provided file name with utf-8 encoding.

    Keyword arguments:
    df -- the dataframe to write
    file_name -- the file name to write it out.
    """
    df.to_csv(file_name, sep='\t', encoding='utf-8', header=True)


create_event_graph(EVENT_ID)


Time taken for event event000001000: 0.6196934239999998
volume_id_1  layer_id_1  module_id_1  volume_id_2  layer_id_2  module_id_2
7            2           2            12           2           1              2
                                                               168            1
                                                   4           1              1
                                                               5              1
                                                   6           1              1
                         3            7            2           5              2
                                      12           4           1              1
                         4            7            2           1              4
                                                               6              2
                         5            12           4           1              1
                                                               5     