In [1]:
from trackml import dataset, randomize, score, weights

import numpy as np
import pandas as pd

# hits, cells, particles, truth = dataset.load_event_particles('/home/ec2-user/SageMaker/efs/dataset/train/')
import time
from collections import defaultdict


In [7]:

# CONSTANTS 
EVENT_ID = 'event000001000';
hits_file_template = '/home/ec2-user/SageMaker/efs/dataset/train/{event_id}-hits.csv'
truth_file_template = '/home/ec2-user/SageMaker/efs/dataset/train/{event_id}-truth.csv'
hit_orders_template = '/home/ec2-user/SageMaker/efs/particles-in-order/{event_id}-hit_orders.csv'
graph_output_forfile_template = '/home/ec2-user/SageMaker/efs/graph-data/{event_id}_graph_data.csv'


def create_graph(event_id, hits_file_template, truth_file_template, hit_orders_template):
    """Create a graph from the provided event_id.

    Keyword arguments:
    event_id -- the event_id to be used for making a graph
    hits_file_template -- the file path template in the EFS for the hits_file
    truth_file_template -- the file path template in the EFS for the truth file
    hit_orders_template -- the file path template in the EFS for the hit_order file (manually generated)
    """
    
    print("=========Start=========")
    start_time = time.process_time()

    # Step 0: Obtain file paths
    hits_file = hits_file_template.format(event_id=event_id)
    truth_file = truth_file_template.format(event_id=event_id)
    hit_orders_file = hit_orders_template.format(event_id=event_id)

    # Step 1.1 
    hits_df = pd.read_csv(hits_file)
    # Step 1.2
    truth_df = pd.read_csv(truth_file)
    # Step 1.3
    hit_orders_df = pd.read_csv(hit_orders_file)

    # Step 2
    hits_truth_df = pd.merge(hits_df, truth_df, on=['hit_id'])

    # Step 3
    hits_truth_orders_df = pd.merge(hits_truth_df, hit_orders_df, on=['particle_id','hit_id'])


    # Step 4. drop all columns except these
    col_list = ["volume_id", "layer_id", "module_id", "particle_id", "hit_order"]

    hits_truth_orders_df_col_filtered = hits_truth_orders_df[col_list]

    # Step 5 Make a copy of hits_truth_orders_df_col_filtered
    hits_truth_orders_df_col_filtered_copy = hits_truth_orders_df_col_filtered.copy()

    # Step 6 -- renaming columns
    hits_truth_orders_df_col_filtered = hits_truth_orders_df_col_filtered.rename(index=str, columns={"volume_id": "volume_id_1", "layer_id": "layer_id_1", "module_id": "module_id_1"})

    hits_truth_orders_df_col_filtered_copy = hits_truth_orders_df_col_filtered_copy.rename(index=str, columns={"volume_id": "volume_id_2", "layer_id": "layer_id_2", "module_id": "module_id_2"})


    # Step 7 -- subtracting 1 from `hit_order` column values
    hits_truth_orders_df_col_filtered_copy['hit_order'] = hits_truth_orders_df_col_filtered_copy['hit_order'].apply(lambda x: x - 1)

    # Step 8 -- create df by inner join of df1 and df2 on particle_id, hit_order
    hits_truth_orders_join_particle_hit_id = pd.merge(hits_truth_orders_df_col_filtered, hits_truth_orders_df_col_filtered_copy, on=['particle_id','hit_order'])


    # Step 9 -- create edge_weight column and initialize each entry to 1
    hits_truth_orders_join_particle_hit_id['edge_weight'] = pd.Series(1, index=hits_truth_orders_join_particle_hit_id.index)

    # Step 10hit_orders_template
    cols_to_join = ["volume_id_1", "layer_id_1", "module_id_1", "volume_id_2", "layer_id_2", "module_id_2"]

    # Step 11
    output_df = hits_truth_orders_join_particle_hit_id.groupby(cols_to_join)['edge_weight'].sum()
    
    graph_output_file_name = graph_output_file_template.format(event_id=event_id)
    
    # Step 12 -- write to csv
    write_df_to_csv_default_location(output_df, graph_output_file_name)
    
    end_time = time.process_time()
    print("Time taken for event {event_id}: ".format(event_id=event_id) + str(end_time - start_time))

    print(hits_truth_orders_join_particle_hit_id)

    
def write_df_to_csv_default_location(df, file_name):
    """Write dataframe to csv with the provided file name with utf-8 encoding.

    Keyword arguments:
    df -- the dataframe to write
    file_name -- the file name to write it out.
    """
    df.to_csv(file_name, sep='\t', encoding='utf-8', header=True)


create_graph(EVENT_ID, hits_file_template, truth_file_template, hit_orders_template)


Time taken for event event000001000: 0.6046939839999999
       volume_id_1  layer_id_1  module_id_1         particle_id  hit_order  \
0                7           2            2    4513426512543744         11   
1                7           2            2  819655956815151104         13   
2                7           2            2  418837307966095360         11   
3                7           2            2  761110055012532224         11   
4                7           2            2  801644925560029184          9   
5                7           2            2  211690073207341056         12   
6                7           2            3  139619285029879808         11   
7                7           2            3  225187609230442496         10   
8                7           2            3  445865090483224576         11   
9                7           2            4  418835796137607168          9   
10               7           2            4  810658172128722944         16   
11      