In [110]:
import numpy as np
import pandas as pd
from time import time


def file_url(category, event_id=None, train_or_test="train"):
    """Return the path of a csv corresponding to a given event and data category.
    
    Arguments:
    category -- one of "cells", "hits", "particles", "truth", "blacklist", "detectors",
        "sample_submission" or "hit_orders".
    event_id -- the integer id of an event. Should be included unless category is "detectors" or
        "sample submission". Ensure that event_id and train_or_test are consistent with each other.
    train_or_test -- one of "train" (default) or "test".
    
    TODO: Check for valid input.
    """
    if category.startswith('blacklist'):
        folder = 'dataset/blacklist'
    elif category == 'hit_orders':
        folder = 'particles-in-order'
    elif category in ('sample_submission', 'detectors'):
        return '/home/ec2-user/SageMaker/efs/dataset/{0}.csv'.format(category)
    else:
        folder = 'dataset/' + train_or_test
    return '/home/ec2-user/SageMaker/efs/{0}/event{1:09d}-{2}.csv'.format(folder, event_id, category)


def generate_hit_orders(event_id):
    """Generates hit_order csv for an event.
    
    When finished, prints the number of valid particles and hits, as well as the number and
    proportion of particles which were successfully placed in order.
    """
    start = time()
    # load truth, blacklist_particles and blacklist_hits files for event 1000.
    truth = pd.read_csv(file_url('truth', 1000))
    blacklist_particles = pd.read_csv(file_url('blacklist_particles', 1000))
    blacklist_hits = pd.read_csv(file_url('blacklist_hits', 1000))
    print('time to load data:', time() - start)
    start = time()
    # filter out track 0 (garbage track), tracks with three or fewer hits, 
    # and rows with blacklisted hits and particles.
    not_blacklist_particle = ~truth.particle_id.isin(blacklist_particles.particle_id)
    not_blacklist_hit = ~truth.hit_id.isin(blacklist_hits.hit_id)
    del blacklist_particles, blacklist_hits

    particle_num_hits = truth.groupby('particle_id')['particle_id'].transform('count')
    not_short_track = particle_num_hits > 3
    del particle_num_hits

    not_particle_zero = truth.particle_id != 0
    
    truth = truth[not_particle_zero & not_blacklist_particle & not_blacklist_hit & not_short_track]
    del not_particle_zero, not_blacklist_particle, not_blacklist_hit, not_short_track
    
    particle_weight = truth.groupby('particle_id')['weight'].transform('sum')
    truth.loc[:, 'weight_order'] = truth.weight/particle_weight
    del particle_weight
    
    truth = truth[['particle_id', 'hit_id', 'tz', 'tpz', 'weight_order']]
    
    # create z_order_dim. This is tz if the z-dimension of the particle's average trajectory
    # is positive and -tz otherwise.
    z_direction = np.sign(truth.groupby('particle_id').tpz.transform('mean'))
    truth.loc[:, 'z_order_dim'] = z_direction*truth.tz
    truth.drop(['tz', 'tpz'], axis=1, inplace=True)
    del z_direction
    
    # create hit_order column.
    truth.loc[:, 'hit_order'] = truth.groupby('particle_id')['z_order_dim'].rank(
        method='first',
        ascending=True
    ).astype(int)
    truth.drop('z_order_dim', axis=1, inplace=True)

    # sort by particle_id and hit_order.
    truth.sort_values(['particle_id', 'hit_order'], inplace=True)
    
    truth.loc[:, 'track_length'] = truth.groupby('particle_id').hit_id.transform('count')
    true_weight_order = truth.groupby(['track_length', 'hit_order']).weight_order.median()
    truth.drop('track_length', axis=1, inplace=True)
    
    # identify and remove particles whose hit order is incorrect.
    particles_in_order = truth.groupby('particle_id').apply(correct_order)
    
    total_num_particles = len(particles_in_order)
    mask = particles_in_order.loc[truth.particle_id].values
    truth = truth[mask]
    num_good_particles = len(truth.particle_id.unique())
    
    truth.reset_index(drop=True, inplace=True)
    truth.drop('weight_order', axis=1, inplace=True)
    
    print('total number of scored particles in event:\t', total_num_particles)
    print('number of successfully sorted particles:\t', num_good_particles)
    print('percentage of partices successfully sorted\t:', 
          100*num_good_particles/total_num_particles)

    print('time to prep dataframe', time() - start)
    return truth

def correct_order(particle):
    """Helper function for generate_hit_order_csv"""
    return np.all(
        np.isclose(
            particle.weight_order.values, true_weight_order.loc[len(particle)].values,
            atol=1e-06
        )
    )
    
    

In [111]:
truth = generate_hit_orders(1000)
truth.head()

time to load data: 0.2171158790588379
total number of scored particles in event:	 9272
number of successfully sorted particles:	 9252
percentage of partices successfully sorted	: 99.78429680759275
time to prep dataframe 6.21956992149353


Unnamed: 0,particle_id,hit_id,hit_order
0,4503668346847232,20880,1
1,4503668346847232,29323,2
2,4503668346847232,35621,3
3,4503668346847232,42238,4
4,4503668346847232,73763,5


In [112]:
truth.head(30)

Unnamed: 0,particle_id,hit_id,hit_order
0,4503668346847232,20880,1
1,4503668346847232,29323,2
2,4503668346847232,35621,3
3,4503668346847232,42238,4
4,4503668346847232,73763,5
5,4503668346847232,80881,6
6,4503668346847232,87265,7
7,4503668346847232,113565,8
8,4503737066323968,23346,1
9,4503737066323968,31387,2


In [73]:
valid_particles.drop('level_0', axis=1, inplace=True)

In [79]:
# valid_particles.reset_index(inplace=True)
valid_particles.sort_values(by=['hit_order', 'weight_order'], inplace=True)
valid_particles.particle_id.unique().shape

(100821,)