In [None]:
'''
This Notebook Primarily serves to plot the tracks, hits, respective parameters, and matches.
It should be used to verify if the data being processed and used for machine learning is accurate.
'''

## Load Serialized Data into Dataframes

In [2]:
import msgpack
import pandas as pd

In [26]:
%%time

'''
Read in the data from the serialized msgpack files.
'''
# List of strings with dataframe names to set as filenames for storage
filenames_ = ['data/track_global_df_.msgpack', 'data/track_param_global_df_.msgpack', 
              'data/rechit_global_df_.msgpack', 'data/rechit_param_global_df_.msgpack']

track_global_df_ = pd.read_msgpack(filenames_[0])
track_param_global_df_ = pd.read_msgpack(filenames_[1])
rechit_global_df_ = pd.read_msgpack(filenames_[2]) 
rechit_param_global_df_ = pd.read_msgpack(filenames_[3])


CPU times: user 1.97 s, sys: 91.3 ms, total: 2.06 s
Wall time: 2.07 s


In [27]:
rechit_param_global_df_

Unnamed: 0,rechit_x,rechit_y,rechit_z,rechit_id,rechit_r,rechit_phi,rechit_eta
0,24.046999,2.173970,-3.617893,0,24.145067,0.090160,-0.149285
1,24.142433,1.838912,-44.622871,1,24.212366,0.076022,-1.371125
2,23.276819,7.080809,-3.526887,2,24.329985,0.295306,-0.144458
3,21.779072,9.699214,-3.829539,3,23.841198,0.418977,-0.159944
4,22.871218,7.792418,-44.648659,4,24.162251,0.328373,-1.373454
5,21.230831,10.659569,-44.980103,5,23.756571,0.465310,-1.394909
6,20.300842,13.080436,-3.615110,6,24.149990,0.572379,-0.149141
7,19.974371,13.423154,-24.236652,7,24.065672,0.591699,-0.886389
8,20.039923,13.354204,-44.692486,8,24.081804,0.587804,-1.377252
9,18.096556,15.394129,-44.975178,9,23.758463,0.704881,-1.394741


## Place the Cuts on Rechits => eta (-0.9, 0.9)

In [28]:
intermediate_df_ = rechit_param_global_df_[rechit_param_global_df_['rechit_eta'] <= 0.9]
rechit_param_global_df_cut_ = intermediate_df_[intermediate_df_['rechit_eta'] >= -0.9]
print len(rechit_param_global_df_cut_), "of", len(rechit_param_global_df_), \
float(len(rechit_param_global_df_cut_))/float(len(rechit_global_df_)), "hits remain"

# Use the list of remaining rechits to filter the global rechit index
# TODO: Move this cell to a location before the tracks are matched to rechits
# Thus you automatically match only the hits that are valid given the cut

rechit_global_df_cut_ = rechit_global_df_.iloc[rechit_param_global_df_cut_['rechit_id']]
print len(rechit_global_df_cut_), "entries in global rechit df"

# Create a dict/hash map of the modified indices for searching efficiently
# This seems faster than checking rechit_id in the dataframe each time
uncut_rechit_id_map_ = {}
for idx_, item_ in rechit_param_global_df_cut_['rechit_id'].items():
    uncut_rechit_id_map_[item_] = idx_

# Check that the ID Ordering scheme in the global rechit ordered dicts has 
# not changed as a result of dropping the rechits that have been cut

for x,y in zip(rechit_global_df_cut_['rechit_id'], rechit_param_global_df_cut_['rechit_id']):
    if x != y:
        print "Rechit ID Mismatch: ", x, y

152630 of 600347 0.254236300006 hits remain
152630 entries in global rechit df


## Visualize some of the tracks

In [29]:
# solve for a and b
def best_fit(X, Y):

    xbar = sum(X)/len(X)
    ybar = sum(Y)/len(Y)
    n = len(X) # or len(Y)

    numer = sum([xi*yi for xi,yi in zip(X, Y)]) - n * xbar * ybar
    denum = sum([xi**2 for xi in X]) - n * xbar**2

    b = numer / denum
    a = ybar - b * xbar

    #print('best fit line:\ny = {:.2f} + {:.2f}x'.format(a, b))

    return a, b

In [None]:
% matplotlib inline
# Plot the tracks that have been matched to rechits (WITHOUT THE CUT)

# Set a criteria to filter only tracks matched to actual hits
criteria_ = [(len(rechit_id_list_) > 0) for rechit_id_list_ in track_global_df_['rechit_ids']]
track_matched_hit_ids_ = track_global_df_[criteria_]['rechit_ids']

# Verify that you have selected the correct tracks with matched rechits
assert len(track_matched_hit_ids_) == len(track_global_df_[track_global_df_['match_count'] > 0]), \
"Incorrect tracks selected; re-check count of tracks matched to rechits"
def plot_filtered_tracks(key, tracks_begin_=10, tracks_end_=45)
    fig_ = plt.figure(figsize=[20, 10])
    ax_ = plt.subplot(121)

    event_id_ = 10
    tracks_begin_, tracks_end_ = 10, 45

    # Num_tracks should be less thatn the number of actual tracks matched to hits

    for i in range(tracks_begin_, tracks_end_):
        key_x_array_ = []
        key_y_array_ = []
        #matched_hit_z_ = []
        for matched_id_ in track_matched_hit_ids_.iloc[i]:
            matched_hit_x_.append(rechit_param_global_df_.iloc[matched_id_][key_x]) 
            matched_hit_y_.append(rechit_param_global_df_.iloc[matched_id_][key_y]) 
            #matched_hit_z_.append(rechit_param_global_df_.iloc[matched_id_]['rechit_z'])   
        #sorted_x_, sorted_y_, sorted_z_ = zip(*sorted(zip(matched_hit_x_, matched_hit_y_, matched_hit_z_)))
        sorted_x_, sorted_y_= zip(*sorted(zip(matched_hit_x_, matched_hit_y_)))
        plt.scatter(matched_hit_x_, matched_hit_y_, s=5)
        a, b = best_fit(sorted_x_, sorted_y_)
        yfit = [a + b * xi for xi in sorted_x_]
        plt.plot(sorted_x_, yfit, label=i) # Best-fit line visualization
    plt.xlabel(key_x)
    plt.ylabel(key_y)
    #plt.zlabel('RechitZ')
    plt.legend()
    # ax_.scatter3D(concat_simhit_x_, concat_simhit_y_, concat_simhit_z_, s=0.6)
    return

    ax_ = plt.subplot(122)

    for i in range(tracks_begin_, tracks_end_):
        matched_hit_x_ = []
        matched_hit_y_ = []
        #matched_hit_z_ = []
        for matched_id_ in track_matched_hit_ids_.iloc[i]:
            if matched_id_ in uncut_rechit_id_map_:
                matched_hit_x_.append(rechit_param_global_df_.iloc[matched_id_]['rechit_x']) 
                matched_hit_y_.append(rechit_param_global_df_.iloc[matched_id_]['rechit_y']) 
                #matched_hit_z_.append(rechit_param_global_df_.iloc[matched_id_]['rechit_z'])
        if len(matched_hit_x_) <= 1:
            print "Skipped all rechits in track", i
            continue
        #sorted_x_, sorted_y_, sorted_z_ = zip(*sorted(zip(matched_hit_x_, matched_hit_y_, matched_hit_z_)))
        sorted_x_, sorted_y_= zip(*sorted(zip(matched_hit_x_, matched_hit_y_)))
        plt.scatter(matched_hit_x_, matched_hit_y_, s=5)
        a, b = best_fit(sorted_x_, sorted_y_)
        yfit = [a + b * xi for xi in sorted_x_]
        plt.plot(sorted_x_, yfit, label=i) # Best-fit line visualization
        #plt.plot(sorted_x_, sorted_y_, label=i) # Better visualization
    plt.xlabel('RechitX')
    plt.ylabel('RechitY')
    #plt.zlabel('RechitZ')
    plt.legend()

In [None]:
% matplotlib inline

# Plot the tracks that have been matched to rechits (WITHOUT THE CUT)

# Set a criteria to filter only tracks matched to hits
criteria_ = [(len(rechit_id_list_) > 0) for rechit_id_list_ in track_global_df_['rechit_ids']]
track_matched_hit_ids_ = track_global_df_[criteria_]['rechit_ids']

# Verify that you have selected the correct tracks with matched rechits
assert len(track_matched_hit_ids_) == len(track_global_df_[track_global_df_['match_count'] > 0]), \
"Incorrect tracks selected; re-check count of tracks matched to rechits"

fig_ = plt.figure()
ax_ = Axes3D(fig_)

# Defined in previous cell
tracks_begin_, tracks_end_ = 0, 18
# TODO: Select tracks by event_id
event_id_ = 15

# Num_tracks should be less thatn the number of actual tracks matched to hits

for i in range(tracks_begin_, tracks_end_):
    matched_hit_x_ = []
    matched_hit_y_ = []
    matched_hit_z_ = []
    for matched_id_ in track_matched_hit_ids_.iloc[i]:
        matched_hit_x_.append(rechit_param_global_df_.iloc[matched_id_]['rechit_r']) 
        matched_hit_y_.append(rechit_param_global_df_.iloc[matched_id_]['rechit_eta']) 
        matched_hit_z_.append(rechit_param_global_df_.iloc[matched_id_]['rechit_phi'])   
    sorted_x_, sorted_y_, sorted_z_ = zip(*sorted(zip(matched_hit_x_, matched_hit_y_, matched_hit_z_)))
    ax_.scatter3D(matched_hit_x_, matched_hit_y_, matched_hit_z_, s=5)
    #ax_.plot(sorted_x_, sorted_y_, sorted_z_, label=i)# Better visualization
plt.xlabel('RechitR')
plt.ylabel('RechitEta')
ax_.set_zlabel('RechitPhi')
plt.legend()
# ax_.scatter3D(concat_simhit_x_, concat_simhit_y_, concat_simhit_z_, s=0.6)

fig_ = plt.figure()
ax_ = Axes3D(fig_)

for i in range(tracks_begin_, tracks_end_):
    matched_hit_x_ = []
    matched_hit_y_ = []
    matched_hit_z_ = []
    for matched_id_ in track_matched_hit_ids_.iloc[i]:
        if matched_id_ in uncut_rechit_id_map_:
            matched_hit_x_.append(rechit_param_global_df_.iloc[matched_id_]['rechit_r']) 
            matched_hit_y_.append(rechit_param_global_df_.iloc[matched_id_]['rechit_eta'])
            matched_hit_z_.append(rechit_param_global_df_.iloc[matched_id_]['rechit_phi'])   
    if len(matched_hit_x_) <= 1:
        print "Skipped all rechits in track", i
        continue
    sorted_y_, sorted_x_, sorted_z_ = zip(*sorted(zip(matched_hit_y_, matched_hit_x_, matched_hit_z_)))
    # REMOVED Y AXIS ENTIRELY (R-PHI) -------------------------------------------------------------------------
    ax_.scatter(matched_hit_x_, matched_hit_y_, matched_hit_z_, s=5, label=i)
    #ax_.plot(sorted_x_, sorted_y_, sorted_x_, sorted_z_, label=i)# Better visualization
plt.xlabel('RechitEta')
plt.ylabel('RechitR')
ax_.set_zlabel('RechitPhi')
plt.legend()

## Place the cuts on tracks by Eta and Pt


In [None]:
%%time
# Uncomment next line and comment next 3 lines to cut only by track_pt for checking % tracks remaining
# track_param_global_df_cut_ = track_param_global_df_[track_param_global_df_['track_pt'] <= 10]

intermediate_df_ = track_param_global_df_[track_param_global_df_['track_eta'] <= 0.9]
intermediate_df_ = intermediate_df_[intermediate_df_['track_pt'] <= 10]
intermediate_df_ = intermediate_df_[intermediate_df_['track_pt'] >= 1]
track_param_global_df_cut_ = intermediate_df_[intermediate_df_['track_eta'] >= -0.9]
print len(track_param_global_df_cut_), "of", len(track_param_global_df_), \
"(", float(len(track_param_global_df_cut_))*100/float(len(track_param_global_df_)), "%) tracks remain"

track_global_df_cut_ = track_global_df_.iloc[track_param_global_df_cut_['track_id']]
print len(track_global_df_cut_), "entries in global track df"

# Create a dict/hash map of the modified indices for searching efficiently
# This seems faster than checking rechit_id in the dataframe each time
uncut_track_id_map_ = {}
for idx_, item_ in track_param_global_df_cut_['track_id'].items():
    uncut_track_id_map_[item_] = idx_

# Check that the ID Ordering scheme in the global track ordered dicts has 
# not changed as a result of dropping the tracks that have been cut
for x,y in zip(track_global_df_cut_['track_id'], track_param_global_df_cut_['track_id']):
    if x != y:
        print "Track ID Mismatch: ", x, y

## Plot only the filtered rechits

In [None]:
%%time
# Append all uncut rechits into arrays (x, y) for plotting
concat_x_ = []
concat_y_ = []

hit_x_df_ = rechit_param_global_df_['rechit_x']
hit_y_df_ = rechit_param_global_df_['rechit_y']

for rechit_id_ in uncut_rechit_id_map_:
    concat_x_.append(hit_x_df_[uncut_rechit_id_map_[rechit_id_]])
    concat_y_.append(hit_y_df_[uncut_rechit_id_map_[rechit_id_]])

plt.figure(figsize=(10,10))
ax_ = plt.subplot(1,1,1)
# Plot the 2D Histogram for Mono Rechits
ax_.set_title('Uncut Rechit Distribution')
ax_.patch.set_facecolor('black')
ax_.hist2d(concat_x_, concat_y_, bins=500, norm=matplotlib.colors.LogNorm(), cmap='hot')
#plt.savefig('plots/' + gen_event_ + '/stereo/rechitdistribution')
plt.show()

In [None]:
# Comparing df.loc and df.iloc using a random index => iloc is generally faster
# %timeit rechit_param_global_df_.loc[5868]['rechit_x']
# %timeit rechit_param_global_df_.iloc[5868]['rechit_x']

In [None]:
'''
Output the dictionaries of uncut track ids and uncut rechit ids
'''


## Convert to Adjacency Matrix for each Track

In [None]:
import tensorflow as tf

In [None]:

for event_id_ in range(number_of_events_):
    
    # Obtain subset of global dataframe for the respective event
    event_df_ = track_global_df_[track_global_df_['event_id']==event_id_]
    
    