In [1]:
import pandas as pd
import numpy as np

In [2]:
# Define number of events in data
number_of_events_ = 100

## Load the DataFrames Stored during Data Preprocessing

In [3]:
'''Refer to Data Preprocessing Notebook to Understand the Semantics and Column Headers'''

'Refer to Data Preprocessing Notebook to Understand the Semantics and Column Headers'

In [4]:
%%time
track_global_df_ = pd.read_msgpack('data/track_global_df_.msgpack')
track_param_global_df_ = pd.read_msgpack('data/track_param_global_df_.msgpack')
rechit_global_df_ = pd.read_msgpack('data/rechit_global_df_.msgpack')
rechit_param_global_df_ = pd.read_msgpack('data/rechit_param_global_df_.msgpack')

CPU times: user 475 ms, sys: 41.5 ms, total: 517 ms
Wall time: 603 ms


In [38]:
intermediate_df_ = track_param_global_df_[track_param_global_df_[b'track_eta'] <= 0.9]
intermediate_df_ = intermediate_df_[intermediate_df_[b'track_pt'] <= 10]
intermediate_df_ = intermediate_df_[intermediate_df_[b'track_pt'] >= 1]
track_param_global_df_ = intermediate_df_[intermediate_df_[b'track_eta'] >= -0.9]
track_global_df_ = track_global_df_.iloc[track_param_global_df_[b'track_id']]

In [5]:
%%time
track_count_ = pd.DataFrame.to_dict(pd.read_csv('data/track_count_.csv'))
rechit_count_ = pd.DataFrame.to_dict(pd.read_csv('data/rechit_count_.csv'))
track_ids_ = pd.DataFrame.to_dict(pd.read_csv('data/track_ids_.csv'))
rechit_ids_ = pd.DataFrame.to_dict(pd.read_csv('data/rechit_ids_.csv'))

CPU times: user 44.7 ms, sys: 14.7 ms, total: 59.4 ms
Wall time: 83.6 ms


## Create the Graph Tuples from Hit Matches

In [6]:
'''
Graphs are created on a per-event basis. We build an adjacency matrix of rechits for each individual event.
The rechit connections are defined by the track that they belong to.
The target label for each node is taken as the first tp_index in its rechit_tp_list

#TODO: Incorporate a more flexible labeling schema - can you label the edges using the 'extra' tp index?
Can this learn more interesting structures for the graph(s)?
# Solution: Use the same node in different graphs - same as above but implementation-wise easier to do.

#TODO: What information do we use to weight the edges in the graph?
Differences in rechit parameters?
Rechit vs. Track Parameters?
'''

"\nGraphs are created on a per-event basis. We build an adjacency matrix of rechits for each individual event.\nThe rechit connections are defined by the track that they belong to.\nThe target label for each node is taken as the first tp_index in its rechit_tp_list\n\n#TODO: Incorporate a more flexible labeling schema - can you label the edges using the 'extra' tp index?\nCan this learn more interesting structures for the graph(s)?\n# Solution: Use the same node in different graphs - same as above but implementation-wise easier to do.\n\n#TODO: What information do we use to weight the edges in the graph?\nDifferences in rechit parameters?\nRechit vs. Track Parameters?\n"

In [26]:
# This array is unused since cuts (-0.9 < rechit_eta <= 0.9) have already been placed on rechits
uncut_rechit_ids_ = np.load('data/uncut_rechit_ids_.npy')

In [1]:
%%time
import numpy as np
'''TODO: Convert keys to string instead of bytes and check overheads'''

data_dict_list = []

# Global Features are track-based so they vary in length per-event
# We find the maximum number of tracks that correspond to max_len of global feature vector
# Is it a good idea to zero-pad global feature vectors less than max_len?
GLOBAL_FEATURES_LEN_ = max([len(track_global_df_[track_global_df_[b'event_id']==event_id_]) for event_id_ in range(100)])

for event_id_ in range(number_of_events_):
    data_dict = {}
    track_event_df_ = track_global_df_[track_global_df_[b'event_id'] == 14]
    track_param_df_ = track_param_global_df_.loc[track_event_df_[b'track_id']]
    track_df_ = track_event_df_.merge(track_param_df_)

    # Sort the tracks according to increasing track_eta and associate a label with each track
    track_df_.sort_values(b'track_eta', ascending=True, inplace=True)
    track_df_.index = pd.RangeIndex(len(track_df_.index))  

    rechit_event_df_ = rechit_global_df_[rechit_global_df_[b'event_id']==event_id_]
    rechit_param_df_ = rechit_param_global_df_[rechit_param_global_df_[b'event_id']==event_id_]
    
    if len(rechit_event_df_) != len(rechit_param_df_):
        print("Error - param data and event data are not of equal length!")
    
    number_of_rechits_in_event_ = len(rechit_event_df_)
    
    # Set the node features as the track features that they belong to
    node_labels_ = np.array(rechit_param_df_[b'rechit_local_id'].tolist())
    
    # Originally, we were setting node-level features based on the nodes but that can be done
    # for the test data set; instead here we can set the node-level features as the track features
    # at least for training and "learn" the track-level features (eta) based on which we can cluster the nodes?
    # The question still remains how do we initialize the edges ???
    
    # Update: Reverting to node-level features for each node as of now 
    # Modify it to combine some form of track-level features (target?)
    
    node_features_ = np.transpose(np.array([rechit_param_df_[b'rechit_r'].tolist(),
                                  rechit_param_df_[b'rechit_eta'].tolist(),
                                  rechit_param_df_[b'rechit_phi'].tolist()]))
    
    # For each track, append to the list of source nodes, destination nodes, and node feature vectors
    # Keep track of the node label equal to len(track_df_)
    node_label_i_ = 0
    for track_rechit_id_array_ in track_df_[b'rechit_local_ids']:
        src_vertices_ = []
        dest_vertices_ = []
        # Sort the rechits based on values of Rechit R
        # Start track building from the inside and move all the way outside
        final_rechits_ = sorted(track_rechit_id_array_, 
                                key=lambda hit_: rechit_param_df_.iloc[int(hit_)][b'rechit_r'])
        if len(final_rechits_) < 2:
            continue
        elif len(final_rechits_) == 2:
            src_vertices_.append(final_rechits_[0])
            dest_vertices_.append(final_rechits_[1])
        # In order to extend this to 2 skip-connections (expanding to the assumption that 3 hits can be 
        # on the same layer, thus all of them should be connected to a hit on the next layer)
        # Create another else case for len(final_rechits_) == 3: and add the corresponding vertices
        # to src and dest arrays. Then you can modify the addition procedure to include src+[3:] and dest+[:-3]
        # So you will have (1,2), (1,3), and (1,4) edges as a simple example of adding 2-skip-connections
        else:
            # Add the edges starting from node a and going to both a+1 and a+2
            # We define this as 1-skip-connection because hits might lie on the same layer
            # We originally sort them by the radius to ensure skip-connections have a meaning
            src_vertices_.extend(final_rechits_[:-1]+final_rechits_[:-2])
            dest_vertices_.extend(final_rechits_[1:]+final_rechits_[2:])
        # Increment the node label
        node_label_i_ += 1
    if node_label_i_ != len(track_df_):
        print("Error: Node Labels exceed the number of tracks - spurious labels generated!")
    
    # Define a zero-padded global feature vector
    if len(track_df_) < GLOBAL_FEATURES_LEN_:
        global_feature_vector_ = np.array(track_df_[b'track_eta'].values + np.zeros(GLOBAL_FEATURES_LEN_ - len(track_df_))
    else:    
        global_feature_vector_ = np.array(track_df_[b'track_eta'].values)
                                          
    data_dict_ = {
    "globals": global_feature_vector_,
    "nodes": nodes_0,
    "edges": edges_0,
    "senders": senders_0,
    "receivers": receivers_0
    }


NameError: name 'number_of_events_' is not defined

In [36]:
#rechit_event_df_ = rechit_global_df_[rechit_global_df_[b'event_id']==10]
#rechit_param_df_ = rechit_param_global_df_.loc[rechit_event_data_[b'rechit_id']]

#print(rechit_event_df_.head(5))
#print(rechit_param_df_.head(5))
#print(rechit_param_df_.iloc[0])

#track_event_df_ = track_global_df_[track_global_df_[b'event_id'] == 14]
#track_param_df_ = track_param_global_df_.loc[track_event_df_[b'track_id']]
#print(len(track_event_df_), len(track_param_df_))
#track_df_ = track_event_df_.merge(track_param_df_)
#print(track_df_.head())
#track_df_.sort_values(b'track_eta', ascending=True, inplace=True)
#track_df_.index = pd.RangeIndex(len(track_df_.index))  
a = [len(track_global_df_[track_global_df_[b'event_id']==event_id_]) for event_id_ in range(100)]
GLOBAL_FEATURES_LEN_ = max(a)
print (a.index(GLOBAL_FEATURES_LEN_))
print(sum(a))

55
6041


## Tensorflow Models

## PyTorch Models

In [9]:
import torch
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

class GCNConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(GCNLayer, self).__init__()
        self.lin = torch.nn.Linear(in_channels, out_channels)

    def forward(self, x, edge_index):
        # x has shape [N, in_channels]
        # edge_index has shape [2, E]

        # Step 1: Add self-loops to the adjacency matrix.
        edge_index = add_self_loops(edge_index, num_nodes=x.size(0))

        # Step 2: Linearly transform node feature matrix.
        x = self.lin(x)

        # Step 3-5: Start propagating messages with "add" aggregation.
        return self.propagate('add', edge_index, x=x, num_nodes=x.size(0))

    def message(self, x_j, edge_index, num_nodes):
        # x_j has shape [E, out_channels]

        # Step 3: Normalize node features.
        row, col = edge_index
        deg = degree(row, num_nodes, dtype=x_j.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        return norm.view(-1, 1) * x_j

    def update(self, aggr_out):
        # aggr_out has shape [N, out_channels]

        # Step 5: Return new node embeddings.
        return aggr_out

ModuleNotFoundError: No module named 'torch'

In [None]:
from torch_geometric.data import Data

# Sample Edge Label Definition for Rechits - Adjacency List?
edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index)

In [None]:
'''Define the 2-layer GCN'''

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_features, 16)
        self.conv2 = GCNConv(16, data.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)


In [None]:
# CUDA available on cmg-gpu1080
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()