In [1]:
import pandas as pd
import numpy as np

In [2]:
# Define number of events in data
number_of_events_ = 100

## Load the DataFrames Stored during Data Preprocessing

In [3]:
'''Refer to Data Preprocessing Notebook to Understand the Semantics and Column Headers'''

'Refer to Data Preprocessing Notebook to Understand the Semantics and Column Headers'

In [4]:
# TODO: Set new string names of column to avoid using b'' syntax to access the rows and columns
def decode_byte_headers(df_=None): 
    if df_ is not None:
        new_names = []
        for column_name in df_.columns:
            new_names.append(str(column_name.decode()))
        df_.columns = new_names
        return df_
    pass

In [5]:
%%time
track_global_df_ = decode_byte_headers(pd.read_msgpack('data/track_global_df_.msgpack'))
track_param_global_df_ = decode_byte_headers(pd.read_msgpack('data/track_param_global_df_.msgpack'))
rechit_global_df_ = decode_byte_headers(pd.read_msgpack('data/rechit_global_df_.msgpack'))
rechit_param_global_df_ = decode_byte_headers(pd.read_msgpack('data/rechit_param_global_df_.msgpack'))

CPU times: user 60.7 ms, sys: 7.72 ms, total: 68.4 ms
Wall time: 86.9 ms


In [6]:
intermediate_df_ = track_param_global_df_[track_param_global_df_['track_eta'] <= 0.9]
intermediate_df_ = intermediate_df_[intermediate_df_['track_pt'] <= 5]
intermediate_df_ = intermediate_df_[intermediate_df_['track_pt'] >= 0.5]
track_param_global_df_ = intermediate_df_[intermediate_df_['track_eta'] >= -0.9]
track_global_df_ = track_global_df_.iloc[track_param_global_df_['track_id']]

In [7]:
# %%time
# This cell has been rendered obsolete and is only run to check for backward compatibility
# For the other notebook titled 'Data Preprocessing ...'
'''
track_count_ = pd.DataFrame.to_dict(pd.read_csv('data/track_count_.csv'))
rechit_count_ = pd.DataFrame.to_dict(pd.read_csv('data/rechit_count_.csv'))
track_ids_ = pd.DataFrame.to_dict(pd.read_csv('data/track_ids_.csv'))
rechit_ids_ = pd.DataFrame.to_dict(pd.read_csv('data/rechit_ids_.csv'))
'''

"\ntrack_count_ = pd.DataFrame.to_dict(pd.read_csv('data/track_count_.csv'))\nrechit_count_ = pd.DataFrame.to_dict(pd.read_csv('data/rechit_count_.csv'))\ntrack_ids_ = pd.DataFrame.to_dict(pd.read_csv('data/track_ids_.csv'))\nrechit_ids_ = pd.DataFrame.to_dict(pd.read_csv('data/rechit_ids_.csv'))\n"

## DeepHGCal/TFRecords Data Preparation

In [8]:
import tensorflow as tf

In [9]:
MAX_RECHIT_LEN = 3600
MAX_TRACK_LEN = 100

In [277]:
%%time
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

'''This is the function to create graphs in the form readable by DeepHGCal'''

data_dict_list_ = []
scaler = StandardScaler()

# Global Features are track-based so they vary in length per-event
# We find the maximum number of tracks that correspond to max_len of global feature vector
# Is it a good idea to zero-pad global feature vectors less than max_len?

for event_id_ in range(number_of_events_):
    data_dict_ = {}
    senders_ = []
    receivers_ = []
    
    track_event_df_ = track_global_df_[track_global_df_['event_id'] == event_id_]
    track_param_df_ = track_param_global_df_.loc[track_event_df_['track_id']]
    track_df_ = track_event_df_.merge(track_param_df_)

    # Sort the tracks according to increasing track_eta and associate a label with each track
    # This is done by resetting the track index based on increasing track_et
    track_df_.sort_values('track_eta', ascending=True, inplace=True)
    track_df_.index = pd.RangeIndex(len(track_df_.index))  

    rechit_event_df_ = rechit_global_df_[rechit_global_df_['event_id']==event_id_]
    rechit_param_df_ = rechit_param_global_df_[rechit_param_global_df_['event_id']==event_id_]
    rechit_param_df_.index = pd.RangeIndex(len(rechit_param_df_.index))  
    if len(rechit_event_df_) != len(rechit_param_df_):
        print("Error - param data and event data are not of equal length!")
    
    number_of_rechits_in_event_ = len(rechit_event_df_)
    
    # Set the node features as the track features that they belong to
    node_indices_ = np.array(rechit_param_df_['rechit_local_id'].tolist()).astype(int)
    node_labels_ = []
    
    # Originally, we were setting node-level features based on the nodes but that can be done
    # for the test data set; instead here we can set the node-level features as the track features
    # at least for training and "learn" the track-level features (eta) based on which we can cluster the nodes?
    # The question still remains how do we initialize the edges ???
    
    # Update: Reverting to node-level features for each node as of now 
    # Modify it to combine some form of track-level features (target?)
    
    # Skip the event if it has no rechits
    if len(node_indices_) == 0:
        print("Event ", event_id_, " has no rechits" )
        continue
        
    rechit_feature_vector_ = np.transpose(np.array([
        rechit_param_df_['rechit_r'].tolist(),
        rechit_param_df_['rechit_eta'].tolist(),
        rechit_param_df_['rechit_phi'].tolist(),
        rechit_param_df_['rechit_x'].tolist(),
        rechit_param_df_['rechit_y'].tolist(),
        rechit_param_df_['rechit_z'].tolist(),
        np.zeros(len(rechit_param_df_['rechit_eta'])),
        np.zeros(len(rechit_param_df_['rechit_eta'])),
        np.zeros(len(rechit_param_df_['rechit_eta'])),
        np.zeros(len(rechit_param_df_['rechit_eta'])),
    ]))
    
    track_feature_vector_ = np.transpose(np.array([
        np.zeros(len(track_df_['track_eta'])),
        track_df_['track_eta'].tolist(),
        track_df_['track_phi'].tolist(),
        np.zeros(len(track_df_['track_eta'])),
        np.zeros(len(track_df_['track_eta'])),
        np.zeros(len(track_df_['track_eta'])),
        track_df_['track_dsz'].tolist(),
        track_df_['track_dxy'].tolist(),
        track_df_['track_qoverp'].tolist(),
        track_df_['track_pt'].tolist(),
    ]))

    node_feature_vector_ = np.vstack((rechit_feature_vector_, track_feature_vector_))
    node_feature_vector_ = scaler.fit_transform(node_feature_vector_)    
    
    # Initialize an array of zeros as labels for the nodes
    # Replace each zero with the int (label) associated with the track to which the rechit belongs
    # This becomes the target label to identify for that training example
    # Later, we will one_hot_encode the target label for working with the tensorflow model
    node_label_array_ = [0] * len(rechit_feature_vector_)
    track_labels_ = []
    
    # Associate a label with each rechit
    for trk_idx_, row in track_df_.iterrows():
        track_rechit_id_array_ = row.rechit_local_ids
        # We use the rechit local index as a unique label for the track
        # Lower indices are thus associated with lower track eta
        for id_ in track_rechit_id_array_:
            node_label_array_[int(id_)] = trk_idx_ + 1  # We can now use index '0' as a 'noise' label
        
        # Also associate a label with each track that we consider as a point among the data
        track_labels_.append(trk_idx_ + 1)
    
    print(len(node_label_array_))   
    #print(node_indices_.shape)
    #node_labels_ = np.vstack((node_indices_, label_array_)).T
    #print(node_labels_.shape)
    #assert (len(node_labels_)==len(rechit_feature_vector_)), "Node label and Node feature vector length mismatch"
    
    # Concatenate node and track labels into a single array
    # Thereafter, concatenate everything into the node feature matrix 
    # Note that we duplicate the labels to work with off-the-shelf DeepHGCal Model
    nla_ = np.concatenate(((np.array(node_label_array_)), np.array(track_labels_)), axis=0)  # add track labels here
    # node_feature_vector_ = np.concatenate((rechit_feature_vector_, nla_[:, None], nla_[:, None]), axis = 1)
    assert node_feature_vector_.shape[0] == nla_.shape[0], "Labels are not equal to training nodes/examples"
    # data comprises of f features--here f is 6--and 1-d vector for labels (total f+1 columns)
    # it has n rows of rechits making its shape n x f
    # f is constant for all events but n varies with the event thus padding may be necessary
    data_dict_ = {
    "data": node_feature_vector_,
    "labels": nla_,
    }
    
    data_dict_list_.append(data_dict_)
print(len(data_dict_list_), "graphs generated from data")

1320
1554
2596
2939
1502
1862
1204
1281
2420
1566
10 graphs generated from data
CPU times: user 170 ms, sys: 7.96 ms, total: 178 ms
Wall time: 185 ms


In [278]:
'''Testing np stacking and concatenation speeds
%%time 
print(rechit_feature_vector_.shape)
nla = np.array(node_label_array_)
c = np.concatenate((rechit_feature_vector_, nla[:, None], nla[:, None]), axis = 1)
# Concatenate works faster than column_stack - TODO look into their implementations, seems interesting!
# P.S. Column stack just uses vstack in the backend according to StackOverflow
# c = np.column_stack((rechit_feature_vector_, nla, nla))

print(c.shape)'''

'Testing np stacking and concatenation speeds\n%%time \nprint(rechit_feature_vector_.shape)\nnla = np.array(node_label_array_)\nc = np.concatenate((rechit_feature_vector_, nla[:, None], nla[:, None]), axis = 1)\n# Concatenate works faster than column_stack - TODO look into their implementations, seems interesting!\n# P.S. Column stack just uses vstack in the backend according to StackOverflow\n# c = np.column_stack((rechit_feature_vector_, nla, nla))\n\nprint(c.shape)'

In [311]:
from sklearn.preprocessing import OneHotEncoder

'''Convert the dict into a tf.Example then use a proto_buffer to 
serialize it into a compatible format for TFRecords'''

def create_tf_example(graph_dict=None, max_hits=None, max_tracks=None, set_one_hot_labels=False):
    """
    :param graph_dict: dictionary with each key representing a feature vector
    :param labels: list with 
    Creates a tf.Example message ready to be written to a file.
    """
    one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
    
    if max_hits is None:
        max_hits = 3600
    
    if graph_dict is None:
        raise ValueError("No Rechit Feature Matrix provided")

    # TODO: Can we eliminate this padding? After all, graphs are meant to be dynamic structures, right?
    # Presently, all features are concatenated into a matrix and converted to a tensor
    if 'data' in graph_dict.keys():
        original_data = graph_dict['data']
        data_dimensions = original_data.shape[1]
        # If number of points are less than max_hits threshold
        if original_data.shape[0] < max_hits:
            event_data_size = max_hits * data_dimensions
            
            # Reshape the original data into a 1D array
            # TODO: Is np.ravel faster? Does it matter?
            original_data = graph_dict['data'].reshape(-1).astype(np.float32)
            # Pad the data with zeros to ensure the length of data from each event is constant
            padded_data = np.concatenate((original_data, np.zeros(event_data_size - len(original_data))), axis=0)
            padded_data = np.reshape(padded_data, (max_hits, data_dimensions))
        else:
            # Eliminate the first bits of the data (these are definitely rechits)
            # The last part of the data is the tracks and we do not want to eliminate those
            # Since they are useful as 'centroids' for the clustering
            padded_data = original_data[-event_data_size:]
    else:
        raise ValueError("Key 'data' not found in rechit data dictionary")
    
    # Handle the labels either separately as one-hot-encoded data or together
    if 'labels' in graph_dict.keys():
        node_labels = graph_dict['labels']
        
        # Zero-pad the labels (in the case of one-hot encoded labels, the '0' label
        # corresponds to noisy points in the cloud)
        if len(node_labels) > max_hits:
            padded_labels = node_labels[-max_hits:]
        else:
            # Pad the data with zeros to ensure the length of data from each event is constant
            padded_labels = np.concatenate((node_labels, np.zeros(max_hits - len(node_labels))))
        
        # Check if one-hot-encoded labels are desired
        if set_one_hot_labels is True:
            if max_tracks is None:
                # Set default value of maximum tracks
                # This will add as many dimensions to your data
                max_tracks = 100
            
            # Create one-hot-encoded representation of the target labels for the data
            target_labels = one_hot_encoder.fit_transform(np.array([padded_labels]).T).toarray()
            # If there are more labels than max_tracks, just cut out the last set of labels
            if target_labels.shape[1] > max_tracks:
                target_labels = target_labels[:, :max_tracks]
            else:
                # Append zero-columns as padding to these data items
                padding_columns = np.zeros((max_hits, max_tracks-target_labels.shape[1]))
                target_labels = np.hstack((target_labels, padding_columns))
        else:            
            # Code to store regular labels in a duplicated array for DeepHGCal
            target_labels = np.concatenate((padded_labels[:, None], padded_labels[:, None]), axis = 1)
    else:
        raise ValueError("Key 'labels' not found in rechit data dictionary")
    
    # Once the labels and data is padded and concatenated, finalize the data into a 2D matrix
    final_data = np.hstack((padded_data, target_labels))
    print(final_data.shape)
    # Create a dictionary mapping the feature name to the tf.Example-compatible
    # data type.
    feature_matrix = {}
        
    # We flatten this tensor and convert it into a FloatList that is then serialized
    # Define the tf Feature to wrap the FloatList
    feature_matrix['data'] = tf.train.Feature(float_list=tf.train.FloatList(value=final_data.ravel()))
        
    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature_matrix))
    return example_proto.SerializeToString()


def _parse_function(example_proto, data_dimensions=None):
    '''If you would like to read and parse the data stored in TFRecord format, 
    refer to the [Dev] Prototyping Graph Neural Networks Notebook'''
    # max_tracks = 100; features = 10
    if data_dimensions is None:
        data_dimensions = (3600, 12)
        
    # Create a description of the features to be read from the TFRecord file(s).  
    feature_description = {
    'data': tf.FixedLenFeature(data_dimensions, tf.float32),
    }

    # Parse the input tf.Example proto using the Feature dictionary above.
    return tf.parse_single_example(example_proto, feature_description)



In [312]:
'''Write the TFRecord File'''

with tf.python_io.TFRecordWriter('tfrecords/ttbar-10.tfrecord', 
                                 options=tf.python_io.TFRecordOptions(
                                    tf.python_io.TFRecordCompressionType.GZIP)) as tfwriter:
    for event_number_, data_record_ in enumerate(data_dict_list_):
        tf_example_ = create_tf_example(data_record_)
        tfwriter.write(tf_example_)
        


(3600, 12)
(3600, 12)
(3600, 12)
(3600, 12)
(3600, 12)
(3600, 12)
(3600, 12)
(3600, 12)
(3600, 12)
(3600, 12)


## Testing Reading the files from TFRecords

In [313]:
'''Test using the tf.TFRecordDataset to check the values have been read'''

# This is the future-compatible version of reading TFRecord files
# But this requires eager execution to be enabled which is persently not the case
import os

# First check that the tf_example_ above has been correctly serialized to bytes by parsing it back
result = tf.train.Example.FromString(tf_example_)
print (type(result))
print("First value: {}".format(result.features.feature['data'].float_list.value[0]))


# filenames = ['jan-tfrecords/' + name for name in os.listdir('jan-tfrecords/')]
filenames = ['tfrecords/' + name for name in os.listdir('tfrecords/')]
for x in filenames:
    # Remove macOS specific metadata files like .DS_Store
    if '.tfrecord' not in x:
        filenames.remove(x)
print (filenames)


raw_dataset = tf.data.TFRecordDataset(filenames, compression_type="GZIP")
print(raw_dataset.output_shapes)

for raw_record in raw_dataset.take(1):
  print(type(raw_record))

parsed_dataset = raw_dataset.map(_parse_function)
print(parsed_dataset)

for parsed_record in parsed_dataset.take(10):
  print(parsed_record)

'''THIS IS FOR WHEN EAGER EXECUTION IS NOT ENABLED

iterator = parsed_dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    for i in range(10):
        val = sess.run(next_element)
        print(val)'''


<class 'tensorflow.core.example.example_pb2.Example'>
First value: -1.387333631515503
['tfrecords/ttbar-10.tfrecord']
()
<class 'tensorflow.python.framework.ops.EagerTensor'>
<DatasetV1Adapter shapes: {data: (3600, 12)}, types: {data: tf.float32}>
{'data': <tf.Tensor: id=1933, shape=(3600, 12), dtype=float32, numpy=
array([[-1.4635228 , -0.41366753, -0.20038593, ..., -0.03687247,
         0.        ,  0.        ],
       [-1.4556891 , -0.40339887, -0.09332851, ..., -0.03687247,
         0.        ,  0.        ],
       [-1.476408  , -0.43635982, -0.02883724, ..., -0.03687247,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)>}
{'data': <tf.Tensor: id=1935, shape=(3600, 12), dtyp

'THIS IS FOR WHEN EAGER EXECUTION IS NOT ENABLED\n\niterator = parsed_dataset.make_one_shot_iterator()\nnext_element = iterator.get_next()\n\nwith tf.Session() as sess:\n    for i in range(10):\n        val = sess.run(next_element)\n        print(val)'

In [126]:
''' Manually attempting to create a TF Dataset for TFRecords for DeepHGCal
Format BxNxF where:
:param B: batch size
:param N: Number of rechits
:param F: Number of features for the rechit
'''

# This should be handled in DeepHGCal and not manually



' Manually attempting to create a TF Dataset for TFRecords for DeepHGCal\nFormat BxNxF where:\n:param B: batch size\n:param N: Number of rechits\n:param F: Number of features for the rechit\n'

## DeepMind Graph Nets/Tensorflow Data Preparation

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from graph_nets import blocks
from graph_nets import graphs
from graph_nets import modules
from graph_nets import utils_np
from graph_nets import utils_tf

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import sonnet as snt
import tensorflow as tf

## Create the Graph Tuples from Hit Matches

In [None]:
from sklearn.preprocessing import StandardScaler

'''
Graphs are created on a per-event basis. We build an adjacency matrix of rechits for each individual event.
The rechit connections are defined by the track that they belong to.
The target label for each node is taken as the first tp_index in its rechit_tp_list

#TODO: Incorporate a more flexible labeling schema - can you label the edges using the 'extra' tp index?
Can this learn more interesting structures for the graph(s)?
# Solution: Use the same node in different graphs - same as above but implementation-wise easier to do.

#TODO: What information do we use to weight the edges in the graph?
Differences in rechit parameters?
Rechit vs. Track Parameters?
'''

In [None]:
%%time
import numpy as np
from sklearn.preprocessing import StandardScaler

'''This is the function to create graphs in the form readable by the DeepMind Library'''
'''TODO: Use the node_label to actually label the nodes!'''
data_dict_list_ = []
scaler = StandardScaler()


# Global Features are track-based so they vary in length per-event
# We find the maximum number of tracks that correspond to max_len of global feature vector
# Is it a good idea to zero-pad global feature vectors less than max_len?
GLOBAL_FEATURES_LEN_ = max([
    len(
        track_global_df_[track_global_df_['event_id']==event_id_]
    ) for event_id_ in range(100)
])

for event_id_ in range(number_of_events_):
    data_dict_ = {}
    senders_ = []
    receivers_ = []
    
    track_event_df_ = track_global_df_[track_global_df_['event_id'] == event_id_]
    track_param_df_ = track_param_global_df_.loc[track_event_df_['track_id']]
    track_df_ = track_event_df_.merge(track_param_df_)

    # Sort the tracks according to increasing track_eta and associate a label with each track
    # This is done by resetting the track index based on increasing track_et
    track_df_.sort_values('track_eta', ascending=True, inplace=True)
    track_df_.index = pd.RangeIndex(len(track_df_.index))  

    rechit_event_df_ = rechit_global_df_[rechit_global_df_['event_id']==event_id_]
    rechit_param_df_ = rechit_param_global_df_[rechit_param_global_df_['event_id']==event_id_]
    rechit_param_df_.index = pd.RangeIndex(len(rechit_param_df_.index))  
    if len(rechit_event_df_) != len(rechit_param_df_):
        print("Error - param data and event data are not of equal length!")
    
    number_of_rechits_in_event_ = len(rechit_event_df_)
    
    # Set the node features as the track features that they belong to
    node_labels_ = np.array(rechit_param_df_['rechit_local_id'].tolist()).astype(int)
    
    # Originally, we were setting node-level features based on the nodes but that can be done
    # for the test data set; instead here we can set the node-level features as the track features
    # at least for training and "learn" the track-level features (eta) based on which we can cluster the nodes?
    # The question still remains how do we initialize the edges ???
    
    # Update: Reverting to node-level features for each node as of now 
    # Modify it to combine some form of track-level features (target?)
    if len(node_labels_) == 0:
        continue
    rechit_feature_vector_ = np.transpose(np.array([rechit_param_df_['rechit_r'].tolist(),
                                  rechit_param_df_['rechit_eta'].tolist(),
                                  rechit_param_df_['rechit_phi'].tolist()]))
    rechit_feature_vector_ = scaler.fit_transform(rechit_feature_vector_)
    node_feature_vector_ = rechit_feature_vector_
    
    # Edge Features uses the track matches to define edge features for each set of nodes
    # We are not relying on edges for now but this will be relevant for message-passing in graph neural networks
    edge_feature_vector_ = []
    
    # For each track, append to the list of source nodes, destination nodes, and node feature vectors
    # Keep track of the node label equal to len(track_df_)
    node_label_i_ = 0
    
    for row in track_df_.itertuples():
        track_edge_features_ = []
        # The itertuples() method for dataframes requires acccess by rows
        # The matched_rechit_ids array is present in the third column of this 'row'
        track_rechit_id_array_ = row[2]
        #print (track_rechit_id_array_)
        src_vertices_ = []
        dest_vertices_ = []
        # Sort the rechits based on values of Rechit R
        # Start track building from the inside and move all the way outside
        final_rechits_ = sorted(track_rechit_id_array_, 
                                key=lambda hit_: rechit_param_df_.loc[int(hit_)]['rechit_r'])
        final_rechits_ = [int(hit_) for hit_ in final_rechits_]
        if len(final_rechits_) < 2:
            # Increment the node label
            node_label_i_ += 1
            continue
        elif len(final_rechits_) == 2:
            src_vertices_.append(final_rechits_[0])
            dest_vertices_.append(final_rechits_[1])
        
        # In order to extend this to 2 skip-connections (expanding to the assumption that 3 hits can be 
        # on the same layer, thus all of them should be connected to a hit on the next layer)
        # Create another else case for len(final_rechits_) == 3: and add the corresponding vertices
        # to src and dest arrays. Then you can modify the addition procedure to include src+[3:] and dest+[:-3]
        # So you will have (1,2), (1,3), and (1,4) edges as a simple example of adding 2-skip-connections
        else:
            # Add the edges starting from node a and going to both a+1 and a+2
            # We define this as 1-skip-connection because hits might lie on the same layer
            # We originally sort them by the radius to ensure skip-connections have a meaning
            src_vertices_.extend(final_rechits_[:-1]+final_rechits_[:-2])
            dest_vertices_.extend(final_rechits_[1:]+final_rechits_[2:])
        # Increment the node label
        node_label_i_ += 1
        senders_.extend(src_vertices_)
        receivers_.extend(dest_vertices_)
        # Define the edge feature vectors of same length as number of vertices
        # Indices 7, 8, 9 correspond to track eta, phi, and qoverp values respectively
        track_edge_features_ = [[row[7], row[8], row[9]]] * len(src_vertices_)
        edge_feature_vector_.append(track_edge_features_)
    
    if node_label_i_ != len(track_df_):
        print("Error: Node Labels don't match the number of tracks - spurious labels generated?", node_label_i_, len(track_df_))
    
    # Define a zero-padded global feature vector
    if len(track_df_) < GLOBAL_FEATURES_LEN_:
        global_feature_vector_ = track_df_['track_eta'].tolist() + [0]*(GLOBAL_FEATURES_LEN_ - len(track_df_))
    else: 
        global_feature_vector_ = track_df_['track_eta'].values
    
    if len(track_edge_features_) != len(dest_vertices_):
        print(len(track_edge_features_), len(dest_vertices_))
        print("Edge features and number of destination edges do not match in event", event_id_)
    
    data_dict_ = {
    "nodes": node_feature_vector_,
    "edges": edge_feature_vector_,
    "senders": senders_,
    "receivers": receivers_
    }
    data_dict_list_.append(data_dict_)
print(len(data_dict_list_), "graphs generated from data")

In [None]:
def print_graphs_tuple(graphs_tuple):
  print("Shapes of `GraphsTuple`'s fields:")
  print(graphs_tuple.map(lambda x: x if x is None else x.shape, fields=graphs.ALL_FIELDS))
  print("\nData contained in `GraphsTuple`'s fields:")
  print("globals:\n{}".format(graphs_tuple.globals))
  print("nodes:\n{}".format(graphs_tuple.nodes))
  print("edges:\n{}".format(graphs_tuple.edges))
  print("senders:\n{}".format(graphs_tuple.senders))
  print("receivers:\n{}".format(graphs_tuple.receivers))
  print("n_node:\n{}".format(graphs_tuple.n_node))
  print("n_edge:\n{}".format(graphs_tuple.n_edge))


In [None]:
# Convert the list of dicts into individua graphs
# This *should* require a placeholder in order to manage easier padding (in the future?)
# There is an error with the graph creation process so placeholders might be the solution to it

tracking_graphs_tuple = utils_np.data_dicts_to_graphs_tuple(data_dict_list_)
#print_graphs_tuple(tracking_graphs_tuple)

In [None]:
%matplotlib notebook

graphs_nx = utils_np.graphs_tuple_to_networkxs(tracking_graphs_tuple)
nrows, ncols = 2, 1
_, axs = plt.subplots(nrows=nrows, ncols=ncols, squeeze=False, figsize=(12, 10))
for iax in range(nrows*ncols):
    colnum = iax%ncols
    rownum = int(float(iax)/float(ncols))
    nx.draw(graphs_nx[iax], ax=axs[rownum][colnum], node_size=2)
    axs[rownum][colnum].set_title("Graph {}".format(iax))

In [None]:
print(i for i in tracking_graphs_tuple)

In [None]:
tf.reset_default_graph()

seed = 2
rand = np.random.RandomState(seed=seed)

# Model parameters.
# Number of processing (message-passing) steps.
num_processing_steps_tr = 10
num_processing_steps_ge = 10

# Training loss.
loss_ops_tr = create_loss_ops(target_ph, output_ops_tr)

# Loss across processing steps.
loss_op_tr = sum(loss_ops_tr) / num_processing_steps_tr

# Test/generalization loss.
loss_ops_ge = create_loss_ops(target_ph, output_ops_ge)
loss_op_ge = loss_ops_ge[-1]  # Loss from final processing step.

# Optimizer.
learning_rate = 1e-3
optimizer = tf.train.AdamOptimizer(learning_rate)
step_op = optimizer.minimize(loss_op_tr)

In [None]:
# This cell resets the Tensorflow session, but keeps the same computational
# graph.
sess = tf.Session()
sess.run(tf.global_variables_initializer())

last_iteration = 0
logged_iterations = []
losses_tr = []
corrects_tr = []
solveds_tr = []
losses_ge = []
corrects_ge = []
solveds_ge = []

In [None]:
log_every_seconds = 20

print("# (iteration number), T (elapsed seconds), "
      "Ltr (training loss), Lge (test/generalization loss), "
      "Ctr (training fraction nodes/edges labeled correctly), "
      "Str (training fraction examples solved correctly), "
      "Cge (test/generalization fraction nodes/edges labeled correctly), "
      "Sge (test/generalization fraction examples solved correctly)")

start_time = time.time()
last_log_time = start_time
for iteration in range(last_iteration, num_training_iterations):
  last_iteration = iteration
  feed_dict, _ = create_feed_dict(rand, batch_size_tr, num_nodes_min_max_tr,
                                  theta, input_ph, target_ph)
  train_values = sess.run({
      "step": step_op,
      "target": target_ph,
      "loss": loss_op_tr,
      "outputs": output_ops_tr
  },
                          feed_dict=feed_dict)
  the_time = time.time()
  elapsed_since_last_log = the_time - last_log_time
  if elapsed_since_last_log > log_every_seconds:
    last_log_time = the_time
    feed_dict, raw_graphs = create_feed_dict(
        rand, batch_size_ge, num_nodes_min_max_ge, theta, input_ph, target_ph)
    test_values = sess.run({
        "target": target_ph,
        "loss": loss_op_ge,
        "outputs": output_ops_ge
    },
                           feed_dict=feed_dict)

## PyTorch Models

In [None]:
import torch
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

class GCNConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(GCNLayer, self).__init__()
        self.lin = torch.nn.Linear(in_channels, out_channels)

    def forward(self, x, edge_index):
        # x has shape [N, in_channels]
        # edge_index has shape [2, E]

        # Step 1: Add self-loops to the adjacency matrix.
        edge_index = add_self_loops(edge_index, num_nodes=x.size(0))

        # Step 2: Linearly transform node feature matrix.
        x = self.lin(x)

        # Step 3-5: Start propagating messages with "add" aggregation.
        return self.propagate('add', edge_index, x=x, num_nodes=x.size(0))

    def message(self, x_j, edge_index, num_nodes):
        # x_j has shape [E, out_channels]

        # Step 3: Normalize node features.
        row, col = edge_index
        deg = degree(row, num_nodes, dtype=x_j.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        return norm.view(-1, 1) * x_j

    def update(self, aggr_out):
        # aggr_out has shape [N, out_channels]

        # Step 5: Return new node embeddings.
        return aggr_out

In [None]:
from torch_geometric.data import Data

# Sample Edge Label Definition for Rechits - Adjacency List?
edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index)

In [None]:
'''Define the 2-layer GCN'''

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_features, 16)
        self.conv2 = GCNConv(16, data.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)


In [None]:
# CUDA available on cmg-gpu1080
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()