In [1]:
# Network
import torch
import torch.nn.functional as F

from torch_geometric.nn import GCNConv, ChebConv
from torch_geometric.nn import global_max_pool

# Data
from torch.utils.data.sampler import SubsetRandomSampler

from torch_geometric.data import Batch, Data, Dataset, DataLoader

# General
import numpy as np

# Util
import os.path as osp
import h5py
import pickle

import time


### Generate fake indicies

In [2]:
# length = 2937
# splits = 10
# random_shuffle = np.random.permutation(length)
# validation_indicies = random_shuffle[:length//splits]
# test_indicies = random_shuffle[length//splits:2*length//splits]
# train_indicies = random_shuffle[2*length//splits:]

# with open("train_indicies.txt", 'w') as f:
#     f.writelines(["{}\n".format(i) for i in train_indicies])
    
# with open("validation_indicies.txt", 'w') as f:
#     f.writelines(["{}\n".format(i) for i in validation_indicies])

# with open("test_indicies.txt", 'w') as f:
#     f.writelines(["{}\n".format(i) for i in test_indicies])

# Data Loader

In [3]:

class WCH5Dataset(Dataset):
    """
    Dataset storing image-like data from Water Cherenkov detector
    memory-maps the detector data from hdf5 file
    The detector data must be uncompresses and unchunked
    labels are loaded into memory outright
    No other data is currently loaded 
    """

    # Override the default implementation
    def _download(self):
        pass
    
    def _process(self):
        pass
    
    
    def __init__(self, path, train_indices_file, validation_indices_file, test_indices_file, 
                 edge_index_pickle, nodes=15808,
                 transform=None, pre_transform=None, pre_filter=None, 
                 use_node_attr=False, use_edge_attr=False, cleaned=False):

        super(WCH5Dataset, self).__init__("", transform, pre_transform,
                                        pre_filter)
        
        f=h5py.File(path,'r')
        hdf5_event_data = f["event_data"]
        hdf5_labels=f["labels"]

        assert hdf5_event_data.shape[0] == hdf5_labels.shape[0]

        event_data_shape = hdf5_event_data.shape
        event_data_offset = hdf5_event_data.id.get_offset()
        event_data_dtype = hdf5_event_data.dtype

        #this creates a memory map - i.e. events are not loaded in memory here
        #only on get_item
        self.event_data = np.memmap(path, mode='r', shape=event_data_shape, 
                                    offset=event_data_offset, dtype=event_data_dtype)
        
        #this will fit easily in memory even for huge datasets
        self.labels = np.array(hdf5_labels)
        self.nodes = nodes
        self.load_edges(edge_index_pickle)
        
        self.transform=transform
        
        #the section below handles the subset
        #(for reduced dataset training tests)
        #as well as shuffling and train/test/validation splits
            
        self.train_indices = self.load_indicies(train_indices_file)
        self.val_indices = self.load_indicies(validation_indices_file)
        self.test_indices = self.load_indicies(test_indices_file)
    
    def load_indicies(self, indicies_file):
        with open(indicies_file, 'r') as f:
            lines = f.readlines()
        # indicies = [int(l.strip()) for l in lines if not l.isspace()]
        indicies = [int(l.strip()) for l in lines]
        return indicies
    
    def load_edges(self, edge_index_pickle):
        edge_index = torch.zeros([self.nodes, self.nodes], dtype=torch.int64)

        with open(edge_index_pickle, 'rb') as f:
            edges = pickle.load(f)

            for k,vs in edges.items():
                for v in vs:
                    edge_index[k,v] = 1

        self.edge_index=edge_index.to_sparse()._indices()
    
    def get(self, idx):
        x = torch.from_numpy(self.event_data[idx])
        y = torch.tensor([self.labels[idx]], dtype=torch.int64)

        return Data(x=x, y=y, edge_index=self.edge_index)

    def __len__(self):
        return self.labels.shape[0]

In [4]:
dataset = WCH5Dataset("/app/IWCDmPMT_4pi_full_tank_test.h5", 
                      "train_indicies.txt", "validation_indicies.txt", "test_indicies.txt",
                      "../../visualization/edges_dict.pkl")

In [5]:
train_loader=DataLoader(dataset, batch_size=32, 
                        pin_memory=True, sampler=SubsetRandomSampler(dataset.train_indices))

In [6]:
batch = train_loader.__iter__().__next__()

# Network stuff

In [7]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(2, 16, cached=True)
        self.conv2 = GCNConv(16, 16, cached=True)
        self.conv3 = GCNConv(16, 5, cached=True)

#         self.conv1 = ChebConv(2, 16, K=2)
#         self.conv2 = ChebConv(16, 5, K=2)

    def forward(self, x, edge_index, batch):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv3(x, edge_index)
        x = global_max_pool(x, batch)
        return F.log_softmax(x, dim=1)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model= Net().to(device)
batch = batch.to(device)
# data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)


In [9]:
def train():
    model.train()
    optimizer.zero_grad()
#     output = model(batch.x.to_sparse(), batch.edge_index, batch.batch)
    output = model(batch.x, batch.edge_index, batch.batch)

    F.nll_loss(output, batch.y).backward()
    optimizer.step()


def test():
    model.eval()
    logits = model(batch.x, batch.edge_index, batch.batch)
    pred = logits.argmax(1)
    acc = pred.eq(batch.y).sum().item() / batch.y.shape[0]
    return acc

In [10]:
start = time.time()

best_acc = 0
batch.x = batch.x.to_sparse()
for epoch in range(1, 201):
    train()
#     train_acc = test()
#     if train_acc > best_acc:
#         best_acc = train_acc
#     log = 'Epoch: {:03d}, Train: {:.4f}, Best: {:.4f}'
#     print(log.format(epoch, train_acc, best_acc))
print(time.time() - start)

14.30902886390686
