In [125]:
from ogb.graphproppred import PygGraphPropPredDataset
from torch_geometric.data import DataLoader
import src.GNNtrainer.models as models
from collections import namedtuple
from tensorboardX import SummaryWriter
import torch
import time
from torch.autograd import Variable
from torch_geometric.datasets import TUDataset
import torch_geometric.transforms as T

In [126]:
# For OGBG-MOLHIV
#dataset = PygGraphPropPredDataset(name = "ogbg-molhiv", root = '/io/ogbg')
# For reddit-binary
dataset = TUDataset(root='/io/reddit', name='REDDIT-BINARY', transform=T.Constant(1))

## Train on REDDIT-BINARY

In [127]:
print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Contains isolated nodes: {data.contains_isolated_nodes()}')
print(f'Contains self-loops: {data.contains_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: REDDIT-BINARY(2000):
Number of graphs: 2000
Number of features: 1
Number of classes: 2

Data(edge_index=[2, 480], x=[218, 1], y=[1])
Number of nodes: 218
Number of edges: 480
Average node degree: 2.20
Contains isolated nodes: False
Contains self-loops: False
Is undirected: True


In [129]:
torch.manual_seed(12345)
dataset = dataset.shuffle()

train_dataset = dataset[:1500]
test_dataset = dataset[1500:]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 1500
Number of test graphs: 500


In [130]:
from torch_geometric.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
Batch(batch=[26406], edge_index=[2, 61042], x=[26406, 1], y=[64])

Step 2:
Number of graphs in the current batch: 64
Batch(batch=[30471], edge_index=[2, 69260], x=[30471, 1], y=[64])

Step 3:
Number of graphs in the current batch: 64
Batch(batch=[27038], edge_index=[2, 62574], x=[27038, 1], y=[64])

Step 4:
Number of graphs in the current batch: 64
Batch(batch=[21653], edge_index=[2, 50846], x=[21653, 1], y=[64])

Step 5:
Number of graphs in the current batch: 64
Batch(batch=[34406], edge_index=[2, 79612], x=[34406, 1], y=[64])

Step 6:
Number of graphs in the current batch: 64
Batch(batch=[18666], edge_index=[2, 43914], x=[18666, 1], y=[64])

Step 7:
Number of graphs in the current batch: 64
Batch(batch=[37943], edge_index=[2, 86086], x=[37943, 1], y=[64])

Step 8:
Number of graphs in the current batch: 64
Batch(batch=[26932], edge_index=[2, 62426], x=[26932, 1], y=[64])

Step 9:
Number of graphs in the current batch: 64
Batch(batch=[3

In [131]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
from ogb.graphproppred.mol_encoder import AtomEncoder


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.emb = AtomEncoder(9)
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        #x = self.emb(x)??????????????
        x = x.type(torch.FloatTensor)
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (emb): AtomEncoder(
    (atom_embedding_list): ModuleList(
      (0): Embedding(119, 9)
      (1): Embedding(4, 9)
      (2): Embedding(12, 9)
      (3): Embedding(12, 9)
      (4): Embedding(10, 9)
      (5): Embedding(6, 9)
      (6): Embedding(6, 9)
      (7): Embedding(2, 9)
      (8): Embedding(2, 9)
    )
  )
  (conv1): GCNConv(1, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [144]:
model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = torch.nn.CrossEntropyLoss()

def train():
    
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.      
        out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
        loss = criterion(out, data.y.squeeze())  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.

def test(loader):
    
    model.eval()
    
    correct = 0
    for data in loader:  # Iterate in batches over the training/test dataset.
        out = model(data.x, data.edge_index, data.batch)  
        pred = out.argmax(dim=1)  # Use the class with highest probability.
        print(pred)
        correct += int((pred == data.y).sum())  # Check against ground-truth labels.
    return correct / len(loader.dataset)  # Derive ratio of correct predictions.




In [145]:
model.train()
for epoch in range(1, 201):
    epoch_loss = 0.0
    for data in train_loader:  # Iterate in batches over the training dataset.      
        out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
        loss = criterion(out, data.y.squeeze())  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.
        epoch_loss += out.shape[0] * loss.item()
    print(f'epoch {epoch} loss: {epoch_loss/len(train_loader.dataset)}')

epoch 1 loss: 0.6954443575541178
epoch 2 loss: 0.6915993765195211
epoch 3 loss: 0.6826803833643595
epoch 4 loss: 0.6320986620585124
epoch 5 loss: 0.5900826771259308
epoch 6 loss: 0.5754357085227967
epoch 7 loss: 0.5884442135492961
epoch 8 loss: 0.5763287183443705
epoch 9 loss: 0.5784719020525615
epoch 10 loss: 0.5745966023604075
epoch 11 loss: 0.5829605614344279
epoch 12 loss: 0.5877226552963257
epoch 13 loss: 0.5781127626101176
epoch 14 loss: 0.5800930013656617
epoch 15 loss: 0.5755476876894633
epoch 16 loss: 0.574506546497345
epoch 17 loss: 0.5759740679264068
epoch 18 loss: 0.5820157322883606
epoch 19 loss: 0.5875841024716695
epoch 20 loss: 0.5747259953022004
epoch 21 loss: 0.5754176880518596
epoch 22 loss: 0.5748744239807129
epoch 23 loss: 0.5761179342269898
epoch 24 loss: 0.5768507223129272
epoch 25 loss: 0.5797089120546977
epoch 26 loss: 0.5797975963751475
epoch 27 loss: 0.5747252044677734
epoch 28 loss: 0.5764869771003723
epoch 29 loss: 0.5844195675849915
epoch 30 loss: 0.5735849

In [152]:
from sklearn.metrics import confusion_matrix

In [153]:
y_true = []
y_pred = []

In [154]:
for data in test_loader:
    y_true.extend(data.y.squeeze())
    y_pred.extend(model(data.x, data.edge_index, data.batch).argmax(dim=1))

In [155]:
import numpy as np
y_true = np.array(y_true)
y_pred = np.array(y_pred)

In [156]:
cm = confusion_matrix(y_true, y_pred)

In [157]:
cm

array([[203,  40],
       [ 26, 231]])

In [158]:
torch.save(model, '/io/pretrained_reddit')

In [None]:
cg_dict = {
        "adj": all_adjs,
        "feat": all_feats,
        "label": all_labels,
        "pred": np.expand_dims(predictions, axis=0),
    }

In [None]:
torch.save(cg_dict, '/io/cg_reddit')