Welcome back! You know the drill:

In [1]:
!pip install deepchem
!pip install 'deepchem[torch]'
!pip install rdkit
!pip install torch_geometric
!pip install torch
!pip install pytorch-ignite


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgr

[MUTAG](https://chrsmrrs.github.io/datasets/docs/datasets/#:~:text=MUTAG,MUTAG) is a very popular datset for graph classification on [PapersWithCode](https://paperswithcode.com/task/graph-classification/latest). Experts know best, so let's stand on the shoulders of giants.

In [2]:
from torch_geometric.datasets import TUDataset
dataset = TUDataset(root="./data/", name="MUTAG")

In [3]:
print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: MUTAG(188):
Number of graphs: 188
Number of features: 7
Number of classes: 2

Data(edge_index=[2, 38], x=[17, 7], edge_attr=[38, 4], y=[1])
Number of nodes: 17
Number of edges: 38
Average node degree: 2.24
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [4]:
print("Number of 0s:", list(dataset.data.y).count(0))
print("Number of 1s: ", list(dataset.data.y).count(1))

Number of 0s: 63
Number of 1s:  125




<img src="./images/thanos.jpg" alt="drawing" width="500"/>

In [5]:
dataset.shuffle()
split = int(len(dataset) * 0.8)
train_dataset, test_dataset = dataset[:split], dataset[split:]
print("Train:", len(train_dataset), "| Test:", len(test_dataset))

Train: 150 | Test: 38


In [6]:
import itertools
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print() 

Step 1:
Number of graphs in the current batch: 32
DataBatch(edge_index=[2, 1188], x=[538, 7], edge_attr=[1188, 4], y=[32], batch=[538], ptr=[33])

Step 2:
Number of graphs in the current batch: 32
DataBatch(edge_index=[2, 1266], x=[575, 7], edge_attr=[1266, 4], y=[32], batch=[575], ptr=[33])

Step 3:
Number of graphs in the current batch: 32
DataBatch(edge_index=[2, 1372], x=[619, 7], edge_attr=[1372, 4], y=[32], batch=[619], ptr=[33])

Step 4:
Number of graphs in the current batch: 32
DataBatch(edge_index=[2, 1196], x=[544, 7], edge_attr=[1196, 4], y=[32], batch=[544], ptr=[33])

Step 5:
Number of graphs in the current batch: 22
DataBatch(edge_index=[2, 908], x=[408, 7], edge_attr=[908, 4], y=[22], batch=[408], ptr=[23])



In [7]:
import torch
from torch_geometric.nn import GraphConv
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import global_mean_pool


class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GNN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GraphConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GraphConv(hidden_channels, hidden_channels)
        self.conv3 = GraphConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, 1) # Output a binary value (0 or 1) because this is a binary classification problem 

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)

        x = x.to(self.lin.weight.dtype)
        x = self.lin(x)
        
        return x

model = GNN(hidden_channels=64)
print(model)

GNN(
  (conv1): GraphConv(7, 64)
  (conv2): GraphConv(64, 64)
  (conv3): GraphConv(64, 64)
  (lin): Linear(in_features=64, out_features=1, bias=True)
)


In [8]:
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    acc = (correct / len(y_pred)) * 100 
    return acc

In [9]:
from ignite.engine import *
from ignite.handlers import *
from ignite.metrics import *
from ignite.utils import *
from ignite.contrib.metrics.regression import *
from ignite.contrib.metrics import *

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.BCEWithLogitsLoss()


def train(loader):
    model.train()

    train_loss, train_acc = 0, 0
    
    for data in loader:  # Iterate in batches over the training dataset.
        try: 
            # 1. Forward pass (model outputs raw logits)
            y_logits = model(data.x, data.edge_index, data.batch)
            y_pred = torch.round(torch.sigmoid(y_logits))
            y_true = data.y # extract target column

            # 2. Calculate loss/accuracy
            loss = loss_fn(y_logits.squeeze(), y_true.float())
            train_loss += loss
            train_acc += accuracy_fn(y_true=y_true, y_pred=y_pred.squeeze())

            # 3. Optimizer zero grad
            optimizer.zero_grad() 

            # 4. Loss backwards
            loss.backward()  

            # 5. Optimizer step
            optimizer.step()
        except:
            continue
    train_loss /= len(loader)
    train_acc /= len(loader)
    print(f"Train loss: {train_loss:.5f} | Train accuracy: {train_acc:.2f}%")

def test(loader):
     model.eval()

     test_loss, test_acc = 0, 0
     def eval_step(engine, batch):
        return batch

     default_evaluator = Engine(eval_step)
 
     roc_auc = ROC_AUC()
     roc_auc.attach(default_evaluator, 'roc_auc')
     roc_auc = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         y_logits = model(data.x, data.edge_index, data.batch)
         y_pred = torch.round(torch.sigmoid(y_logits))
         y_true = data.y # extract target column

         test_loss += loss_fn(y_logits.squeeze(), y_true.float())
         test_acc += accuracy_fn(y_true=y_true, y_pred=y_pred.squeeze())
         state = default_evaluator.run([[y_pred.squeeze(), y_true]])
         roc_auc += state.metrics['roc_auc']
     test_loss /= len(loader)
     test_acc /= len(loader)
     roc_auc /= len(loader)
     print(f"Test loss: {test_loss:.5f} | Test accuracy: {test_acc:.2f}% | ROC-AUC: {roc_auc:.2f} \n")

In [10]:
epochs = 150
for epoch in range(epochs):
    train(train_loader)
    test(test_loader)

Train loss: 0.81634 | Train accuracy: 47.56%
Test loss: 0.66830 | Test accuracy: 60.94% | ROC-AUC: 0.50 

Train loss: 0.63178 | Train accuracy: 65.23%
Test loss: 0.65238 | Test accuracy: 60.94% | ROC-AUC: 0.50 

Train loss: 0.59958 | Train accuracy: 65.85%
Test loss: 0.63826 | Test accuracy: 60.94% | ROC-AUC: 0.50 

Train loss: 0.58764 | Train accuracy: 66.14%
Test loss: 0.60446 | Test accuracy: 60.94% | ROC-AUC: 0.50 

Train loss: 0.58866 | Train accuracy: 64.72%
Test loss: 0.57928 | Test accuracy: 60.94% | ROC-AUC: 0.50 

Train loss: 0.52276 | Train accuracy: 65.85%
Test loss: 0.51399 | Test accuracy: 60.94% | ROC-AUC: 0.50 

Train loss: 0.49319 | Train accuracy: 75.80%
Test loss: 0.42835 | Test accuracy: 79.17% | ROC-AUC: 0.81 

Train loss: 0.46928 | Train accuracy: 78.52%
Test loss: 0.54696 | Test accuracy: 72.40% | ROC-AUC: 0.71 

Train loss: 0.53056 | Train accuracy: 74.83%
Test loss: 0.41366 | Test accuracy: 85.94% | ROC-AUC: 0.90 

Train loss: 0.47523 | Train accuracy: 77.33%
T

Very reasonable results! Call us Leibniz-Newton, the way we integrated these changes.