In [1]:
# Install required packages.
import os
import torch
from torchmetrics import AUROC
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

2.2.1


In [2]:
from torch_geometric.datasets import TUDataset
dataset = TUDataset(root='data/TUDataset', name='Tox21_AhR_training')

In [3]:
import torch
from torch_geometric.datasets import TUDataset

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: Tox21_AhR_training(8169):
Number of graphs: 8169
Number of features: 50
Number of classes: 2

Data(edge_index=[2, 52], x=[25, 50], edge_attr=[52, 4], y=[1])
Number of nodes: 25
Number of edges: 52
Average node degree: 2.08
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset
train_idx, test_idx = train_test_split(np.arange(len(dataset)),
                                             test_size=0.2,
                                             shuffle=True,
                                             stratify=dataset.data.y)

train_dataset = Subset(dataset, train_idx)
test_dataset = Subset(dataset, test_idx)


# split = int(len(dataset) * 0.80)
# train_dataset = dataset[:split]
# test_dataset = dataset[split:]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 6535
Number of test graphs: 1634




In [5]:
count = {}
for mol in train_dataset:
    count[mol.y[0].item()] = 1 + count.get(mol.y[0].item(), 0)
count

{0: 5775, 1: 760}

In [6]:
count = {}
for mol in test_dataset:
    count[mol.y[0].item()] = 1 + count.get(mol.y[0].item(), 0)
count

{1: 190, 0: 1444}

In [7]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2604], x=[1271, 50], edge_attr=[2604, 4], y=[64], batch=[1271], ptr=[65])

Step 2:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2432], x=[1181, 50], edge_attr=[2432, 4], y=[64], batch=[1181], ptr=[65])

Step 3:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2430], x=[1184, 50], edge_attr=[2430, 4], y=[64], batch=[1184], ptr=[65])

Step 4:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2342], x=[1136, 50], edge_attr=[2342, 4], y=[64], batch=[1136], ptr=[65])

Step 5:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2078], x=[1028, 50], edge_attr=[2078, 4], y=[64], batch=[1028], ptr=[65])

Step 6:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2612], x=[1279, 50], edge_attr=[2612, 4], y=[64], batch=[1279], ptr=[65])

Step 7:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2546], x=[1225, 50], edge

In [8]:
for step, data in enumerate(test_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2188], x=[1072, 50], edge_attr=[2188, 4], y=[64], batch=[1072], ptr=[65])

Step 2:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2366], x=[1153, 50], edge_attr=[2366, 4], y=[64], batch=[1153], ptr=[65])

Step 3:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2366], x=[1165, 50], edge_attr=[2366, 4], y=[64], batch=[1165], ptr=[65])

Step 4:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2220], x=[1083, 50], edge_attr=[2220, 4], y=[64], batch=[1083], ptr=[65])

Step 5:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2370], x=[1154, 50], edge_attr=[2370, 4], y=[64], batch=[1154], ptr=[65])

Step 6:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2498], x=[1222, 50], edge_attr=[2498, 4], y=[64], batch=[1222], ptr=[65])

Step 7:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2494], x=[1222, 50], edge

In [9]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool, global_max_pool, global_add_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_add_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)

        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GCNConv(50, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [10]:
model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
auroc = AUROC(task="binary")

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return (correct / len(loader.dataset), auroc(out.softmax(dim=1)[:, 1], data.y))  # Derive ratio of correct predictions.


for epoch in range(1, 50):
    train()
    train_acc, train_auc = test(train_loader)
    test_acc, test_auc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}, Test AUC: {test_auc:.4f}')

Epoch: 001, Train Acc: 0.8837, Test Acc: 0.8837, Test AUC: 0.7172
Epoch: 002, Train Acc: 0.8837, Test Acc: 0.8837, Test AUC: 0.8069
Epoch: 003, Train Acc: 0.8837, Test Acc: 0.8837, Test AUC: 0.8276




Epoch: 004, Train Acc: 0.8837, Test Acc: 0.8837, Test AUC: 0.8690
Epoch: 005, Train Acc: 0.8837, Test Acc: 0.8837, Test AUC: 0.8966
Epoch: 006, Train Acc: 0.8837, Test Acc: 0.8837, Test AUC: 0.9172
Epoch: 007, Train Acc: 0.8837, Test Acc: 0.8837, Test AUC: 0.8690
Epoch: 008, Train Acc: 0.8837, Test Acc: 0.8837, Test AUC: 0.8759
Epoch: 009, Train Acc: 0.8828, Test Acc: 0.8843, Test AUC: 0.8000
Epoch: 010, Train Acc: 0.8799, Test Acc: 0.8807, Test AUC: 0.8000
Epoch: 011, Train Acc: 0.8817, Test Acc: 0.8825, Test AUC: 0.8276
Epoch: 012, Train Acc: 0.8831, Test Acc: 0.8825, Test AUC: 0.8069
Epoch: 013, Train Acc: 0.8842, Test Acc: 0.8831, Test AUC: 0.8207
Epoch: 014, Train Acc: 0.8837, Test Acc: 0.8837, Test AUC: 0.6552
Epoch: 015, Train Acc: 0.8840, Test Acc: 0.8813, Test AUC: 0.8000
Epoch: 016, Train Acc: 0.8834, Test Acc: 0.8813, Test AUC: 0.7724
Epoch: 017, Train Acc: 0.8840, Test Acc: 0.8825, Test AUC: 0.8414
Epoch: 018, Train Acc: 0.8846, Test Acc: 0.8831, Test AUC: 0.8621
Epoch: 019

In [11]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from torch_geometric.nn import global_mean_pool, global_max_pool, global_add_pool


class GAT(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GAT, self).__init__()
        self.conv1 = GATConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GATConv(hidden_channels, hidden_channels)
        self.conv3 = GATConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_add_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)

        return x

model = GAT(hidden_channels=64)
print(model)

GAT(
  (conv1): GATConv(50, 64, heads=1)
  (conv2): GATConv(64, 64, heads=1)
  (conv3): GATConv(64, 64, heads=1)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [12]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
model = GAT(hidden_channels=64)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
auroc = AUROC(task="binary")

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return (correct / len(loader.dataset), auroc(out.softmax(dim=1)[:, 1], data.y))  # Derive ratio of correct predictions.

for epoch in range(1, 50):
    train()
    train_acc, train_auc = test(train_loader)
    test_acc, test_auc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}, Test AUC: {test_auc:.4f}')

GAT(
  (conv1): GATConv(50, 64, heads=1)
  (conv2): GATConv(64, 64, heads=1)
  (conv3): GATConv(64, 64, heads=1)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)
Epoch: 001, Train Acc: 0.8837, Test Acc: 0.8837, Test AUC: 0.8138
Epoch: 002, Train Acc: 0.8837, Test Acc: 0.8837, Test AUC: 0.8414
Epoch: 003, Train Acc: 0.8837, Test Acc: 0.8837, Test AUC: 0.8345
Epoch: 004, Train Acc: 0.8837, Test Acc: 0.8837, Test AUC: 0.8828
Epoch: 005, Train Acc: 0.8837, Test Acc: 0.8837, Test AUC: 0.9103
Epoch: 006, Train Acc: 0.8836, Test Acc: 0.8831, Test AUC: 0.8759
Epoch: 007, Train Acc: 0.8849, Test Acc: 0.8856, Test AUC: 0.9034
Epoch: 008, Train Acc: 0.8837, Test Acc: 0.8837, Test AUC: 0.8897
Epoch: 009, Train Acc: 0.8854, Test Acc: 0.8874, Test AUC: 0.6483
Epoch: 010, Train Acc: 0.8832, Test Acc: 0.8825, Test AUC: 0.8414
Epoch: 011, Train Acc: 0.8846, Test Acc: 0.8868, Test AUC: 0.8138
Epoch: 012, Train Acc: 0.8860, Test Acc: 0.8856, Test AUC: 0.9103
Epoch: 013, Train Acc: 0.8858, Tes