In [1]:
# Install required packages.
import os
import torch
from torchmetrics import AUROC
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

2.2.1


In [2]:
from torch_geometric.datasets import TUDataset
train_dataset = TUDataset(root='data/TUDataset', name='Tox21_AhR_training')
test_dataset = TUDataset(root='data/TUDataset', name='Tox21_AhR_testing')

In [3]:
import torch
from torch_geometric.datasets import TUDataset

print()
print(f'Dataset: {train_dataset}:')
print('====================')
print(f'Number of graphs: {len(train_dataset)}')
print(f'Number of features: {train_dataset.num_features}')
print(f'Number of classes: {train_dataset.num_classes}')

data = train_dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

print()
print(f'Dataset: {test_dataset}:')
print('====================')
print(f'Number of graphs: {len(test_dataset)}')
print(f'Number of features: {test_dataset.num_features}')
print(f'Number of classes: {test_dataset.num_classes}')

data = test_dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: Tox21_AhR_training(8169):
Number of graphs: 8169
Number of features: 50
Number of classes: 2

Data(edge_index=[2, 52], x=[25, 50], edge_attr=[52, 4], y=[1])
Number of nodes: 25
Number of edges: 52
Average node degree: 2.08
Has isolated nodes: False
Has self-loops: False
Is undirected: True

Dataset: Tox21_AhR_testing(272):
Number of graphs: 272
Number of features: 51
Number of classes: 2

Data(edge_index=[2, 44], x=[20, 51], edge_attr=[44, 4], y=[1])
Number of nodes: 20
Number of edges: 44
Average node degree: 2.20
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [4]:
dataset = train_dataset.shuffle()
# test_dataset = test_dataset.shuffle()

split = int(len(dataset) * 0.80)
train_dataset = dataset[:split]
test_dataset = dataset[split:]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 6535
Number of test graphs: 1634


In [5]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2342], x=[1157, 50], edge_attr=[2342, 4], y=[64], batch=[1157], ptr=[65])

Step 2:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2146], x=[1067, 50], edge_attr=[2146, 4], y=[64], batch=[1067], ptr=[65])

Step 3:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2274], x=[1115, 50], edge_attr=[2274, 4], y=[64], batch=[1115], ptr=[65])

Step 4:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2172], x=[1074, 50], edge_attr=[2172, 4], y=[64], batch=[1074], ptr=[65])

Step 5:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2510], x=[1219, 50], edge_attr=[2510, 4], y=[64], batch=[1219], ptr=[65])

Step 6:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2058], x=[1003, 50], edge_attr=[2058, 4], y=[64], batch=[1003], ptr=[65])

Step 7:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2366], x=[1168, 50], edge

In [6]:
for step, data in enumerate(test_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 1972], x=[971, 50], edge_attr=[1972, 4], y=[64], batch=[971], ptr=[65])

Step 2:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2452], x=[1206, 50], edge_attr=[2452, 4], y=[64], batch=[1206], ptr=[65])

Step 3:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2300], x=[1120, 50], edge_attr=[2300, 4], y=[64], batch=[1120], ptr=[65])

Step 4:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2390], x=[1168, 50], edge_attr=[2390, 4], y=[64], batch=[1168], ptr=[65])

Step 5:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2134], x=[1052, 50], edge_attr=[2134, 4], y=[64], batch=[1052], ptr=[65])

Step 6:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2254], x=[1092, 50], edge_attr=[2254, 4], y=[64], batch=[1092], ptr=[65])

Step 7:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2062], x=[1021, 50], edge_a

In [7]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool, global_max_pool, global_add_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(train_dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, train_dataset.num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_add_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)

        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GCNConv(50, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [8]:
model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
auroc = AUROC(task="binary")

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return (correct / len(loader.dataset), auroc(out.softmax(dim=1)[:, 1], data.y))  # Derive ratio of correct predictions.


for epoch in range(1, 50):
    train()
    train_acc, train_auc = test(train_loader)
    test_acc, test_auc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}, Test AUC: {test_auc:.4f}')

Epoch: 001, Train Acc: 0.8834, Test Acc: 0.8849, Test AUC: 0.7202




Epoch: 002, Train Acc: 0.8834, Test Acc: 0.8849, Test AUC: 0.6964
Epoch: 003, Train Acc: 0.8836, Test Acc: 0.8843, Test AUC: 0.7321
Epoch: 004, Train Acc: 0.8837, Test Acc: 0.8849, Test AUC: 0.7024
Epoch: 005, Train Acc: 0.8849, Test Acc: 0.8898, Test AUC: 0.7381
Epoch: 006, Train Acc: 0.8851, Test Acc: 0.8874, Test AUC: 0.6667
Epoch: 007, Train Acc: 0.8848, Test Acc: 0.8874, Test AUC: 0.7798
Epoch: 008, Train Acc: 0.8852, Test Acc: 0.8868, Test AUC: 0.7500
Epoch: 009, Train Acc: 0.8855, Test Acc: 0.8892, Test AUC: 0.8512
Epoch: 010, Train Acc: 0.8842, Test Acc: 0.8874, Test AUC: 0.8036
Epoch: 011, Train Acc: 0.8834, Test Acc: 0.8849, Test AUC: 0.8571
Epoch: 012, Train Acc: 0.8836, Test Acc: 0.8856, Test AUC: 0.8393
Epoch: 013, Train Acc: 0.8855, Test Acc: 0.8886, Test AUC: 0.8631
Epoch: 014, Train Acc: 0.8860, Test Acc: 0.8911, Test AUC: 0.8988
Epoch: 015, Train Acc: 0.8837, Test Acc: 0.8862, Test AUC: 0.8333
Epoch: 016, Train Acc: 0.8857, Test Acc: 0.8911, Test AUC: 0.8929
Epoch: 017

In [9]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from torch_geometric.nn import global_mean_pool, global_max_pool, global_add_pool


class GAT(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GAT, self).__init__()
        self.conv1 = GATConv(train_dataset.num_node_features, hidden_channels)
        self.conv2 = GATConv(hidden_channels, hidden_channels)
        self.conv3 = GATConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, train_dataset.num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_add_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)

        return x

model = GAT(hidden_channels=64)
print(model)

GAT(
  (conv1): GATConv(50, 64, heads=1)
  (conv2): GATConv(64, 64, heads=1)
  (conv3): GATConv(64, 64, heads=1)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [10]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
model = GAT(hidden_channels=64)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
auroc = AUROC(task="binary")

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return (correct / len(loader.dataset), auroc(out.softmax(dim=1)[:, 1], data.y))  # Derive ratio of correct predictions.

for epoch in range(1, 50):
    train()
    train_acc, train_auc = test(train_loader)
    test_acc, test_auc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}, Test AUC: {test_auc:.4f}')

GAT(
  (conv1): GATConv(50, 64, heads=1)
  (conv2): GATConv(64, 64, heads=1)
  (conv3): GATConv(64, 64, heads=1)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)
Epoch: 001, Train Acc: 0.8834, Test Acc: 0.8849, Test AUC: 0.7024
Epoch: 002, Train Acc: 0.8834, Test Acc: 0.8849, Test AUC: 0.7679
Epoch: 003, Train Acc: 0.8848, Test Acc: 0.8880, Test AUC: 0.7143
Epoch: 004, Train Acc: 0.8840, Test Acc: 0.8898, Test AUC: 0.7143
Epoch: 005, Train Acc: 0.8840, Test Acc: 0.8886, Test AUC: 0.7202
Epoch: 006, Train Acc: 0.8836, Test Acc: 0.8886, Test AUC: 0.7500
Epoch: 007, Train Acc: 0.8832, Test Acc: 0.8856, Test AUC: 0.7857
Epoch: 008, Train Acc: 0.8845, Test Acc: 0.8892, Test AUC: 0.7619
Epoch: 009, Train Acc: 0.8846, Test Acc: 0.8892, Test AUC: 0.7738
Epoch: 010, Train Acc: 0.8836, Test Acc: 0.8849, Test AUC: 0.7500
Epoch: 011, Train Acc: 0.8836, Test Acc: 0.8849, Test AUC: 0.7202
Epoch: 012, Train Acc: 0.8836, Test Acc: 0.8849, Test AUC: 0.7440
Epoch: 013, Train Acc: 0.8849, Tes