This is the main notebook I use for testing code and creating graphs

In [1]:
# check pytorch version
import torch
print(torch.__version__)

2.4.1+cu121


In [2]:
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid, TUDataset
from torch_geometric.nn import GCN
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.metrics import accuracy_score
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
from torch_geometric.utils import to_networkx
from torch_geometric.nn import GINConv, global_add_pool, Sequential
from torch_geometric.nn import GATConv
from sklearn.model_selection import train_test_split
from torch_geometric.loader import DataLoader
from torch_geometric.transforms import OneHotDegree

In [3]:
#Import the datasets
cora = Planetoid(root='/tmp/Cora', name='Cora')
imdb = TUDataset(root=f'/tmp/IMDB-BINARY', name='IMDB-BINARY')
enzymes = TUDataset(root=f'/tmp/ENZYMES', name='ENZYMES')

In [4]:
#Define a dict that we will use to store performance for each of our models on each dataset
scores = {"Cora": {}, "Imdb": {}, "Enzymes": {}}

# GCN Performance

#### Node classification

In [5]:
#Define a basic GCN for node classification
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(cora.num_node_features, 16)
        self.conv2 = GCNConv(16, cora.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [6]:
# Get the model, data, and optimizer setup in torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = cora[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [7]:
# train model for 150 epochs

model.train()
for epoch in range(151):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    
    if epoch % 20 == 0:
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}')

Epoch 000, Loss: 1.9566
Epoch 020, Loss: 0.2462
Epoch 040, Loss: 0.0478
Epoch 060, Loss: 0.0686
Epoch 080, Loss: 0.0372
Epoch 100, Loss: 0.0409
Epoch 120, Loss: 0.0305
Epoch 140, Loss: 0.0328


In [8]:
#evaluate the model
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
scores["Cora"]["GCN"] = acc
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.7960


#### Graph Classification

In [9]:
#Define a basic GCN for graph classification
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super().__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 16)
        self.fc = torch.nn.Linear(16, num_classes)

    def forward(self, x, edge_index, batch):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

In [10]:
# Helper fuction to make node features for the IMDB dataset
# We just represent each node with its degree
def create_degree_features(data):
    num_nodes = data.num_nodes
    degrees = torch.bincount(data.edge_index[0], minlength=num_nodes)
    
    feature_vector = degrees.view(-1, 1).float()
    
    return feature_vector

In [11]:
# Helper function to load data, train the model, and evaluate it for graph classification
def train_and_evaluate(dataset, epochs = 150):

    # Loading Data
    train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GCN(dataset.num_node_features or 1, dataset.num_classes).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    # Training
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for data in train_loader:
            data = data.to(device)
            optimizer.zero_grad()

            # if no node features we add in the degrees
            if data.x is None:
                data.x = create_degree_features(data).to(device)

            out = model(data.x, data.edge_index, data.batch)
            loss = F.nll_loss(out, data.y)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        if epoch % 20 == 0:
            print(f'Epoch {epoch}, Loss: {loss:.4f}')

    # Evaluation
    model.eval()
    correct = 0
    total = 0
    for data in test_loader:
        data = data.to(device)

        # if no node features we add in the degrees
        if data.x is None:
            data.x = create_degree_features(data).to(device)

        out = model(data.x, data.edge_index, data.batch)
        pred = out.argmax(dim=1)
        correct += int((pred == data.y).sum())
        total += data.num_graphs

    accuracy = correct / total
    print(f'Accuracy: {accuracy:.4f}')
    
    return accuracy

In [12]:
# Test model on Imdb dataset

In [13]:
scores["Imdb"]["GCN"] = train_and_evaluate(imdb)

Epoch 0, Loss: 0.6746
Epoch 20, Loss: 0.6701
Epoch 40, Loss: 0.7241
Epoch 60, Loss: 0.4621
Epoch 80, Loss: 0.5995
Epoch 100, Loss: 0.7216
Epoch 120, Loss: 0.4611
Epoch 140, Loss: 0.4974
Accuracy: 0.6600


In [14]:
# Test model on Enzymes dataset

In [15]:
scores["Enzymes"]["GCN"] = train_and_evaluate(enzymes)

Epoch 0, Loss: 1.7942
Epoch 20, Loss: 1.7589
Epoch 40, Loss: 1.6636
Epoch 60, Loss: 1.6799
Epoch 80, Loss: 1.7140
Epoch 100, Loss: 1.5424
Epoch 120, Loss: 1.7926
Epoch 140, Loss: 1.5380
Accuracy: 0.2583


In [16]:
# Check the scores so far

In [17]:
scores

{'Cora': {'GCN': 0.796},
 'Imdb': {'GCN': 0.66},
 'Enzymes': {'GCN': 0.25833333333333336}}

# GIN Performance

#### Node classification

In [18]:
# Helper function to mind max degree in dataset
def find_max_degree(dataset):
    max_degree = -1
    for data in dataset:
        degree = data.edge_index[0].bincount().max().item()
        if degree > max_degree:
            max_degree = degree
            return max_degree

In [19]:
# Get max of this dataset
cora_max = find_max_degree(cora)

In [20]:
# make a transformed dataset with the one hot encoded degree features
transformed_cora = Planetoid(root='/tmp/Cora', name='Cora', transform=OneHotDegree(cora_max))

In [21]:
#Define a GIN for node classification
class GIN(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.mlp1 = torch.nn.Sequential(
            torch.nn.Linear(transformed_cora.num_node_features, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 16)
        )
        self.mlp2 = torch.nn.Sequential(
            torch.nn.Linear(16, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, transformed_cora.num_classes)
        )
        
        self.conv1 = GINConv(self.mlp1, eps=0.0, train_eps=True)
        self.conv2 = GINConv(self.mlp2, eps=0.0, train_eps=True)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)        
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [22]:
# Get the model, data, and optimizer setup in torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GIN().to(device)
data = transformed_cora[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [23]:
# train model for 150 epochs

model.train()
for epoch in range(151):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    
    if epoch % 20 == 0:
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}')

Epoch 000, Loss: 1.9842
Epoch 020, Loss: 0.7392
Epoch 040, Loss: 0.1852
Epoch 060, Loss: 0.1175
Epoch 080, Loss: 0.0528
Epoch 100, Loss: 0.0361
Epoch 120, Loss: 0.0334
Epoch 140, Loss: 0.0227


In [24]:
#evaluate the model
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
scores["Cora"]["GIN"] = acc
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.7090


#### Graph Classification

In [25]:
#Define a GIN for node classification
class GIN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super().__init__()

        self.mlp1 = torch.nn.Sequential(
            torch.nn.Linear(num_node_features, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 16)
        )
        self.mlp2 = torch.nn.Sequential(
            torch.nn.Linear(16, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 16)
        )

        self.conv1 = GINConv(self.mlp1, eps=0.0, train_eps=True)
        self.conv2 = GINConv(self.mlp2, eps=0.0, train_eps=True)

        self.fc = torch.nn.Sequential(
            torch.nn.Linear(16, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, num_classes)
        )

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = global_add_pool(x, batch)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.fc1(x)

        return F.log_softmax(x, dim=1)

In [30]:
# get max degrees for our datasets
def find_max_degree(dataset):
    max_degree = 0
    for data in dataset:
        max_degree = max(max_degree, int(data.edge_index.max()))
    return max_degree

imdb_max = find_max_degree(imdb)
enzymes_max = find_max_degree(enzymes)

In [31]:
# make a transformed datasets with the one hot encoded degree features
transformed_imdb = TUDataset(root='/tmp/IMDB', name='IMDB-BINARY', transform=OneHotDegree(imdb_max))
transformed_enzymes = TUDataset(root='/tmp/ENZYMES', name='ENZYMES', transform=OneHotDegree(enzymes_max))

In [32]:
# Helper function to load data, train the model, and evaluate it for graph classification
def train_and_evaluate(dataset, epochs = 150):

    # Loading Data
    train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GIN(dataset.num_node_features or 1, dataset.num_classes).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    # Training
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for data in train_loader:
            data = data.to(device)
            optimizer.zero_grad()

            out = model(data.x, data.edge_index, data.batch)
            loss = F.nll_loss(out, data.y)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        if epoch % 20 == 0:
            print(f'Epoch {epoch}, Loss: {loss:.4f}')

    # Evaluation
    model.eval()
    correct = 0
    total = 0
    for data in test_loader:
        data = data.to(device)
        out = model(data.x, data.edge_index, data.batch)
        pred = out.argmax(dim=1)
        correct += int((pred == data.y).sum())
        total += data.num_graphs

    accuracy = correct / total
    print(f'Accuracy: {accuracy:.4f}')
    
    return accuracy

In [33]:
train_and_evaluate(transformed_imdb)

TypeError: GIN.forward() takes 2 positional arguments but 4 were given

In [None]:
scores

{'Cora': {'GCN': 0.807, 'GIN': 0.737},
 'Imdb': {'GCN': 0.72},
 'Enzymes': {'GCN': 0.2833333333333333}}