## Importing libraries and dataset path

In [1]:
## Standard libraries
import os
import json
import math
import numpy as np
import time

## Imports for plotting
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg', 'pdf') # For export
from matplotlib.colors import to_rgb
import matplotlib
matplotlib.rcParams['lines.linewidth'] = 2.0
import seaborn as sns
sns.reset_orig()
sns.set()

## Progress bar
from tqdm.notebook import tqdm


## PyTorch (Main library for building and training neural networks)
import torch  # Core PyTorch library for tensors and operations on them.
import torch.nn as nn  # nn contains classes for creating and defining neural network layers.
import torch.nn.functional as F  # F contains functions that perform operations such as activations, convolutions, etc., without needing to define layers explicitly.
import torch.utils.data as data  # Utilities for loading and managing data (like DataLoader).
import torch.optim as optim  # Contains optimization algorithms such as SGD, Adam for training models.

## Torchvision (Used for loading common datasets and performing image transformations)
import torchvision  # Provides utilities for computer vision tasks such as datasets, models, and transformations.
from torchvision.datasets import CIFAR10  # CIFAR10 is a commonly used dataset containing small 32x32 color images across 10 classes.
from torchvision import transforms  # Used to apply transformations (e.g., resize, normalize) to the input data.

## PyTorch Lightning (A framework that simplifies training, testing, and validation of models)
try:
    import pytorch_lightning as pl  # PyTorch Lightning (pl) is an abstraction layer that organizes PyTorch code, making it easier to manage experiments and models.
except ModuleNotFoundError:  # If PyTorch Lightning is not installed (e.g., in Google Colab environments), the script installs it.
    !pip install --quiet pytorch-lightning>=1.4  # Installing PyTorch Lightning (version >= 1.4) if it's missing.
    import pytorch_lightning as pl  # After installing, it imports PyTorch Lightning.

# Callbacks from PyTorch Lightning
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint  
# LearningRateMonitor tracks the learning rate during training.
# ModelCheckpoint saves the best model during training, based on some criteria (e.g., lowest validation loss).

# Path to the folder where the datasets are/should be downloaded (e.g., CIFAR10)
DATASET_PATH = "D:/USC_Course/CSCE 790 Section 007 Neural Networks and Their Applications/cifar-10-python/cifar-10-batches-py"
# This specifies the local path where the CIFAR-10 dataset will be downloaded or loaded from.

# Path to the folder where the pretrained models are saved
CHECKPOINT_PATH = "D:/USC_Course/CSCE 790 Section 007 Neural Networks and Their Applications/CHECKPOINT_PATH"
# This path specifies where the trained or pre-trained models will be saved or loaded from.

# Setting the seed for reproducibility
pl.seed_everything(42)  # This sets the random seed to ensure results are reproducible. The number 42 is arbitrary, but commonly used in random seeds.

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True  # Ensures that the results on GPU are deterministic by disabling non-deterministic algorithms.
torch.backends.cudnn.benchmark = False  # Disabling benchmarking makes training slower but ensures deterministic behavior.

# Check if GPU is available, otherwise use CPU
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
# "cuda:0" refers to the first GPU, if available. If no GPU is found, it will use the CPU for computations.

print(device)  # Print the device being used (either "cuda:0" for GPU or "cpu").

  set_matplotlib_formats('svg', 'pdf') # For export
Seed set to 42


cuda:0


## Pre-trained models which is downloadable from the mentioned URL

In [2]:
import urllib.request
from urllib.error import HTTPError
# Github URL where saved models are stored for this tutorial
base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial7/"
# Files to download
CHECKPOINT_PATH = "D:/USC_Course/CSCE 790 Section 007 Neural Networks and Their Applications/CHECKPOINT_PATH/"
pretrained_files = ["NodeLevelMLP.ckpt", "NodeLevelGNN.ckpt", "GraphLevelGraphConv.ckpt"]

# Create checkpoint path if it doesn't exist yet
os.makedirs(CHECKPOINT_PATH, exist_ok=True)

# For each file, check whether it already exists. If not, try downloading it.
for file_name in pretrained_files:
    file_path = os.path.join(CHECKPOINT_PATH, file_name)
    if "/" in file_name:
        os.makedirs(file_path.rsplit("/",1)[0], exist_ok=True)
    if not os.path.isfile(file_path):
        file_url = base_url + file_name
        print(f"Downloading {file_url}...")
        try:
            urllib.request.urlretrieve(file_url, file_path)
        except HTTPError as e:
            print("Something went wrong. Please try to download the file from the GDrive folder, or contact the author with the full output including the following error:\n", e)

## Definition of the GCNLayer Class

In [3]:
class GCNLayer(nn.Module):

    def __init__(self, c_in, c_out):
        super().__init__()
        # Linear transformation to map input features to output features
        self.projection = nn.Linear(c_in, c_out)

    def forward(self, node_feats, adj_matrix):
        """
        Inputs:
            node_feats - Tensor with node features of shape [batch_size, num_nodes, c_in]
            adj_matrix - Batch of adjacency matrices of the graph. If there is an edge from i to j, adj_matrix[b,i,j]=1 else 0.
                         Supports directed edges by non-symmetric matrices. Assumes to already have added the identity connections.
                         Shape: [batch_size, num_nodes, num_nodes]
        """
        # Calculate number of neighbors by summing adjacency matrix along the last dimension
        num_neighbours = adj_matrix.sum(dim=-1, keepdims=True)
        
        # Apply linear transformation to node features
        node_feats = self.projection(node_feats)
        
        # Perform graph convolution (message passing) by matrix multiplication between adj_matrix and node features
        node_feats = torch.bmm(adj_matrix, node_feats)
        
        # Normalize the updated node features by the number of neighbors (degree normalization)
        node_feats = node_feats / num_neighbours
        
        # Return the updated node features
        return node_feats


## Initializing Node Features and Adjacency Matrix

In [4]:
# Create a tensor of node features with values ranging from 0 to 7 and cast to float32 type
node_feats = torch.arange(8, dtype=torch.float32).view(1, 4, 2)
# The tensor is reshaped to [1, 4, 2] where:
# 1 is the batch size,
# 4 is the number of nodes,
# 2 is the number of features per node.

# Define an adjacency matrix representing the graph structure
adj_matrix = torch.Tensor([[[1, 1, 0, 0],  # Node 0 is connected to node 1
                            [1, 1, 1, 1],  # Node 1 is connected to all nodes
                            [0, 1, 1, 1],  # Node 2 is connected to node 1, 2, and 3
                            [0, 1, 1, 1]]]) # Node 3 is connected to node 1, 2, and 3
# Shape of adj_matrix: [1, 4, 4], indicating 1 batch, 4 nodes, and connections between nodes

# Print node features tensor to verify the initialized values
print("Node features:\n", node_feats)

# Print adjacency matrix to verify the defined graph structure
print("\nAdjacency matrix:\n", adj_matrix)


Node features:
 tensor([[[0., 1.],
         [2., 3.],
         [4., 5.],
         [6., 7.]]])

Adjacency matrix:
 tensor([[[1., 1., 0., 0.],
         [1., 1., 1., 1.],
         [0., 1., 1., 1.],
         [0., 1., 1., 1.]]])


## Applying a GCN Layer with Identity Weight Matrix to Simplify Message Passing

In [5]:
# Initialize a GCN layer with 2 input features (c_in) and 2 output features (c_out)
layer = GCNLayer(c_in=2, c_out=2)

# Set the linear transformation weight matrix to an identity matrix (no transformation on features)
layer.projection.weight.data = torch.Tensor([[1., 0.], [0., 1.]])
# This ensures that the input features remain unchanged during the linear transformation.

# Set the bias term of the linear transformation to zero
layer.projection.bias.data = torch.Tensor([0., 0.])
# No bias will be added to the input features, further simplifying the transformation.

# Perform a forward pass through the GCN layer without computing gradients (for efficiency)
with torch.no_grad():
    # Apply the GCN layer to the node features and adjacency matrix
    out_feats = layer(node_feats, adj_matrix)
    # This step will perform message passing and feature aggregation based on the adjacency matrix.

# Print the adjacency matrix to confirm its structure
print("Adjacency matrix", adj_matrix)

# Print the input node features before the GCN layer operation
print("Input features", node_feats)

# Print the output node features after applying the GCN layer
print("Output features", out_feats)


Adjacency matrix tensor([[[1., 1., 0., 0.],
         [1., 1., 1., 1.],
         [0., 1., 1., 1.],
         [0., 1., 1., 1.]]])
Input features tensor([[[0., 1.],
         [2., 3.],
         [4., 5.],
         [6., 7.]]])
Output features tensor([[[1., 2.],
         [3., 4.],
         [4., 5.],
         [4., 5.]]])


## Implementation of a Graph Attention Layer (GATLayer)

In [6]:
class GATLayer(nn.Module):

    def __init__(self, c_in, c_out, num_heads=1, concat_heads=True, alpha=0.2):
        """
        Inputs:
            c_in - Dimensionality of input features
            c_out - Dimensionality of output features
            num_heads - Number of attention heads applied in parallel. The output features are
                        split equally over the heads if concat_heads=True.
            concat_heads - If True, concatenate the output from different heads. Otherwise, average them.
            alpha - Negative slope for the LeakyReLU activation function.
        """
        super().__init__()
        self.num_heads = num_heads
        self.concat_heads = concat_heads
        
        # c_in: Input feature size per node
        # c_out: Output feature size per node
        # If the outputs are concatenated, make sure c_out is divisible by num_heads
        if self.concat_heads:
            assert c_out % num_heads == 0, "Number of output features must be a multiple of the count of heads."
            c_out = c_out // num_heads  # Split output features equally among heads

        # Linear layer to project node features (c_in) to the desired output size (c_out * num_heads)
        self.projection = nn.Linear(c_in, c_out * num_heads)
        
        # Learnable parameters for attention scores, one set for each head
        self.a = nn.Parameter(torch.Tensor(num_heads, 2 * c_out))
        
        # LeakyReLU activation for computing attention logits
        self.leakyrelu = nn.LeakyReLU(alpha)

        # Initialize the weights using Xavier initialization to maintain variance across layers
        nn.init.xavier_uniform_(self.projection.weight.data, gain=1.414)
        nn.init.xavier_uniform_(self.a.data, gain=1.414)

    def forward(self, node_feats, adj_matrix, print_attn_probs=False):
        """
        Inputs:
            node_feats - Node feature matrix of shape [batch_size, num_nodes, c_in]
            adj_matrix - Adjacency matrix with self-connections. Shape: [batch_size, num_nodes, num_nodes]
            print_attn_probs - If True, print attention weights during the forward pass for debugging
        """
        batch_size, num_nodes = node_feats.size(0), node_feats.size(1)

        # node_feats: Input node features of shape [batch_size, num_nodes, c_in].
        # adj_matrix: Adjacency matrix that includes self-loops (connections of nodes with themselves).
        
        # Apply the linear projection to the node features and reshape them for multiple heads
        node_feats = self.projection(node_feats)
        node_feats = node_feats.view(batch_size, num_nodes, self.num_heads, -1)

        # Attention Computation:
        # Get the indices of the edges where there are connections (non-zero values in the adjacency matrix)
        edges = adj_matrix.nonzero(as_tuple=False)  # List of edges as (batch_index, node_i, node_j)
        
        # Flatten node features to make it easier to gather nodes corresponding to the edges
        node_feats_flat = node_feats.view(batch_size * num_nodes, self.num_heads, -1)
        edge_indices_row = edges[:, 0] * num_nodes + edges[:, 1]
        edge_indices_col = edges[:, 0] * num_nodes + edges[:, 2]
        
        # Concatenate the features for node_i and node_j for all edges
        a_input = torch.cat([
            torch.index_select(input=node_feats_flat, index=edge_indices_row, dim=0),
            torch.index_select(input=node_feats_flat, index=edge_indices_col, dim=0)
        ], dim=-1)  # Concatenates features of node pairs along the last dimension

        # Compute attention scores (logits) for each head by applying the attention weight matrix `a`
        attn_logits = torch.einsum('bhc,hc->bh', a_input, self.a)
        attn_logits = self.leakyrelu(attn_logits)  # Apply LeakyReLU non-linearity

        # Initialize the attention matrix and fill with a large negative number to apply masking
        attn_matrix = attn_logits.new_zeros(adj_matrix.shape + (self.num_heads,)).fill_(-9e15)
        
        # Assign the computed attention logits to the corresponding edges in the adjacency matrix
        attn_matrix[adj_matrix[..., None].repeat(1, 1, 1, self.num_heads) == 1] = attn_logits.reshape(-1)

        # Apply softmax to normalize the attention scores for each node's neighbors
        attn_probs = F.softmax(attn_matrix, dim=2)
        if print_attn_probs:
            print("Attention probs\n", attn_probs.permute(0, 3, 1, 2))  # Debugging output of attention probabilities

        # Message Passing:
        # Compute the weighted sum of node features based on attention probabilities
        node_feats = torch.einsum('bijh,bjhc->bihc', attn_probs, node_feats)

        # Concatenation/Averaging:
        # If concat_heads=True, the outputs from different attention heads are concatenated. 
        # Otherwise, they are averaged across heads.
        if self.concat_heads:
            node_feats = node_feats.reshape(batch_size, num_nodes, -1)  # Flatten to concatenate head outputs
        else:
            node_feats = node_feats.mean(dim=2)  # Average across heads

        return node_feats  # Return the updated node features


## Applying Graph Attention Layer with Custom Weights and Multi-Head Attention

In [7]:
# Initialize a Graph Attention Layer with 2 input features, 2 output features, and 2 attention heads
layer = GATLayer(2, 2, num_heads=2)

# Set the projection weight to the identity matrix
# This ensures that the input features are not transformed (identity transformation)
layer.projection.weight.data = torch.Tensor([[1., 0.], [0., 1.]])

# Set the projection bias to zeros
# No bias is added to the node features in this example
layer.projection.bias.data = torch.Tensor([0., 0.])

# Set the learnable attention weight parameters (a) for each attention head
# These values are arbitrary, to generate diverse attention weights
layer.a.data = torch.Tensor([[-0.2, 0.3], [0.1, -0.1]])

# Perform a forward pass through the GAT layer without computing gradients
# This helps in examining the behavior of the layer without updating any learnable parameters
with torch.no_grad():
    # Apply the GAT layer on the node features and adjacency matrix
    # print_attn_probs=True will output the attention probabilities during the forward pass for debugging purposes
    out_feats = layer(node_feats, adj_matrix, print_attn_probs=True)

# Print the adjacency matrix for reference
print("Adjacency matrix", adj_matrix)

# Print the input node features for reference
print("Input features", node_feats)

# Print the output node features after the GAT layer processes the input
print("Output features", out_feats)

# Example of attention probabilities for both heads
Attention_probs = torch.Tensor([[[[0.3543, 0.6457, 0.0000, 0.0000],
                                  [0.1096, 0.1450, 0.2642, 0.4813],
                                  [0.0000, 0.1858, 0.2885, 0.5257],
                                  [0.0000, 0.2391, 0.2696, 0.4913]],
                                 [[0.5100, 0.4900, 0.0000, 0.0000],
                                  [0.2975, 0.2436, 0.2340, 0.2249],
                                  [0.0000, 0.3838, 0.3142, 0.3019],
                                  [0.0000, 0.4018, 0.3289, 0.2693]]]])

# Adjacency matrix representing the graph structure
Adjacency_matrix = torch.Tensor([[[1., 1., 0., 0.],
                                  [1., 1., 1., 1.],
                                  [0., 1., 1., 1.],
                                  [0., 1., 1., 1.]]])

# Input features for each node
Input_features = torch.Tensor([[[0., 1.],
                                [2., 3.],
                                [4., 5.],
                                [6., 7.]]])

# Output features after applying the GAT layer
Output_features = torch.Tensor([[[1.2913, 1.9800],
                                 [4.2344, 3.7725],
                                 [4.6798, 4.8362],
                                 [4.5043, 4.7351]]])

print("Attention probs", Attention_probs)
print("Adjacency matrix", Adjacency_matrix)
print("Input features", Input_features)
print("Output features", Output_features)


Attention probs
 tensor([[[[0.3543, 0.6457, 0.0000, 0.0000],
          [0.1096, 0.1450, 0.2642, 0.4813],
          [0.0000, 0.1858, 0.2885, 0.5257],
          [0.0000, 0.2391, 0.2696, 0.4913]],

         [[0.5100, 0.4900, 0.0000, 0.0000],
          [0.2975, 0.2436, 0.2340, 0.2249],
          [0.0000, 0.3838, 0.3142, 0.3019],
          [0.0000, 0.4018, 0.3289, 0.2693]]]])
Adjacency matrix tensor([[[1., 1., 0., 0.],
         [1., 1., 1., 1.],
         [0., 1., 1., 1.],
         [0., 1., 1., 1.]]])
Input features tensor([[[0., 1.],
         [2., 3.],
         [4., 5.],
         [6., 7.]]])
Output features tensor([[[1.2913, 1.9800],
         [4.2344, 3.7725],
         [4.6798, 4.8362],
         [4.5043, 4.7351]]])
Attention probs tensor([[[[0.3543, 0.6457, 0.0000, 0.0000],
          [0.1096, 0.1450, 0.2642, 0.4813],
          [0.0000, 0.1858, 0.2885, 0.5257],
          [0.0000, 0.2391, 0.2696, 0.4913]],

         [[0.5100, 0.4900, 0.0000, 0.0000],
          [0.2975, 0.2436, 0.2340, 0.2249]

## Installing and Importing PyTorch Geometric with Version Compatibility

In [8]:
# Attempt to import the PyTorch Geometric library
try:
    import torch_geometric
except ModuleNotFoundError:
    # Determine the PyTorch version and CUDA version from the installed PyTorch
    # This helps in fetching the correct compatible versions of PyTorch Geometric dependencies
    TORCH = torch.__version__.split('+')[0]  # Extract the PyTorch version
    CUDA = 'cu' + torch.version.cuda.replace('.', '')  # Extract the CUDA version

    # Install the required PyTorch Geometric packages, specifying the correct versions of torch-scatter, torch-sparse, 
    # torch-cluster, torch-spline-conv, and torch-geometric based on the PyTorch and CUDA version
    # Each URL corresponds to a wheel (.whl) file which is a binary package for installation.
    !pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
    !pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
    !pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
    !pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
    !pip install torch-geometric  # Install the main torch-geometric package

# After installation, import PyTorch Geometric modules
import torch_geometric

# Import specific submodules from torch_geometric
# geom_nn provides common neural network layers for graph data (e.g., GCN, GAT, etc.)
import torch_geometric.nn as geom_nn

# geom_data provides datasets and data structures for working with graph data
import torch_geometric.data as geom_data


## GNN Layer Selection in PyTorch Geometric

In [9]:
# Define a dictionary that maps layer names (strings) to their corresponding graph layer classes in PyTorch Geometric
gnn_layer_by_name = {
    # "GCN" maps to the GCNConv layer from the geom_nn module (Graph Convolutional Network)
    "GCN": geom_nn.GCNConv,
    
    # "GAT" maps to the GATConv layer from the geom_nn module (Graph Attention Network)
    "GAT": geom_nn.GATConv,
    
    # "GraphConv" maps to the GraphConv layer from the geom_nn module, which is a variant of GCN with additional functionality
    "GraphConv": geom_nn.GraphConv
}


## Loading and Accessing the Cora Dataset in PyTorch Geometric

In [10]:
# Load the Cora dataset using PyTorch Geometric's Planetoid class
cora_dataset = torch_geometric.datasets.Planetoid(root=DATASET_PATH, name="Cora")

# Print the dataset to confirm it's loaded correctly
print(cora_dataset[0])


Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])


In [11]:
# Load the Cora dataset using PyTorch Geometric's Planetoid class
cora_dataset = torch_geometric.datasets.Planetoid(root=DATASET_PATH, name="Cora")

# Access the first graph in the dataset
cora_dataset[0]
# This shows the structure: node features (x), edges (edge_index), labels (y), and masks for training/validation/testing

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

## Flexible Graph Neural Network (GNN) Model Implementation

In [12]:
# Define a class for a simple Graph Neural Network (GNN) model
class GNNModel(nn.Module):

    # Initialize the GNNModel with input, hidden, and output dimensions, number of layers, and other parameters
    def __init__(self, c_in, c_hidden, c_out, num_layers=2, layer_name="GCN", dp_rate=0.1, **kwargs):
        """
        Inputs:
            c_in - Dimension of input features per node
            c_hidden - Dimension of hidden features in the intermediate layers
            c_out - Dimension of output features (typically the number of classes in classification tasks)
            num_layers - Number of graph layers to use in the model
            layer_name - Specifies the type of graph layer to use (e.g., GCN, GAT, GraphConv)
            dp_rate - Dropout rate for regularization (used in hidden layers)
            kwargs - Additional parameters passed to the graph layer (e.g., number of heads for GAT)
        """
        super().__init__()  # Call the parent class (nn.Module) initializer
        
        # Retrieve the graph layer class from the dictionary using the layer_name (e.g., GCN, GAT)
        gnn_layer = gnn_layer_by_name[layer_name]

        # Initialize an empty list to store the layers of the GNN
        layers = []
        in_channels, out_channels = c_in, c_hidden  # Set input and hidden channel dimensions

        # Loop to create the intermediate hidden layers (num_layers - 1)
        for l_idx in range(num_layers-1):
            layers += [
                # Add the graph layer with specified input and output dimensions
                gnn_layer(in_channels=in_channels,
                          out_channels=out_channels,
                          **kwargs),
                # Add the ReLU activation function to introduce non-linearity
                nn.ReLU(inplace=True),
                # Add dropout for regularization to avoid overfitting
                nn.Dropout(dp_rate)
            ]
            # Update in_channels to be the size of the hidden layer for the next iteration
            in_channels = c_hidden
        
        # Add the final graph layer (output layer) with output dimension (c_out, usually the number of classes)
        layers += [gnn_layer(in_channels=in_channels,
                             out_channels=c_out,
                             **kwargs)]
        
        # Store the layers in a ModuleList (a PyTorch container for layers)
        self.layers = nn.ModuleList(layers)

    # Define the forward pass through the GNN
    def forward(self, x, edge_index):
        """
        Inputs:
            x - Node feature matrix (input features for each node)
            edge_index - Tensor representing the edges in the graph (using PyTorch Geometric notation)
        """
        # Iterate through each layer in the GNN
        for l in self.layers:
            # Check if the layer is a MessagePassing layer (i.e., a graph layer from PyTorch Geometric)
            if isinstance(l, geom_nn.MessagePassing):
                # For graph layers, pass both the node features (x) and the edge information (edge_index)
                x = l(x, edge_index)
            else:
                # For non-graph layers (e.g., ReLU, Dropout), just pass the node features
                x = l(x)
        # Return the output node features after passing through all the layers
        return x


## Implementation of a Simple Multi-Layer Perceptron (MLP) Model for Node Classification

In [13]:
# Define a class for a simple Multi-Layer Perceptron (MLP) model
class MLPModel(nn.Module):

    # Initialize the MLP model with input, hidden, and output dimensions, number of layers, and dropout rate
    def __init__(self, c_in, c_hidden, c_out, num_layers=2, dp_rate=0.1):
        """
        Inputs:
            c_in - Dimension of input features (number of features per node)
            c_hidden - Dimension of hidden layers (number of hidden units)
            c_out - Dimension of output features (usually number of classes in classification tasks)
            num_layers - Number of hidden layers in the model (default is 2)
            dp_rate - Dropout rate for regularization (applied between layers)
        """
        super().__init__()  # Call the parent class (nn.Module) initializer

        # Initialize an empty list to store the layers of the MLP
        layers = []
        in_channels, out_channels = c_in, c_hidden  # Set initial input and hidden layer sizes

        # Loop to create the hidden layers (num_layers - 1) with ReLU activation and dropout
        for l_idx in range(num_layers-1):
            layers += [
                # Add a fully connected (linear) layer
                nn.Linear(in_channels, out_channels),
                # Add ReLU activation to introduce non-linearity
                nn.ReLU(inplace=True),
                # Add dropout for regularization to prevent overfitting
                nn.Dropout(dp_rate)
            ]
            # Update the input size for the next layer to match the hidden layer size
            in_channels = c_hidden

        # Add the final fully connected layer (output layer)
        layers += [nn.Linear(in_channels, c_out)]

        # Use nn.Sequential to create a sequential container for the layers
        self.layers = nn.Sequential(*layers)

    # Define the forward pass through the MLP
    def forward(self, x, *args, **kwargs):
        """
        Inputs:
            x - Input features per node (node features or input matrix)
        """
        # Pass the input through the sequence of layers
        return self.layers(x)


## PyTorch Lightning Module for Node-Level GNN and MLP Models

In [14]:
# Define a PyTorch Lightning module for node-level tasks using GNNs
class NodeLevelGNN(pl.LightningModule):

    # Initialize the model with the specified model name and hyperparameters
    def __init__(self, model_name, **model_kwargs):
        super().__init__()
        # Save the hyperparameters (e.g., model configurations) for future reference
        self.save_hyperparameters()

        # Initialize the model: if model_name is "MLP", create an MLP model; otherwise, use a GNN model
        if model_name == "MLP":
            self.model = MLPModel(**model_kwargs)
        else:
            self.model = GNNModel(**model_kwargs)
        
        # Define the loss function for the model, using CrossEntropyLoss for classification tasks
        self.loss_module = nn.CrossEntropyLoss()

    # Define the forward pass of the model
    def forward(self, data, mode="train"):
        # Extract node features and edge indices from the data object
        x, edge_index = data.x, data.edge_index
        # Pass the features and edge information through the model (MLP or GNN)
        x = self.model(x, edge_index)

        # Use the appropriate mask depending on the mode (train, val, or test)
        if mode == "train":
            mask = data.train_mask
        elif mode == "val":
            mask = data.val_mask
        elif mode == "test":
            mask = data.test_mask
        else:
            assert False, f"Unknown forward mode: {mode}"  # Raise an error if the mode is unknown

        # Calculate the loss only for the nodes selected by the mask
        loss = self.loss_module(x[mask], data.y[mask])
        # Calculate accuracy for the selected nodes
        acc = (x[mask].argmax(dim=-1) == data.y[mask]).sum().float() / mask.sum()
        return loss, acc

    # Configure the optimizer for the training process
    def configure_optimizers(self):
        # Use Stochastic Gradient Descent (SGD) with learning rate, momentum, and weight decay
        optimizer = optim.SGD(self.parameters(), lr=0.1, momentum=0.9, weight_decay=2e-3)
        return optimizer

    # Define the training step, which will be called during each training iteration
    def training_step(self, batch, batch_idx):
        # Forward pass to calculate loss and accuracy for the training batch
        loss, acc = self.forward(batch, mode="train")
        # Log the training loss and accuracy for tracking during training
        self.log('train_loss', loss)
        self.log('train_acc', acc)
        return loss

    # Define the validation step, called during validation
    def validation_step(self, batch, batch_idx):
        # Forward pass to calculate accuracy for the validation batch
        _, acc = self.forward(batch, mode="val")
        # Log the validation accuracy for tracking
        self.log('val_acc', acc)

    # Define the test step, called during testing
    def test_step(self, batch, batch_idx):
        # Forward pass to calculate accuracy for the test batch
        _, acc = self.forward(batch, mode="test")
        # Log the test accuracy for tracking
        self.log('test_acc', acc)


## Training Function for Node-Level GNN/MLP Models with PyTorch Lightning

In [15]:
# Define a function to train the node classifier model (either GNN or MLP)
def train_node_classifier(model_name, dataset, **model_kwargs):
    # Set a random seed for reproducibility
    pl.seed_everything(42)

    # Create a data loader for the node-level dataset (batch size is 1 since it's a single graph)
    node_data_loader = geom_data.DataLoader(dataset, batch_size=1)

    # Define the directory for saving checkpoints during training
    root_dir = os.path.join(CHECKPOINT_PATH, "NodeLevel" + model_name)
    # Create the directory if it doesn't exist
    os.makedirs(root_dir, exist_ok=True)

    # Create a PyTorch Lightning trainer with checkpointing and GPU/CPU support
    trainer = pl.Trainer(default_root_dir=root_dir,
                         # Save only the best model based on validation accuracy
                         callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc")],
                         # Use GPU if available, otherwise use CPU
                         accelerator="gpu" if str(device).startswith("cuda") else "cpu",
                         devices=1,  # Use one GPU or CPU
                         max_epochs=200,  # Set maximum number of epochs to 200
                         enable_progress_bar=False)  # Disable the progress bar since an epoch is just one step

    # Disable unnecessary logging of the default hyperparameter metric
    trainer.logger._default_hp_metric = None

    # Check if a pretrained model exists; if yes, load it and skip training
    pretrained_filename = os.path.join(CHECKPOINT_PATH, f"NodeLevel{model_name}.ckpt")
    if os.path.isfile(pretrained_filename):
        # If a pretrained model is found, load it
        print("Found pretrained model, loading...")
        model = NodeLevelGNN.load_from_checkpoint(pretrained_filename)
    else:
        # If no pretrained model is found, train a new model
        pl.seed_everything()  # Set seed again for reproducibility
        # Create a new instance of NodeLevelGNN with the given model name and dataset features
        model = NodeLevelGNN(model_name=model_name, c_in=dataset.num_node_features, c_out=dataset.num_classes, **model_kwargs)
        # Train the model using the same data loader for training and validation (mask is handled internally)
        trainer.fit(model, node_data_loader, node_data_loader)
        # Load the best model checkpoint after training
        model = NodeLevelGNN.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

    # Test the best model on the test set using the same data loader
    test_result = trainer.test(model, node_data_loader, verbose=False)

    # Get a single batch from the data loader and move it to the appropriate device (GPU/CPU)
    batch = next(iter(node_data_loader))
    batch = batch.to(model.device)

    # Calculate the accuracy on the training set using the trained model
    _, train_acc = model.forward(batch, mode="train")
    # Calculate the accuracy on the validation set using the trained model
    _, val_acc = model.forward(batch, mode="val")

    # Return a dictionary containing the train, validation, and test accuracies
    result = {"train": train_acc,
              "val": val_acc,
              "test": test_result[0]['test_acc']}
    return model, result  # Return the trained model and accuracy results


## Training and Evaluating MLP on the Cora Dataset with GPU Support

In [16]:
# Define a function to print the test scores (train, validation, and test accuracies)
def print_results(result_dict):
    # Check if the result dictionary contains the training results
    if "train" in result_dict:
        # Print the training accuracy formatted to 4.2 decimal places
        print(f"Train accuracy: {(100.0*result_dict['train']):4.2f}%")
    
    # Check if the result dictionary contains the validation results
    if "val" in result_dict:
        # Print the validation accuracy formatted to 4.2 decimal places
        print(f"Val accuracy:   {(100.0*result_dict['val']):4.2f}%")
    
    # Print the test accuracy formatted to 4.2 decimal places
    print(f"Test accuracy:  {(100.0*result_dict['test']):4.2f}%")

# Train a node classifier using the MLP model on the Cora dataset
node_mlp_model, node_mlp_result = train_node_classifier(
    model_name="MLP",              # Specify the model type as MLP (Multi-Layer Perceptron)
    dataset=cora_dataset,          # Use the Cora dataset for node classification
    c_hidden=16,                   # Set the hidden layer size to 16
    num_layers=2,                  # Specify that the MLP should have 2 layers
    dp_rate=0.1                    # Set the dropout rate to 0.1 for regularization
)

# Print the train, validation, and test accuracies for the trained MLP model
print_results(node_mlp_result)


Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\Roja\anaconda3\Lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


Found pretrained model, loading...


Lightning automatically upgraded your loaded checkpoint from v1.0.2 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint D:\USC_Course\CSCE 790 Section 007 Neural Networks and Their Applications\CHECKPOINT_PATH\NodeLevelMLP.ckpt`
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train accuracy: 97.14%
Val accuracy:   54.60%
Test accuracy:  60.60%


c:\Users\Roja\anaconda3\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
c:\Users\Roja\anaconda3\Lib\site-packages\pytorch_lightning\utilities\data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 2708. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


In [17]:
# Train a node classifier using a Graph Neural Network (GNN) model on the Cora dataset
node_gnn_model, node_gnn_result = train_node_classifier(
    model_name="GNN",             # Specify the model type as GNN (Graph Neural Network)
    layer_name="GCN",             # Specify the graph layer type as GCN (Graph Convolutional Network)
    dataset=cora_dataset,         # Use the Cora dataset for node classification
    c_hidden=16,                  # Set the hidden layer size to 16
    num_layers=2,                 # Specify that the GNN should have 2 layers
    dp_rate=0.1                   # Set the dropout rate to 0.1 for regularization
)

# Print the train, validation, and test accuracies for the trained GNN model
print_results(node_gnn_result)



Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Lightning automatically upgraded your loaded checkpoint from v1.0.2 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint D:\USC_Course\CSCE 790 Section 007 Neural Networks and Their Applications\CHECKPOINT_PATH\NodeLevelGNN.ckpt`
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Found pretrained model, loading...
Train accuracy: 100.00%
Val accuracy:   78.60%
Test accuracy:  82.40%


## Loading the MUTAG Dataset for Graph Classification

In [18]:
# Load the MUTAG dataset using PyTorch Geometric's TUDataset class
tu_dataset = torch_geometric.datasets.TUDataset(root=DATASET_PATH, name="MUTAG")

# Print some basic information about the dataset
print("Data object:", tu_dataset.data)  # Print the structure of the data object storing the graphs
print("Length:", len(tu_dataset))       # Print the number of graphs in the dataset (188 graphs)
# Calculate and print the average label (percentage of graphs labeled with 1)
print(f"Average label: {tu_dataset.data.y.float().mean().item():4.2f}")

Data object: Data(x=[3371, 7], edge_index=[2, 7442], edge_attr=[7442, 4], y=[188])
Length: 188
Average label: 0.66




## Splitting the MUTAG Dataset into Training and Test Sets

In [19]:
# Set a manual random seed to ensure reproducibility when shuffling the dataset
torch.manual_seed(42)

# Shuffle the dataset so that graphs are randomly ordered
tu_dataset.shuffle()

# Split the dataset into training (first 150 graphs) and test (remaining 38 graphs) sets
train_dataset = tu_dataset[:150]  # Select the first 150 graphs as the training set
test_dataset = tu_dataset[150:]   # Select the remaining graphs as the test set


## Creating Data Loaders for Batching Multiple Graphs Efficiently

In [20]:
# Create a DataLoader for the training dataset
graph_train_loader = geom_data.DataLoader(train_dataset, batch_size=64, shuffle=True)

# Create a DataLoader for the validation dataset (can use test dataset as validation due to small size)
graph_val_loader = geom_data.DataLoader(test_dataset, batch_size=64) 

# Create a DataLoader for the test dataset
graph_test_loader = geom_data.DataLoader(test_dataset, batch_size=64) 



## Inspecting a Batch of Graphs from the Data Loader

In [21]:
# Get the next batch of graphs from the test data loader
batch = next(iter(graph_test_loader))

# Print the entire batch object to inspect its structure
print("Batch:", batch)

# Print the first 10 graph labels from the batch (y represents the graph labels)
print("Labels:", batch.y[:10])

# Print the batch indices that map nodes to their corresponding graphs in the batch
print("Batch indices:", batch.batch[:40])


Batch: DataBatch(edge_index=[2, 1512], x=[687, 7], edge_attr=[1512, 4], y=[38], batch=[687], ptr=[39])
Labels: tensor([1, 1, 1, 0, 0, 0, 1, 1, 1, 0])
Batch indices: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2])


## Graph-Level GNN Model with Global Pooling for Graph Classification

In [22]:
# Define a graph-level GNN model class for graph classification
class GraphGNNModel(nn.Module):

    # Initialize the model with input, hidden, output dimensions, dropout, and additional GNN arguments
    def __init__(self, c_in, c_hidden, c_out, dp_rate_linear=0.5, **kwargs):
        """
        Inputs:
            c_in - Dimension of input features
            c_hidden - Dimension of hidden features
            c_out - Dimension of output features (usually the number of classes)
            dp_rate_linear - Dropout rate before the linear layer (higher than inside GNN)
            kwargs - Additional arguments for the GNN model
        """
        super().__init__()  # Call the parent class (nn.Module) initializer
        # Define the GNN model using the GNNModel class and pass input, hidden, and output dimensions
        self.GNN = GNNModel(c_in=c_in,
                            c_hidden=c_hidden,
                            c_out=c_hidden,  # Intermediate hidden output, not the final output yet
                            **kwargs)  # Pass additional keyword arguments if any

        # Define the head of the model, including dropout and a linear layer for the final classification
        self.head = nn.Sequential(
            nn.Dropout(dp_rate_linear),  # Dropout for regularization before the linear layer
            nn.Linear(c_hidden, c_out)   # Linear layer for mapping hidden features to output classes
        )

    # Define the forward pass for the model
    def forward(self, x, edge_index, batch_idx):
        """
        Inputs:
            x - Input features per node
            edge_index - List of vertex index pairs representing the edges in the graph (PyTorch Geometric notation)
            batch_idx - Index of batch element for each node (used for pooling nodes per graph)
        """
        # Pass the node features and edge information through the GNN model
        x = self.GNN(x, edge_index)

        # Apply global mean pooling over the nodes for each graph, combining node features into a single graph feature
        x = geom_nn.global_mean_pool(x, batch_idx)  # Average pooling for aggregating node features at the graph level

        # Pass the pooled features through the linear head for final prediction (classification)
        x = self.head(x)

        # Return the output (graph-level prediction)
        return x


## Graph-Level GNN Model Training Using PyTorch Lightning for Binary Classification

In [23]:
# Define a PyTorch Lightning module for graph-level tasks using a GNN
class GraphLevelGNN(pl.LightningModule):

    # Initialize the module with hyperparameters and model configuration
    def __init__(self, **model_kwargs):
        super().__init__()
        # Save the hyperparameters (such as model configurations) for future reference
        self.save_hyperparameters()

        # Initialize the graph-level GNN model
        self.model = GraphGNNModel(**model_kwargs)

        # Use Binary Cross Entropy Loss for binary classification, Cross Entropy for multi-class classification
        self.loss_module = nn.BCEWithLogitsLoss() if self.hparams.c_out == 1 else nn.CrossEntropyLoss()

    # Define the forward pass of the model
    def forward(self, data, mode="train"):
        # Extract node features, edge indices, and batch indices from the data object
        x, edge_index, batch_idx = data.x, data.edge_index, data.batch
        # Pass the features through the GNN model
        x = self.model(x, edge_index, batch_idx)
        # Remove extra dimensions if necessary
        x = x.squeeze(dim=-1)

        # Determine predictions based on the output
        if self.hparams.c_out == 1:  # Binary classification case
            preds = (x > 0).float()  # Convert logits to binary predictions
            data.y = data.y.float()  # Ensure target labels are floats
        else:  # Multi-class classification case
            preds = x.argmax(dim=-1)  # Use argmax to get predicted class for each graph

        # Calculate the loss using the appropriate loss function
        loss = self.loss_module(x, data.y)
        # Calculate accuracy
        acc = (preds == data.y).sum().float() / preds.shape[0]
        return loss, acc

    # Configure the optimizer for training
    def configure_optimizers(self):
        # Use AdamW optimizer with a learning rate of 1e-2 and no weight decay (since the dataset is small)
        optimizer = optim.AdamW(self.parameters(), lr=1e-2, weight_decay=0.0)
        return optimizer

    # Define the training step
    def training_step(self, batch, batch_idx):
        # Perform a forward pass and compute loss and accuracy for the training batch
        loss, acc = self.forward(batch, mode="train")
        # Log training loss and accuracy
        self.log('train_loss', loss)
        self.log('train_acc', acc)
        return loss

    # Define the validation step
    def validation_step(self, batch, batch_idx):
        # Perform a forward pass and compute accuracy for the validation batch
        _, acc = self.forward(batch, mode="val")
        # Log validation accuracy
        self.log('val_acc', acc)

    # Define the test step
    def test_step(self, batch, batch_idx):
        # Perform a forward pass and compute accuracy for the test batch
        _, acc = self.forward(batch, mode="test")
        # Log test accuracy
        self.log('test_acc', acc)


## Training a Graph-Level GNN with PyTorch Lightning and Model Checkpoints

In [24]:
def train_graph_classifier(model_name, **model_kwargs):
    pl.seed_everything(42)

    # Create a PyTorch Lightning trainer with the generation callback
    root_dir = os.path.join(CHECKPOINT_PATH, "GraphLevel" + model_name)
    os.makedirs(root_dir, exist_ok=True)
    
    trainer = pl.Trainer(
        default_root_dir=root_dir,
        callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc")],
        accelerator="gpu" if str(device).startswith("cuda") else "cpu",
        devices=1,
        max_epochs=500,
        enable_progress_bar=False
    )
    trainer.logger._default_hp_metric = None  # Optional logging argument that we don't need

    # Check if pretrained model exists, load it if available
    pretrained_filename = os.path.join(CHECKPOINT_PATH, f"GraphLevel{model_name}.ckpt")
    if os.path.isfile(pretrained_filename):
        print("Found pretrained model, loading...")
        model = GraphLevelGNN.load_from_checkpoint(pretrained_filename)
    else:
        pl.seed_everything(42)
        model = GraphLevelGNN(
            c_in=tu_dataset.num_node_features,
            c_out=1 if tu_dataset.num_classes == 2 else tu_dataset.num_classes,
            **model_kwargs
        )
        trainer.fit(model, graph_train_loader, graph_val_loader)
        model = GraphLevelGNN.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

    # Test best model on validation and test set
    train_result = trainer.test(model, graph_train_loader, verbose=False)  # Test on training data
    test_result = trainer.test(model, graph_test_loader, verbose=False)    # Test on test data

    # Ensure both test and train accuracy are logged in the result
    result = {"test": test_result[0]['test_acc'], "train": train_result[0].get('test_acc', 0)}
    return model, result

## Training and Testing Graph-Level GNN with GraphConv Layer

In [25]:
# Train the graph classification model using the GraphConv layer
model, result = train_graph_classifier(
    model_name="GraphConv",       # Specify the model name as GraphConv
    c_hidden=256,                 # Number of hidden units in the model
    layer_name="GraphConv",       # Use the GraphConv layer for the GNN
    num_layers=3,                 # Define the number of layers in the model (3 layers)
    dp_rate_linear=0.5,           # Dropout rate of 50% before the final linear layer
    dp_rate=0.0                   # Dropout rate of 0% in the GNN layers
)

# Print the training and test performance
print(f"Train performance: {100.0*result['train']:.4f}%")  # Print the training accuracy
print(f"Test performance: {100.0*result['test']:.4f}%")    # Print the test accuracy


Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Lightning automatically upgraded your loaded checkpoint from v1.0.2 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint D:\USC_Course\CSCE 790 Section 007 Neural Networks and Their Applications\CHECKPOINT_PATH\GraphLevelGraphConv.ckpt`
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\Roja\anaconda3\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:475: Your `test_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
c:\Users\Roja\anaconda3\Lib\site-packages\pytorch_lightning\utilities\data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 2. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVIC

Found pretrained model, loading...
Train performance: 93.2765%
Test performance: 92.1053%
