In [4]:
import torch
from torch_geometric.datasets import Planetoid, PPI
from torch_geometric.data import Data, Dataset
from torch_geometric.data.lightning import LightningDataset
from torch.utils.data import random_split
from typing import Optional, Any, Dict, Tuple
import os

root = 'D:\ComputerScience\cs_2024_Fall_Deep_Learning\Lab\data'

  root = 'D:\ComputerScience\cs_2024_Fall_Deep_Learning\Lab\data'


In [2]:
def load_planetoid(root, names):
    dataset = Planetoid(root, names)
    num_features = dataset.num_features
    num_classes = dataset.num_classes
    data = dataset[0]

    print(f'Dataset: {dataset}:')
    print('======================')
    print(f'Number of graphs: {len(dataset)}')
    print(f'Number of features: {dataset.num_features}')
    print(f'Number of classes: {dataset.num_classes}')

    print(f'Number of nodes: {data.num_nodes}')
    print(f'Number of edges: {data.num_edges}')
    print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
    print(f'Number of training nodes: {data.train_mask.sum()}')
    print(f'Number of validation nodes: {data.val_mask.sum()}')
    print(f'Number of test nodes: {data.test_mask.sum()}')
    print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
    print(f'Contains isolated nodes: {data.contains_isolated_nodes()}')
    print(f'Contains self-loops: {data.contains_self_loops()}')
    print(f'Is undirected: {data.is_undirected()}')

    return dataset

def load_ppi(root):
    path = os.path.join(root, 'PPI')
    train_dataset = PPI(path, split='train')
    val_dataset = PPI(path, split='val')
    test_dataset = PPI(path, split='test')
    num_features = train_dataset.num_features
    num_classes = train_dataset.num_classes

    data = train_dataset[0]

    print(f'Dataset: {train_dataset}:')
    print('======================')
    print(f'Number of train graphs: {len(train_dataset)}')
    print(f'Number of val graphs: {len(val_dataset)}')
    print(f'Number of test graphs: {len(test_dataset)}')

    print(f'Number of features: {train_dataset.num_features}')
    print(f'Number of classes: {train_dataset.num_classes}')
    print(f'Number of nodes: {data.num_nodes}')
    print(f'Number of edges: {data.num_edges}')
    print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
    # print(f'Number of training nodes: {data.train_mask.sum()}')
    # print(f'Number of validation nodes: {data.val_mask.sum()}')
    # print(f'Number of test nodes: {data.test_mask.sum()}')
    # print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
    print(f'Contains isolated nodes: {data.contains_isolated_nodes()}')
    print(f'Contains self-loops: {data.contains_self_loops()}')
    print(f'Is undirected: {data.is_undirected()}')

    return train_dataset, val_dataset, test_dataset


In [8]:
dataset_cora = load_planetoid(root, 'Cora')

Dataset: Cora():
Number of graphs: 1
Number of features: 1433
Number of classes: 7
Number of nodes: 2708
Number of edges: 10556
Average node degree: 3.90
Number of training nodes: 140
Number of validation nodes: 500
Number of test nodes: 1000
Training node label rate: 0.05
Contains isolated nodes: False
Contains self-loops: False
Is undirected: True




In [9]:
graph0 = dataset_cora[0]
print(graph0)
print(graph0.y.size())
print(graph0.y[:10])

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
torch.Size([2708])
tensor([3, 4, 4, 0, 3, 2, 0, 3, 3, 2])


In [11]:
dataset_citeseer = load_planetoid(root, 'CiteSeer')

Dataset: CiteSeer():
Number of graphs: 1
Number of features: 3703
Number of classes: 6
Number of nodes: 3327
Number of edges: 9104
Average node degree: 2.74
Number of training nodes: 120
Number of validation nodes: 500
Number of test nodes: 1000
Training node label rate: 0.04
Contains isolated nodes: True
Contains self-loops: False
Is undirected: True




In [12]:
graph0 = dataset_citeseer[0]
print(graph0)
print(graph0.y.size())
print(graph0.y[:10])

Data(x=[3327, 3703], edge_index=[2, 9104], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327])
torch.Size([3327])
tensor([3, 1, 5, 5, 3, 1, 3, 0, 3, 5])


In [5]:
dataset_ppi_train, dataset_ppi_val, dataset_ppi_test = load_ppi(root)

Dataset: PPI(20):
Number of train graphs: 20
Number of val graphs: 2
Number of test graphs: 2
Number of features: 50
Number of classes: 121
Number of nodes: 1767
Number of edges: 32318
Average node degree: 18.29
Contains isolated nodes: True
Contains self-loops: False
Is undirected: True




In [13]:
graph0 = dataset_ppi_train[0]
print(graph0)
print(graph0.y.shape)
print(graph0.y[:10])

Data(x=[1767, 50], edge_index=[2, 32318], y=[1767, 121])
torch.Size([1767, 121])
tensor([[1., 0., 0.,  ..., 1., 1., 0.],
        [1., 0., 0.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 0., 1.,  ..., 1., 1., 0.],
        [0., 0., 0.,  ..., 1., 1., 0.],
        [1., 1., 1.,  ..., 1., 1., 0.]])


In [11]:
from torch_geometric.data import DataLoader
train_loader = DataLoader(dataset_cora, batch_size=1, shuffle=False)
print(dataset_cora.num_classes)
print(dataset_cora.num_features)
for data in train_loader:
    print(data.num_nodes)
    print(data.num_edges)
    print(data.train_mask.sum())
    print(data.val_mask.sum())
    print(data.test_mask.sum())
    print(data.y.size())


7
1433
2708
10556
tensor(140)
tensor(500)
tensor(1000)
torch.Size([2708])
1433


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import MessagePassing
import torch_geometric.utils as utils

class GCNConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        """
        Initialize the GCN convolution layer.

        Args:
            in_channels (int): Number of input features per node.
            out_channels (int): Number of output features per node.
        """
        # Initialize the MessagePassing class with 'add' aggregation
        super(GCNConv, self).__init__(aggr='add')

        # Define a linear transformation (weight matrix)
        self.linear = nn.Linear(in_channels, out_channels, bias=True)

    def forward(self, x, edge_index):
        """
        Forward pass of the GCN layer.

        x_i  = W * \sum_{j \in N(i)} (1/sqrt(d_i * d_j)) * x_j

        Args:
            x (Tensor): Node feature matrix of shape [N, in_channels].
            edge_index (LongTensor): Edge indices in COO format of shape [2, E].

        Returns:
            Tensor: Output node features of shape [N, out_channels].
        """
        # Add self-loops to the adjacency matrix
        edge_index, _ = utils.add_self_loops(edge_index, num_nodes=x.size(0))

        # Compute normalization coefficients
        row, col = edge_index
        deg = utils.degree(row, x.size(0), dtype=x.dtype)  # Degree of each node
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0  # Handle division by zero

        # Apply linear transformation
        x = self.linear(x)  # [N, out_channels]

        # Initiate message passing
        return self.propagate(edge_index, x=x, norm=(deg_inv_sqrt, row, col))

    def message(self, x_j, norm):
        """
        Define the message computation.

        Args:
            x_j (Tensor): Neighbor node features of shape [E, out_channels].
            norm (tuple): Tuple containing normalization factors.

        Returns:
            Tensor: Messages to be aggregated.
        """
        deg_inv_sqrt, row, col = norm

        # Source node degree (j)
        D_j = deg_inv_sqrt[col]
        # Target node degree (i)
        D_i = deg_inv_sqrt[row]

        # Compute normalization factor for each edge
        alpha = D_i * D_j

        # Scale the messages
        return alpha.view(-1, 1) * x_j  # [E, out_channels]

    def update(self, aggr_out):
        """
        Update node embeddings after aggregation.

        Args:
            aggr_out (Tensor): Aggregated messages of shape [N, out_channels].

        Returns:
            Tensor: Updated node features.
        """
        return aggr_out

class GCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        """
        Initialize a simple GCN model.

        Args:
            in_channels (int): Number of input features.
            hidden_channels (int): Number of hidden features.
            out_channels (int): Number of output features.
        """
        super(GCN, self).__init__()

        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        """
        Forward pass of the GCN model.

        Args:
            x (Tensor): Node features of shape [N, in_channels].
            edge_index (LongTensor): Edge indices in COO format of shape [2, E].

        Returns:
            Tensor: Output features of shape [N, out_channels].
        """
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

  """


In [6]:
type(dataset_cora)

torch_geometric.datasets.planetoid.Planetoid