In [7]:
import torch
from torch_geometric.datasets import Planetoid, PPI
from torch_geometric.data import Data, Dataset
from torch_geometric.data.lightning import LightningDataset
from torch.utils.data import random_split
from typing import Optional, Any, Dict, Tuple
import os

root = 'D:\ComputerScience\cs_2024_Fall_Deep_Learning\Lab\data'

  root = 'D:\ComputerScience\cs_2024_Fall_Deep_Learning\Lab\data'


In [8]:
def load_planetoid(root, names):
    dataset = Planetoid(root, names)
    num_features = dataset.num_features
    num_classes = dataset.num_classes
    data = dataset[0]

    print(f'Dataset: {dataset}:')
    print('======================')
    print(f'Number of graphs: {len(dataset)}')
    print(f'Number of features: {dataset.num_features}')
    print(f'Number of classes: {dataset.num_classes}')

    print(f'Number of nodes: {data.num_nodes}')
    print(f'Number of edges: {data.num_edges}')
    print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
    print(f'Number of training nodes: {data.train_mask.sum()}')
    print(f'Number of validation nodes: {data.val_mask.sum()}')
    print(f'Number of test nodes: {data.test_mask.sum()}')
    print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
    print(f'Contains isolated nodes: {data.contains_isolated_nodes()}')
    print(f'Contains self-loops: {data.contains_self_loops()}')
    print(f'Is undirected: {data.is_undirected()}')

    return dataset

def load_ppi(root):
    path = os.path.join(root, 'PPI')
    train_dataset = PPI(path, split='train')
    val_dataset = PPI(path, split='val')
    test_dataset = PPI(path, split='test')
    num_features = train_dataset.num_features
    num_classes = train_dataset.num_classes

    data = train_dataset[0]
    train_graphs = len(train_dataset)
    node_num = sum([data.num_nodes for data in train_dataset])
    edge_num = sum([data.num_edges for data in train_dataset])
    

    print(f'Dataset: {train_dataset}:')
    print('======================')
    print(f'Number of train graphs: {len(train_dataset)}')
    print(f'Number of val graphs: {len(val_dataset)}')
    print(f'Number of test graphs: {len(test_dataset)}')

    print(f'Number of features: {train_dataset.num_features}')
    print(f'Number of classes: {train_dataset.num_classes}')
    print(f'Avg number of nodes: {node_num / train_graphs:.2f}')
    print(f'Avg number of edges: {edge_num / train_graphs:.2f}')
    print(f'Average node degree: {edge_num / node_num:.2f}')
    # print(f'Number of training nodes: {data.train_mask.sum()}')
    # print(f'Number of validation nodes: {data.val_mask.sum()}')
    # print(f'Number of test nodes: {data.test_mask.sum()}')
    # print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
    print(f'Contains isolated nodes: {data.contains_isolated_nodes()}')
    print(f'Contains self-loops: {data.contains_self_loops()}')
    print(f'Is undirected: {data.is_undirected()}')

    return train_dataset, val_dataset, test_dataset


In [8]:
import torch
from torch_geometric.datasets import Planetoid, PPI
from torch_geometric.data import Data, InMemoryDataset
import torch_geometric.transforms as T
from typing import Optional, Any, Dict, Tuple
import os

class GraphDataset:
    def __init__(self, dataset_name: str, root: str = 'data/', task='node-cls'):
        """
        Initialize the GraphDataset with the specified dataset.

        Args:
            dataset_name (str): Name of the dataset to load. Supported names:
                                'Cora', 'CiteSeer', 'PubMed', 'PPI', etc.
            root (str): Root directory where the dataset should be saved/downloaded.
        """
        assert task in ['node-cls', 'link-pred'], f"Task {task} not supported."
        self.dataset_name = dataset_name.lower()
        self.root = root
        self.dataset = None
        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None
        self.num_features = None
        self.num_classes = None
        self.task = task
        if task == 'node-cls':
            self.transform = T.NormalizeFeatures()
        else:
            self.transform = T.Compose([
                                T.NormalizeFeatures(),
                                T.RandomLinkSplit(
                                    num_val=0.05, 
                                    num_test=0.1, is_undirected=True, add_negative_train_samples=False)]
                                )

        self.load_dataset()

    def load_dataset(self):
        """
        Load the dataset based on the dataset name provided during initialization.
        """
        if self.dataset_name in ['cora', 'citeseer']:
            self._load_planetoid()
        elif self.dataset_name == 'ppi':
            self._load_ppi()
        else:
            assert False, f"Dataset {self.dataset_name} not supported."

    def _load_planetoid(self):
        """
        Load Planetoid datasets: Cora, CiteSeer, PubMed.
        These datasets come with predefined train/val/test masks.
        """
        dataset_name_cap = self.dataset_name.capitalize()

        print(f"Loaded {dataset_name_cap} dataset:")
        self.dataset = Planetoid(root=self.root, name=dataset_name_cap, transform=self.transform)
        
        # Set attributes
        sample_data = self.dataset[0]
        if self.task == 'link-pred':
            self.train_dataset, self.val_dataset, self.test_dataset = sample_data
            sample_data = self.train_dataset
            # only one graph in each dataset
            self.train_dataset = [self.train_dataset]
            self.val_dataset = [self.val_dataset]
            self.test_dataset = [self.test_dataset]
        else:
            self.train_dataset = self.dataset
            self.val_dataset = self.dataset
            self.test_dataset = self.dataset

            self.num_features = self.dataset.num_node_features
            self.num_classes = self.dataset.num_classes
            print(f" - Number of features: {self.num_features}")
            print(f" - Number of classes: {self.num_classes}")

        print(f" - Number of training nodes: {sample_data.train_mask.sum()}")
        print(f" - Number of validation nodes: {sample_data.val_mask.sum()}")
        print(f" - Number of test nodes: {sample_data.test_mask.sum()}")
        
        print(f" - Number of nodes: {sample_data.num_nodes}")
        print(f" - Number of edges: {sample_data.num_edges}")

    def _load_ppi(self):
        """
        Load the PPI dataset, which consists of multiple graphs for training, validation, and testing.
        """
        dir_path = os.path.join(self.root, 'PPI')
        print(f"Loaded PPI dataset:")
        self.train_dataset = PPI(root=dir_path, split='train', transform=self.transform)
        self.val_dataset = PPI(root=dir_path, split='val', transform=self.transform)
        self.test_dataset = PPI(root=dir_path, split='test', transform=self.transform)

        if self.task == 'link-pred':
            train_dataset = []
            val_dataset = []
            test_dataset = []

            for ta, va, te in self.train_dataset:
                train_dataset.append(ta)
                val_dataset.append(va)
                test_dataset.append(te)
            
            for ta, va, te in self.val_dataset:
                train_dataset.append(ta)
                val_dataset.append(va)
                test_dataset.append(te)
            
            for ta, va, te in self.test_dataset:
                train_dataset.append(ta)
                val_dataset.append(va)
                test_dataset.append(te)
            
            self.train_dataset = train_dataset
            self.val_dataset = val_dataset
            self.test_dataset = test_dataset

        else:
            self.num_features = self.train_dataset.num_features
            self.num_classes = self.train_dataset.num_classes
            print(f'Number of features: {self.train_dataset.num_features}')
            print(f'Number of classes: {self.train_dataset.num_classes}')

        data = self.train_dataset[0]

        print(f'Number of train graphs: {len(self.train_dataset)}')
        print(f'Number of val graphs: {len(self.val_dataset)}')
        print(f'Number of test graphs: {len(self.test_dataset)}')
        print(f'Number of nodes: {data.num_nodes}')
        print(f'Number of edges: {data.num_edges}')
        print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
        print(f'Contains isolated nodes: {data.has_isolated_nodes()}')
        print(f'Contains self-loops: {data.has_self_loops()}')
        print(f'Is undirected: {data.is_undirected()}')

    def get_datasets(self) -> Dict[str, Optional[torch.utils.data.Dataset]]:
        """
        Retrieve the datasets based on the dataset type.

        Returns:
            dict: A dictionary containing train, val, test, and pred datasets as applicable.
        """
        datasets = {}
        if self.train_dataset is None:
            datasets['train'] = self.dataset
            datasets['val'] = self.dataset
            datasets['test'] = self.dataset
        else:  # self.dataset_name == 'ppi':
            datasets['train'] = self.train_dataset
            datasets['val'] = self.val_dataset
            datasets['test'] = self.test_dataset
        return datasets

In [9]:
graph_dataset = GraphDataset('ppi', '../data', task='link-pred')
datasets = graph_dataset.get_datasets()
print(len(datasets['train']))
print(len(datasets['val']))
print(len(datasets['test']))

Loaded PPI dataset:
Number of train graphs: 24
Number of val graphs: 24
Number of test graphs: 24
Number of nodes: 1767
Number of edges: 27474
Average node degree: 15.55
Contains isolated nodes: True
Contains self-loops: False
Is undirected: True
24
24
24


In [10]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(datasets['train'], batch_size=4, shuffle=True)

for batch in train_loader:
    print(batch)
    break

DataBatch(x=[11098, 50], edge_index=[2, 276138], y=[11098, 121], edge_label=[138069], edge_label_index=[2, 138069], batch=[11098], ptr=[5])


In [9]:
import torch_geometric.transforms as T
transform = T.Compose([T.NormalizeFeatures(),
                        T.RandomLinkSplit(num_val=0.1, num_test=0.1, is_undirected=True, add_negative_train_samples=False)])


In [5]:
dataset_ppi = load_ppi(root)


Dataset: PPI(20):
Number of train graphs: 20
Number of val graphs: 2
Number of test graphs: 2
Number of features: 50
Number of classes: 121
Avg number of nodes: 2245.30
Avg number of edges: 61318.40
Average node degree: 27.31
Contains isolated nodes: True
Contains self-loops: False
Is undirected: True




In [10]:
dataset = load_planetoid(root, 'Cora')[0]
dataset_coras = Planetoid(root, 'Cora', transform=transform)
print(dataset_coras)
print(type(dataset_coras[0]))
train_dataset, val_dataset, test_dataset = dataset_coras[0]

Dataset: Cora():
Number of graphs: 1
Number of features: 1433
Number of classes: 7
Number of nodes: 2708
Number of edges: 10556
Average node degree: 3.90
Number of training nodes: 140
Number of validation nodes: 500
Number of test nodes: 1000
Training node label rate: 0.05
Contains isolated nodes: False
Contains self-loops: False
Is undirected: True
Cora()
<class 'tuple'>


In [11]:
print(dataset)
print(train_dataset)
print(val_dataset)
print(test_dataset)

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
Data(x=[2708, 1433], edge_index=[2, 8448], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_label=[4224], edge_label_index=[2, 4224])
Data(x=[2708, 1433], edge_index=[2, 8448], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_label=[1054], edge_label_index=[2, 1054])
Data(x=[2708, 1433], edge_index=[2, 9502], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_label=[1054], edge_label_index=[2, 1054])


In [12]:
print(train_dataset.num_nodes)
print(test_dataset.num_features)
print(test_dataset.edge_label)

2708
1433
tensor([1., 1., 1.,  ..., 0., 0., 0.])


In [15]:
edge_index = [(train_dataset.edge_index[0][i], train_dataset.edge_index[1][i]) for i in range(train_dataset.edge_index.size(1))]
test_edge_label_index = [(val_dataset.edge_label_index[0][i], val_dataset.edge_label_index[1][i]) for i in range(val_dataset.edge_label_index.size(1))]

In [16]:
sum_neg = 0
for e in test_edge_label_index:
    if e not in edge_index:
        sum_neg += 1
print(sum_neg)

1054


In [9]:
graph0 = dataset_cora[0]
print(graph0)
print(graph0.y.size())
print(graph0.y[:10])

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
torch.Size([2708])
tensor([3, 4, 4, 0, 3, 2, 0, 3, 3, 2])


In [11]:
dataset_citeseer = load_planetoid(root, 'CiteSeer')

Dataset: CiteSeer():
Number of graphs: 1
Number of features: 3703
Number of classes: 6
Number of nodes: 3327
Number of edges: 9104
Average node degree: 2.74
Number of training nodes: 120
Number of validation nodes: 500
Number of test nodes: 1000
Training node label rate: 0.04
Contains isolated nodes: True
Contains self-loops: False
Is undirected: True




In [12]:
graph0 = dataset_citeseer[0]
print(graph0)
print(graph0.y.size())
print(graph0.y[:10])

Data(x=[3327, 3703], edge_index=[2, 9104], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327])
torch.Size([3327])
tensor([3, 1, 5, 5, 3, 1, 3, 0, 3, 5])


In [15]:
path = os.path.join(root, 'PPI')
dataset_ppi0 = PPI(path, transform=transform, split='train') 
dataset_ppi1 = PPI(path, transform=transform, split='val')
dataset_ppi2 = PPI(path, transform=transform, split='test')



In [16]:
for ta, va, te in dataset_ppi0:
    print(ta)
    print(va)
    print(te)
    break

Data(x=[1767, 50], edge_index=[2, 25858], y=[1767, 121], edge_label=[12929], edge_label_index=[2, 12929])
Data(x=[1767, 50], edge_index=[2, 25858], y=[1767, 121], edge_label=[3230], edge_label_index=[2, 3230])
Data(x=[1767, 50], edge_index=[2, 29088], y=[1767, 121], edge_label=[3230], edge_label_index=[2, 3230])


In [13]:
graph0 = dataset_ppi_train[0]
print(graph0)
print(graph0.y.shape)
print(graph0.y[:10])

Data(x=[1767, 50], edge_index=[2, 32318], y=[1767, 121])
torch.Size([1767, 121])
tensor([[1., 0., 0.,  ..., 1., 1., 0.],
        [1., 0., 0.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 0., 1.,  ..., 1., 1., 0.],
        [0., 0., 0.,  ..., 1., 1., 0.],
        [1., 1., 1.,  ..., 1., 1., 0.]])


In [11]:
from torch_geometric.data import DataLoader
train_loader = DataLoader(dataset_cora, batch_size=1, shuffle=False)
print(dataset_cora.num_classes)
print(dataset_cora.num_features)
for data in train_loader:
    print(data.num_nodes)
    print(data.num_edges)
    print(data.train_mask.sum())
    print(data.val_mask.sum())
    print(data.test_mask.sum())
    print(data.y.size())


7
1433
2708
10556
tensor(140)
tensor(500)
tensor(1000)
torch.Size([2708])
1433


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import MessagePassing
import torch_geometric.utils as utils

class GCNConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        """
        Initialize the GCN convolution layer.

        Args:
            in_channels (int): Number of input features per node.
            out_channels (int): Number of output features per node.
        """
        # Initialize the MessagePassing class with 'add' aggregation
        super(GCNConv, self).__init__(aggr='add')

        # Define a linear transformation (weight matrix)
        self.linear = nn.Linear(in_channels, out_channels, bias=True)

    def forward(self, x, edge_index):
        """
        Forward pass of the GCN layer.

        x_i  = W * \sum_{j \in N(i)} (1/sqrt(d_i * d_j)) * x_j

        Args:
            x (Tensor): Node feature matrix of shape [N, in_channels].
            edge_index (LongTensor): Edge indices in COO format of shape [2, E].

        Returns:
            Tensor: Output node features of shape [N, out_channels].
        """
        # Add self-loops to the adjacency matrix
        edge_index, _ = utils.add_self_loops(edge_index, num_nodes=x.size(0))

        # Compute normalization coefficients
        row, col = edge_index
        deg = utils.degree(row, x.size(0), dtype=x.dtype)  # Degree of each node
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0  # Handle division by zero

        # Apply linear transformation
        x = self.linear(x)  # [N, out_channels]

        # Initiate message passing
        return self.propagate(edge_index, x=x, norm=(deg_inv_sqrt, row, col))

    def message(self, x_j, norm):
        """
        Define the message computation.

        Args:
            x_j (Tensor): Neighbor node features of shape [E, out_channels].
            norm (tuple): Tuple containing normalization factors.

        Returns:
            Tensor: Messages to be aggregated.
        """
        deg_inv_sqrt, row, col = norm

        # Source node degree (j)
        D_j = deg_inv_sqrt[col]
        # Target node degree (i)
        D_i = deg_inv_sqrt[row]

        # Compute normalization factor for each edge
        alpha = D_i * D_j

        # Scale the messages
        return alpha.view(-1, 1) * x_j  # [E, out_channels]

    def update(self, aggr_out):
        """
        Update node embeddings after aggregation.

        Args:
            aggr_out (Tensor): Aggregated messages of shape [N, out_channels].

        Returns:
            Tensor: Updated node features.
        """
        return aggr_out

class GCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        """
        Initialize a simple GCN model.

        Args:
            in_channels (int): Number of input features.
            hidden_channels (int): Number of hidden features.
            out_channels (int): Number of output features.
        """
        super(GCN, self).__init__()

        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        """
        Forward pass of the GCN model.

        Args:
            x (Tensor): Node features of shape [N, in_channels].
            edge_index (LongTensor): Edge indices in COO format of shape [2, E].

        Returns:
            Tensor: Output features of shape [N, out_channels].
        """
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

  """


In [6]:
type(dataset_cora)

torch_geometric.datasets.planetoid.Planetoid