## Enviroment Setup

In [15]:
!pip install -q -r requirements.txt

In [26]:
# Import libraries
import torch
import torch.nn.functional as F
from tqdm import tqdm
import torch.nn as nn
from torch_geometric.nn import SAGEConv
from torch_geometric.data import NeighborSampler
import os.path as osp
import pandas as pd
import numpy as np
import collections
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator
import seaborn as sns
import matplotlib.pyplot as plt
import umap
from scipy.special import softmax

# Set seaborn style
sns.set(rc={'figure.figsize':(16.7,8.27)})
sns.set_theme(style="ticks")


### Dataset explorartion

In [27]:
import os

# Step 2: Define dataset path
root = 'home/data/products'  # You can change this to your desired path

# Step 3: Check if dataset folder exists
if not os.path.exists(root):
    print("Dataset folder not found! Downloading now...")
    # Initialize dataset object, this will trigger download automatically
    dataset = PygNodePropPredDataset(name='ogbn-products', root=root)
    print("Dataset downloaded and loaded successfully.")
else:
    print("Dataset found. Loading without downloading...")
    dataset = PygNodePropPredDataset(name='ogbn-products', root=root)
    print("Dataset loaded successfully.")

Dataset found. Loading without downloading...


  if osp.exists(f) and torch.load(f) != _repr(self.pre_transform):
  if osp.exists(f) and torch.load(f) != _repr(self.pre_filter):
  self.data, self.slices = torch.load(self.processed_paths[0])


Dataset loaded successfully.


In [28]:
data = dataset[0]
split_idx = dataset.get_idx_split()
evaluator = Evaluator(name='ogbn-products')
print(f"Number of nodes: {data.num_nodes}")
print(f"Number of edges: {data.num_edges}")
print(f"Node feature shape: {data.x.shape}")
print(f"Edge index shape: {data.edge_index.shape}")
print(f"Number of classes: {dataset.num_classes}")

Number of nodes: 2449029
Number of edges: 123718280
Node feature shape: torch.Size([2449029, 100])
Edge index shape: torch.Size([2, 123718280])
Number of classes: 47


## Label Mapping and Class Distribution

In [29]:
# Load label mapping
mapping_file = osp.join(root, 'ogbn_products', 'mapping', 'labelidx2productcategory.csv.gz')
df = pd.read_csv(mapping_file)

# Create label mapping dictionary
label_mapping = dict(zip(df.iloc[:, 0], df.iloc[:, 1]))

# Check distribution
y = data.y.squeeze().tolist()
label_counts = collections.Counter(y)
print("Label counts:", label_counts)


Label counts: Counter({4: 668950, 7: 172199, 6: 158771, 3: 151061, 12: 131886, 2: 116043, 0: 114294, 8: 110796, 1: 109832, 13: 101541, 16: 83594, 21: 80795, 9: 67358, 10: 52345, 18: 49019, 24: 45406, 17: 42337, 5: 40715, 11: 32937, 42: 32500, 15: 26911, 20: 22575, 19: 17438, 23: 3653, 14: 3079, 25: 3024, 28: 1969, 29: 1561, 43: 1399, 22: 879, 36: 630, 44: 566, 26: 553, 37: 514, 32: 513, 31: 418, 30: 277, 27: 259, 34: 154, 38: 91, 41: 61, 35: 44, 39: 37, 33: 29, 45: 9, 40: 6, 46: 1})


## Model Definition

In [30]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers=3):
        super(GraphSAGE, self).__init__()
        self.num_layers = num_layers

        # Build SAGE convolution layers
        self.convs = nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        self.convs.append(SAGEConv(hidden_channels, out_channels))

    def reset_parameters(self):
        """Optional: Reset parameters (good for re-initializing before retraining)"""
        for conv in self.convs:
            conv.reset_parameters()

    def forward(self, x, adjs):
        """Forward used during training with neighbor sampling."""
        for i, (edge_index, _, size) in enumerate(adjs):
            x_target = x[:size[1]]  # Target nodes are always placed first
            x = self.convs[i]((x, x_target), edge_index)
            if i != self.num_layers - 1:
                x = F.relu(x)
                x = F.dropout(x, p=0.5, training=self.training)
        return x
    
    @torch.no_grad()
    def inference(self, x_all, subgraph_loader, device):
   
        pbar = tqdm(total=x_all.size(0) * self.num_layers)
        pbar.set_description('Evaluating')

        for i in range(self.num_layers):
            xs = []
            for batch_size, n_id, adj in subgraph_loader:
                edge_index, _, size = adj  # unpack first
                edge_index = edge_index.to(device)  # move edge_index to device

                x = x_all[n_id].to(device)  # move node features to device
                x_target = x[:size[1]]  # get target nodes
                x = self.convs[i]((x, x_target), edge_index)

                if i != self.num_layers - 1:
                    x = F.relu(x)

                xs.append(x)
                pbar.update(batch_size)
            x_all = torch.cat(xs, dim=0)

        pbar.close()
        return x_all

    

## Training

In [38]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GraphSAGE(dataset.num_features, 256, dataset.num_classes, num_layers=3).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
x = data.x.to(device)
y = data.y.squeeze().to(device)

# NeighborSampler for mini-batch training
train_loader = NeighborSampler(data.edge_index, node_idx=split_idx['train'],
                                sizes=[15, 10, 5], batch_size=1024, shuffle=True)

def train(model, loader):
    model.train()
    total_loss = total_correct = 0
    for batch_size, n_id, adjs in loader:
        adjs = [adj.to(device) for adj in adjs]
        optimizer.zero_grad()
        out = model(x[n_id], adjs)
        out = out.log_softmax(dim=-1)
        loss = F.nll_loss(out, y[n_id[:batch_size]])
        loss.backward()
        optimizer.step()

        total_loss += float(loss)
        total_correct += int(out.argmax(dim=-1).eq(y[n_id[:batch_size]]).sum())

    return total_loss / len(loader), total_correct / split_idx['train'].size(0)

# Train for 20 epochs
for epoch in range(1, 21):
    loss, acc = train(model, train_loader)
    print(f'Epoch {epoch:02d}, Loss: {loss:.4f}, Train Accuracy: {acc:.4f}')


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## Save and reload Model

In [None]:
# Save model
torch.save(model.state_dict(), 'graphsage_model_v1.pth')

# Later: reload model
model.load_state_dict(torch.load('graphsage_model_v1.pth'))
model = model.to(device)


NameError: name 'model' is not defined