# **Node Classification in the Amazon Product Graph**

## **Libraries**

In [1]:
!pip install torch torchvision torchaudio
!pip install torch-geometric ogb



In [13]:
import torch
from torch.serialization import add_safe_globals
from torch_geometric.data.data import Data
from torch_geometric.data.data import DataEdgeAttr
from torch_geometric.data import Data
from ogb.nodeproppred import NodePropPredDataset
from torch_geometric.loader import NeighborLoader
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, SAGEConv
from sklearn.metrics import f1_score

In [3]:
add_safe_globals({'torch_geometric.data.data.Data': Data})
add_safe_globals({'torch_geometric.data.data.DataEdgeAttr': DataEdgeAttr})

## **Loading the ogbn-products Dataset**

In [4]:
dataset = NodePropPredDataset(name='ogbn-products')
split_idx = dataset.get_idx_split()

graph, labels = dataset[0]

print(graph)

Downloading http://snap.stanford.edu/ogb/data/nodeproppred/products.zip


Downloaded 1.38 GB: 100%|██████████| 1414/1414 [00:27<00:00, 51.44it/s]


Extracting dataset/products.zip
Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00,  1.69it/s]


Saving...
{'edge_index': array([[      0,  152857,       0, ..., 2449028,   53324, 2449028],
       [ 152857,       0,   32104, ...,  162836, 2449028,   53324]]), 'edge_feat': None, 'node_feat': array([[ 0.03193326, -0.1958605 ,  0.0519961 , ...,  0.07669606,
        -0.3929545 , -0.06478424],
       [-0.02405794,  0.63032097,  1.0605699 , ..., -1.6874819 ,
         3.5866776 ,  0.818219  ],
       [ 0.33269015, -0.5585958 , -0.28860757, ..., -0.37157044,
         0.2520575 ,  0.04153213],
       ...,
       [ 0.10660695,  0.2654852 , -0.00567423, ...,  1.0867023 ,
         0.07590195, -1.1736895 ],
       [ 0.24968362, -0.25740346,  0.41230008, ...,  1.5465808 ,
         1.0309792 , -0.29657176],
       [ 0.7175324 , -0.23930131,  0.04430327, ..., -1.0132493 ,
        -0.41407427, -0.08227058]], dtype=float32), 'num_nodes': 2449029}


## **Preprocessing and Preparing Data Graphs**

In [7]:
edge_index = torch.tensor(graph['edge_index'], dtype=torch.long)
x = torch.tensor(graph['node_feat'], dtype=torch.float)
y = torch.tensor(labels, dtype=torch.long).squeeze()

data = Data(x=x, edge_index=edge_index, y=y)

train_idx = split_idx["train"]
valid_idx = split_idx["valid"]
test_idx = split_idx["test"]

print(data)
print(f"# Train samples: {train_idx.shape[0]}")
print(f"# Valid samples: {valid_idx.shape[0]}")
print(f"# Test samples: {test_idx.shape[0]}")

Data(x=[2449029, 100], edge_index=[2, 123718280], y=[2449029])
# Train samples: 196615
# Valid samples: 39323
# Test samples: 2213091


## **GCN and GraphSAGE Models**

In [14]:
# Model GCN
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

# Model GraphSAGE
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

## **Evaluate Model**

In [15]:

@torch.no_grad()
def evaluate_model(model, data, train_idx, valid_idx, test_idx):
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)

    accs, f1s = [], []
    for split in [train_idx, valid_idx, test_idx]:
        correct = (pred[split] == data.y[split]).sum().item()
        acc = correct / split.shape[0]
        f1 = f1_score(data.y[split].cpu(), pred[split].cpu(), average='macro')
        accs.append(acc)
        f1s.append(f1)
    return accs, f1s

## **Train Model**

In [16]:
def train_model(model, data, train_idx, valid_idx, test_idx, epochs=20):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    model.to(device)
    data = data.to(device)

    for epoch in range(1, epochs + 1):
        model.train()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = F.cross_entropy(out[train_idx], data.y[train_idx])
        loss.backward()
        optimizer.step()

        accs, f1s = evaluate_model(model, data, train_idx, valid_idx, test_idx)
        print(f"[Epoch {epoch:02d}] Loss: {loss.item():.4f} | "
              f"Train Acc: {accs[0]:.4f} | Val Acc: {accs[1]:.4f} | Test Acc: {accs[2]:.4f} || "
              f"Val F1: {f1s[1]:.4f} | Test F1: {f1s[2]:.4f}")

    return accs, f1s  # return final accuracy and F1