In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric 
import torch_geometric.nn as gnn
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import os   
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
from ogb.nodeproppred import PygNodePropPredDataset,Evaluator

data_path = "Dataset/"
dataset = PygNodePropPredDataset("ogbn-products",data_path)

In [3]:
split_idx = dataset.get_idx_split()
split_idx.keys()
train_idx = split_idx["train"]
val_idx = split_idx["valid"]
test_idx = split_idx["test"]

print(f"Nodes of train Data : {train_idx.shape[0]}")
print(f"Nodes of valid Data :{val_idx.shape[0]}")
print(f"Nodes of test Data :{test_idx.shape[0]}")

Nodes of train Data : 196615
Nodes of valid Data :39323
Nodes of test Data :2213091


In [4]:
data = dataset[0].to(device)
print(data)



Data(num_nodes=2449029, edge_index=[2, 123718280], x=[2449029, 100], y=[2449029, 1])


In [5]:
print(f"Num of nodes in graph : {data.num_nodes}")
print(f"Num of edges in graph : {data.edge_index.shape[1]}")
print(f"Num of features of node in graph  : {data.x.shape[1]}")

Num of nodes in graph : 2449029
Num of edges in graph : 123718280
Num of features of node in graph  : 100


In [6]:
labels = data.y.unique()
df_labels = pd.read_csv("Dataset/ogbn_products/mapping/labelidx2productcategory.csv.gz")
keys = df_labels["label idx"].values
values = df_labels["product category"].values

labels_mapping = dict(zip(keys,values))

print(f"the labels are :{labels}")
print(f"label name by idx :{labels_mapping}")
df_labels.head(df_labels.shape[0])



the labels are :tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46], device='cuda:0')
label name by idx :{0: 'Home & Kitchen', 1: 'Health & Personal Care', 2: 'Beauty', 3: 'Sports & Outdoors', 4: 'Books', 5: 'Patio, Lawn & Garden', 6: 'Toys & Games', 7: 'CDs & Vinyl', 8: 'Cell Phones & Accessories', 9: 'Grocery & Gourmet Food', 10: 'Arts, Crafts & Sewing', 11: 'Clothing, Shoes & Jewelry', 12: 'Electronics', 13: 'Movies & TV', 14: 'Software', 15: 'Video Games', 16: 'Automotive', 17: 'Pet Supplies', 18: 'Office Products', 19: 'Industrial & Scientific', 20: 'Musical Instruments', 21: 'Tools & Home Improvement', 22: 'Magazine Subscriptions', 23: 'Baby Products', 24: nan, 25: 'Appliances', 26: 'Kitchen & Dining', 27: 'Collectibles & Fine Art', 28: 'All Beauty', 29: 'Luxury Beauty', 30: 'Amazon Fashion', 31: 'Computers', 32: 'All Elec

Unnamed: 0,label idx,product category
0,0,Home & Kitchen
1,1,Health & Personal Care
2,2,Beauty
3,3,Sports & Outdoors
4,4,Books
5,5,"Patio, Lawn & Garden"
6,6,Toys & Games
7,7,CDs & Vinyl
8,8,Cell Phones & Accessories
9,9,Grocery & Gourmet Food


In [7]:
from pandas.core.common import flatten
from collections import Counter

y = data.y.cpu().numpy().flatten()
labels_count = Counter(y)
for i in labels_count.keys():
    print(f"{labels_mapping[i]} :",labels_count[i])

Home & Kitchen : 114294
Health & Personal Care : 109832
Beauty : 116043
Sports & Outdoors : 151061
Books : 668950
Patio, Lawn & Garden : 40715
Toys & Games : 158771
CDs & Vinyl : 172199
Cell Phones & Accessories : 110796
Grocery & Gourmet Food : 67358
Arts, Crafts & Sewing : 52345
Clothing, Shoes & Jewelry : 32937
Electronics : 131886
Movies & TV : 101541
Software : 3079
Video Games : 26911
Automotive : 83594
Pet Supplies : 42337
Office Products : 49019
Industrial & Scientific : 17438
Musical Instruments : 22575
Tools & Home Improvement : 80795
Magazine Subscriptions : 879
Baby Products : 3653
nan : 45406
Appliances : 3024
Kitchen & Dining : 553
Collectibles & Fine Art : 259
All Beauty : 1969
Luxury Beauty : 1561
Amazon Fashion : 277
Computers : 418
All Electronics : 513
Purchase Circles : 29
MP3 Players & Accessories : 154
Gift Cards : 44
Office & School Supplies : 630
Home Improvement : 514
Camera & Photo : 91
GPS & Navigation : 37
Digital Music : 6
Car Electronics : 61
Baby : 32500


In [8]:
X = data.x.to(device)
y = data.y.squeeze().to(device)
X.device

device(type='cuda', index=0)

In [9]:
from torch_geometric.loader import NeighborSampler,NeighborLoader

train_loader = NeighborSampler(data.edge_index,node_idx=train_idx,sizes=[20,15,10],batch_size=256,shuffle=True)



In [10]:
def test_data(train_loader):
    for i,(batch_size,n_id,adjs) in enumerate(train_loader):
        print(f"batch size = {batch_size} | n_id.shape = {n_id.shape[0]}")
        for k,(edge_index,_,size) in enumerate(adjs):
            print(f"\tk:{k}")
            print(f"\t\tedge index :{edge_index.shape}")
            print(f"\t\tsize :{size}")
        if i ==1 :
            break
test_data(train_loader)


batch size = 256 | n_id.shape = 346551
	k:0
		edge index :torch.Size([2, 615783])
		size :(346551, 62300)
	k:1
		edge index :torch.Size([2, 76814])
		size :(62300, 5211)
	k:2
		edge index :torch.Size([2, 4999])
		size :(5211, 256)
batch size = 256 | n_id.shape = 353341
	k:0
		edge index :torch.Size([2, 624842])
		size :(353341, 63153)
	k:1
		edge index :torch.Size([2, 77588])
		size :(63153, 5255)
	k:2
		edge index :torch.Size([2, 5038])
		size :(5255, 256)


# Build Model

In [11]:
class Sage(nn.Module):
    def __init__(self,in_dim,out_dim,hidden_dim,num_layers):
        super(Sage,self).__init__()
        self.num_layers = num_layers
        self.convs = nn.ModuleList()

        self.convs.append(gnn.SAGEConv(in_dim,hidden_dim))
        for _ in range(num_layers-2):
            self.convs.append(gnn.SAGEConv(hidden_dim,hidden_dim))
        self.convs.append(gnn.SAGEConv(hidden_dim,out_dim))


    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
    
    def forward(self,x,adj):
        for k,(edge_index,_,size) in enumerate(adj):
            x_targets = x[:size[1]] # Target Nodes are first always
            x = self.convs[k]((x,x_targets),edge_index)
            if i != self.num_layers-1:
                x = F.relu(x)
                x = F.dropout(x,p=0.5,training=self.training)
        return x.log_softmax(dim=-1)
                



device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Sage(dataset.x.shape[1],dataset.num_classes,256,num_layers=3).to(device)


In [12]:
def trainstep(epoch):
    model.train()


    pbar = tqdm(total=train_idx.shape[0])
    pbar.set_description(f"Epoch {epoch}:")
    
    total_loss=total_correct = 0
    for batch_size,n_id,adjs in train_loader:
        adjs = [adj.to(device) for adj in adjs]

        # Forward
        optimizer.zero_grad()
        out = model(X[n_id],adjs)
        
        #backward
        loss = F.nll_loss(out, y[n_id[:batch_size]])
        loss.backward()
        optimizer.step()

        total_loss += float(loss)
        total_correct += int(out.argmax(dim=-1).eq(y[n_id[:batch_size]]).sum())

        pbar.update(batch_size)
    pbar.close()

    loss = total_loss/len(train_loader)
    acc = total_correct/train_idx.shape[0]
    
    return loss,acc







learning_rate=3e-3
num_epochs = 20
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

for epoch in range(num_epochs):
    loss,acc = trainstep(epoch)
    print(f"Epch {epoch}, Loss: {loss:.4f}, Train_acc: {acc:.4f}")



Epoch 0::   4%|▍         | 8192/196615 [01:12<27:08, 115.70it/s]

KeyboardInterrupt: 

Epoch 0::   4%|▍         | 8192/196615 [01:30<27:08, 115.70it/s]