# **Heterogeneous Node Classification on Restaurant-User Graph**
DeepSnap implementation with NetworkX graph

Resources: 
- Introduction to DeepSNAP Heterogeneous Graph: https://colab.research.google.com/drive/1wVGUfUno5Kgs2H-jEGFcm0EogN7DEd-w?usp=sharing#scrollTo=tWPKqipn-Jwj
- Heterogeneous Node Classification with DeepSNAP: https://colab.research.google.com/drive/1L-0kaLqeiT6lHhjHxAzP5sHIcb4b4e7G?usp=sharing
- DeepSnap documentation: https://snap.stanford.edu/deepsnap/notes/introduction.html#deepsnap-heterogeneous-graph


## Installation

In [1]:
# Install required packages.
!pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install -q torch-geometric
!pip install -q git+https://github.com/snap-stanford/deepsnap.git

  Building wheel for deepsnap (setup.py) ... [?25l[?25hdone


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import torch_geometric.nn as pyg_nn


import networkx as nx
from deepsnap.graph import Graph
from deepsnap.hetero_graph import HeteroGraph
from deepsnap.batch import Batch
from deepsnap.dataset import GraphDataset

from sklearn.metrics import f1_score
from deepsnap.hetero_graph import HeteroGraph
from torch_sparse import SparseTensor, matmul
from deepsnap.hetero_gnn import HeteroConv, HeteroSAGEConv, HeteroConv, forward_op, loss_op
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Creating the network graph datasets

In [32]:
G = nx.read_gpickle("./drive/MyDrive/Colab Notebooks/hetero_graph/hetero_graph3.gpickle")

In [33]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 609175
Number of edges: 1864204


In [34]:
H = HeteroGraph(G)
print(f"Heterogeneous graph: {H.num_nodes()} nodes, {H.num_edges()} edges")

Heterogeneous graph: {'restaurant': 29767, 'user': 579408} nodes, {('restaurant', 'rr', 'restaurant'): 483976, ('restaurant', 'ur', 'user'): 1380228} edges


In [35]:
print("Node types: {}".format(H.node_types))
print("Edge types: {}".format(H.edge_types))
print("Message types: {}".format(H.message_types))
for node_type in H.node_types:
    print("Node type {} has {} nodes".format(node_type, H.num_nodes(node_type)))
for message_type in H.message_types:
    print("Message type {} has {} edges".format(message_type, H.num_edges(message_type)))
print(H.node_feature)

print(H.node_label)
H.node_label["restaurant"] = torch.LongTensor(H.node_label["restaurant"])
print(H.node_label)

print(H.num_node_features("restaurant"))
print(H.num_node_features("user"))
print(type(H.node_label["restaurant"]))
print(H.num_node_labels("restaurant"))
print(H.num_node_labels("user"))

Node types: ['restaurant', 'user']
Edge types: ['rr', 'ur']
Message types: [('restaurant', 'rr', 'restaurant'), ('restaurant', 'ur', 'user')]
Node type restaurant has 29767 nodes
Node type user has 579408 nodes
Message type ('restaurant', 'rr', 'restaurant') has 483976 edges
Message type ('restaurant', 'ur', 'user') has 1380228 edges
{'restaurant': tensor([[1.0000, 1.0000, 1.0000,  ..., 0.0000, 0.0000, 0.7400],
        [1.0000, 1.0000, 1.0000,  ..., 0.0000, 0.0000, 0.8300],
        [1.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 0.6950],
        ...,
        [1.0000, 1.0000, 1.0000,  ..., 0.0000, 0.0000, 0.7900],
        [1.0000, 1.0000, 1.0000,  ..., 0.0000, 0.0000, 0.8800],
        [1.0000, 1.0000, 1.0000,  ..., 0.0000, 0.0000, 0.6890]]), 'user': tensor([[4.7813e-03, 6.7032e-04, 5.7544e-04,  ..., 0.0000e+00, 6.4742e-05,
         0.0000e+00],
        [5.7376e-04, 1.9571e-05, 5.8126e-06,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [6.3751e-04, 4.8928e-05, 1.1625e-05,  ..

In [21]:
dataset = GraphDataset([H], task='node')

print(dataset[0])

HeteroGraph(G=[], edge_index=[], edge_label_index=[], edge_to_graph_mapping=[], edge_to_tensor_mapping=[1864204], edge_type=[], node_feature=[], node_label=[], node_label_index=[], node_to_graph_mapping=[], node_to_tensor_mapping=[609175], node_type=[], task=[])


In [22]:
dataset_train, dataset_val, dataset_test = dataset.split(transductive=True, split_ratio=[0.8, 0.1, 0.1], split_types="restaurant")

train_loader = DataLoader(dataset_train, collate_fn=Batch.collate(), batch_size=16)
val_loader = DataLoader(dataset_val, collate_fn=Batch.collate(), batch_size=16)
test_loader = DataLoader(dataset_test, collate_fn=Batch.collate(), batch_size=16)

loaders = [train_loader, val_loader, test_loader]

## Heterogeneous GNN Layer

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: {}".format(device))
best_model = None
best_val = 0

Device: cpu


In [24]:
def generate_convs(hete, conv, hidden_size, task='node'):
    convs1 = {}
    convs2 = {}
    for message_type in hete.message_types:
        n_type = message_type[0]
        s_type = message_type[2]
        n_feat_dim = hete.num_node_features(n_type)
        s_feat_dim = hete.num_node_features(s_type)
        if task == 'node':
            label_dim = hete.num_node_labels(s_type)
        elif task == 'link_pred':
            label_dim = 2
        convs1[message_type] = conv(n_feat_dim, hidden_size, s_feat_dim)
        convs2[message_type] = conv(hidden_size, label_dim, hidden_size)
    return convs1, convs2

In [25]:
class HeteroNet(torch.nn.Module):
    def __init__(self, hete, hidden_size, dropout):
        super(HeteroNet, self).__init__()
        conv1, conv2 = generate_convs(hete, HeteroSAGEConv, hidden_size)
        self.conv1 = HeteroConv(conv1)
        self.conv2 = HeteroConv(conv2)
        self.relus1 = nn.ModuleDict()
        self.relus2 = nn.ModuleDict()
        self.dropouts1 = nn.ModuleDict()
        self.dropouts2 = nn.ModuleDict()
        for node_type in hete.node_types:
            self.relus1[node_type] = nn.LeakyReLU()
            self.relus2[node_type] = nn.LeakyReLU()
            self.dropouts1[node_type] = nn.Dropout(p=dropout)
            self.dropouts2[node_type] = nn.Dropout(p=dropout)

    def forward(self, data):
        x = forward_op(data.node_feature, self.dropouts1)
        x = forward_op(x, self.relus1)
        x = self.conv1(x, data.edge_index)
        x = forward_op(x, self.dropouts2)
        x = forward_op(x, self.relus2)
        x = self.conv2(x, data.edge_index)
        return x

    def loss(self, pred, y, node_label_index):
        loss = loss_op(pred, y, node_label_index, F.cross_entropy)
        return 

In [26]:
def train(model, optimizer, train_loader):
    model.train()
    optimizer.zero_grad()
    for batch in train_loader:
        batch.to(device)
        emb = model(batch)
        loss = model.loss(emb, batch.node_label, batch.node_label_index)
        loss.backward()
    optimizer.step()
    return loss.item()

In [27]:
def test(model, loaders):
    global best_model
    global best_val
    model.eval()
    accs = []
    for loader in loaders:
        for batch in loader:
            batch.to(device)
            logits = model(batch)
            total = 0
            acc = 0
            for node_type in logits:
                node_idx = batch.node_label_index[node_type].to(device)
                pred = logits[node_type][node_idx]
                pred = pred.max(1)[1]
                acc += pred.eq(
                    batch.node_label[node_type].to(device)
                ).sum().item()
                total += pred.size(0)
            acc /= total
            accs.append(acc)
    if accs[1] > best_val:
        best_val = accs[1]
        best_model = copy.deepcopy(model)
    return accs

In [28]:
hidden_size = 32
model = HeteroNet(H, hidden_size, 0.5).to(device)
optimizer = torch.optim.Adam(
    model.parameters(), lr=0.01, weight_decay=5e-3
)
num_epochs = 100

train_accs, valid_accs, test_accs = [], [], []

for epoch in range(num_epochs):
    loss = train(model, optimizer, train_loader)
    accs = test(model, loaders)
    log = "Epoch {}: Train: {:.4f}, Validation: {:.4f}. Test: {:.4f}"
    print(log.format(epoch + 1, accs[0], accs[1], accs[2]))
    train_accs.append(accs[0])
    valid_accs.append(accs[1])
    test_accs.append(accs[2])
accs = test(best_model, loaders)
log = "Best model: Train: {:.4f}, Validation: {:.4f}. Test: {:.4f}"
print(log.format(accs[0], accs[1], accs[2]))

tensor([    0,     0,     0,  ..., 29705, 29732, 29752]) -2
<built-in method index_select of Tensor object at 0x7f88db530280>
tensor([  465,   681,  1049,  ..., 29703, 29724, 29728]) -2
<built-in method index_select of Tensor object at 0x7f88db530280>
tensor([     0,      0,      0,  ..., 237393, 126474,  56643]) -2
<built-in method index_select of Tensor object at 0x7f88db530280>


IndexError: ignored