In [1]:
!pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install -q torch-geometric
!pip install -q git+https://github.com/snap-stanford/deepsnap.git

[K     |████████████████████████████████| 2.6MB 6.7MB/s 
[K     |████████████████████████████████| 1.5MB 7.5MB/s 
[K     |████████████████████████████████| 215kB 8.6MB/s 
[K     |████████████████████████████████| 235kB 14.7MB/s 
[K     |████████████████████████████████| 2.2MB 10.4MB/s 
[K     |████████████████████████████████| 51kB 7.4MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone
  Building wheel for deepsnap (setup.py) ... [?25l[?25hdone


In [2]:
import copy
import torch
import torch.nn as nn
import networkx as nx
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torch.optim as optim

from deepsnap.graph import Graph
from deepsnap.batch import Batch
from deepsnap.dataset import GraphDataset
from torch.utils.data import DataLoader
import torch_geometric.nn as pyg_nn

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [52]:
# Read NetworkX graph of restaurants
# No features: restaurants_no_features.gpickle
# Basic features: restaurants_basic_features.gpickle

G = nx.read_gpickle("./drive/MyDrive/Colab Notebooks/restaurants_node_degree.gpickle")

In [53]:
G.number_of_nodes()

29963

In [54]:
G.number_of_edges()

491464

In [55]:
class GNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, args):
        super(GNN, self).__init__()
        self.num_layers = args["num_layers"]

        conv_model = self.build_conv_model(args["model"])
        self.convs = nn.ModuleList()
        self.convs.append(conv_model(input_size, hidden_size))

        for l in range(self.num_layers - 1):
            self.convs.append(conv_model(hidden_size, hidden_size))
        self.post_mp = nn.Linear(hidden_size, output_size)

    def forward(self, data):
        x, edge_index, batch = data.node_feature, data.edge_index, data.batch

        for i in range(len(self.convs) - 1):
            x = self.convs[i](x, edge_index)
            x = F.leaky_relu(x)
        x = self.convs[-1](x, edge_index)
        x = F.log_softmax(x, dim=1)
        return x

    def loss(self, pred, label):
        return F.nll_loss(pred, label)

    def build_conv_model(self, model_type):
        if model_type == 'GCN':
            return pyg_nn.GCNConv
        elif model_type == 'GAT':
            return pyg_nn.GATConv
        elif model_type == "GraphSage":
            return pyg_nn.SAGEConv
        else:
            raise ValueError(
                "Model {} unavailable, please add it to GNN.build_conv_model.".format(model_type))

In [56]:
def train(train_loader, val_loader, test_loader, args, num_node_features, num_classes,
          device="cpu"):
    model = GNN(num_node_features, args['hidden_size'], num_classes, args).to(device)
    print(model)
    optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=5e-4)

    for epoch in range(args['epochs']):
        total_loss = 0
        model.train()
        for batch in train_loader:
            batch.to(device)
            optimizer.zero_grad()
            pred = model(batch)
            label = batch.node_label
            loss = model.loss(pred[batch.node_label_index], label)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        train_acc = round(test(train_loader, model, device), 4)
        val_acc = round(test(val_loader, model, device), 4)
        test_acc = round(test(test_loader, model, device), 4)
        print(f"Epoch {epoch + 1}: Train: {train_acc}, Validation: {val_acc}. Test: {test_acc}, Loss: {round(total_loss, 4)}")

def test(loader, model, device='cuda'):
    model.eval()
    for batch in loader:
        batch.to(device)
        logits = model(batch)
        pred = logits[batch.node_label_index].max(1)[1]
        acc = pred.eq(batch.node_label).sum().item()
        total = batch.node_label_index.shape[0]
        acc /= total
    return acc

In [60]:
args = {
    "device" : 'cuda' if torch.cuda.is_available() else 'cpu',
    "hidden_size" : 128,
    "epochs" : 250,
    "lr" : 0.01,
    "num_layers": 3,
    "model": "GraphSage" # [GraphSage, GAT, GCN]
}

H = Graph(G)
dataset = GraphDataset(graphs=[H], task='node')

dataset_train, dataset_val, dataset_test = dataset.split(transductive=True, split_ratio=[0.8, 0.1, 0.1])

train_loader = DataLoader(dataset_train, collate_fn=Batch.collate(), batch_size=10)
val_loader = DataLoader(dataset_val, collate_fn=Batch.collate(), batch_size=10)
test_loader = DataLoader(dataset_test, collate_fn=Batch.collate(), batch_size=10)

num_node_features = dataset_train.num_node_features
num_classes = dataset_train.num_node_labels
print(f"There are {num_node_features} features and {num_classes} labels.")

train(train_loader, val_loader, test_loader, args, num_node_features, num_classes, args["device"])

There are 8 features and 3 labels.
GNN(
  (convs): ModuleList(
    (0): SAGEConv(8, 128)
    (1): SAGEConv(128, 128)
    (2): SAGEConv(128, 128)
  )
  (post_mp): Linear(in_features=128, out_features=3, bias=True)
)
Epoch 1: Train: 0.338, Validation: 0.3281. Test: 0.3397, Loss: 11.2614
Epoch 2: Train: 0.341, Validation: 0.3418. Test: 0.331, Loss: 6.5609
Epoch 3: Train: 0.3272, Validation: 0.3328. Test: 0.3373, Loss: 15.0303
Epoch 4: Train: 0.3246, Validation: 0.3281. Test: 0.337, Loss: 18.0754
Epoch 5: Train: 0.3629, Validation: 0.3585. Test: 0.3664, Loss: 9.4994
Epoch 6: Train: 0.3469, Validation: 0.3351. Test: 0.3427, Loss: 3.4237
Epoch 7: Train: 0.3413, Validation: 0.3415. Test: 0.331, Loss: 4.662
Epoch 8: Train: 0.3413, Validation: 0.3431. Test: 0.3293, Loss: 5.8553
Epoch 9: Train: 0.364, Validation: 0.3585. Test: 0.356, Loss: 5.3081
Epoch 10: Train: 0.361, Validation: 0.3581. Test: 0.361, Loss: 4.4751
Epoch 11: Train: 0.3719, Validation: 0.3702. Test: 0.3737, Loss: 4.1595
Epoch 12: