In [2]:
import dgl
from IPython.display import Image
import pygraphviz as pgv
import scipy.io
import urllib.request
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn

### Citation graph

References:
https://arxiv.org/pdf/1511.04854.pdf

Given: below citation graph

In [2]:
Image(url='https://data.dgl.ai/tutorial/hetero/acm-example.png')

Let's try visualize above citation graph as DGL heterenous graph.

In [3]:
writing_edges = [(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 1), (3, 2)]
citing_edges = [(2, 0)]
publishing_edges = [(0,0), (1,1), (1,2)]

writting_g = dgl.bipartite(writing_edges, 'author', 'writing', 'paper')
citting_g = dgl.graph(citing_edges, 'paper', 'citing')
publishing_g = dgl.bipartite(publishing_edges, 'venue', 'publishing', 'paper')
citatation_graph = dgl.hetero_from_relations([writting_g, citting_g, publishing_g])

citatation_graph

Graph(num_nodes={'author': 4, 'paper': 3, 'venue': 2},
      num_edges={('author', 'writing', 'paper'): 8, ('paper', 'citing', 'paper'): 1, ('venue', 'publishing', 'paper'): 3},
      metagraph=[('author', 'paper'), ('paper', 'paper'), ('venue', 'paper')])

In [4]:
def plot_heterograph(nxg, filepath):
    ag = pgv.AGraph(strict=False, directed=True)
    for u, v, k in nxg.edges(keys=True):
        ag.add_edge(u, v, label=k)
    
    ag.layout('dot')
    ag.draw(filepath)

plot_heterograph(citatation_graph.metagraph, 'img/citation_graph.png')

In [3]:
Image(url='img/citation_graph.png')

In [6]:
ag1 = pgv.AGraph(strict=False, directed=True)
for we in writing_edges:
    ag1.add_edge('a'+str(we[0]), 'p'+str(we[1]), label='writing')

for ce in citing_edges:
    ag1.add_edge('p'+str(ce[0]), 'p'+str(ce[1]), label='citing')

for pe in publishing_edges:
    ag1.add_edge('v'+str(pe[0]), 'p'+str(pe[1]), label='publishing')

ag1.layout('dot')
ag1.draw('img/citation_heteroneusgraph.png')

In [4]:
Image(url='img/citation_heteroneusgraph.png')

To create more realistic heterograph, we use the ACM dataset, which contains information about papers citation,

In [8]:
data_url = 'https://data.dgl.ai/dataset/ACM.mat'
data_file_path = './data/ACM.mat'

urllib.request.urlretrieve(data_url, data_file_path)
data = scipy.io.loadmat(data_file_path)
print(list(data.keys()))

['__header__', '__version__', '__globals__', 'TvsP', 'PvsA', 'PvsV', 'AvsF', 'VvsC', 'PvsL', 'PvsC', 'A', 'C', 'F', 'L', 'P', 'T', 'V', 'PvsT', 'CNormPvsA', 'RNormPvsA', 'CNormPvsC', 'RNormPvsC', 'CNormPvsT', 'RNormPvsT', 'CNormPvsV', 'RNormPvsV', 'CNormVvsC', 'RNormVvsC', 'CNormAvsF', 'RNormAvsF', 'CNormPvsL', 'RNormPvsL', 'stopwords', 'nPvsT', 'nT', 'CNormnPvsT', 'RNormnPvsT', 'nnPvsT', 'nnT', 'CNormnnPvsT', 'RNormnnPvsT', 'PvsP', 'CNormPvsP', 'RNormPvsP']


In [9]:
print('Num of papers:')
data['PvsA'].shape[0]

Num of papers:


12499

In [10]:
print('Num of authors:')
data['PvsA'].shape[1]

Num of authors:


17431

In [11]:
print('Links beetwen authors and papers:')
data['PvsA'].nnz

Links beetwen authors and papers:


37055

Firstly, we analyze paper-author relationship.

In [12]:
paper_vs_author_g = dgl.bipartite(data['PvsA'], 'paper', 'written-by', 'author')
paper_vs_author_g

Graph(num_nodes={'paper': 12499, 'author': 17431},
      num_edges={('paper', 'written-by', 'author'): 37055},
      metagraph=[('paper', 'author')])

In [13]:
G = dgl.heterograph({
        ('paper', 'written-by', 'author') : data['PvsA'],
        ('author', 'writing', 'paper') : data['PvsA'].transpose(),
        ('paper', 'citing', 'paper') : data['PvsP'],
        ('paper', 'cited', 'paper') : data['PvsP'].transpose(),
        ('paper', 'is-about', 'subject') : data['PvsL'],
        ('subject', 'has', 'paper') : data['PvsL'].transpose(),
    })


In [14]:
plot_heterograph(G.metagraph, 'img/acm_graph.png')

In [5]:
Image(url='img/acm_graph.png')

### Node classification and regression to predict the class of each node or estimate a value associated with it

Goal: predict the publishing conference of a paper.

Description:
Dataset contains 14 different conferences, to make our classification semi-supervised, let's labeled only the last four of them. 

Refrences:
https://docs.dgl.ai/tutorials/basics/5_hetero.html


In [16]:
pvc = data['PvsC'].tocsr()
selected_conferences = [10, 11, 12, 13]
selected_papers = pvc[:, selected_conferences].tocoo()

Label last four nodes.

In [17]:
labels = pvc.indices
labels[labels == 10] = 0
labels[labels == 11] = 1
labels[labels == 12] = 2
labels[labels == 13] = 3
labels = torch.tensor(labels).long()

Split dataset to train, val and test subset.

In [18]:
pid = selected_papers.row
shuffle = np.random.permutation(pid)
train = torch.tensor(shuffle[0:800]).long()
val = torch.tensor(shuffle[800:900]).long()
test = torch.tensor(shuffle[900:]).long()

Learning the represantation of nodes in the graph using Relational-GCN needs two steps:

1. Message computation and aggregation within each relation
2. Reduction that merges the results from multiple relationships.

In heteroRGCNLayer, we keep weights for each relation. We have to also define forward function, where

- Compute W_r (relation weight) * h,
- Save that value,
- Specify per-relation message passing functions,
- Trigger message passing of multiple types,
- Return the updated node feature dictionary.

In [19]:
class HeteroRGCNLayer(nn.Module):
    def __init__(self, in_size, out_size, etypes):
        super(HeteroRGCNLayer, self).__init__()
        self.weight = nn.ModuleDict({
                name : nn.Linear(in_size, out_size) for name in etypes
            })

    def forward(self, G, feat_dict):
        funcs = {}
        for srctype, etype, dsttype in G.canonical_etypes:
            Wh = self.weight[etype](feat_dict[srctype])
            G.nodes[srctype].data['Wh_%s' % etype] = Wh
            funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h'))

        G.multi_update_all(funcs, 'mean')
        return {ntype : G.nodes[ntype].data['h'] for ntype in G.ntypes}

Then, we will create simple GNN by stacking two Layers. 

In [20]:
class HeteroRGCN(nn.Module):
    def __init__(self, G, in_size, hidden_size, out_size):
        super(HeteroRGCN, self).__init__()

        embed_dict = {ntype : nn.Parameter(torch.Tensor(G.number_of_nodes(ntype), in_size))
                      for ntype in G.ntypes}
        for key, embed in embed_dict.items():
            nn.init.xavier_uniform_(embed)
        self.embed = nn.ParameterDict(embed_dict)

        self.layer1 = HeteroRGCNLayer(in_size, hidden_size, G.etypes)
        self.layer2 = HeteroRGCNLayer(hidden_size, out_size, G.etypes)

    def forward(self, G):
        h_dict = self.layer1(G, self.embed)
        h_dict = {k : F.leaky_relu(h) for k, h in h_dict.items()}
        h_dict = self.layer2(G, h_dict)

        return h_dict['paper']

Finally, train GCN and evaluate received results.

In [21]:
model = HeteroRGCN(G, 10, 10, 4)

opt = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

best_val_acc = 0
best_test_acc = 0

for epoch in range(100):
    logits = model(G)
    loss = F.cross_entropy(logits[train], labels[train])

    pred = logits.argmax(1)
    train_acc = (pred[train] == labels[train]).float().mean()
    val_acc = (pred[val] == labels[val]).float().mean()
    test_acc = (pred[test] == labels[test]).float().mean()

    if best_val_acc < val_acc:
        best_val_acc = val_acc
        best_test_acc = test_acc

    opt.zero_grad()
    loss.backward()
    opt.step()

    if epoch % 5 == 0:
        print('Loss %.4f, Train Acc %.4f, Val Acc %.4f (Best %.4f), Test Acc %.4f (Best %.4f)' % (
            loss.item(),
            train_acc.item(),
            val_acc.item(),
            best_val_acc.item(),
            test_acc.item(),
            best_test_acc.item(),
        ))

Loss 1.3580, Train Acc 0.5400, Val Acc 0.5200 (Best 0.5200), Test Acc 0.5435 (Best 0.5435)
Loss 1.3194, Train Acc 0.5400, Val Acc 0.5200 (Best 0.5200), Test Acc 0.5435 (Best 0.5435)
Loss 1.2778, Train Acc 0.5400, Val Acc 0.5200 (Best 0.5200), Test Acc 0.5435 (Best 0.5435)
Loss 1.2256, Train Acc 0.5400, Val Acc 0.5200 (Best 0.5200), Test Acc 0.5435 (Best 0.5435)
Loss 1.1562, Train Acc 0.5400, Val Acc 0.5200 (Best 0.5200), Test Acc 0.5435 (Best 0.5435)
Loss 1.0697, Train Acc 0.5400, Val Acc 0.5200 (Best 0.5200), Test Acc 0.5435 (Best 0.5435)
Loss 0.9777, Train Acc 0.5400, Val Acc 0.5200 (Best 0.5200), Test Acc 0.5435 (Best 0.5435)
Loss 0.8930, Train Acc 0.5400, Val Acc 0.5200 (Best 0.5200), Test Acc 0.5435 (Best 0.5435)
Loss 0.8093, Train Acc 0.5400, Val Acc 0.5200 (Best 0.5200), Test Acc 0.5435 (Best 0.5435)
Loss 0.7269, Train Acc 0.5725, Val Acc 0.5200 (Best 0.5200), Test Acc 0.5493 (Best 0.5435)
Loss 0.6438, Train Acc 0.6812, Val Acc 0.6500 (Best 0.6500), Test Acc 0.6519 (Best 0.6519)

We received test evaluation result around 88.14% accuracy, which is quite good on small dataset.