In [1]:
pip install dgl

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dgl
  Downloading dgl-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 4.9 MB/s 
[?25hCollecting psutil>=5.8.0
  Downloading psutil-5.9.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (281 kB)
[K     |████████████████████████████████| 281 kB 29.8 MB/s 
Installing collected packages: psutil, dgl
  Attempting uninstall: psutil
    Found existing installation: psutil 5.4.8
    Uninstalling psutil-5.4.8:
      Successfully uninstalled psutil-5.4.8
Successfully installed dgl-0.9.0 psutil-5.9.2


In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl

np.random.seed(1)


DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [3]:
def leaky_relu(z):
    return np.where(z > 0, z, z * 0.01)

def softmax(z):
    if len(z.shape) > 1:
        # Softmax for matrix
        max_matrix = np.max(z, axis=0)
        stable_z = z - max_matrix
        e = np.exp(stable_z)
        a = e / np.sum(e, axis=0, keepdims=True)
    else:
        # Softmax for vector
        vector_max_value = np.max(z)
        a = (np.exp(z - vector_max_value)) / sum(np.exp(z - vector_max_value))

    assert a.shape == z.shape

    return a


### Graph and Weight Matrix Generation

In [4]:
print('\n\n----- One-hot vector representation of nodes. Shape(n,n)\n')
X = np.eye(5, 5)
n = X.shape[0]
np.random.shuffle(X)
print(X)

print('\n\n----- Embedding dimension\n')
emb = 3
print(emb)

print('\n\n----- Weight Matrix. Shape(emb, n)\n')
W = np.random.uniform(-np.sqrt(1. / emb), np.sqrt(1. / emb), (emb, n))
print(W)

print('\n\n----- Adjacency Matrix (undirected graph). Shape(n,n)\n')
A = np.random.randint(2, size=(n, n))
np.fill_diagonal(A, 1)  
A = (A + A.T)
A[A > 1] = 1
print(A)



----- One-hot vector representation of nodes. Shape(n,n)

[[0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]]


----- Embedding dimension

3


----- Weight Matrix. Shape(emb, n)

[[-0.4294049   0.57624235 -0.3047382  -0.11941829 -0.12942953]
 [ 0.19600584  0.5029172   0.3998854  -0.21561317  0.02834577]
 [-0.06529497 -0.31225734  0.03973776  0.47800217 -0.04941563]]


----- Adjacency Matrix (undirected graph). Shape(n,n)

[[1 1 1 0 1]
 [1 1 1 1 1]
 [1 1 1 1 0]
 [0 1 1 1 1]
 [1 1 0 1 1]]


### Linear Transformation

In [5]:
# equation (1)
print('\n\n----- Linear Transformation. Shape(n, emb)\n')
z1 = X.dot(W.T)
print(z1)



----- Linear Transformation. Shape(n, emb)

[[-0.3047382   0.3998854   0.03973776]
 [ 0.57624235  0.5029172  -0.31225734]
 [-0.12942953  0.02834577 -0.04941563]
 [-0.4294049   0.19600584 -0.06529497]
 [-0.11941829 -0.21561317  0.47800217]]


### Transformer: Additive Attention Mechanism

In [6]:
# equation (2)
print('\n\n----- Concat hidden features to represent edges. Shape(len(emb.concat(emb)), number of edges)\n')
edge_coords = np.where(A==1)
h_src_nodes = z1[edge_coords[0]]
h_dst_nodes = z1[edge_coords[1]]
z2 = np.concatenate((h_src_nodes, h_dst_nodes), axis=1)

# Concatenation tests
assert len(edge_coords[1]) == z2.shape[0], "The number of edges in A is not equal to the number of concat edges"
test_value = np.array([-0.11941829, -0.12942953, 0.19600584, 0.5029172, 0.3998854, -0.21561317])
assert z2[4 ,:].tolist().sort()  == test_value.tolist().sort(), "Something went wrong in the concat process"
print(z2)

print('\n\n----- Attention coefficients. Shape(1, len(emb.concat(emb)))\n')
att = np.random.rand(1, z2.shape[1])
print(att)

print('\n\n----- Edge representations combined with the attention coefficients. Shape(1, number of edges)\n')
z2_att = z2.dot(att.T)
print(z2_att)

print('\n\n----- Leaky Relu. Shape(1, number of edges)')
e = leaky_relu(z2_att)
print(e)



----- Concat hidden features to represent edges. Shape(len(emb.concat(emb)), number of edges)

[[-0.3047382   0.3998854   0.03973776 -0.3047382   0.3998854   0.03973776]
 [-0.3047382   0.3998854   0.03973776  0.57624235  0.5029172  -0.31225734]
 [-0.3047382   0.3998854   0.03973776 -0.12942953  0.02834577 -0.04941563]
 [-0.3047382   0.3998854   0.03973776 -0.11941829 -0.21561317  0.47800217]
 [ 0.57624235  0.5029172  -0.31225734 -0.3047382   0.3998854   0.03973776]
 [ 0.57624235  0.5029172  -0.31225734  0.57624235  0.5029172  -0.31225734]
 [ 0.57624235  0.5029172  -0.31225734 -0.12942953  0.02834577 -0.04941563]
 [ 0.57624235  0.5029172  -0.31225734 -0.4294049   0.19600584 -0.06529497]
 [ 0.57624235  0.5029172  -0.31225734 -0.11941829 -0.21561317  0.47800217]
 [-0.12942953  0.02834577 -0.04941563 -0.3047382   0.3998854   0.03973776]
 [-0.12942953  0.02834577 -0.04941563  0.57624235  0.5029172  -0.31225734]
 [-0.12942953  0.02834577 -0.04941563 -0.12942953  0.02834577 -0.04941563]
 [-

### Normalize the Attention Scores

In [7]:
# equation (3)
print('\n\n----- Edge scores as matrix. Shape(n,n)\n')
e_matr = np.zeros(A.shape)
e_matr[edge_coords[0], edge_coords[1]] = e.reshape(-1,)
print(e_matr)

print('\n\n----- For each node, normalize the edge (or neighbor) contributions using softmax\n')
alpha0 = softmax(e_matr[:,0][e_matr[:,0] != 0]) 
alpha1 = softmax(e_matr[:,1][e_matr[:,1] != 0])
alpha2 = softmax(e_matr[:,2][e_matr[:,2] != 0])
alpha3 = softmax(e_matr[:,3][e_matr[:,3] != 0])
alpha4 = softmax(e_matr[:,4][e_matr[:,4] != 0])
alpha = np.concatenate((alpha0, alpha1, alpha2, alpha3, alpha4))
print(alpha)

print('\n\n----- Normalized edge score matrix. Shape(n,n)\n')
A_scaled = np.zeros(A.shape)
A_scaled[edge_coords[0], edge_coords[1]] = alpha.reshape(-1,)
print(A_scaled)



----- Edge scores as matrix. Shape(n,n)

[[ 3.03222751e-01  7.33156386e-01  1.11502195e-01  0.00000000e+00
   1.14458791e-01]
 [ 9.60794571e-02  5.26013092e-01 -9.56410988e-04 -1.44587571e-03
  -9.26845030e-04]
 [ 7.86065337e-02  5.08540169e-01 -1.13114022e-03 -1.62060495e-03
   0.00000000e+00]
 [ 0.00000000e+00  5.34430817e-01 -8.72233739e-04 -1.36169846e-03
  -8.42667781e-04]
 [ 4.82066128e-01  9.11999763e-01  0.00000000e+00  2.41399100e-01
   2.93302168e-01]]


----- For each node, normalize the edge (or neighbor) contributions using softmax

[0.26263543 0.21349717 0.20979916 0.31406823 0.21610715 0.17567419
 0.1726313  0.1771592  0.25842816 0.27167844 0.24278118 0.24273876
 0.24280162 0.23393014 0.23388927 0.23394984 0.29823075 0.25138555
 0.22399017 0.22400903 0.30061525]


----- Normalized edge score matrix. Shape(n,n)

[[0.26263543 0.21349717 0.20979916 0.         0.31406823]
 [0.21610715 0.17567419 0.1726313  0.1771592  0.25842816]
 [0.27167844 0.24278118 0.24273876 0.2428016

### Neighborhood Diffusion (GCN) Scaled by the Attention Scores (GAT)

In [8]:
# equation (4)
print('\n\nNeighborhood aggregation (GCN) scaled with attention scores (GAT). Shape(n, emb)\n')
ND_GAT = A_scaled.dot(z1)
print(ND_GAT)



Neighborhood aggregation (GCN) scaled with attention scores (GAT). Shape(n, emb)

[[-0.02166863  0.15062515  0.08352843]
 [-0.09390287  0.15866476  0.05716299]
 [-0.07856777  0.28521023 -0.09286313]
 [-0.03154513  0.10583032  0.04267501]
 [-0.07962369  0.19226439  0.069115  ]]


## GAT Layer - DGL Test
Original layer implementation: https://docs.dgl.ai/en/0.4.x/tutorials/models/1_gnn/9_gat.html  

In [9]:
class GATTestLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim):
        super(GATTestLayer, self).__init__()
        self.g = g
        # equation (1)
        self.fc = nn.Linear(in_dim, out_dim, bias=False)
        # equation (2)
        self.attn_fc = nn.Linear(2 * out_dim, 1, bias=False)
        self.reset_parameters()

    def reset_parameters(self):
        """Reinizialitation modified for testing"""
        gain = nn.init.calculate_gain('relu')
        self.fc.state_dict()['weight'][:] = torch.from_numpy(W)
        self.attn_fc.state_dict()['weight'][:] = torch.from_numpy(att)

    def edge_attention(self, edges):
        # edge UDF for equation (2)
        z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=1)
        a = self.attn_fc(z2)
        return {'e': F.leaky_relu(a)}

    def message_func(self, edges):
        # message UDF for equation (3) & (4)
        return {'z': edges.src['z'], 'e': edges.data['e']}

    def reduce_func(self, nodes):
        # reduce UDF for equation (3) & (4)
        # equation (3)
        alpha = F.softmax(nodes.mailbox['e'], dim=1)
        # equation (4)
        h = torch.sum(alpha * nodes.mailbox['z'], dim=1)
        return {'h': h}

    def forward(self, h):
        # equation (1)
        z = self.fc(h)
        self.g.ndata['z'] = z
        # equation (2)
        self.g.apply_edges(self.edge_attention)
        # equation (3) & (4)
        self.g.update_all(self.message_func, self.reduce_func)
        return self.g.ndata.pop('h')

In [10]:
print('\n\n----- Create a new DGL graph using the NumPy graph\n')
src_ids = torch.tensor(edge_coords[0])
dst_ids = torch.tensor(edge_coords[1])
g = dgl.graph((src_ids, dst_ids))
print(g)

print('\n\n----- Create a DGL instance of the GAT test layer\n')
net = GATTestLayer(g,
          in_dim=n,
          out_dim=3)
print(net.forward(torch.Tensor(X)))

print('\n\n----- Recap of the NumPy GAT layer')
print(np.round(ND_GAT, decimals=4))





----- Create a new DGL graph using the NumPy graph

Graph(num_nodes=5, num_edges=21,
      ndata_schemes={}
      edata_schemes={})


----- Create a DGL instance of the GAT test layer

tensor([[-0.0217,  0.1506,  0.0835],
        [-0.0939,  0.1587,  0.0572],
        [-0.0786,  0.2852, -0.0929],
        [-0.0315,  0.1058,  0.0427],
        [-0.0796,  0.1923,  0.0691]], grad_fn=<IndexCopyBackward0>)


----- Recap of the NumPy GAT layer
[[-0.0217  0.1506  0.0835]
 [-0.0939  0.1587  0.0572]
 [-0.0786  0.2852 -0.0929]
 [-0.0315  0.1058  0.0427]
 [-0.0796  0.1923  0.0691]]


The resulting matrices from the NumPy implementation and the DGL implementation are equal \o/.

## Multi Head GAT Layer Implementation with NumPy
Multiple head attentions are created generating multiple GAT layers.

In [11]:
print('\n\n----- Recap on the output of the GAT layer')
print('\nLayer 1. Shape(emb,n)')
layer1 = ND_GAT
print(layer1)

print('\nLayer 2. Shape(emb,n)')
layer2 = ND_GAT
print(layer2)

print('\n\n----- Concatenate multiple attentions. Shape(num_layers*emb, n)\n')
concat = np.concatenate((layer1, layer2), axis=1)
print(concat)

print('\n\n----- Average multiple attentions.\n')
# 30 is the number of parameters: num_layers*emb*n
average = np.sum((layer1, layer2)) / 30
print(average)



----- Recap on the output of the GAT layer

Layer 1. Shape(emb,n)
[[-0.02166863  0.15062515  0.08352843]
 [-0.09390287  0.15866476  0.05716299]
 [-0.07856777  0.28521023 -0.09286313]
 [-0.03154513  0.10583032  0.04267501]
 [-0.07962369  0.19226439  0.069115  ]]

Layer 2. Shape(emb,n)
[[-0.02166863  0.15062515  0.08352843]
 [-0.09390287  0.15866476  0.05716299]
 [-0.07856777  0.28521023 -0.09286313]
 [-0.03154513  0.10583032  0.04267501]
 [-0.07962369  0.19226439  0.069115  ]]


----- Concatenate multiple attentions. Shape(num_layers*emb, n)

[[-0.02166863  0.15062515  0.08352843 -0.02166863  0.15062515  0.08352843]
 [-0.09390287  0.15866476  0.05716299 -0.09390287  0.15866476  0.05716299]
 [-0.07856777  0.28521023 -0.09286313 -0.07856777  0.28521023 -0.09286313]
 [-0.03154513  0.10583032  0.04267501 -0.03154513  0.10583032  0.04267501]
 [-0.07962369  0.19226439  0.069115   -0.07962369  0.19226439  0.069115  ]]


----- Average multiple attentions.

0.04979367027023359


## Multi Head GAT Layer - DGL Test
Original layer implementation: https://docs.dgl.ai/en/0.4.x/tutorials/models/1_gnn/9_gat.html  

In [12]:
class MultiHeadGATTestLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim, num_heads, merge='cat'):
        super(MultiHeadGATTestLayer, self).__init__()
        self.heads = nn.ModuleList()
        for i in range(num_heads):
            # Use the test layer for consistency with the NumPy implementation
            self.heads.append(GATTestLayer(g, in_dim, out_dim))
        self.merge = merge

    def forward(self, h):
        head_outs = [attn_head(h) for attn_head in self.heads]
        if self.merge == 'cat':
            # concat on the output feature dimension (dim=1)
            return torch.cat(head_outs, dim=1)
        else:
            # merge using average
            return torch.mean(torch.stack(head_outs))

print('\n\n----- Multi head GAT layer (concat operation). Shape(num_layers*emb, n)\n')
concat_net = MultiHeadGATTestLayer(g, in_dim=n, out_dim=3, num_heads=2)
print(concat_net)
print('\n----- DGL concat output\n')
print(concat_net.forward(torch.Tensor(X)))

print('\n----- Recap of the NumPy concatenation\n')
print(np.round(concat, decimals=4))

print('\n\n----- Multi head GAT Layer (average operation). Shape(emb, n)\n')
mean_net = MultiHeadGATTestLayer(g, in_dim=n, out_dim=3, num_heads=2, merge='mean')
print(mean_net)
print('\n----- DGL average output\n')
print(mean_net.forward(torch.Tensor(X)))

print('\n----- Recap of the NumPy average\n')
print(np.round(average, decimals=4))



----- Multi head GAT layer (concat operation). Shape(num_layers*emb, n)

MultiHeadGATTestLayer(
  (heads): ModuleList(
    (0): GATTestLayer(
      (fc): Linear(in_features=5, out_features=3, bias=False)
      (attn_fc): Linear(in_features=6, out_features=1, bias=False)
    )
    (1): GATTestLayer(
      (fc): Linear(in_features=5, out_features=3, bias=False)
      (attn_fc): Linear(in_features=6, out_features=1, bias=False)
    )
  )
)

----- DGL concat output

tensor([[-0.0217,  0.1506,  0.0835, -0.0217,  0.1506,  0.0835],
        [-0.0939,  0.1587,  0.0572, -0.0939,  0.1587,  0.0572],
        [-0.0786,  0.2852, -0.0929, -0.0786,  0.2852, -0.0929],
        [-0.0315,  0.1058,  0.0427, -0.0315,  0.1058,  0.0427],
        [-0.0796,  0.1923,  0.0691, -0.0796,  0.1923,  0.0691]],
       grad_fn=<CatBackward0>)

----- Recap of the NumPy concatenation

[[-0.0217  0.1506  0.0835 -0.0217  0.1506  0.0835]
 [-0.0939  0.1587  0.0572 -0.0939  0.1587  0.0572]
 [-0.0786  0.2852 -0.0929 -0.0786  0

The resulting matrices from the NumPy implementation and the DGL implementation are equal \o/.

# From Theory to Practice
After the understanding of math and the implementation of GAT building blocks, we can run some experiments as reported in the original paper. Let's recap the DGL modules using a fair parameter initialization. The following implementation is based on the example available here: https://docs.dgl.ai/en/0.4.x/tutorials/models/1_gnn/9_gat.html.

## New Imports

In [13]:
import time
from dgl import DGLGraph
from dgl.data import citation_graph as citegrh
import networkx as nx

## GAT Implementation with DGL

In [14]:
class GATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim):
        super(GATLayer, self).__init__()
        self.g = g
        # equation (1)
        self.fc = nn.Linear(in_dim, out_dim, bias=False)
        # equation (2)
        self.attn_fc = nn.Linear(2 * out_dim, 1, bias=False)
        self.reset_parameters()

    def reset_parameters(self):
        """Reinitialize learnable parameters."""
        gain = nn.init.calculate_gain('relu')
        nn.init.xavier_normal_(self.fc.weight, gain=gain)
        nn.init.xavier_normal_(self.attn_fc.weight, gain=gain)

    def edge_attention(self, edges):
        # edge UDF for equation (2)
        z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=1)
        a = self.attn_fc(z2)
        return {'e': F.leaky_relu(a)}

    def message_func(self, edges):
        # message UDF for equation (3) & (4)
        return {'z': edges.src['z'], 'e': edges.data['e']}

    def reduce_func(self, nodes):
        # reduce UDF for equation (3) & (4)
        # equation (3)
        alpha = F.softmax(nodes.mailbox['e'], dim=1)
        # equation (4)
        h = torch.sum(alpha * nodes.mailbox['z'], dim=1)
        return {'h': h}

    def forward(self, h):
        # equation (1)
        z = self.fc(h)
        self.g.ndata['z'] = z
        # equation (2)
        self.g.apply_edges(self.edge_attention)
        # equation (3) & (4)
        self.g.update_all(self.message_func, self.reduce_func)
        return self.g.ndata.pop('h')

In [15]:
class MultiHeadGATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim, num_heads, merge='cat'):
        super(MultiHeadGATLayer, self).__init__()
        self.heads = nn.ModuleList()
        for i in range(num_heads):
            self.heads.append(GATLayer(g, in_dim, out_dim))
        self.merge = merge

    def forward(self, h):
        head_outs = [attn_head(h) for attn_head in self.heads]
        if self.merge == 'cat':
            # concat on the output feature dimension (dim=1)
            return torch.cat(head_outs, dim=1)
        else:
            # merge using average
            return torch.mean(torch.stack(head_outs))

In [16]:
class GAT(nn.Module):
    def __init__(self, g, in_dim, hidden_dim, out_dim, num_heads):
        super(GAT, self).__init__()
        self.layer1 = MultiHeadGATLayer(g, in_dim, hidden_dim, num_heads)
        # Be aware that the input dimension is hidden_dim*num_heads since
        # multiple head outputs are concatenated together. Also, only
        # one attention head in the output layer.
        self.layer2 = MultiHeadGATLayer(g, hidden_dim * num_heads, out_dim, 1)

    def forward(self, h):
        h = self.layer1(h)
        h = F.elu(h)
        h = self.layer2(h)
        return h

## Evaluation Functions

In [17]:
def accuracy(logits, labels):
    _, indices = torch.max(logits, dim=1)
    correct = torch.sum(indices == labels)
    return correct.item() * 1.0 / len(labels)

def evaluate(model, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model(features)
        logits = logits[mask]
        labels = labels[mask]
        return accuracy(logits, labels)

## Load Cora Dataset

In [18]:
def load_cora_data():
    data = citegrh.load_cora()
    features = torch.FloatTensor(data.features)
    labels = torch.LongTensor(data.labels)
    train_mask = torch.BoolTensor(data.train_mask)
    val_mask = torch.BoolTensor(data.val_mask)
    test_mask = torch.BoolTensor(data.test_mask)
    print(data)
    graph=data[0]
    g = graph
    return g, features, labels, train_mask, val_mask, test_mask

g, features, labels, train_mask, val_mask, test_mask = load_cora_data()
print('\n\n----- Features of CORA dataset')

print('\n----- Graph:')
print(g)

print('\n----- Features:')
print(features)
print(features.nonzero(as_tuple=True)[1])

print('\n----- Labels:')
print(labels)
print(labels.size())
output = torch.unique(labels)
occs = torch.bincount(labels)
print('----- Number of unique labels:')
print(output)
print('----- Number of label occurrences:')
print(occs)

print('\n----- Training mask:')
train_long = train_mask.long()
occs = torch.bincount(train_long)
print(output)
print(occs)

print('\n----- Validation mask:')
val_long = val_mask.long()
occs = torch.bincount(val_long)
print(output)
print(occs)

print('\n----- Testing mask:')
test_long = test_mask.long()
occs = torch.bincount(test_long)
print(output)
print(occs)


Downloading /root/.dgl/cora_v2.zip from https://data.dgl.ai/dataset/cora_v2.zip...
Extracting file to /root/.dgl/cora_v2
Finished data loading and preprocessing.
  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.
Dataset("cora_v2", num_graphs=1, save_path=/root/.dgl/cora_v2)


----- Features of CORA dataset

----- Graph:
Graph(num_nodes=2708, num_edges=10556,
      ndata_schemes={'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'label': Scheme(shape=(), dtype=torch.int64), 'feat': Scheme(shape=(1433,), dtype=torch.float32)}
      edata_schemes={})

----- Features:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0



Analyzing the cora dataset, you can get the following information:

1. Nodes have no features (one-hot encoding vectors)
2. Node labels are uniformly distributed


## Training Loop

In [19]:
# create the model, 2 heads, each head has hidden size 8
model = GAT(g,
          in_dim=features.size()[1],
          hidden_dim=8,
          out_dim=7,
          num_heads=2)

# create optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# main loop
dur = []
for epoch in range(300):
    t0 = time.time()

    logits = model(features)
    logp = F.log_softmax(logits, 1)
    loss = F.nll_loss(logp[train_mask], labels[train_mask])

    train_acc = accuracy(logp[train_mask], labels[train_mask])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    dur.append(time.time() - t0)

    print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Training Accuracy {:.4f}".format(
        epoch, np.mean(dur), loss.item(), train_acc))

    if epoch % 30==0:
        print("\nEval on validation dataset...")
        val_acc = evaluate(model, features, labels, val_mask)
        print("Validation Accuracy: {:.4f}\n".format(val_acc))

print()
acc = evaluate(model, features, labels, test_mask)
print("Test Accuracy {:.4f}".format(acc))


Epoch 00000 | Time(s) 0.3260 | Loss 1.9456 | Training Accuracy 0.1357

Eval on validation dataset...
Validation Accuracy: 0.1760

Epoch 00001 | Time(s) 0.2356 | Loss 1.9437 | Training Accuracy 0.1857
Epoch 00002 | Time(s) 0.2043 | Loss 1.9418 | Training Accuracy 0.2500
Epoch 00003 | Time(s) 0.1967 | Loss 1.9398 | Training Accuracy 0.3214
Epoch 00004 | Time(s) 0.1867 | Loss 1.9379 | Training Accuracy 0.3929
Epoch 00005 | Time(s) 0.1817 | Loss 1.9359 | Training Accuracy 0.4286
Epoch 00006 | Time(s) 0.1936 | Loss 1.9340 | Training Accuracy 0.4857
Epoch 00007 | Time(s) 0.1954 | Loss 1.9320 | Training Accuracy 0.5500
Epoch 00008 | Time(s) 0.1886 | Loss 1.9301 | Training Accuracy 0.6143
Epoch 00009 | Time(s) 0.1832 | Loss 1.9281 | Training Accuracy 0.6571
Epoch 00010 | Time(s) 0.1795 | Loss 1.9262 | Training Accuracy 0.6857
Epoch 00011 | Time(s) 0.1758 | Loss 1.9242 | Training Accuracy 0.7643
Epoch 00012 | Time(s) 0.1737 | Loss 1.9222 | Training Accuracy 0.7929
Epoch 00013 | Time(s) 0.1722 |