In [2]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

1.10.1


In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

## Structure

In [4]:
class GATLayer(nn.Module):
    """
        Simple PyTorch Implementation of the Graph Attention layer
    """
    
    def __init__(self):
        super().__init__()
    
    def forward(self, input, adj):
        pass

## Linear Transformation

$$
\bar{h'}_i = \textbf{W}\cdot \bar{h}_i
$$
with $\textbf{W}\in\mathbb R^{F'\times F}$ and $\bar{h}_i\in\mathbb R^{F}$.

$$
\bar{h'}_i \in \mathbb{R}^{F'}
$$

In [5]:
in_features = 5
out_features = 2
nb_nodes = 3

W = nn.Parameter(torch.zeros(size=(in_features, out_features))) # Xavier Parameter Initialization
# nn.init.xavier_uniform(W.data, gain=1.414)
nn.init.xavier_uniform_(W.data, gain=1.414)

input = torch.rand(nb_nodes, in_features)

# Linear Transforamtion
h = torch.mm(input, W)
N = h.size()[0]

print(h.shape) # Output after linear trainsformation
print(N) # number of nodes

torch.Size([3, 2])
3


## Attention Mechanism

![title](./Images/AttentionMechanism.png)

In [6]:
a = nn.Parameter(torch.zeros(size=(2 * out_features, 1))) # Xavier Parameter Initialization
nn.init.xavier_uniform_(a.data, gain=1.44)
print(a.shape)

leakyrelu = nn.LeakyReLU(0.2) # LeakyReLU

torch.Size([4, 1])


In [7]:
a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)],dim=1).view(N, -1, 2 * out_features) 
print(a_input.shape)

torch.Size([3, 3, 4])


In [8]:
# Demonstration how above is derrived....
print(h, 
      h.size())
print("========")
print(h.repeat(1, N), 
      h.repeat(1, N).size()) # N = 3
print("========")
print(h.repeat(1, N).view(N * N, -1), 
      h.repeat(1, N).view(N * N, -1).size())
print("========")
print(h.repeat(N, 1), 
      h.repeat(N, 1).size())
print("========")
print(torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1), 
      torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).size())
print("========")
print(torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * out_features), 
      torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * out_features).size())

tensor([[-0.2666, -0.1805],
        [ 0.9001, -0.7813],
        [ 0.5976,  0.7885]], grad_fn=<MmBackward0>) torch.Size([3, 2])
tensor([[-0.2666, -0.1805, -0.2666, -0.1805, -0.2666, -0.1805],
        [ 0.9001, -0.7813,  0.9001, -0.7813,  0.9001, -0.7813],
        [ 0.5976,  0.7885,  0.5976,  0.7885,  0.5976,  0.7885]],
       grad_fn=<RepeatBackward0>) torch.Size([3, 6])
tensor([[-0.2666, -0.1805],
        [-0.2666, -0.1805],
        [-0.2666, -0.1805],
        [ 0.9001, -0.7813],
        [ 0.9001, -0.7813],
        [ 0.9001, -0.7813],
        [ 0.5976,  0.7885],
        [ 0.5976,  0.7885],
        [ 0.5976,  0.7885]], grad_fn=<ViewBackward0>) torch.Size([9, 2])
tensor([[-0.2666, -0.1805],
        [ 0.9001, -0.7813],
        [ 0.5976,  0.7885],
        [-0.2666, -0.1805],
        [ 0.9001, -0.7813],
        [ 0.5976,  0.7885],
        [-0.2666, -0.1805],
        [ 0.9001, -0.7813],
        [ 0.5976,  0.7885]], grad_fn=<RepeatBackward0>) torch.Size([9, 2])
tensor([[-0.2666, -0.1805, -0.2

![title](./Images/a_input.png)

In [9]:
e = leakyrelu(torch.matmul(a_input, a).squeeze(2))

In [10]:
# Understanding the how thing are built...
print(a_input.shape, a.shape)
print("")
print(torch.matmul(a_input, a).shape)
print("")
print(torch.matmul(a_input, a).squeeze(2).shape)

torch.Size([3, 3, 4]) torch.Size([4, 1])

torch.Size([3, 3, 1])

torch.Size([3, 3])


## Mask Attention

In [11]:
adj = torch.randint(2, (3, 3))

zero_vec = -9e15 * torch.ones_like(e)
print(zero_vec.shape)

torch.Size([3, 3])


In [21]:
attention = torch.where(adj > 0, e, zero_vec)
print(adj, "\n \n", e, "\n \n", zero_vec, "\n \n")
# print(np.abs(attention.detach().numpy()))
print("attention: ", attention, attention.shape)

tensor([[1, 0, 1],
        [1, 1, 0],
        [1, 0, 0]]) 
 
 tensor([[ 0.0413,  0.3608,  0.5604],
        [-0.4482, -0.3843, -0.3444],
        [-0.0239,  0.1999,  0.3995]], grad_fn=<LeakyReluBackward0>) 
 
 tensor([[-9.0000e+15, -9.0000e+15, -9.0000e+15],
        [-9.0000e+15, -9.0000e+15, -9.0000e+15],
        [-9.0000e+15, -9.0000e+15, -9.0000e+15]]) 
 

attention:  tensor([[ 4.1342e-02, -9.0000e+15,  5.6045e-01],
        [-4.4824e-01, -3.8435e-01, -9.0000e+15],
        [-2.3920e-02, -9.0000e+15, -9.0000e+15]], grad_fn=<SWhereBackward0>) torch.Size([3, 3])


In [13]:
# Applying softmax function
attention = F.softmax(attention, dim=1)
print(attention)

tensor([[0.3731, 0.0000, 0.6269],
        [0.4840, 0.5160, 0.0000],
        [1.0000, 0.0000, 0.0000]], grad_fn=<SoftmaxBackward0>)


In [14]:
h_prime = torch.matmul(attention, h)

print(attention)
print("")
print(h_prime)

tensor([[0.3731, 0.0000, 0.6269],
        [0.4840, 0.5160, 0.0000],
        [1.0000, 0.0000, 0.0000]], grad_fn=<SoftmaxBackward0>)

tensor([[ 0.2752,  0.4270],
        [ 0.3353, -0.4905],
        [-0.2666, -0.1805]], grad_fn=<MmBackward0>)


In [15]:
# h_prime Vs h
print(h_prime, "\n", h)

tensor([[ 0.2752,  0.4270],
        [ 0.3353, -0.4905],
        [-0.2666, -0.1805]], grad_fn=<MmBackward0>) 
 tensor([[-0.2666, -0.1805],
        [ 0.9001, -0.7813],
        [ 0.5976,  0.7885]], grad_fn=<MmBackward0>)


## Build the GAT Layer 

In [16]:
class GATLayer(nn.Module):
    def __init__(self, in_features, out_features, dropout, alpha, concat=True):
        super().__init__()
        self.dropout = dropout # drop prob = 0.6
        self.in_features = in_features
        self.out_features = out_features
        self.alpha = alpha # LeakyReLU with negative slope, alpha = 0.2
        self.concat = concat # concat = True for all the layers except the output layer
        
        # Xavier Initialization of Weights
        self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)
        
        self.a = nn.Parameter(torch.zeros(size=(2 * out_features, 1)))
        nn.init.xavier_uniform_(self.a.data, gain=1.414)
        
        # LeakyReLU
        self.leakyrelu = nn.LeakyReLU(self.alpha)
        
    def forward(self, input, adj):
        # Linear Transformation
        h = torch.mm(input, self.W) # matrix multiplication
        N = h.size()[0]
        print(N)
        
        # Attention Mechanism
        a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features)
        e       = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))
        print(a_input.shape, e.shape)
        
        # Masked Attetion
        zero_vec = -9e15 * torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime = torch.matmul(attention, h)
        
        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

### Using the built layer

In [17]:
from torch_geometric.data import Data
# from torch_geometric.nn import GATConv
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T

import matplotlib.pyplot as plt

dataset_name = 'Cora'
dataset = Planetoid(root='./data' + dataset_name, name=dataset_name)
dataset.transform = T.NormalizeFeatures()

print(f"Number of Classes in {dataset_name}:", dataset.num_classes)
print(f"Number of Node Features in {dataset_name}:", dataset.num_node_features)
# print(dataset[0])
data = dataset[0]
print(data)

Number of Classes in Cora: 7
Number of Node Features in Cora: 1433
Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])


In [18]:
class GAT(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.gat1 = GATLayer(dataset.num_node_features, 16, dropout=0.6, alpha=0.2)
        self.gat2 = GATLayer(16, dataset.num_classes, dropout=0.6, alpha=0.2)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.gat1(x, edge_index)
        x = F.elu(x)
        
        return F.log_softmax(x, dim=1)

In [19]:
model = GAT()
print("Model info: ")
print(model)
data = dataset[0]
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(10):
    optimizer.zero_grad()
    output = model(data)
    loss = F.nll_loss(output[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

Model info: 
GAT(
  (gat1): GATLayer(
    (leakyrelu): LeakyReLU(negative_slope=0.2)
  )
  (gat2): GATLayer(
    (leakyrelu): LeakyReLU(negative_slope=0.2)
  )
)
2708
torch.Size([2708, 2708, 32]) torch.Size([2708, 2708])


RuntimeError: The size of tensor a (10556) must match the size of tensor b (2708) at non-singleton dimension 1