In [1]:
import os
print(os.environ['PATH'])

/Users/nilsagor/opt/anaconda3/bin:/Users/nilsagor/opt/anaconda3/condabin:/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin


**Question:**

How much features of node "c" are important to node "i"?

Can we learn such importance, in an automatic manner?

## Graph  Attention Network

**input:** a set of node features $h =\{ \bar{h_{1}}, \bar{h_{2}}, \dots \bar{h_{n}}\} \qquad h_{i} \in R^F  $

Here $n$ is the number of nodes and $F$ is the number of features in each node.

**output:** a new set of node features $h^{\prime} =\{ \bar{h_{1}^{\prime}}, \bar{h_{2}^{\prime}} \dots \bar{h_{i}^{\prime}}\} \qquad h_{i}^{\prime} \in R^F  $

initial step, apply a **parameterized linear transformation** to every node 

$ W \cdot \bar{h_{i}} \qquad W \in R^{F^{\prime} \times F}  $

$(F^{\prime} \times F) \cdot F$

$F^{\prime}$

**Self attention:**

$ a : R^{F^{\prime}} \times R^{F^{\prime}} \rightarrow R $


$e_{i,j} = a(W \cdot \bar{h_{i}}, W \cdot \bar{h_{j}}) $

here $e_{i,j}$ specify the importance of node $j$'s features to node $i$

The model allows every node to attend on every other node, dropping all structural information.

by inject the graph structure into the mechanism by performing masked attention compute $e_{i,j}$ for nodes $j \in N_{i}$ wher $N_{i}$ some neighborhood of node $i$ in the graph.


**Normalizeation**

To make coefficients easily comparable across diffirent nodes, normalize them across all choices of $j$ using the softmax function

$a_{i,j} = softmax_{j}(e_{i,j}) = \frac{exp(e_{i,j})}{\sum_{k \in N(i)}exp(e_{i,j})}$

$\alpha$ is a single-layer feed forward neural network, parametrized by a weight vector $a \in R6{2F^{\prime}}$ and applying LeakyReLU nonlinearity (with negative input slope $\alpha = 0.2$). The coefficient computed by the attention mechanism

$a_{i,j} = \frac{exp(LeakyReLU(a^{T}[W\bar{h_{i}}W\bar{h_{j}}]))}{\sum_{k \in N_{i}}exp(LeakyReLU(a^{T}[W\bar{h_{i}}W\bar{h_{j}}]))}$




**Multi-head attention**

Multi-head attention is a module for attention mechanism which runs through an attention mechanism several times in parallel. The independent attention outputs are then concatenated and linearly transformed into the expected dimension. Intuitively, multiple attenttion heads allows for attending to parts of the sequence differently 

To stabilize the learning process of self-attention, employing multi-head attention would beneficial, Specifically, K independent attention mechanism execute the transformation and then their features are concatenated 

**Concatention**

$h_{i}^{\prime} = ||_{k=1}^{K} \sigma(\sum_{j \in N(i)} \alpha_{i,j}{k} W^{k}h_{j})$


Here $|| $represents concatenation, $\alpha_{i,j}^{k}$ are normalized attention coefficient computed by the k-th attention mechanism $W^{k}$ input linear transformation's weight matrix



if we perform multi-head attention on the final (prediction) layer of the network, concatenation is no longer sensible, instead employ average and delay applying the final nonlinearity (usually a softmax or logistic sigmoid for classification)

**Average**

$h_{i}^{\prime} = \sigma(\frac{1}{K}\sum_{k = 1}^{K} \sum_{j \in N(i)} \alpha_{i,j}{k} W^{k}h_{j})$


In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
class GATLayer(nn.Module):
    def __init__(self):
        super(GATLayer, self).__init__()
        
    def forward(self, input, adj):
        print("")

In [4]:
in_features = 5
out_features = 2
nb_nodes = 3

W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
nn.init.xavier_uniform_(W.data, gain=1.414)

input = torch.rand(nb_nodes, in_features)

h = torch.mm(input, W)
N = h.size()[0]
print(h.shape)

torch.Size([3, 2])


In [5]:
a = nn.Parameter(torch.zeros(size=(2*out_features, 1)))
nn.init.xavier_uniform_(a.data, gain=1.414)
print(a.shape)

leakyrelu = nn.LeakyReLU(0.2)

torch.Size([4, 1])


In [6]:
a_input = torch.cat([h.repeat(1, N).view(N*N, -1), h.repeat(N,1)], dim=1).view(N, -1, 2*out_features)

In [7]:
e = leakyrelu(torch.matmul(a_input, a).squeeze(2))

In [8]:
print(a_input.shape, a.shape)
print("")
print(torch.matmul(a_input, a).shape)
print("")
print(torch.matmul(a_input, a).squeeze(2).shape)

torch.Size([3, 3, 4]) torch.Size([4, 1])

torch.Size([3, 3, 1])

torch.Size([3, 3])


In [9]:
# Masked Attention
adj = torch.randint(2, (3,3))
zero_vec = -9315*torch.ones_like(e)
print(zero_vec.shape)

torch.Size([3, 3])


In [10]:
attention = torch.where(adj>0, e, zero_vec)
print(adj, "\n", e, "\n", zero_vec)
attention

tensor([[1, 1, 0],
        [0, 0, 0],
        [0, 1, 0]]) 
 tensor([[ 0.8454, -0.0866, -0.0323],
        [-0.0710, -0.3267, -0.2724],
        [-0.0323, -0.2880, -0.2337]], grad_fn=<LeakyReluBackward0>) 
 tensor([[-9315., -9315., -9315.],
        [-9315., -9315., -9315.],
        [-9315., -9315., -9315.]])


tensor([[ 8.4536e-01, -8.6592e-02, -9.3150e+03],
        [-9.3150e+03, -9.3150e+03, -9.3150e+03],
        [-9.3150e+03, -2.8795e-01, -9.3150e+03]], grad_fn=<SWhereBackward>)

In [11]:
attention = F.softmax(attention, dim=1)
h_prime = torch.matmul(attention, h)

In [12]:
attention

tensor([[0.7175, 0.2825, 0.0000],
        [0.3333, 0.3333, 0.3333],
        [0.0000, 1.0000, 0.0000]], grad_fn=<SoftmaxBackward>)

In [13]:
h_prime

tensor([[1.0181, 0.6678],
        [1.5702, 0.6649],
        [1.8825, 0.4424]], grad_fn=<MmBackward>)

In [14]:
print(h_prime, "\n", h)

tensor([[1.0181, 0.6678],
        [1.5702, 0.6649],
        [1.8825, 0.4424]], grad_fn=<MmBackward>) 
 tensor([[0.6777, 0.7566],
        [1.8825, 0.4424],
        [2.1503, 0.7958]], grad_fn=<MmBackward>)


In [15]:
# Build the layer
class GATLayer(nn.Module):
    def __init__(self):
        super(GATLayer, self).__init__()
        
    def forward(self):
          # Linear transformation
            h = torch.mm(input, self.W)
            N = h.size()[0]
            
            #Attention machanism
            a_input = torch.cat([h.repeat(1, N).view(N*N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2*self.out_features)
            e = self.leakyrelu(torch.matmul(a_input, self.a).sequeeze(2))
            
            
            #masked Attention
            zero_vec = -9e15*torch.ones_likes(e)
            attention = torch.where(adj > 0, e, zero_vec)
            
            attention = F.softmax(attention, dim=1)
            attention = F.dropout(attention, self.dropout, training = self.training)
            h_prime = torch.matmul(attention, h)
            
            
            if self.concat:
                return F.relu(h_prime)
            else:
                return h_prime
        

In [16]:
class GATLayer(nn.Module):
    def __init__(self):
        super(GATLayer, self).__init__()
        self.dropout = dropout
        self.in_features = in_features
        self.out_features = out_features
        self.alpha = alpha
        self.concat = concat
        
        
        
        self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
        nn.init.xavier_uniform(self.W.data, gain=1.414)
        
        self.a = nn.Parameter(torch.zeros(size=(2*out_features, 1)))
        nn.init.xavier_uniform(self.a.data, gain=1.414)
        
        
        # leakyRelu
        self.leakyrelu = self.LeakyReLU(self.alpha)
        
        
    def forward(self, input, adj):
        # linear transformation
        h = torch.mm(input, self.W) #matrix multiplication
        N = h.size()[0]
        print(N)
        
        # Attention Mechnasim
        a_input = torch.cat([h.repeat(1, N).view(N*N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2*self.out_features)
        e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))
        
        # Masked Attention
        zero_vec = -9e15*torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)
        
        attention = F.softmax(attention, dim = 1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime = troch.matmul(attention, h)
               
             
        
        if self.concat:
            return f.elu(h_prime)
        else:
            return h_prime

In [17]:
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T

import matplotlib.pyplot as plt

name_data = 'Cora'
dataset = Planetoid(root='/tmp/'+name_data, name=name_data)
dataset.transform = T.NormalizeFeatures()

print(f"Number of Classes in {name_data}", dataset.num_classes)
print(f"Number of Node Features in {name_data}", dataset.num_node_features)

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!
Number of Classes in Cora 7
Number of Node Features in Cora 1433


In [18]:
class GAT(torch.nn.Module):
    def __init__(self):
        super(GAT, self).__init__()
        self.hid = 8
        self.in_head = 8
        self.out_head = 1
        
        self.conv1 = GATConv(dataset.num_features, self.hid, heads=self.in_head, dropout=0.6)
        self.conv2 = GATConv(self.hid*self.in_head, dataset.num_classes, concat = False, heads =self.out_head, dropout = 0.6)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p = 0.6, training = self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
model = GAT().to(device)
data = dataset[0].to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay = 5e-4)

In [21]:
model.train()
for epoch in range(1000):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    
    if epoch % 200:
        print(loss)
        
    loss.backward()
    optimizer.step()

tensor(1.9433, grad_fn=<NllLossBackward>)
tensor(1.9428, grad_fn=<NllLossBackward>)
tensor(1.9372, grad_fn=<NllLossBackward>)
tensor(1.9346, grad_fn=<NllLossBackward>)
tensor(1.9334, grad_fn=<NllLossBackward>)
tensor(1.9274, grad_fn=<NllLossBackward>)
tensor(1.9235, grad_fn=<NllLossBackward>)
tensor(1.9233, grad_fn=<NllLossBackward>)
tensor(1.9144, grad_fn=<NllLossBackward>)
tensor(1.9056, grad_fn=<NllLossBackward>)
tensor(1.9022, grad_fn=<NllLossBackward>)
tensor(1.8971, grad_fn=<NllLossBackward>)
tensor(1.8885, grad_fn=<NllLossBackward>)
tensor(1.8814, grad_fn=<NllLossBackward>)
tensor(1.8802, grad_fn=<NllLossBackward>)
tensor(1.8609, grad_fn=<NllLossBackward>)
tensor(1.8674, grad_fn=<NllLossBackward>)
tensor(1.8534, grad_fn=<NllLossBackward>)
tensor(1.8561, grad_fn=<NllLossBackward>)
tensor(1.8365, grad_fn=<NllLossBackward>)
tensor(1.8349, grad_fn=<NllLossBackward>)
tensor(1.8274, grad_fn=<NllLossBackward>)
tensor(1.8164, grad_fn=<NllLossBackward>)
tensor(1.7880, grad_fn=<NllLossBac

tensor(0.9306, grad_fn=<NllLossBackward>)
tensor(0.8221, grad_fn=<NllLossBackward>)
tensor(0.7886, grad_fn=<NllLossBackward>)
tensor(0.9086, grad_fn=<NllLossBackward>)
tensor(0.8354, grad_fn=<NllLossBackward>)
tensor(0.8562, grad_fn=<NllLossBackward>)
tensor(0.8680, grad_fn=<NllLossBackward>)
tensor(0.8204, grad_fn=<NllLossBackward>)
tensor(0.7579, grad_fn=<NllLossBackward>)
tensor(0.8907, grad_fn=<NllLossBackward>)
tensor(0.8427, grad_fn=<NllLossBackward>)
tensor(0.6655, grad_fn=<NllLossBackward>)
tensor(0.8312, grad_fn=<NllLossBackward>)
tensor(0.9067, grad_fn=<NllLossBackward>)
tensor(0.8689, grad_fn=<NllLossBackward>)
tensor(0.8384, grad_fn=<NllLossBackward>)
tensor(0.8404, grad_fn=<NllLossBackward>)
tensor(0.8320, grad_fn=<NllLossBackward>)
tensor(0.8323, grad_fn=<NllLossBackward>)
tensor(0.7583, grad_fn=<NllLossBackward>)
tensor(0.8463, grad_fn=<NllLossBackward>)
tensor(0.8718, grad_fn=<NllLossBackward>)
tensor(0.7868, grad_fn=<NllLossBackward>)
tensor(0.8923, grad_fn=<NllLossBac

tensor(0.6630, grad_fn=<NllLossBackward>)
tensor(0.7714, grad_fn=<NllLossBackward>)
tensor(0.7875, grad_fn=<NllLossBackward>)
tensor(0.7627, grad_fn=<NllLossBackward>)
tensor(0.8190, grad_fn=<NllLossBackward>)
tensor(0.6243, grad_fn=<NllLossBackward>)
tensor(0.6963, grad_fn=<NllLossBackward>)
tensor(0.7315, grad_fn=<NllLossBackward>)
tensor(0.7862, grad_fn=<NllLossBackward>)
tensor(0.6778, grad_fn=<NllLossBackward>)
tensor(0.7611, grad_fn=<NllLossBackward>)
tensor(0.8037, grad_fn=<NllLossBackward>)
tensor(0.7114, grad_fn=<NllLossBackward>)
tensor(0.7184, grad_fn=<NllLossBackward>)
tensor(0.7419, grad_fn=<NllLossBackward>)
tensor(0.7488, grad_fn=<NllLossBackward>)
tensor(0.6120, grad_fn=<NllLossBackward>)
tensor(0.7013, grad_fn=<NllLossBackward>)
tensor(0.6991, grad_fn=<NllLossBackward>)
tensor(0.7229, grad_fn=<NllLossBackward>)
tensor(0.7364, grad_fn=<NllLossBackward>)
tensor(0.7362, grad_fn=<NllLossBackward>)
tensor(0.6540, grad_fn=<NllLossBackward>)
tensor(0.7583, grad_fn=<NllLossBac

tensor(0.7053, grad_fn=<NllLossBackward>)
tensor(0.7033, grad_fn=<NllLossBackward>)
tensor(0.7288, grad_fn=<NllLossBackward>)
tensor(0.6817, grad_fn=<NllLossBackward>)
tensor(0.6704, grad_fn=<NllLossBackward>)
tensor(0.6563, grad_fn=<NllLossBackward>)
tensor(0.6953, grad_fn=<NllLossBackward>)
tensor(0.5756, grad_fn=<NllLossBackward>)
tensor(0.6899, grad_fn=<NllLossBackward>)
tensor(0.6494, grad_fn=<NllLossBackward>)
tensor(0.7898, grad_fn=<NllLossBackward>)
tensor(0.7529, grad_fn=<NllLossBackward>)
tensor(0.6102, grad_fn=<NllLossBackward>)
tensor(0.6128, grad_fn=<NllLossBackward>)
tensor(0.6479, grad_fn=<NllLossBackward>)
tensor(0.6626, grad_fn=<NllLossBackward>)
tensor(0.5412, grad_fn=<NllLossBackward>)
tensor(0.6623, grad_fn=<NllLossBackward>)
tensor(0.6565, grad_fn=<NllLossBackward>)
tensor(0.6646, grad_fn=<NllLossBackward>)
tensor(0.6979, grad_fn=<NllLossBackward>)
tensor(0.6828, grad_fn=<NllLossBackward>)
tensor(0.6955, grad_fn=<NllLossBackward>)
tensor(0.7381, grad_fn=<NllLossBac

tensor(0.6811, grad_fn=<NllLossBackward>)
tensor(0.6536, grad_fn=<NllLossBackward>)
tensor(0.7435, grad_fn=<NllLossBackward>)
tensor(0.5676, grad_fn=<NllLossBackward>)
tensor(0.7285, grad_fn=<NllLossBackward>)
tensor(0.7194, grad_fn=<NllLossBackward>)
tensor(0.5810, grad_fn=<NllLossBackward>)
tensor(0.5841, grad_fn=<NllLossBackward>)
tensor(0.7163, grad_fn=<NllLossBackward>)
tensor(0.6591, grad_fn=<NllLossBackward>)
tensor(0.6602, grad_fn=<NllLossBackward>)
tensor(0.6920, grad_fn=<NllLossBackward>)
tensor(0.6882, grad_fn=<NllLossBackward>)
tensor(0.6573, grad_fn=<NllLossBackward>)
tensor(0.6495, grad_fn=<NllLossBackward>)
tensor(0.6162, grad_fn=<NllLossBackward>)
tensor(0.6337, grad_fn=<NllLossBackward>)
tensor(0.6339, grad_fn=<NllLossBackward>)
tensor(0.7024, grad_fn=<NllLossBackward>)
tensor(0.6336, grad_fn=<NllLossBackward>)
tensor(0.6485, grad_fn=<NllLossBackward>)
tensor(0.6571, grad_fn=<NllLossBackward>)
tensor(0.7323, grad_fn=<NllLossBackward>)
tensor(0.7134, grad_fn=<NllLossBac

tensor(0.6674, grad_fn=<NllLossBackward>)
tensor(0.6389, grad_fn=<NllLossBackward>)
tensor(0.7564, grad_fn=<NllLossBackward>)
tensor(0.6223, grad_fn=<NllLossBackward>)
tensor(0.6704, grad_fn=<NllLossBackward>)
tensor(0.6645, grad_fn=<NllLossBackward>)


In [22]:
model.eval()
_, pred = model(data).max(dim=1)
correct = float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / data.test_mask.sum().item()
print('Accuracy: {:.4f}'.format(acc*100))

Accuracy: 81.9000
