In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv 
import torch_geometric.transforms as T

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [8]:
# linear transformation

the_in_feats = 5
out_feats = 2
nb_nodes = 3

W = nn.Parameter(torch.zeros(size=(the_in_feats, out_feats))) # xavier paramiter inizialation
nn.init.xavier_uniform_(W.data, gain =1.414)

input = torch.rand(nb_nodes, the_in_feats)
# tensor of 3 nodes, with 5 features with a random value between 0 and 1

h = torch.mm(input, W)
N = h.size()[0]

print(input)
print(N)
print(h.shape)

tensor([[0.3749, 0.3713, 0.8901, 0.7779, 0.8807],
        [0.8543, 0.8180, 0.9878, 0.1403, 0.3032],
        [0.2311, 0.5651, 0.1418, 0.4426, 0.6663]])
3
torch.Size([3, 2])


In [9]:
# Attention Mechanism 

a = nn.Parameter(torch.zeros(size=(2*out_feats, 1)))
nn.init.xavier_uniform_(a.data, gain=1.414)
print(a) # allows for back propagation for the flag. 
print(a.shape)

leakyrelu = nn.LeakyReLU(0.2)
print(leakyrelu)

Parameter containing:
tensor([[-1.4843],
        [ 0.6496],
        [-0.8846],
        [ 0.6198]], requires_grad=True)
torch.Size([4, 1])
LeakyReLU(negative_slope=0.2)


In [11]:
a_input = torch.cat([h.repeat(1,N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * out_feats)
print(a_input)
print(a_input.shape) # 3-d tensor with 3 channles, 3 nodes each wtih 4 features. 

tensor([[[ 2.5500,  0.0788,  2.5500,  0.0788],
         [ 2.5500,  0.0788,  2.2703, -1.2548],
         [ 2.5500,  0.0788,  1.4414,  0.5385]],

        [[ 2.2703, -1.2548,  2.5500,  0.0788],
         [ 2.2703, -1.2548,  2.2703, -1.2548],
         [ 2.2703, -1.2548,  1.4414,  0.5385]],

        [[ 1.4414,  0.5385,  2.5500,  0.0788],
         [ 1.4414,  0.5385,  2.2703, -1.2548],
         [ 1.4414,  0.5385,  1.4414,  0.5385]]], grad_fn=<ViewBackward0>)
torch.Size([3, 3, 4])


In [13]:
e = leakyrelu(torch.matmul(a_input, a).squeeze(2))
print(e)
print(e.shape)

tensor([[-1.1881, -1.3040, -0.9350],
        [-1.2784, -1.3942, -1.0252],
        [-0.7993, -0.9151, -0.5462]], grad_fn=<LeakyReluBackward0>)
torch.Size([3, 3])


In [15]:
print(a_input.shape, a.shape)
print('')
print(torch.matmul(a_input, a).shape)
print('')
print(torch.matmul(a_input, a).squeeze(2).shape)

torch.Size([3, 3, 4]) torch.Size([4, 1])

torch.Size([3, 3, 1])

torch.Size([3, 3])


In [16]:
# Masked Attention

adj = torch.randint(2, (3,3))
print(adj)

zero_vec = -9e15*torch.ones_like(e)
print(zero_vec)
print(zero_vec.shape)

tensor([[1, 1, 0],
        [1, 0, 1],
        [0, 0, 1]])
tensor([[-9.0000e+15, -9.0000e+15, -9.0000e+15],
        [-9.0000e+15, -9.0000e+15, -9.0000e+15],
        [-9.0000e+15, -9.0000e+15, -9.0000e+15]])
torch.Size([3, 3])


In [17]:
attention = torch.where(adj > 0, e, zero_vec)
print(adj)
print('')
print(e)
print('')
print(zero_vec)
print('')
print(attention)

tensor([[1, 1, 0],
        [1, 0, 1],
        [0, 0, 1]])

tensor([[-1.1881, -1.3040, -0.9350],
        [-1.2784, -1.3942, -1.0252],
        [-0.7993, -0.9151, -0.5462]], grad_fn=<LeakyReluBackward0>)

tensor([[-9.0000e+15, -9.0000e+15, -9.0000e+15],
        [-9.0000e+15, -9.0000e+15, -9.0000e+15],
        [-9.0000e+15, -9.0000e+15, -9.0000e+15]])

tensor([[-1.1881e+00, -1.3040e+00, -9.0000e+15],
        [-1.2784e+00, -9.0000e+15, -1.0252e+00],
        [-9.0000e+15, -9.0000e+15, -5.4615e-01]], grad_fn=<WhereBackward0>)


In [20]:
attention = F.softmax(attention, dim=1)
h_prime = torch.matmul(attention, h)
print(attention)
print('')
print(h_prime)

tensor([[0.5289, 0.4711, 0.0000],
        [0.4371, 0.0000, 0.5629],
        [0.0000, 0.0000, 1.0000]], grad_fn=<SoftmaxBackward0>)

tensor([[ 2.4182, -0.5495],
        [ 1.9259,  0.3376],
        [ 1.4414,  0.5385]], grad_fn=<MmBackward0>)


In [21]:
# h_prime vs h

print(h_prime)
print('')
print(h)

tensor([[ 2.4182, -0.5495],
        [ 1.9259,  0.3376],
        [ 1.4414,  0.5385]], grad_fn=<MmBackward0>)

tensor([[ 2.5500,  0.0788],
        [ 2.2703, -1.2548],
        [ 1.4414,  0.5385]], grad_fn=<MmBackward0>)


In [28]:
class GATLayer( nn.Module ):

    def __init__(self, in_features, out_features, dropout = 0.6, alpha= 0.2, concat = True):
        super().__init__()
        self.dropout = dropout
        self.in_features = in_features # dataset.num_features
        self.out_features = out_features # dataset.num_classes
        self.alpha = alpha
        self.concat = concat 

        # create the weight matrix that will learn during the training process
        # this creates a trainable paramter with a matrix of size in_features x out_features 
        self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))

        # set up weights of the layer to promote even propogation 
        nn.init.xavier_uniform_(self.W.data, gain=1.414)

        # learnable parameter that is used to see to compute the attention 
        # scores of nodes around the neighborhod. 
        self.a = nn.Parameter(torch.zeros(size=(2*out_features, 1)))
        nn.init.xavier_uniform_(self.a.data, gain=1.414)

        # set up the activation function leakyRelu, similar to Relu, 
        # but allows for negatives which prevents the model from stoppping to learn
        self.leakyrelu = nn.LeakyReLU(self.alpha)

    def forward(self, input, adj):

        # linear Transformation
        h = torch.mm(input, self.W)
        N = h.size()[0]
        print(N)

        # Attention Mechanism 
        a_input = torch.cat([h.repeat(1,N).view(N*N, -1), h.repeat(N,1)], dim=1).view(N, -1, 2 * self.out_features)
        e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))

        # Masked Attention 
        zero_vec = -9e15*torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)


        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime = torch.matmul(attention, h)

        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime 


In [24]:
from load_data import dataset

dataset.transform = T.NormalizeFeatures()


In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [29]:
class GAT(nn.Module):
    def __init__(self):
        super().__init__()
        self.hid = 8
        self.in_head = 8
        self.out_head = 1

        self.conv1 = GATLayer(dataset.num_features, self.hid)
        self.conv2 = GATLayer(self.in_head *self.hid, dataset.num_classes)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)
    

model = GAT().to(device)
data = dataset[0].to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

model.train()
for epoch in range(1000):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])

    if epoch%200 == 0:
        print(loss)

    loss.backward()
    optimizer.step()

2708


RuntimeError: The size of tensor a (10556) must match the size of tensor b (2708) at non-singleton dimension 1

In [32]:
class GAT(nn.Module):
    def __init__(self):
        super(GAT, self).__init__()
        self.hid = 8
        self.in_head = 8
        self.out_head = 1

        self.conv1 = GATConv(dataset.num_features, self.hid, heads=self.in_head, dropout=0.6)
        self.conv2 = GATConv(self.in_head *self.hid, dataset.num_classes, concat = False, heads= self.out_head, dropout=0.6)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)


tensor(1.9436, grad_fn=<NllLossBackward0>)
tensor(0.7244, grad_fn=<NllLossBackward0>)
tensor(0.5033, grad_fn=<NllLossBackward0>)
tensor(0.5200, grad_fn=<NllLossBackward0>)
tensor(0.5288, grad_fn=<NllLossBackward0>)


In [34]:
def train(model, data, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def validate(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data)
        val_loss = F.nll_loss(out[data.val_mask], data.y[data.val_mask]).item()
        pred = out[data.val_mask].max(1)[1]
        correct = pred.eq(data.y[data.val_mask]).sum().item()
        val_acc = correct / data.val_mask.sum().item()
    return val_loss, val_acc

def test(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data)
        pred = out[data.test_mask].max(1)[1]
        correct = pred.eq(data.y[data.test_mask]).sum().item()
        test_acc = correct / data.test_mask.sum().item()
    return test_acc


In [35]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GAT().to(device)
data = dataset[0].to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

best_val_acc = 0
patience = 100  # Early stopping patience
patience_counter = 0

for epoch in range(1000):
    train_loss = train(model, data, optimizer)
    val_loss, val_acc = validate(model, data)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        # Save the best model
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1

    if epoch % 200 == 0:
        print(f'Epoch {epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

    if patience_counter >= patience:
        print("Early stopping")
        break

# Load the best model for testing
model.load_state_dict(torch.load('best_model.pth'))
test_acc = test(model, data)
print(f'Test Accuracy: {test_acc:.4f}')


Epoch 0, Train Loss: 1.9425, Val Loss: 1.9396, Val Acc: 0.3300
Early stopping
Test Accuracy: 0.8160


In [None]:


model = GAT().to(device)
data = dataset[0].to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)


In [None]:

model.train()
for epoch in range(1000):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])

    if epoch%200 == 0:
        print(loss)

    loss.backward()
    optimizer.step()

In [31]:
model.eval()
_, pred = model(data).max(dim=1)
correct = float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / data.test_mask.sum().item()
print('Accuracy: {:.4f}'.format(acc))

Accuracy: 0.8180
