# Implementing GAT using Numpy

In [34]:
from pprint import pprint

import numpy as np

np.random.seed(10)

# Adjacency matrix
A = np.array([
    [1, 1, 1, 1],
    [1, 1, 0, 0],
    [1, 0, 1, 1],
    [1, 0, 1, 1],
])

# Generate random matrix of node features
X = np.random.uniform(-1, 1, (4, 4))

# Generate random edge features
X_edge = np.random.uniform(-1, 1, (12, 2))

# Regular weight matrix
# (no_of_hidden_dims, no_of_nodes)
W = np.random.uniform(-1, 1, (2, 4))

# Attention matrix
# size: (1, dim_h x 2)
W_att = np.random.uniform(-1, 1, (1, 4))

# Connections from source nodes to destination nodes
connections = np.where(A > 0)

# Concatenate hidden vectors of source and destination nodes
# Then apply linear transformation: W_att
a = W_att @ np.concatenate([(X @ W.T)[connections[0]], (X @ W.T)[connections[1]]], axis=1).T


# print((X @ W.T).shape)
# print(((X @ W.T)[connections[0]]).shape)
# print((np.concatenate([(X @ W.T)[connections[0]], (X @ W.T)[connections[1]]], axis=1)).shape)

# Applying Leaky ReLU to the previous result
def leaky_relu(x, alpha=0.2):
    return np.maximum(alpha * x, x)


e = leaky_relu(a)
# print(e.shape)

# Place these values in a matrix: shape = A.shape
# Means it should look like a adjacency matrix
E = np.zeros(A.shape)  # (no_of_nodes, no_of_nodes)
E[connections[0], connections[1]] = e[0]
# print(E)


# Normalize every row of attention scores.
def softmax2D(x, axis):
    e = np.exp(x - np.expand_dims(np.max(x, axis=axis), axis))
    sum = np.expand_dims(np.sum(e, axis=axis), axis)
    # print(np.sum(e/sum, axis=1))
    return e / sum


# Attention weights
W_alpha = softmax2D(E, 1)


# Calculate new matrix of embeddings H
H = A.T @ W_alpha @ X @ W.T



print(f"Adjacency Matrix: \n{A}")
print(f"\nNode features: \n{X}")
print(f"\nEdge features: \n{X_edge}")
print(f"\nRegular weight matrix: \n{W}")
print(f"\nAttention matrix: \n{W_att}")
print(f"\nEdges in COO format: \n{connections}")
print(f"\nAttention score: \n{a}")
print(f"\nAfter applying LeakyReLU: \n{e[0]}")
print(f"\nFinal Attention weights: \n{W_alpha}")
print('\nHidden embeddings for node features:')
pprint(H)


Adjacency Matrix: 
[[1 1 1 1]
 [1 1 0 0]
 [1 0 1 1]
 [1 0 1 1]]

Node features: 
[[ 0.54264129 -0.9584961   0.26729647  0.49760777]
 [-0.00298598 -0.55040671 -0.60387427  0.52106142]
 [-0.66177833 -0.82332037  0.37071964  0.90678669]
 [-0.99210347  0.02438453  0.62524192  0.22505213]]

Edge features: 
[[ 0.44351063 -0.41624786]
 [ 0.83554825  0.42915157]
 [ 0.08508874 -0.7156599 ]
 [-0.25331848  0.34826723]
 [-0.11633365 -0.13197201]
 [ 0.23553396  0.02627649]
 [ 0.30079436  0.20207791]
 [ 0.61044639  0.0432943 ]
 [ 0.81729776 -0.36152782]
 [-0.8190813  -0.39859989]
 [-0.77203128  0.65736265]
 [-0.90620736  0.2525743 ]]

Regular weight matrix: 
[[ 0.09517231  0.63857399 -0.60210492  0.7137006 ]
 [-0.29669472  0.50929538 -0.40807659  0.76787296]]

Attention matrix: 
[[-0.34897672 -0.6699682  -0.21494151 -0.81307925]]

Edges in COO format: 
(array([0, 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3]), array([0, 1, 2, 3, 0, 1, 0, 2, 3, 0, 2, 3]))

Attention score: 
[[ 0.76434534 -0.00115851  0.15336751  

# Influence Mechanism

- W_inf: Linear Transformation that will be applied to scaled edge features
- reshaped_weights: Obtained attetion weights for the actual edges defined in the edge_index of the graph


In [32]:
print('\nAlpha_ones\n', W_alpha)

# Obtaining attetion weights for the actual edges: defined in the edge_index
mask = W_alpha[connections[0], connections[1]]
print('\nMask\n', mask)
print(mask.shape)

# Calculating scaled version of edge fetures 
# attention weights * edge_features
# we need to do this for actual edges for given a node in the graph
# Element-wise multiplication is used here to calculate scaled version of the edges
reshaped_weights = mask.reshape(12, 1) * X_edge  # reshape need to be done!
print('\nreshaped_weights\n', reshaped_weights)
print('\nreshaped_weights_shape', reshaped_weights.shape)


# Influence matrix: Linear Transformation
W_inf = np.random.uniform(-1, 1, (10, 2))
print('\nW_inf', W_inf.shape)

# Calculating the influence for the whole graph
result = reshaped_weights @ W_inf.T
print('\nInfluence: ', result.shape)
print(result)


Alpha_ones
 [[0.38275322 0.17818341 0.20776604 0.23129734]
 [0.26004579 0.22229523 0.25882949 0.25882949]
 [0.30391639 0.2423709  0.22442247 0.22929024]
 [0.32465237 0.23176774 0.21941042 0.22416947]]

Mask
 [0.38275322 0.17818341 0.20776604 0.23129734 0.26004579 0.22229523
 0.30391639 0.22442247 0.22929024 0.32465237 0.21941042 0.22416947]
(12,)

reshaped_weights
 [[ 0.16975512 -0.15932021]
 [ 0.14888083  0.07646769]
 [ 0.01767855 -0.14868982]
 [-0.05859189  0.08055328]
 [-0.03025208 -0.03431877]
 [ 0.05235808  0.00584114]
 [ 0.09141634  0.06141479]
 [ 0.13699789  0.00971621]
 [ 0.1873984  -0.0828948 ]
 [-0.26591669 -0.1294064 ]
 [-0.16939171  0.14423222]
 [-0.20314402  0.05661945]]

reshaped_weights_shape (12, 2)

W_inf (10, 2)

Influence:  (12, 10)
[[-0.30051236 -0.05284312  0.18084565 -0.06800636 -0.08624948 -0.13533348
   0.02606892  0.17670577 -0.07083879 -0.07192392]
 [-0.05652094 -0.0484861   0.13299857 -0.05108373  0.07739503 -0.13402857
   0.11738574 -0.04518059 -0.03480471 

Calculated influence contains all the edge based inluence that comes from the particular edges for the paritcular nodes. Here we have done it for all the nodes in the graph by applying matrix multiplication.
- Shape: (no_of_edges, dim_hidden)
- dim_hidden comes from the W_inf: Linear Transformation for the influence
- W_inf shape: (dim_hidden, no_of_edge_features)

In [26]:
connections #[2, 12]

(array([0, 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3]),
 array([0, 1, 2, 3, 0, 1, 0, 2, 3, 0, 2, 3]))

Edges are encoded in COO format: Source nodes, and Target nodes respectively.

In [25]:
X_edge #[12, 2]

array([[ 0.44351063, -0.41624786],
       [ 0.83554825,  0.42915157],
       [ 0.08508874, -0.7156599 ],
       [-0.25331848,  0.34826723],
       [-0.11633365, -0.13197201],
       [ 0.23553396,  0.02627649],
       [ 0.30079436,  0.20207791],
       [ 0.61044639,  0.0432943 ],
       [ 0.81729776, -0.36152782],
       [-0.8190813 , -0.39859989],
       [-0.77203128,  0.65736265],
       [-0.90620736,  0.2525743 ]])

In [52]:
I = np.zeros((4, 4, 10))  # (no_of_nodes, no_of_nodes, dim_hidden) dim_hidden after W_inf...
I[connections[0], connections[1]] = result
print(I)
I.shape

[[[-0.30051236 -0.05284312  0.18084565 -0.06800636 -0.08624948
   -0.13533348  0.02606892  0.17670577 -0.07083879 -0.07192392]
  [-0.05652094 -0.0484861   0.13299857 -0.05108373  0.07739503
   -0.13402857  0.11738574 -0.04518059 -0.03480471 -0.05774644]
  [-0.15779788 -0.00419503  0.03448085 -0.01231257 -0.10249001
   -0.00472301 -0.05503912  0.14070019 -0.02407198 -0.0107489 ]
  [ 0.12820362  0.01798594 -0.06544785  0.0244849   0.04786473
    0.04489767  0.00217848 -0.08465751  0.02768108  0.02545552]]

 [[-0.00650034  0.01003819 -0.02480023  0.00963643 -0.02902072
    0.02856643 -0.03206345  0.02656801  0.00469866  0.01127059]
  [-0.04003626 -0.01684302  0.04926616 -0.0187985   0.0123169
   -0.04564155  0.0320784   0.0036001  -0.01490048 -0.02082743]
  [ 0.          0.          0.          0.          0.
    0.          0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.          0.
    0.          0.          0.          0.          0.        ]

(4, 4, 10)

In [63]:
# Taking the aggreated influence as node-wise
# Each node now has aggregated data
I.sum(axis=1).shape == (4, 10)
I.sum(axis=1)

array([[-3.86627576e-01, -8.75383089e-02,  2.82877224e-01,
        -1.06917760e-01, -6.34797407e-02, -2.29187390e-01,
         9.05940283e-02,  1.87567857e-01, -1.02034400e-01,
        -1.14963732e-01],
       [-4.65365998e-02, -6.80482998e-03,  2.44659342e-02,
        -9.16206481e-03, -1.67038208e-02, -1.70751254e-02,
         1.49541273e-05,  3.01681113e-02, -1.02018211e-02,
        -9.55683737e-03],
       [-3.73645281e-01, -1.33186510e-01,  3.98145826e-01,
        -1.51594698e-01,  5.66530187e-02, -3.58346707e-01,
         2.29332808e-01,  8.24291045e-02, -1.25684640e-01,
        -1.66839887e-01],
       [ 6.24828645e-01,  2.03970325e-01, -6.17649542e-01,
         2.34877134e-01, -4.91990044e-02,  5.46430465e-01,
        -3.28785920e-01, -1.75986705e-01,  1.99726925e-01,
         2.57488327e-01]])

In [72]:
# Concatenation of both node features and calculated influence messages...
temp = I.sum(axis=1)
print(np.concatenate((H, temp), axis=1))
print(np.concatenate((H, temp), axis=1).shape)

[[-5.58078115e-01  3.47844879e-01 -3.86627576e-01 -8.75383089e-02
   2.82877224e-01 -1.06917760e-01 -6.34797407e-02 -2.29187390e-01
   9.05940283e-02  1.87567857e-01 -1.02034400e-01 -1.14963732e-01]
 [-3.03055171e-01  1.65505852e-01 -4.65365998e-02 -6.80482998e-03
   2.44659342e-02 -9.16206481e-03 -1.67038208e-02 -1.70751254e-02
   1.49541273e-05  3.01681113e-02 -1.02018211e-02 -9.55683737e-03]
 [-4.29219323e-01  2.22605905e-01 -3.73645281e-01 -1.33186510e-01
   3.98145826e-01 -1.51594698e-01  5.66530187e-02 -3.58346707e-01
   2.29332808e-01  8.24291045e-02 -1.25684640e-01 -1.66839887e-01]
 [-4.29219323e-01  2.22605905e-01  6.24828645e-01  2.03970325e-01
  -6.17649542e-01  2.34877134e-01 -4.91990044e-02  5.46430465e-01
  -3.28785920e-01 -1.75986705e-01  1.99726925e-01  2.57488327e-01]]
(4, 12)


Concatenation of node features and edge features might add noise to the overoll representation as both of them have different features that represent entirely different contextual data. For an example, node represent atom's charge and it might not be subjective the edge feature like interaction between two atoms.