-------------

Relative Positional Encoding Example

In [23]:
from torch import nn
import torch

# Initialize the d model
d_model = 100

# Give the max position
max_relative_position = 150

# Create a lookup table (if it were inside a model we must initialize as weights)
embeddings_table = nn.Parameter(torch.Tensor(max_relative_position * 2 + 1, d_model))

# Initialize the value of the lookup table with xavier uniform
nn.init.xavier_uniform_(embeddings_table)

Parameter containing:
tensor([[ 0.1043, -0.0500, -0.0120,  ..., -0.0818, -0.0159,  0.0040],
        [ 0.0261,  0.0660, -0.1183,  ...,  0.0991, -0.1208, -0.1169],
        [ 0.0271,  0.1220, -0.0060,  ..., -0.0501, -0.0651,  0.0510],
        ...,
        [-0.0042,  0.0151, -0.0370,  ..., -0.0921, -0.0884,  0.0806],
        [ 0.1129,  0.0979, -0.1043,  ..., -0.0752,  0.0542,  0.0857],
        [ 0.0499,  0.0409, -0.0933,  ...,  0.0630,  0.0718,  0.0561]],
       requires_grad=True)

In [24]:
embeddings_table.size() # size = (number of positions, embedding dimension)

torch.Size([301, 100])

In [25]:
# Initialize a range with same size as the query sequence length (200)
range_vec_q = torch.arange(200)

# Initialize a range with same size as the key sequence length (200)
range_vec_k = torch.arange(200)

# Calculate the distance between the positions
distance_mat = range_vec_k[None, :] - range_vec_q[:, None]

# Clip the distances between the -max distance and max distance
distance_mat_clipped = torch.clamp(distance_mat, -150, 150)

# Add max distance to the clipped distance in order to obtain positive indices
final_mat = distance_mat_clipped + 150

# Transform final matrix to Long in order to map the indices to their vectors
final_mat = torch.LongTensor(final_mat)

# Determine the positional embeddings
embeddings = embeddings_table[final_mat]

In [26]:
embeddings.size() # The embedding is of dimension [query sequence length, keys sequence length, embedding dim]

torch.Size([200, 200, 100])

**Note since we are using the following formula from [relative_position](https://arxiv.org/pdf/1803.02155v2)**:

$$
E = \frac{Attention + Additional\_attention}{\sqrt{d\_model}}
$$

Where $Additional\_attention = Linear(Q) \times position\_embeddings$

And that we don't need to determine a first relative positional embedding to add to the linear transformation of the values because we are making machine translation. Then the key sequence length that we used in `range_vec_k` can be taken as the query sequence length. But for more comprehension of the process we will maintain the name.

In [29]:
# Initialize a query with size = (batch = 4, sequence, d_model)
query = torch.randn((4, 200, 100))

# Calculate the additional attention
add_attention = query.transpose(0, 1).matmul(embeddings.transpose(1, 2)).transpose(0, 1)

Notice that we transpose the batch dimension with the sequence dimension of the query matrix and we also transposed the dimension of the key sequence with the embedding dimension in order to make matrix multiplication between the query and the position embeddings. After we replaced the batch dimension on its position.

In [30]:
add_attention.size() 

torch.Size([4, 200, 200])

The result will be then added to the original attention and all together divided by $\sqrt{d\_model}$.