# Positional Encoding

While Transformer does model sequences, a lack of recurrency means that it doesn't have the notion of ordering in sequence. More specifically, it doesn't care or care to know which timestamp or element in a sequence comes after or before the other. 

This is problematic for NLP tasks. Sentences maintain word order and jumbled words will barely make sense. The solution to this is to add positional encoding which tells transformer that $x$ is a token in position $i$ and $y$ is a token in position $j$. 

In the original transformer implementation, what they did was to use fixed positional encoding where, even positions are defined by a sin signal and the odd ones with a cosine.

In [1]:
import torch
import torch.nn.functional as F
from einops import rearrange

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [17]:
d_model = 32
dropout = 0.1
max_seq_len = 64

def positional_encoding(x, d_model, dropout, max_seq_len, device=device):
    # vector of max seq len x model hidden dimension size 
    # (d comes from the model embedding layer)
    encodings = torch.zeros(max_seq_len, d_model, device=device)
    position = torch.arange(0, max_seq_len, device=device)
    # this will be of max_seq_len dim
    # we need a vector -> max_seq_len dim x 1
    position = rearrange(position, "s -> s 1")
    # could've also used unsqueeze but I prefer einops
    
    factor = torch.exp(
        torch.arange(0, d_model, 2, device=device) *
        (-torch.log(torch.tensor(10000.0, device=device)) / d_model)
    )
    
    # assign to odd and even psoition
    
    # even
    # 0::2 gives all the even entries
    encodings[:, 0::2] = torch.sin(position * factor)
    # odd
    # 1::2 gives all the odd entries
    encodings[:, 1::2] = torch.sin(position * factor)
    
    
    encodings = rearrange(encodings, "seq dmodel -> 1 seq dmodel")
    
    encoded_x = x + encodings[:, :x.size(1)]
    # during training, encodings will not be differentiated during backprop
    # it's not a model variable
    
    return F.dropout(encoded_x, p=dropout)


# ===========
# suppose, x is an embedding vector for a sequence from the models embedding layer
x = torch.ones(1, max_seq_len, d_model, device=device)
pe = positional_encoding(
    x, d_model=d_model, dropout=0, max_seq_len=max_seq_len)

In [18]:
# using the visualisation shown in annotated transformer notebook
import pandas as pd
import altair as alt


def example_positional(y):
    data = pd.concat(
        [
            pd.DataFrame(
                {
                    "embedding": y[0, :, dim].cpu().numpy(),
                    "dimension": dim,
                    "position": list(range(max_seq_len)),
                }
            )
            for dim in [2, 5, 7, 10]
        ]
    )

    return (
        alt.Chart(data)
        .mark_line()
        .properties(width=800)
        .encode(x="position", y="embedding", color="dimension:N")
        .interactive()
    )
    
example_positional(y=pe)