## Why do we need positional Encodings?

In other networks like RNN, LSTM's etc, they know about the loaction of the word cause the words are passed in order, whereas in Transformers like network, the words are passed in parallel so the transformer doesn't know about the actual sequence, or flow of occurence of the words which we passed, so to add a sense of position to it, we have this. :)

In [51]:
import torch 
import torch.nn as nn
torch.manual_seed(41)

<torch._C.Generator at 0x1b3dfd25d10>

In [52]:
max_seq_len = 10
d_model = 20

In [53]:
e_d_p = torch.arange(0, d_model, 2).float()
o_d_p = torch.arange(1, d_model, 2).float()
e_d_p, o_d_p

(tensor([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18.]),
 tensor([ 1.,  3.,  5.,  7.,  9., 11., 13., 15., 17., 19.]))

In [54]:
e_d = torch.pow(10000, e_d_p/d_model)
o_d = torch.pow(10000, (o_d_p-1)/d_model)
e_d, o_d

(tensor([1.0000e+00, 2.5119e+00, 6.3096e+00, 1.5849e+01, 3.9811e+01, 1.0000e+02,
         2.5119e+02, 6.3096e+02, 1.5849e+03, 3.9811e+03]),
 tensor([1.0000e+00, 2.5119e+00, 6.3096e+00, 1.5849e+01, 3.9811e+01, 1.0000e+02,
         2.5119e+02, 6.3096e+02, 1.5849e+03, 3.9811e+03]))

In [55]:
pos = torch.arange(0, max_seq_len).reshape(-1, 1)
even_pos = torch.sin(pos/ e_d.reshape(1, -1))
odd_pos = torch.cos(pos/ o_d.reshape(1, -1))

In [56]:
even_pos.shape, even_pos, odd_pos

(torch.Size([10, 10]),
 tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 8.4147e-01,  3.8767e-01,  1.5783e-01,  6.3054e-02,  2.5116e-02,
           9.9998e-03,  3.9811e-03,  1.5849e-03,  6.3096e-04,  2.5119e-04],
         [ 9.0930e-01,  7.1471e-01,  3.1170e-01,  1.2586e-01,  5.0217e-02,
           1.9999e-02,  7.9621e-03,  3.1698e-03,  1.2619e-03,  5.0238e-04],
         [ 1.4112e-01,  9.2997e-01,  4.5775e-01,  1.8816e-01,  7.5285e-02,
           2.9995e-02,  1.1943e-02,  4.7547e-03,  1.8929e-03,  7.5357e-04],
         [-7.5680e-01,  9.9977e-01,  5.9234e-01,  2.4971e-01,  1.0031e-01,
           3.9989e-02,  1.5924e-02,  6.3395e-03,  2.5238e-03,  1.0048e-03],
         [-9.5892e-01,  9.1320e-01,  7.1207e-01,  3.1027e-01,  1.2526e-01,
           4.9979e-02,  1.9904e-02,  7.9244e-03,  3.1548e-03,  1.2559e-03],
         [-2.7942e-01,  6.8379e-01,  8.1396e-01,  3.6960e-01,  1.5014e-

In [63]:
PE = torch.stack([even_pos, odd_pos], dim=2)
PE = torch.flatten(PE, start_dim=1, end_dim=2)

In [65]:
PE.shape

torch.Size([10, 20])

In [74]:
class PositionalEncoding(nn.Module):
    def __init__(self, 
                 d_model: int,
                 max_seq_len: int):
        super().__init__()
        self.d_model = d_model
        self.max_seq_len = max_seq_len

    def forward(self) -> torch.tensor:
        pos = torch.arange(0, self.max_seq_len)
        denominator = torch.arange(0, self.d_model, 2)
        denominator = torch.pow(10_000, denominator/self.d_model)
 
        pos = pos.reshape(-1, 1)
        denominator = denominator.reshape(1, -1)
        even_pos = torch.sin(pos/ denominator)
        odd_pos = torch.cos(pos/ denominator)

        PE = torch.stack([even_pos, odd_pos], dim=2)
        PE = torch.flatten(PE, start_dim=1, end_dim=2)
        return PE
    
x = PositionalEncoding(d_model=20,
                       max_seq_len=10)
PE = x()

In [75]:
PE.shape

torch.Size([10, 20])