### Sequences

In order to make sure the lenght of input matrix is always the same. We define d_input (dimension for a word) and seq_len (the length of the sequence). If the sentence is less than the sequence length, we simply pad zeros in it.

In [10]:
"""
'say hello to world'

-> One hot vectors:
  say                 hello               to                  world               empty...
[[0, 0, 1, 0 ... 0], [0, 1, 0, 0 ... 0], [0, 0, 0, 1 ... 0], [1, 0, 0, 0 ... 0], [0...0], ...]

-> Embedding to seq_len x d_model
                1024    x 512

-> Position encoding
PE (pos, 2i)   = sin(pos / 10000^(2i/d_model))
PE (pos, 2i+1) = cos(pos / 10000^(2i/d_model))
x = x + PE(x)

-> Linear transform to make Q V K
512 > 3x512

-> Multi Headed Attention
...
"""

"\n'say hello to world'\n\n-> One hot vectors:\n  say                 hello               to                  world               empty...\n[[0, 0, 1, 0 ... 0], [0, 1, 0, 0 ... 0], [0, 0, 0, 1 ... 0], [1, 0, 0, 0 ... 0], [0...0], ...]\n\n-> Embedding to seq_len x d_model\n                1024    x 512\n\n-> Position encoding\nPE (pos, 2i)   = sin(pos / 10000^(2i/d_model))\nPE (pos, 2i+1) = cos(pos / 10000^(2i/d_model))\nx = x + PE(x)\n\n-> Linear transform to make Q V K\n512 > 3x512\n\n-> Multi Headed Attention\n...\n"

In [11]:
import torch
import torch.nn as nn

max_sequence_length = 10 # usually 1024
d_model = 6              # usually 512

even_i = torch.arange(0, d_model, 2).float()
odd_i  = torch.arange(1, d_model, 2).float()
print(even_i)
print(odd_i)

tensor([0., 2., 4.])
tensor([1., 3., 5.])


In [12]:
even_denominator = torch.pow(10000, even_i    / d_model)
odd_denominator  = torch.pow(10000, (odd_i-1) / d_model)
print(even_denominator)
print(odd_denominator)
# Evens and Odds are the same
denominator = even_denominator

tensor([  1.0000,  21.5443, 464.1590])
tensor([  1.0000,  21.5443, 464.1590])


In [13]:
position = torch.arange(max_sequence_length, dtype=torch.float).reshape(max_sequence_length, 1)
position

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.],
        [9.]])

In [17]:
even_PE = torch.sin(position / denominator)
odd_PE  = torch.cos(position / denominator)
print(even_PE.shape, "\n", even_PE, "\n")
print(odd_PE.shape, "\n", odd_PE)

torch.Size([10, 3]) 
 tensor([[ 0.0000,  0.0000,  0.0000],
        [ 0.8415,  0.0464,  0.0022],
        [ 0.9093,  0.0927,  0.0043],
        [ 0.1411,  0.1388,  0.0065],
        [-0.7568,  0.1846,  0.0086],
        [-0.9589,  0.2300,  0.0108],
        [-0.2794,  0.2749,  0.0129],
        [ 0.6570,  0.3192,  0.0151],
        [ 0.9894,  0.3629,  0.0172],
        [ 0.4121,  0.4057,  0.0194]]) 

torch.Size([10, 3]) 
 tensor([[ 1.0000,  1.0000,  1.0000],
        [ 0.5403,  0.9989,  1.0000],
        [-0.4161,  0.9957,  1.0000],
        [-0.9900,  0.9903,  1.0000],
        [-0.6536,  0.9828,  1.0000],
        [ 0.2837,  0.9732,  0.9999],
        [ 0.9602,  0.9615,  0.9999],
        [ 0.7539,  0.9477,  0.9999],
        [-0.1455,  0.9318,  0.9999],
        [-0.9111,  0.9140,  0.9998]])


In [24]:
# Combine even and odd
stacked = torch.stack([even_PE, odd_PE], dim=2)
print(stacked.shape)
print(stacked[0])

PE = torch.flatten(stacked, start_dim=1, end_dim=2) # flatten dimension 2 to dimension 1
print(PE.shape)
print(PE[0])

torch.Size([10, 3, 2])
tensor([[0., 1.],
        [0., 1.],
        [0., 1.]])
torch.Size([10, 6])
tensor([0., 1., 0., 1., 0., 1.])


In [32]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super(PositionalEncoding, self).__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i / self.d_model)
        position = torch.arange(self.max_sequence_length).reshape(self.max_sequence_length, 1)
        
        even_PE = torch.sin(position / denominator)
        odd_PE  = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE
    
pe = PositionalEncoding(d_model=6, max_sequence_length=10)
pos_encode = pe.forward()
print(pos_encode)


print((pos_encode[0] * pos_encode[1].transpose(0, -1)).sum())
print((pos_encode[0] * pos_encode[2].transpose(0, -1)).sum())
print((pos_encode[0] * pos_encode[3].transpose(0, -1)).sum())
print((pos_encode[0] * pos_encode[4].transpose(0, -1)).sum())
print((pos_encode[0] * pos_encode[5].transpose(0, -1)).sum())
print((pos_encode[0] * pos_encode[6].transpose(0, -1)).sum())
print((pos_encode[0] * pos_encode[7].transpose(0, -1)).sum())



tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])
tensor(2.5392)
tensor(1.5795)
tensor(1.0003)
tensor(1.3291)
tensor(2.2568)
tensor(2.9216)
tensor(2.7015)
