In [1]:
# Postional Embedding
# since we feed all tokens of sequence to transformer at once, we need to add some positional information to the tokens
# positional embedding maybe explicilty learned like token embedding or it can be hardcoded
# we will use hardcoded positional embedding as both generate similar results and harcoding reduces the number of parameters

In [2]:
# PE(pos, 2i) = sin(pos/10000^(2i/d_model))
# PE(pos, 2i+1) = cos(pos/10000^(2i/d_model))

# rewritten as
# PE(pos, i) = sin(pos/10000^(i/d_model))      # for i:even
# PE(pos, i) = cos(pos/10000^(i-1/d_model))    # for i:odd

# pos: position of token in sequence
# i: dimension of positional embedding

In [4]:
import torch

In [9]:
T = 4                  # sequence length  
d_model = 16            # model dimension

In [10]:
even_i = torch.arange(0, d_model, 2).float()
odd_i = torch.arange(1, d_model, 2).float()
even_i, odd_i

(tensor([ 0.,  2.,  4.,  6.,  8., 10., 12., 14.]),
 tensor([ 1.,  3.,  5.,  7.,  9., 11., 13., 15.]))

In [12]:
even_den = torch.pow(10000,(even_i/d_model))
even_den

tensor([1.0000e+00, 3.1623e+00, 1.0000e+01, 3.1623e+01, 1.0000e+02, 3.1623e+02,
        1.0000e+03, 3.1623e+03])

In [15]:
odd_den = torch.pow(10000,((odd_i-1)/d_model))
odd_den

tensor([1.0000e+00, 3.1623e+00, 1.0000e+01, 3.1623e+01, 1.0000e+02, 3.1623e+02,
        1.0000e+03, 3.1623e+03])

In [16]:
# both den are same
denominator = even_den

In [18]:
pos = torch.arange(0, T).float().unsqueeze(1)
pos

tensor([[0.],
        [1.],
        [2.],
        [3.]])

In [23]:
odd_pos = torch.cos(pos/denominator)
odd_pos

tensor([[ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000],
        [ 0.5403,  0.9504,  0.9950,  0.9995,  0.9999,  1.0000,  1.0000,  1.0000],
        [-0.4161,  0.8066,  0.9801,  0.9980,  0.9998,  1.0000,  1.0000,  1.0000],
        [-0.9900,  0.5828,  0.9553,  0.9955,  0.9996,  1.0000,  1.0000,  1.0000]])

In [25]:
even_pos = torch.sin(pos/denominator)
even_pos

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [8.4147e-01, 3.1098e-01, 9.9833e-02, 3.1618e-02, 9.9998e-03, 3.1623e-03,
         1.0000e-03, 3.1623e-04],
        [9.0930e-01, 5.9113e-01, 1.9867e-01, 6.3203e-02, 1.9999e-02, 6.3245e-03,
         2.0000e-03, 6.3246e-04],
        [1.4112e-01, 8.1265e-01, 2.9552e-01, 9.4726e-02, 2.9995e-02, 9.4867e-03,
         3.0000e-03, 9.4868e-04]])

In [26]:
# we need to interlevae even and odd pos
# 0 1 2 3 ,.. 
# even pos: 0,2,4,... odd pos:1,3,5,..
stacked = torch.stack([even_pos, odd_pos], dim=2)        # it will stack even and odd pos along 2nd dimension (interleaving)
PE = torch.flatten(stacked, start_dim=1, end_dim=2)
PE

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,
          0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  3.1098e-01,  9.5042e-01,  9.9833e-02,
          9.9500e-01,  3.1618e-02,  9.9950e-01,  9.9998e-03,  9.9995e-01,
          3.1623e-03,  9.9999e-01,  1.0000e-03,  1.0000e+00,  3.1623e-04,
          1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  5.9113e-01,  8.0658e-01,  1.9867e-01,
          9.8007e-01,  6.3203e-02,  9.9800e-01,  1.9999e-02,  9.9980e-01,
          6.3245e-03,  9.9998e-01,  2.0000e-03,  1.0000e+00,  6.3246e-04,
          1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  8.1265e-01,  5.8275e-01,  2.9552e-01,
          9.5534e-01,  9.4726e-02,  9.9550e-01,  2.9995e-02,  9.9955e-01,
          9.4867e-03,  9.9995e-01,  3.0000e-03,  1.0000e+00,  9.4868e-04,
          1.0000e+00]])

In [27]:
import torch.nn as nn

class PositionalEmbedding(nn.Module):
    def __init__(self,seq_len, d_model):
        super().__init__()
        self.T = seq_len
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        odd_i = torch.arange(1, self.d_model, 2).float() 
        denominator = torch.pow(10000,(even_i/self.d_model))
        pos = torch.arange(0, self.T).float().unsqueeze(1)
        odd_pos = torch.cos(pos/denominator)
        even_pos = torch.sin(pos/denominator)
        stacked = torch.stack([even_pos, odd_pos], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)

        return PE

In [28]:
pe = PositionalEmbedding(4,16)
pe()

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,
          0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  3.1098e-01,  9.5042e-01,  9.9833e-02,
          9.9500e-01,  3.1618e-02,  9.9950e-01,  9.9998e-03,  9.9995e-01,
          3.1623e-03,  9.9999e-01,  1.0000e-03,  1.0000e+00,  3.1623e-04,
          1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  5.9113e-01,  8.0658e-01,  1.9867e-01,
          9.8007e-01,  6.3203e-02,  9.9800e-01,  1.9999e-02,  9.9980e-01,
          6.3245e-03,  9.9998e-01,  2.0000e-03,  1.0000e+00,  6.3246e-04,
          1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  8.1265e-01,  5.8275e-01,  2.9552e-01,
          9.5534e-01,  9.4726e-02,  9.9550e-01,  2.9995e-02,  9.9955e-01,
          9.4867e-03,  9.9995e-01,  3.0000e-03,  1.0000e+00,  9.4868e-04,
          1.0000e+00]])

In [None]:
# token embeding is a simple embedding layer
# vocab size x embedding size table
# given a token, it will return its embedding (row)

In [34]:
# tokensiation
# convert text to tokens
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TransformerEmbedding(nn.Module):
    '''
        It tokenise the sentence and then add token, positional emebedding to it
    '''

    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN, dropout_ratio = 0.1):
        super().__init__()
        self.vocab_size = len(language_to_index)             # language_to_index is a dictionary
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PositionalEmbedding(max_sequence_length, d_model)
        self.dropout = nn.Dropout(dropout_ratio)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN
    
    def batch_tokenize(self, batch, start_token=True, end_token=True):

        def tokenize(sentence, start_token=True, end_token=True):
            sentence_word_indicies = [self.language_to_index[token] for token in list(sentence)]
            # start token
            if start_token:
                sentence_word_indicies.insert(0, self.language_to_index[self.START_TOKEN])
            # end token
            if end_token:
                sentence_word_indicies.append(self.language_to_index[self.END_TOKEN])
            # padding token
            for _ in range(len(sentence_word_indicies), self.max_sequence_length):
                sentence_word_indicies.append(self.language_to_index[self.PADDING_TOKEN])
            return torch.tensor(sentence_word_indicies)

        tokenized = []
        for sentence_num in range(len(batch)):
           tokenized.append( tokenize(batch[sentence_num], start_token, end_token) )
        tokenized = torch.stack(tokenized)
        return tokenized.to(device)
    
    def forward(self, x,start_token = True, end_token=True): 
        # x: batch of sentences
        x = self.batch_tokenize(x ,start_token, end_token)
        print(x)
        x = self.embedding(x)
        pos = self.position_encoder().to(device)
        x = self.dropout(x + pos)
        return x


In [39]:
lan_dict = {chr(i):i-96 for i in range(97,123)}
lan_dict[' '] = 0
lan_dict['<sos>'] = 26
lan_dict['<eos>'] = 27
lan_dict['<pad>'] = 28
lan_dict

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 ' ': 0,
 '<sos>': 26,
 '<eos>': 27,
 '<pad>': 28}

In [40]:
sent = ["hello i am billy russo",
        "hello i am frank castle"]

In [41]:
layer = TransformerEmbedding(30,12,lan_dict,'<sos>','<eos>','<pad>')

In [42]:
layer(sent)

tensor([[26,  8,  5, 12, 12, 15,  0,  9,  0,  1, 13,  0,  2,  9, 12, 12, 25,  0,
         18, 21, 19, 19, 15, 27, 28, 28, 28, 28, 28, 28],
        [26,  8,  5, 12, 12, 15,  0,  9,  0,  1, 13,  0,  6, 18,  1, 14, 11,  0,
          3,  1, 19, 20, 12,  5, 27, 28, 28, 28, 28, 28]])


tensor([[[-0.5624,  3.9419, -1.6502,  1.8174,  0.0120,  1.8498,  0.6854,
           2.4412, -0.9975,  1.7850, -0.4718,  0.0000],
         [ 0.0000,  1.1149,  0.0000,  0.4208,  0.3114,  1.6620, -0.5500,
           1.5107,  1.7444,  2.3122,  0.1903,  2.0085],
         [ 1.0338, -1.6904,  0.3723, -0.3224,  0.8514,  2.2137, -0.0000,
           1.4107, -0.8872,  1.6738,  0.1138,  0.7569],
         [ 1.0515, -0.3677, -1.1151,  1.5340, -0.1522, -0.1699, -1.6942,
           0.0000, -1.4106,  1.5418, -0.8397,  0.9029],
         [ 0.0538,  0.0060, -0.9409,  1.3704, -0.1014, -0.1782, -1.6831,
           0.8122, -1.4082,  1.5418, -0.8392,  0.9029],
         [-1.8316,  0.3362,  1.0790, -0.6102,  0.3570,  0.0000,  1.0911,
           0.0000,  1.3630, -0.4152, -1.0151,  2.4800],
         [-0.1023, -0.1490,  1.1582, -0.2568,  0.5652, -0.6881,  1.0307,
           0.4207, -0.1828,  0.0000,  0.6767,  3.2753],
         [ 0.0280,  1.9001,  1.2704,  0.3626, -0.2348,  0.0000,  0.0000,
           3.4706, -0.06