# Decoder

<img src="https://machinelearningmastery.com/wp-content/uploads/2021/08/attention_research_1.png"  width="400" height="600">

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import math

## Most of the steps below are the same as encoder

In [2]:
input_ids = torch.randint(high=1000, size=(1, 10,)) # The extra dimension at the start is for batch size.
input_ids

tensor([[ 45, 583, 100, 714,  50, 763,  90,   5, 890, 965]])

In [3]:
vocab_size = 1000 # This is how many tokens do we have in our vocabulary
embed_dim = 4 # This is how many dimensions do we have
seq_len = 10
batch_size = 1
token_embedding = nn.Embedding(vocab_size, embed_dim)
position_embedding = nn.Embedding(seq_len, embed_dim)

In [4]:
positions = torch.arange(0, seq_len, dtype=torch.long)
pos_embed = position_embedding(positions)
print(pos_embed)
print('='*20)
print(pos_embed.size())

tensor([[ 0.1858, -0.8513, -0.8483,  0.3233],
        [-1.1369,  1.3385, -1.3545, -0.1748],
        [-1.1121, -2.0867, -0.3455,  0.6971],
        [ 1.5121, -0.5488,  0.3287,  0.1512],
        [ 1.3869,  0.2724,  0.3173,  1.2025],
        [ 0.4041, -1.6461,  0.6674, -0.1767],
        [-1.5414, -1.3549,  0.6292, -1.5743],
        [-0.6163,  0.6783,  0.4672,  0.9271],
        [ 1.2818, -0.4587,  1.4878,  0.3366],
        [-1.3596,  1.6736,  0.9903, -0.1597]], grad_fn=<EmbeddingBackward0>)
torch.Size([10, 4])


In [5]:
input_embed = token_embedding(input_ids)
print(input_embed)
print('='*20)
print(input_embed.size())

tensor([[[-0.7674, -2.2268, -0.6793, -0.8814],
         [-0.5230, -1.3035,  0.6975,  0.9871],
         [ 1.8228, -0.5018,  0.6378,  0.9297],
         [-0.1088, -0.2072, -1.6853,  0.2031],
         [ 1.9600,  0.2219, -0.8571,  1.3081],
         [-0.1358, -0.7490, -1.2744, -0.4434],
         [ 0.7796,  1.1134, -0.2814, -1.5582],
         [-0.2071, -0.0496,  0.8892, -0.0980],
         [-0.9709,  0.3850, -0.6061,  0.3085],
         [-0.5543,  0.1082,  1.4128,  0.7548]]], grad_fn=<EmbeddingBackward0>)
torch.Size([1, 10, 4])


In [6]:
input_embed = input_embed + pos_embed
print(input_embed)
print('='*20)
print(input_embed.size())

tensor([[[-0.5816, -3.0781, -1.5276, -0.5581],
         [-1.6599,  0.0350, -0.6570,  0.8123],
         [ 0.7107, -2.5884,  0.2923,  1.6268],
         [ 1.4034, -0.7560, -1.3567,  0.3543],
         [ 3.3469,  0.4942, -0.5398,  2.5106],
         [ 0.2684, -2.3951, -0.6070, -0.6201],
         [-0.7619, -0.2415,  0.3479, -3.1324],
         [-0.8235,  0.6287,  1.3564,  0.8291],
         [ 0.3110, -0.0737,  0.8817,  0.6451],
         [-1.9139,  1.7818,  2.4031,  0.5951]]], grad_fn=<AddBackward0>)
torch.Size([1, 10, 4])


## Causal Self Attention ‼️

In [7]:
# This will be used in the addition step after we perform attention
residual_embeddings_1 = input_embed.clone() 

In [8]:
# For normalization we actually have to use trainable parameters
epsilon = nn.Parameter(torch.ones(embed_dim))
gamma = nn.Parameter(torch.zeros(embed_dim))

input_embed = F.layer_norm(input_embed, epsilon.shape, epsilon, gamma, 1e-5)

In [9]:
input_embed.size()

torch.Size([1, 10, 4])

In [10]:
# This Linear layer will help us split our embeddings into Query, Key, and Value
qkv_layer = nn.Linear(embed_dim, 3 * embed_dim, bias=False)

q, k, v = qkv_layer(input_embed).split(embed_dim, dim=2)
print(q.size(), k.size(), v.size())

torch.Size([1, 10, 4]) torch.Size([1, 10, 4]) torch.Size([1, 10, 4])


In [11]:
# Let's say we have 3 heads in out multi head attention
n_head = 2 # IMP: Embedding dimension should be divisible by number of heads

print(q)

k = k.view(batch_size, seq_len, n_head, embed_dim // n_head).transpose(1, 2) # (Batch size, num_heads, sequence length, embeddings per head)
q = q.view(batch_size, seq_len, n_head, embed_dim // n_head).transpose(1, 2) # (Batch size, num_heads, sequence length, embeddings per head)
v = v.view(batch_size, seq_len, n_head, embed_dim // n_head).transpose(1, 2) # (Batch size, num_heads, sequence length, embeddings per head)

print(q)

tensor([[[ 0.4452, -0.5987, -0.0388, -0.1329],
         [ 0.3837, -0.2841,  0.6227, -0.9738],
         [ 0.4625, -0.4140,  0.1296, -0.2104],
         [ 0.3667, -1.1854, -0.4079, -0.1858],
         [ 0.4568, -1.2830, -0.3102, -0.3649],
         [ 0.2107, -0.2645, -0.2229,  0.2975],
         [-0.6930,  1.1213, -0.2938,  0.9906],
         [-0.0481,  0.8567,  0.6346, -0.2328],
         [ 0.1515,  0.5416,  0.3539,  0.0995],
         [-0.2398,  1.0069,  0.5121, -0.0524]]], grad_fn=<SplitBackward0>)
tensor([[[[ 0.4452, -0.5987],
          [ 0.3837, -0.2841],
          [ 0.4625, -0.4140],
          [ 0.3667, -1.1854],
          [ 0.4568, -1.2830],
          [ 0.2107, -0.2645],
          [-0.6930,  1.1213],
          [-0.0481,  0.8567],
          [ 0.1515,  0.5416],
          [-0.2398,  1.0069]],

         [[-0.0388, -0.1329],
          [ 0.6227, -0.9738],
          [ 0.1296, -0.2104],
          [-0.4079, -0.1858],
          [-0.3102, -0.3649],
          [-0.2229,  0.2975],
          [-0.2938, 

### Let's create the causal attention mask first

In [12]:
raw_mask = torch.tril(torch.ones(seq_len, seq_len)).view(1, 1, seq_len, seq_len)
raw_mask

tensor([[[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])

In [13]:
attention = (q @ k.transpose(-2, -1)) / math.sqrt(embed_dim // n_head) # In multi-head attention, Dk becomes the dimension of embeddings per head

print(attention.size())

# Here we should insert our Alibi mask
attention = attention.masked_fill(raw_mask[:,:,:seq_len,:seq_len] == 0, float('-inf'))
#                                                ^ This is not max_seq_len but rather the length of current sequence

print(attention)
print('=' * 20)

attention = F.softmax(attention, dim=-1)

print(attention)
print('=' * 20)

new_embeddings = attention @ v
new_embeddings = new_embeddings.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim) # re-assemble all head outputs side by side

print(new_embeddings)
print(new_embeddings.size())

torch.Size([1, 2, 10, 10])
tensor([[[[-0.0278,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
              -inf,    -inf,    -inf],
          [ 0.0692, -0.2711,    -inf,    -inf,    -inf,    -inf,    -inf,
              -inf,    -inf,    -inf],
          [ 0.0547, -0.3689,  0.0170,    -inf,    -inf,    -inf,    -inf,
              -inf,    -inf,    -inf],
          [-0.3012, -0.7969, -0.4466,  0.0391,    -inf,    -inf,    -inf,
              -inf,    -inf,    -inf],
          [-0.2974, -0.8787, -0.4524,  0.0446, -0.1173,    -inf,    -inf,
              -inf,    -inf,    -inf],
          [-0.0056, -0.2127, -0.0330,  0.0136, -0.0235,  0.0786,    -inf,
              -inf,    -inf,    -inf],
          [ 0.1195,  0.8475,  0.2435, -0.0500,  0.1007, -0.2208, -0.7444,
              -inf,    -inf,    -inf],
          [ 0.3214,  0.5171,  0.4349, -0.0201,  0.0799,  0.1012, -0.5470,
            0.3458,    -inf,    -inf],
          [ 0.2901,  0.2776,  0.3690, -0.0059,  0.0516,  0.1660, -0.3

In [14]:
# Now let's add

new_embeddings = new_embeddings + residual_embeddings_1

print(new_embeddings)

tensor([[[ 0.0895, -3.6472, -0.8655, -0.3501],
         [-1.1605, -0.6040, -0.4485,  0.8981],
         [ 1.2547, -3.2896,  0.6044,  1.7858],
         [ 2.0294, -1.1006, -0.9491,  0.4096],
         [ 4.0302,  0.2687, -0.0506,  2.4968],
         [ 0.8701, -2.7126, -0.0924, -0.5735],
         [-0.3022, -0.6009,  0.7209, -3.0595],
         [-0.4824,  0.2287,  1.7308,  0.8694],
         [ 0.6027, -0.5183,  1.1748,  0.7596],
         [-1.7055,  1.2976,  2.6416,  0.7050]]], grad_fn=<AddBackward0>)


## Multi Head Cross Attention ‼️

In [15]:
# Saved to add after the attention step
cross_residual = new_embeddings.clone()

**👇🏼 Pre Normalization**

In [16]:
# For normalization we actually have to use trainable parameters
epsilon = nn.Parameter(torch.ones(embed_dim))
gamma = nn.Parameter(torch.zeros(embed_dim))

input_embed = F.layer_norm(input_embed, epsilon.shape, epsilon, gamma, 1e-5)

In [17]:
# Assume these to be embeddings that come from the encoder
cross_embeddings = torch.rand(1, 10, 4)
cross_embeddings

tensor([[[0.1133, 0.5196, 0.1891, 0.5066],
         [0.5957, 0.7000, 0.6619, 0.1435],
         [0.4222, 0.3409, 0.8789, 0.5092],
         [0.1700, 0.0887, 0.6071, 0.2530],
         [0.9986, 0.9972, 0.7730, 0.6266],
         [0.9470, 0.9322, 0.8488, 0.1389],
         [0.0336, 0.3528, 0.9591, 0.1084],
         [0.7983, 0.5002, 0.2624, 0.9730],
         [0.0630, 0.9560, 0.6581, 0.3984],
         [0.6781, 0.1505, 0.9560, 0.0680]]])

In [18]:
# This Linear layer will help us split our embeddings into Query, Key, and Value
kv_layer = nn.Linear(embed_dim, 2 * embed_dim, bias=False)
q_layer = nn.Linear(embed_dim, embed_dim, bias=False)

k, v = kv_layer(cross_embeddings).split(embed_dim, dim=2)
q = q_layer(new_embeddings)
print(q.size(), k.size(), v.size())

torch.Size([1, 10, 4]) torch.Size([1, 10, 4]) torch.Size([1, 10, 4])


In [19]:
print(q)

k = k.view(batch_size, seq_len, n_head, embed_dim // n_head).transpose(1, 2) # (Batch size, num_heads, sequence length, embeddings per head)
q = q.view(batch_size, seq_len, n_head, embed_dim // n_head).transpose(1, 2) # (Batch size, num_heads, sequence length, embeddings per head)
v = v.view(batch_size, seq_len, n_head, embed_dim // n_head).transpose(1, 2) # (Batch size, num_heads, sequence length, embeddings per head)

print(q)

tensor([[[ 0.9206,  0.3820, -0.2387, -1.1162],
         [-0.3493,  0.1047,  0.7024,  0.0202],
         [ 1.8250,  0.1949, -0.6655,  0.1835],
         [ 1.2321,  0.2170, -0.2109,  0.0787],
         [ 2.1834,  0.0294, -0.4852,  1.6570],
         [ 1.1106,  0.2156, -0.7566, -0.7907],
         [-0.2466, -0.0755, -1.1325, -1.3882],
         [ 0.0185, -0.2302, -0.4079,  0.5411],
         [ 0.6773, -0.0992, -0.6065,  0.4018],
         [-0.7860, -0.4336, -0.3595,  0.6637]]], grad_fn=<UnsafeViewBackward0>)
tensor([[[[ 0.9206,  0.3820],
          [-0.3493,  0.1047],
          [ 1.8250,  0.1949],
          [ 1.2321,  0.2170],
          [ 2.1834,  0.0294],
          [ 1.1106,  0.2156],
          [-0.2466, -0.0755],
          [ 0.0185, -0.2302],
          [ 0.6773, -0.0992],
          [-0.7860, -0.4336]],

         [[-0.2387, -1.1162],
          [ 0.7024,  0.0202],
          [-0.6655,  0.1835],
          [-0.2109,  0.0787],
          [-0.4852,  1.6570],
          [-0.7566, -0.7907],
          [-1.1

In [20]:
attention = (q @ k.transpose(-2, -1)) / math.sqrt(embed_dim // n_head) # In multi-head attention, Dk becomes the dimension of embeddings per head

print(attention.size())

# Cross Attention does not need a causal mask

attention = F.softmax(attention, dim=-1)

print(attention.size())

new_embeddings = attention @ v
new_embeddings = new_embeddings.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim) # re-assemble all head outputs side by side

print(new_embeddings)
print(new_embeddings.size())

torch.Size([1, 2, 10, 10])
torch.Size([1, 2, 10, 10])
tensor([[[-0.5236,  0.6212, -0.4255, -0.0544],
         [-0.4938,  0.5861, -0.4240, -0.0541],
         [-0.5406,  0.6432, -0.4012, -0.0636],
         [-0.5287,  0.6280, -0.4094, -0.0604],
         [-0.5464,  0.6511, -0.3836, -0.0690],
         [-0.5262,  0.6248, -0.4133, -0.0594],
         [-0.4945,  0.5872, -0.4160, -0.0588],
         [-0.4990,  0.5928, -0.4001, -0.0638],
         [-0.5145,  0.6110, -0.3991, -0.0642],
         [-0.4794,  0.5706, -0.3991, -0.0640]]], grad_fn=<ViewBackward0>)
torch.Size([1, 10, 4])


In [21]:
# Now let's add

new_embeddings = new_embeddings + cross_residual

print(new_embeddings)

tensor([[[-0.4341, -3.0259, -1.2910, -0.4045],
         [-1.6543, -0.0179, -0.8725,  0.8440],
         [ 0.7140, -2.6464,  0.2031,  1.7223],
         [ 1.5007, -0.4726, -1.3585,  0.3493],
         [ 3.4838,  0.9198, -0.4342,  2.4278],
         [ 0.3439, -2.0877, -0.5058, -0.6329],
         [-0.7967, -0.0137,  0.3050, -3.1183],
         [-0.9814,  0.8215,  1.3307,  0.8056],
         [ 0.0882,  0.0927,  0.7757,  0.6954],
         [-2.1849,  1.8682,  2.2425,  0.6410]]], grad_fn=<AddBackward0>)


## Feed Forward part

In [22]:
residual_embeddings_2 = new_embeddings.clone() # Again we keep track inorder to add them later

In [23]:
# For normalization we actually have to use trainable parameters
epsilon = nn.Parameter(torch.ones(embed_dim))
gamma = nn.Parameter(torch.zeros(embed_dim))

new_embeddings = F.layer_norm(new_embeddings, epsilon.shape, epsilon, gamma, 1e-5)

In [24]:
# The Feed-forward part is responsible to help the model learn more

make_big = nn.Linear(embed_dim, 4 * embed_dim, bias=False)
gelu = nn.GELU()
make_small = nn.Linear(4 * embed_dim, embed_dim, bias=False)

new_embeddings = make_big(new_embeddings)
new_embeddings = gelu(new_embeddings)
new_embeddings = make_small(new_embeddings)

In [25]:
new_embeddings = new_embeddings + residual_embeddings_2

print(new_embeddings)

tensor([[[-0.5406, -3.0757, -1.2003, -0.2461],
         [-1.4723,  0.2819, -0.9948,  0.8720],
         [ 0.6188, -2.6957,  0.2549,  1.8696],
         [ 1.5030, -0.2592, -1.2548,  0.3540],
         [ 3.5007,  1.1581, -0.3409,  2.4188],
         [ 0.2270, -2.1522, -0.3466, -0.3734],
         [-0.7870, -0.2191,  0.3569, -3.0667],
         [-0.9544,  0.8948,  1.1473,  0.8998],
         [ 0.0308,  0.1222,  0.6844,  0.9148],
         [-2.1406,  1.8915,  2.0670,  0.6925]]], grad_fn=<AddBackward0>)


# And we are done!! This was the decoder process 🎉