##### We'll build Transformer architechure from scratch. To make things easier for us, Let's try to break down the problem into sub problems and try solving these sub problems first. If you are aware of Transformer architechure then you would know that it has 4 components.


1.   Embedding Layer (input layer/target input layer)
2.   Encoder Layer
3.   Decoder Layer
4.   Output layer



So let's try to solve each sub problem first

![Alt Text](https://images.datacamp.com/image/upload/v1704797298/image_7b08f474e7.png)

# Basic Blocks of the Transformer

### Embedding Layer

It has two parts


1.   Word encoding
2.   Positional encoding (adding positional information of each word to the word encoding in the input text)



In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [2]:
class InputEmbedding(nn.Module):

  def __init__(self,source_vocab_size:int,d_model:int):
    super().__init__()
    self.d_model = d_model
    self.vocab_size = source_vocab_size
    self.embedding = nn.Embedding(source_vocab_size,d_model)

  def forward(self,x):
    return self.embedding(x)*np.sqrt(self.d_model)

#### Positional Encoding

In [3]:
torch.zeros([10,10,5],dtype=torch.float64)

tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
  

In [4]:
seq_len = 10
torch.arange(seq_len,dtype=torch.float64).view(seq_len,1)

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.],
        [9.]], dtype=torch.float64)

In [5]:
torch.zeros([1,4]) + 10000

tensor([[10000., 10000., 10000., 10000.]])

In [6]:
indices = torch.arange(0,4,2, dtype=torch.float64)
ct = torch.zeros([1,2]) + 10000
torch.pow(ct,indices/4)

tensor([[  1., 100.]], dtype=torch.float64)

In [7]:
class PositionalEncoding(nn.Module):

  def __init__(self,seq_len,d_model):

    super().__init__()

    PE = torch.zeros([seq_len,d_model],dtype=torch.float64)         ## dim : [seq_len,d_model]
    pos = torch.arange(seq_len,dtype=torch.float64).view(seq_len,1) ## dim : [seq_len,1]

    indices = torch.arange(0,d_model,2, dtype=torch.float64)
    den = torch.pow(10000,indices/d_model)

    PE[:,0::2] = torch.sin(pos/den)
    PE[:,1::2] = torch.cos(pos/den)

    self.register_buffer('PE', PE.unsqueeze(0))


  def forward(self,X):

    return X + self.PE[:,X.size(1),:]




In [8]:
pe = PositionalEncoding(10,512)

In [9]:
x = torch.randn([2,5,512])
pe.forward(x)

tensor([[[-2.0126,  0.6561, -0.6410,  ...,  0.0515, -1.3291,  1.8826],
         [ 0.1997,  2.1964, -0.1028,  ...,  0.2910, -1.1956,  0.9919],
         [-1.1606, -1.1618, -0.0578,  ...,  2.1948, -0.0598,  0.7719],
         [-1.8819,  0.4361, -1.4111,  ...,  0.5100, -0.2765,  1.5825],
         [-0.9097,  0.6576, -1.1641,  ...,  0.9307, -0.2033,  2.4023]],

        [[-0.8413, -0.2362, -0.1425,  ...,  0.8740, -2.3125,  0.8214],
         [-0.6868,  0.1160,  0.9647,  ...,  1.7587,  0.8765, -0.3766],
         [-0.6648,  1.2403, -1.2622,  ...,  1.9908, -1.8640,  0.6998],
         [-2.2293,  1.9843, -2.3635,  ...,  0.1400, -0.6205,  0.7977],
         [-1.3378,  1.8689,  0.1437,  ...,  0.1159,  0.7912,  0.2840]]],
       dtype=torch.float64)

### Layer Normalization

In [10]:
x = torch.randn(2,10)
mu = x.mean(dim=-1,keepdim=True)
sigma = x.std(dim=-1, keepdim=True)

print('input: ',x)
print('mean: ', mu)
print('std: ', sigma)

print('output: ', (x-mu)/sigma)

input:  tensor([[-0.2085, -1.7421, -2.7194,  0.1361, -0.7113, -0.4613,  1.2713,  2.0897,
         -1.7922,  0.1273],
        [-1.3243,  0.4628,  0.5659, -0.2282,  0.1579,  0.3181,  0.1855, -0.5091,
         -0.8930, -0.4898]])
mean:  tensor([[-0.4010],
        [-0.1754]])
std:  tensor([[1.4458],
        [0.6227]])
output:  tensor([[ 0.1332, -0.9275, -1.6035,  0.3715, -0.2146, -0.0417,  1.1567,  1.7227,
         -0.9622,  0.3654],
        [-1.8450,  1.0249,  1.1906, -0.0848,  0.5352,  0.7925,  0.5797, -0.5359,
         -1.1524, -0.5048]])


In [11]:
(1.1379-(0.0440))/0.8895

1.2297920179876334

In [12]:
class LayerNormalization(nn.Module):

  def __init__(self, eps:float=10**-6):
    super().__init__()
    self.eps = eps

    ## Learned during training
    self.alpha = nn.Parameter(torch.ones(1))   ## Multiplied with normalized input
    self.beta =  nn.Parameter(torch.zeros(1))  ## Added (bias)

  def forward(self,x):

    mu = x.mean(dim=-1, keepdim=True)    ## mean separately for each sample
    sigma = x.std(dim=-1, keepdim=True)  ## Std separately for each sample

    return self.alpha*((x-mu)/(sigma+self.eps)) + self.beta

### Multi headed Attention

In [13]:
class MultiHeadAttention(nn.Module):

  def __init__(self,d_model : int,h : int, dropout: float):
    super().__init__()

    self.d_model = d_model
    self.h = h

    assert self.d_model % self.h == 0, "Input dimension is not divisble by no of heads"
    self.d_k = self.d_model // self.h

    self.w_q = nn.Linear(self.d_model,self.d_model)    ## wq shape : (d_model, d_model)
    self.w_k = nn.Linear(self.d_model,self.d_model)    ## wk
    self.w_v = nn.Linear(self.d_model,self.d_model)    ## wv

    self.w_o = nn.Linear(self.d_model,self.d_model)    ## wo
    self.dropout = nn.Dropout(dropout)

  @staticmethod
  def attention(query, key, values, mask, dropout:nn.Dropout):
    d_k = query.shape[3]

    attention_scores = torch.matmul(query,key.transpose(-2,-1))/ np.sqrt(d_k)  ## (batch_size, h, seq_len, d_k) -> (batch_size, h, seq_len, seq_len)

    if mask is not None:
      attention_scores.masked_fill(mask == 0, -1e9)

    attention_scores = attention_scores.softmax(dim=-1)

    if dropout is not None:
      attention_scores = dropout(attention_scores)

    return torch.matmul(attention_scores, values), attention_scores


  def forward(self, q, k, v, mask):

    query = self.w_q(q) ## (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
    key = self.w_k(k)   ## (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
    values = self.w_v(v) ## (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)

    query = query.view(query.shape[0],query.shape[1],self.h,self.d_k).transpose(1,2)     ## (batch_size, seq_len, d_model) -> (batch_size, seq_len, h, d_k) -> (batch_size, h, seq_len, d_k)
    key = key.view(key.shape[0],key.shape[1],self.h,self.d_k).transpose(1,2)             ## (batch_size, seq_len, d_model) -> (batch_size, seq_len, h, d_k) -> (batch_size, h, seq_len, d_k)
    values = values.view(values.shape[0],values.shape[1],self.h,self.d_k).transpose(1,2) ## (batch_size, seq_len, d_model) -> (batch_size, seq_len, h, d_k) -> (batch_size, h, seq_len, d_k)

    x, self.attention_scores = MultiHeadAttention.attention(query, key, values, mask, self.dropout)

    x = x.transpose(1,2).contiguous().view(x.shape[0], -1, self.h*self.d_k)  ## (batch_size, num_heads, seq_len, h) -> (batch_size, seq_len, d_model)

    return self.w_o(x)   ## (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)


In [14]:
attention = MultiHeadAttention(16,4,0.0)
x = torch.randn([1,10,16])
print(x)

tensor([[[-1.8003,  1.3479, -0.4519, -0.1117,  0.6905,  0.4351,  0.5779,
          -0.5061, -0.2013,  0.1791,  0.2344,  1.7502, -0.9007, -0.4884,
          -0.5131, -0.8985],
         [ 0.3860,  1.2616,  0.0962, -0.1063, -0.2013,  0.5360, -1.8349,
           1.1832,  1.1993, -0.4601, -0.1860,  0.1926,  1.2660, -1.7652,
          -0.6127, -1.5354],
         [ 0.6061,  0.6053, -0.9909, -1.6071, -1.6250,  2.4917,  0.8783,
          -0.8135, -0.3035,  0.3632,  1.7293, -1.8015, -0.5320, -0.9973,
           1.6437, -2.2311],
         [ 0.4926,  1.9545, -1.2315, -0.1473, -0.8703,  2.1518, -0.6912,
           0.1429, -1.5533,  0.1572, -0.3270, -1.0155,  0.8278, -0.3608,
          -2.3322,  0.7624],
         [ 0.0512, -0.0764, -0.8885,  0.5825, -1.3498, -1.5043,  1.7731,
           1.7117, -1.2215, -2.6605,  0.1961, -0.8728,  0.5006,  0.7561,
          -1.0037,  1.1572],
         [ 2.3132,  1.3605,  0.3851, -1.3749, -1.2864, -1.5665, -0.7109,
          -1.0166,  0.3458, -0.4048,  0.1444,  0.790

In [15]:
y = attention.forward(x,x,x,None)

In [16]:
attention.attention_scores

tensor([[[[0.0895, 0.1320, 0.0924, 0.0931, 0.1002, 0.0937, 0.0869, 0.1033,
           0.0865, 0.1224],
          [0.0677, 0.0300, 0.1271, 0.0932, 0.1759, 0.1067, 0.1515, 0.1001,
           0.0902, 0.0577],
          [0.0909, 0.0580, 0.0610, 0.1305, 0.1847, 0.0916, 0.1121, 0.1032,
           0.0681, 0.0998],
          [0.0794, 0.0701, 0.0700, 0.0951, 0.1787, 0.1172, 0.1441, 0.1073,
           0.0594, 0.0788],
          [0.2061, 0.1576, 0.0391, 0.1434, 0.0442, 0.0707, 0.0567, 0.0670,
           0.1067, 0.1084],
          [0.0841, 0.0616, 0.1384, 0.0874, 0.1118, 0.1102, 0.1258, 0.0998,
           0.1090, 0.0719],
          [0.1593, 0.1845, 0.0573, 0.1212, 0.0411, 0.0636, 0.0466, 0.0715,
           0.1183, 0.1365],
          [0.1456, 0.0619, 0.0644, 0.1584, 0.1031, 0.0839, 0.0926, 0.0861,
           0.1113, 0.0927],
          [0.0950, 0.0915, 0.0650, 0.1171, 0.1460, 0.0944, 0.1015, 0.1046,
           0.0710, 0.1138],
          [0.0801, 0.0809, 0.0800, 0.1034, 0.1600, 0.0992, 0.1110, 0.1092

### Feed Forward layer

In [17]:
class FeedForward(nn.Module):

  def __init__(self, d_model:int, d_ff:int ,dropout:float):

    super().__init__()
    self.linear1 = nn.Linear(d_model, d_ff)   ## First linear layer
    self.linear2 = nn.Linear(d_ff, d_model)   ## Second linear layer
    self.dropout = nn.Dropout(dropout)        ## Dropout layer

  def forward(self, x):

    ## (batch_size, seq_len, d_model) --> (batch_size, seq_len, d_ff) --> (batch_size, seq_len, d_model)

    return self.linear2(self.dropout(torch.relu(self.linear1(x))))



### Residual Connection

In [28]:
class ResidualConnection(nn.Module):

  def __init__(self, dropout: float):

    super().__init__()

    self.dropout = nn.Dropout(dropout)
    self.norm = LayerNormalization()

  def forward(self, x, sublayer):

    return x + self.dropout(self.norm(sublayer(x)))


## Encoder

In [19]:
class EncoderBlock(nn.Module):

  def __init__(self, self_attention: MultiHeadAttention, feed_forward: FeedForward, dropout: float):
    super().__init__()

    self.self_attention = self_attention
    self.feed_forward = feed_forward

    self.residualconnections = nn.ModuleList([ResidualConnection(dropout) for j in range(2)])

  def forward(self, x, src_mask):

    x = self.residualconnections[0](x, lambda x: self.self_attention(x,x,x,src_mask))
    x = self.residualconnections[0](x, self.feed_forward)

    return x

In [20]:
class Encoder(nn.Module):

  def __init__(self, layers: nn.ModuleList):
    super().__init__()

    self.layers = layers
    self.norm = LayerNormalization()

  def forward(self, x, mask):

    for layer in self.layers:
      x = layer(x, mask)

    return self.norm(x)

## Decoder

In [21]:
class DecoderBlock(nn.Module):

  def __init__(self, self_attention: MultiHeadAttention, cross_attention: MultiHeadAttention, feed_forward: FeedForward, dropout: float):

    super().__init__()

    self.self_attention = self_attention
    self.cross_attention = cross_attention
    self.feed_forward = feed_forward

    self.residualconnections = nn.ModuleList([ResidualConnection(dropout) for j in range(3)])

  def forward(self, x, encoder_output, src_mask, tgt_mask):

    x = self.residualconnections[0](x, lambda x: self.self_attention(x, x, x, tgt_mask))
    x = self.residualconnections[1](x, lambda x: self.cross_attention(x, encoder_output, encoder_output, src_mask))
    x = self.residualconnections[2](x, self.feed_forward)

    return x

In [22]:
class Decoder(nn.Module):

  def __init__(self, layers: nn.ModuleList):
    super().__init__()

    self.layers = layers
    self.norm = LayerNormalization()

  def forward(self, x, encoder_output, src_mask, tgt_mask):

    for layer in self.layers:
      x = layer(x, encoder_output, src_mask, tgt_mask)

    return self.norm(x)

## Output Layer

In [23]:
class OutputLayer(nn.Module):

  def __init__(self, d_model: int, vocab_size: int):

    super().__init__()

    self.linear = nn.Linear(d_model, vocab_size)  ## maps each token to a word in the vocabulary

  def forward(self, x):

    return torch.log_softmax(self.linear(x), dim=-1)   ## Applying softmax along the last dimension

# Transformer

In [24]:
class Transformer(nn.Module):

  def __init__(self, Input_Embedding: InputEmbedding, encoder: Encoder, Output_Embedding: InputEmbedding, decoder: Decoder, Input_pos: PositionalEncoding, Output_pos: PositionalEncoding, Output_layer: OutputLayer):

    super().__init__()
    self.Input_Embedding = Input_Embedding
    self.encoder = encoder
    self.Output_Embedding = Output_Embedding
    self.decoder = decoder
    self.Input_pos = Input_pos
    self.Output_pos = Output_pos
    self.Output_layer = Output_layer

  def encode(self, x_src, src_mask):

    x_src = self.Input_Embedding(x_src)
    x_src = self.Input_pos(x_src)

    return self.encoder(x_src,src_mask)

  def decode(self, x_tgt, enoder_output, src_mask, tgt_mask):

    x_tgt = self.Output_Embedding(x_tgt)
    x_tgt = self.Output_pos(x_tgt)

    return self.decoder(x_tgt, enoder_output, src_mask, tgt_mask)

  def Ouput(self, x):

    return self.Output_layer(x)


#### We have completed building basic blocks of a Transformer architechure. Now Let's try to combine all these. So we are gonna write a function which takes set of parameters required and builds the transformer for us

# Build Transformer

In [31]:
def BuildTransformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, N: int = 6, d_model: int = 512, h: int = 8, d_ff: int = 2048, dropout: float = 0.1):



  Input_Embedding = InputEmbedding(src_vocab_size, d_model)   ## Create embedding layer for source text
  Input_pos = PositionalEncoding(src_seq_len, d_model)        ## Create Postional Encoding for source text

  Output_Embedding = InputEmbedding(tgt_vocab_size, d_model)  ## Create embedding layer for target text
  Output_pos = PositionalEncoding(tgt_seq_len, d_model)       ## Create Postional Encoding for target text

  encoder_blocks = []

  for e in range(N):

    encoder_self_attention = MultiHeadAttention(d_model, h, dropout)
    encoder_feed_forward = FeedForward(d_model, d_ff, dropout)

    encoder_blocks.append(EncoderBlock(encoder_self_attention, encoder_feed_forward, dropout))

  encoder_layers = nn.ModuleList(encoder_blocks)

  encoder = Encoder(encoder_layers)   ## Encoder

  decoder_blocks = []

  for d in range(N):

    decoder_self_attention = MultiHeadAttention(d_model, h, dropout)
    decoder_cross_attention = MultiHeadAttention(d_model, h, dropout)
    decoder_feed_forward = FeedForward(d_model, d_ff, dropout)

    decoder_blocks.append(DecoderBlock(decoder_self_attention, decoder_cross_attention,decoder_feed_forward, dropout))

  decoder_layers = nn.ModuleList(decoder_blocks)

  decoder = Decoder(decoder_layers)  ## Decoder

  Output_layer = OutputLayer(d_model, tgt_vocab_size)  ## Output layer

  transformer = Transformer(Input_Embedding, encoder, Output_Embedding, decoder, Input_pos, Output_pos, Output_layer)  ## create a transformer instance

  ## Initialize the parameters of the network

  for p in transformer.parameters():

    if p.dim() > 1:

       nn.init.xavier_uniform(p)

  print(transformer)

  return transformer








In [32]:
Tf = BuildTransformer(10000, 8000, 20, 12)

  nn.init.xavier_uniform(p)


Transformer(
  (Input_Embedding): InputEmbedding(
    (embedding): Embedding(10000, 512)
  )
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderBlock(
        (self_attention): MultiHeadAttention(
          (w_q): Linear(in_features=512, out_features=512, bias=True)
          (w_k): Linear(in_features=512, out_features=512, bias=True)
          (w_v): Linear(in_features=512, out_features=512, bias=True)
          (w_o): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): FeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (residualconnections): ModuleList(
          (0-1): 2 x ResidualConnection(
            (dropout): Dropout(p=0.1, inplace=False)
            (norm): LayerNormalization()
          )
   