In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

In [20]:
## We first define the Input embeddings as mentioned in the paper

class InputEmbeddings(nn.Module):                     
    # define the constructor
    def __init__(self, d_model : int, vocab_size : int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self,x):                                       
        return self.embedding(x) * math.sqrt(self.d_model)     # In the embedding layers the weights are multiplied by sqrt(d_model)

class PositionalEmbeddings(nn.Module):
    # define the constructor
    def __init__(self, d_model : int, seq_len : int, dropout : float):
        super().__init__()
        self.d_model = d_model
        self.se_len = seq_len
        self.dropout = nn.Dropout(dropout)

        # Create a mtrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # create a vector of shape (seq_len)
        position = torch.arange (0,seq_len,dtype = torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model,2).float()* (-math.log(10000.0) / d_model))
        # apply the sin to even positions 
        pe[:,0::2] = torch.sin(position * div_term)
        pe[:,1::2] = torch.cos(position * div_term)
        # We need to add the " batch dimension " to these tensors so that we can apply it to the whole sentences.
        # there will be bacthes of sentences.

        pe = pe.unsqueeze(0)   # it will become a tensor of dimension (1, Seq_Len, d_model)

        # we can register this tensor in the buffer of this module
        # Buffer of the module : when we want to keep a tensor inside the module but not as a learned parameter but 
        # when we want to save it when we save the file of the model we should register it as a buffer. 
        # This way the tensor will be saved in the file along with the state of the model.
        

        self.register_buffer('pe', pe)

    def forward(self, x):
        # we need to atatch the positiional encoding with every word and we also need to keep in mind that they dont get trained.
        x = x + (self.pe[:, :x.shape[1], : ]).required_grad_(False)
        return self.dropout(x)

#########################################  BUILDING THE ENCODER ######################
# Next we are going to build the small small components of the encoder which are the multi-head attention, the normalization layer,
# The feed forward network and the skip connection. 

# WE BEGIN WITH THE LAYER NORMALIZATION 

class LayerNormalization(nn.Module):

    def __init__(self,eps : float = 10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1))  # Multiplied
        self.bias  = nn.Parameter(torch.zeros(1)) # Added

    def forward(self, x):
        mean = x.mean(dim =1, keepdim = True)
        std = x.std(dim =-1, keepdim = True)
        return self.alpha* (x- mean) / (std + self.eps) + self.bias

# THE FEED FORWARD NETWORK

class FeedforwardBlock(nn.Module):

    def __init__(self, d_model : int, d_ff: int, dropout : float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # W1 and B1
        self.dropout  = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # W2 and B2

    def forward(self,x):
        # (Batch, Seq_len,d_model) --> (Batch, Seq_len, d_ff ) --> (Batch, Seq_len, d_model) 
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))


# Multi-head attention block

class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h
        assert d_model %h == 0, "d_model is not divisible by h"
        self.d_k = d_model // h
        self.W_q = nn.Linear(d_model,d_model) # Wq
        self.W_k = nn.Linear(d_model,d_model) # Wk
        self.W_v = nn.Linear(d_model,d_model) # Wv

        self.w_o = nn.Linear(d_model, d_model) #Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key,value,mask, dropout: nn. Dropout):
        d_k = query.shape[-1]

        # (Batch, h, Seq_len, d_k) --> (Batch, h, Seq_len, Seq_len)
        attention_scores = (query @ key.transpose(-2,-1)) /math.sqrt(d_k)
        # before applying the softmax we need to apply the mask to hide. So we replace all those interactions with 
        # very very small values.
        if mask is not None:
            attention_scores.masked_fill_(mask==0,-1e9)
            
        attention_scores = attention_scores.softmax(dim = -1) # (Batch, h, Seq_len, Seq_len)

        if dropout is not None:
            attention_scores = dropout(attention_scores)

        return (attention_scores @ value), attention_scores
        

    def forward(self,q,k,v,mask):
         query = self.w_q(q)    # (Batch, Seq_len,d_model) --> (Batch, Seq_len, d_model)
         key = self.w_k(k)      # (Batch, Seq_len,d_model) --> (Batch, Seq_len, d_model)
         value = self.w_v(v)    # (Batch, Seq_len,d_model) --> (Batch, Seq_len, d_model)

         # We want to split the embeddings not the sentence. 
         # (Batch,Seq_len,d_model) --> (Batch, Seq_len, h, d_k) -->(Batch, h, Seq_len, d_k)
         query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1,2)
         key = key.view(key.shape[0], key.shape[1],self.h, self.d_k).transpose(1,2)
         value = value.view(value.shape[0], value.shape[1],self.h,self.d_k).transpose(1,2)

         x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

         # (batch, h, Seq_len, d_k) --> (Batch, Seq_len, h, d_K) --> (Batch, Seq_len, d_model)
         x = x.transpose(1,2).contiguous().view(x.shape[0], -1,self.h*self.d_k)

         # ()
         return self.w_o(x)
         

         
         

         
    


    