# Attention Is All You Need
This jupyter notebook consist of the `Transformer` model implmentation

## Importing the necessary libraries

In [1]:
import torch
from torchtext.datasets import Multi30k
import spacy
from torch import nn, optim
import numpy as np
import torch.nn.functional as F

## Attention Layer Module
Given an input X with dimensions ($N \times d_{dim}$),  into Query Q($N \times d_{q}$),  Keys K($N \times d_{k}$) and  Values V($N \times d_{v}$).
Three different types of attention layers found in Transformer:
 - Encoder Self-Attention layer
 - Encoder-Decoder Attention layer
 - Decoder Self-Attention with Mask
 
Encoding is calculated as follows:
$$Z = softmax(\frac{Q * K^{\top}}{\sqrt{d_{K}}})*V$$

With Mask:
$$Z = softmax(\frac{Q * K^{\top}}{\sqrt{d_{K}}} + M)*V$$
where M $\rightarrow$ Mask(Look Ahead mask is implemented here)

For Encoder Decoder attention layer, we have the Query coming from the Decoder and keys & values coming from the Encoder

In [2]:
class Attention_layer(nn.Module):
    def __init__(self, params):
        super().__init__()
        
        #Parameters
        self.dq = params['dq']
        self.dk = params['dk']
        self.dv = params['dv']
        self.dim = params['dim']
        self.mask = params['mask']
        
        # Making Fully connected layers for calculating Queries, Values and Keys
        self.Query = nn.Linear(self.dim, self.dq, bias = False)
        self.Key = nn.Linear(self.dim, self.dk, bias = False)
        self.Value = nn.Linear(self.dim, self.dv, bias = False)
        
    # Look ahead mask function
    def Mask(self, nt, ns, batch_size):
        """nt -> Source sequence length
           ns -> Target sequence length"""
        
        mask = torch.triu(torch.ones((nt, ns))).expand(batch_size, nt, ns)
        mask = mask * (- float("inf"))
        return mask
    
    # Function to calculate the Q, K, V values
    def QKV(self, X):
        """X -> (batch_size, seq_len, dim)
           Q -> (batch_size, seq_len, dq)
           K -> (batch_size, seq_len, dk)
           V -> (batch_size, seq_len, dv)"""
        
        Q = self.Query(X)
        K = self.Key(X)
        V = self.Value(X)
        
        return Q, K, V

    # Forward function
    def forward(self, Xe, Xd = None):
        Q, K, V = self.QKV(Xe)
        if  Xd != None:
            Q, k, v = self.QKV(Xd)
        
        Kt = K.permute(0, 2, 1)
        I = Q @ Kt
        
        # Adding mask
        if self.mask:
            ns = Xe.shape[1]
            nt = ns
            if Xd != None:
                nt = Xd.shape[1]
                
            I.masked_fill(self.Mask(nt, ns, X.shape[0]))
        
        # Calculating Score for the next layer
        Z = (torch.softmax(I / self.dk ** (1/2), dim = 2 )) @ V
        return Z

##  Feed Forward Layer

In [3]:
class FeedForward(nn.Module):
    def __init__(self, params):
        super().__init__()
        
        # Parameters for the feed forward layer
        self.in_size = params['input_size']
        self.out_size = params['output_size']
        self.hlayers = params['hlayers']
        
        # Making layers
        self.layers = nn.ModuleList()
        cdim = self.in_size
        
        for h in self.hlayers:
            self.layers.append(nn.Linear(cdim, h))
            self.layers.append(nn.ReLU())
            
            cdim = h
        
        self.layers.append(nn.Linear(cdim, self.out_size))
        
    # Forward fucntion
    def forward(self, X):
        for layer in self.layers:
            X = layer(X)
        return X

## MultiHead Attention

We use several layers of self attention layers and concatenate their output to get the final encoding

In [4]:
class MultiHead_Attention(nn.Module):
    def __init__(self, params):
        super().__init__()
        
        # parameters for multi head attention
        self.dim = params['dim']
        self.dq = params['dq']
        self.dk = params['dk']
        self.dv = params['dv']
        self.h = params['h']
        self.batch_size = params['batch_size']
        self.mask = params['mask']
        
        # calculating the number of dimensions per head
        self.head_dim = self.dim // self.h
        
        # Parameters for individual head
        self_attn_params = {
            'dim' : self.head_dim,
            'dq'  : self.dq,
            'dk'  : self.dk,
            'dv'  : self.dv,
            'mask' : self.mask
        }
        
        # Creating multiple heads
        self.layers = nn.ModuleList()
        for i in range(self.h):
            self.layers.append(Attention_layer(self_attn_params))
        
        # Layer to transform into a single output
        self.WO = nn.Linear(self.dv * self.h, self.dim, bias = False)
        
    def forward(self, Xe, Xd = None):
        Z = []
        
        # Splitting the input 
        ns = Xe.shape[1]
        Xe = Xe.view(self.h, self.batch_size, ns, self.head_dim) #dimension --> (heads, N, seq_len, head_dim)
        
        if Xd != None:
            # Splitting the input
            nt = Xd.shape[1]
            Xd = Xd.view(self.h, self.batch_size, nt, self.head_dim) #dimension --> (heads, N, seq_len, head_dim)
            
            # Calculating the output of each head
            for h in range(self.h):
                z = self.layers[h](Xe[h], Xd[h])
                Z.append(z)
        else:
            for h in range(self.h):
                z = self.layers[h](Xe[h], Xd)
                Z.append(z)
        
        # Concatenating the outputs
        Z = torch.cat(Z, dim = 2) #dimension --> (N, seq_len, embed_dim)
        # Final output
        Z = self.WO(Z)
        return Z

# Encoder Block

An encoder block contains the following:
- Multi-Head Attention Layer
- Add Norm layer(Residual Connection)
- Feed Forward Layer
- Add Norm Layer

<img src="./Encoder.png" align="center" width=200>


In [5]:
class EncoderBlock(nn.Module):
    def __init__(self, params):
        super().__init__()
        
        # Parameters required for Multi Head Attention
        self.dim = params['dim']
        self.dq = params['dq']
        self.dk = params['dk']
        self.dv = params['dv']
        self.h = params['h']
        self.batch_size = params['batch_size']
        
       
        mult_attn_params = {
            'dim' : self.dim,
            'dq' : self.dq,
            'dk' : self.dk,
            'dv' : self.dv,
            "h"  : self.h,
            "batch_size" : self.batch_size,
            'mask' : False
        }
        
        # Parameters required for Feed Forward
        self.out_size = params['output_size']
        self.hlayers = params['hlayers']
        
        ffn_params ={
            'input_size' : self.dim,
            'output_size' : self.out_size,
            "hlayers" : self.hlayers
        }
        
        self.mult_heads = MultiHead_Attention(mult_attn_params)
        self.norm1 = nn.LayerNorm(self.dim)
        self.ffn = FeedForward(ffn_params)
        self.norm2 = nn.LayerNorm(self.dim)
        
    def forward(self, X):
        # MultiHead Attention along with residual and normalization
        attn = self.mult_heads(X, None)
        Z = self.norm1(attn + X)
        
        # Feed Forward Layer with residual connection and normalization
        f = self.ffn(Z)
        X = self.norm2(Z + f)
        
        return X    

# Decoder Block
Block consists of the following:
- Masked MultiHead Attention
- Encoder Decoder Attention
- Feed Forward Network

<img src="./Decoder.png" align="center" width=200>
Everything is associated with a residual connection and normalization

In [6]:
class DecoderBlock(nn.Module):
    def __init__(self, params):
        super().__init__()
        
        # params for multihead attention
        self.dim = params['dim']
        self.dq = params['dq']
        self.dk = params['dk']
        self.dv = params['dv']
        self.batch_size = params['batch_size']
        self.h = params['h']
        
        # Parameters for masked self attention
        masked_mult_head = {
            'dim' : self.dim,
            'dq' : self.dq,
            'dk' : self.dk,
            "dv" : self.dv,
            'h'  : self.h,
            'batch_size' : self.batch_size,
            'mask':True
        }
        
        # Parameters for encoder decoder attention
        enc_dec_mult_head = {
            'dim' : self.dim,
            'dq'  : self.dq,
            'dk' : self.dk,
            'dv' : self.dv,
             'h' : self.h,
            'batch_size' : self.batch_size,
            'mask' : False
        }
        
        # params for FFN
        self.out_size = params['output_size']
        self.hlayers = params['hlayers']
        
        ffn_params = {
            'input_size' : self.dim,
            'output_size' : self.out_size,
            'hlayers' : self.hlayers
        }
        
        # 3 sublayers in Decoder Block
        self.mask_mult = MultiHead_Attention(masked_mult_head)
        self.norm1 = nn.LayerNorm(self.dim)
        self.encdec_mult = MultiHead_Attention(enc_dec_mult_head)
        self.norm2 = nn.LayerNorm(self.dim)
        self.ffn = FeedForward(ffn_params)
        self.norm3 = nn.LayerNorm(self.dim)
        
    def forward(self, Xd, Xe):
        
        mask_attn = self.mask_mult(Xd, None)
        Z = self.norm1(Xd + mask_attn)
        
        enc_dec_attn = self.encdec_mult(Xe, Z)
        f = self.norm2(Z + enc_dec_attn)
        
        ffn = self.ffn(f)
        X = self.norm3(f + ffn)
        
        return X

# Positional Encoding
Positional Encoding given in the paper:
$$PE_{(pos, 2i)} = sin(pos/10000^{2i/d_{model}})$$
$$PE_{(pos, 2i+1)} = cos(pos/10000^{2i/d_{model}})$$
where pos $\rightarrow$ position and i $\rightarrow$ dimension

In [7]:
def Positional_Encoding(X, embed_dim):
    n, seq_len = X.shape
    position_embedding = nn.Embedding(seq_len, embed_dim)
    positions = torch.arange(seq_len).expand(n, seq_len)
    PE = position_embedding(positions)
    return PE

# Transformer
Here comes the transformer as a whole.
We are building the transformer using the building blocks we created.

<img src="./Transformer.png" align="center">

In [8]:
class Transformer(nn.Module):
    
    def __init__(self, params):
        super().__init__()
        
        # Number of Encoder and Decoder blocks
        self.N = params['N']
        
        # Params for Encoder Decoder
        self.dim = params['dim']
        self.dq = params['dq']
        self.dk = params['dk']
        self.dv = params['dv']
        self.h = params['h']
        self.batch_size = params['batch_size']
        
        # Params for embedding
        self.src_vocab_size = params['src_vocab_size']
        self.trg_vocab_size = params['trg_vocab_size']
        
        # Params for FFN encoder
        self.out_size1 = params['output_size_e']
        self.hlayers1 = params['hlayers_e']
        # Params for FFN decoder
        self.out_size2 = params['output_size_d']
        self.hlayers2 = params['hlayers_d']
            
        enc_params = {
            'dim' : self.dim,
            'dq' : self.dq,
            'dk' : self.dk,
            'dv' : self.dv,
            'h' : self.h,
            "batch_size" : self.batch_size,
            'output_size' : self.out_size1,
            'hlayers' : self.hlayers1
        }
        
        dec_params = {
            'dim' : self.dim,
            'dq' : self.dq,
            'dk' : self.dk,
            'dv' : self.dv,
            'h' : self.h,
            'batch_size' : self.batch_size,
            'output_size' : self.out_size2,
            'hlayers' : self.hlayers2
        }
        
        # Embedding
        self.src_embedding = nn.Embedding(self.src_vocab_size, self.dim)
        self.trg_embedding = nn.Embedding(self.trg_vocab_size, self.dim)
        
        # Encoder Blocks & Decoder Blocks:
        self.EncoderBlocks = nn.ModuleList()
        self.DecoderBlocks = nn.ModuleList()
        
        # Creating Encoder And Decoder Blocks
        for n in range(self.N):
            self.EncoderBlocks.append(EncoderBlock(enc_params))
            self.DecoderBlocks.append(DecoderBlock(dec_params))
        
        # Final output layer
        self.output_layer = nn.Linear(self.dim, self.dim)
        
    def forward(self, src, trg):
        # X,Z -> Input Embedding, Output Embedding
        X = self.src_embedding(src)
        Z = self.trg_embedding(trg)
        
        # Adding positional Encoding
        X += Positional_Encoding(X, self.dim)
        Z += Positional_Encoding(Z, self.dim)
        
        # Passing through Encoders
        for n in range(self.N):
            X = self.EncoderBlocks[n](X)
        
        # Passing through Decoders
        for n in range(self.N):
            Z = self.DecoderBlocks[n](Z, X)
        
        # Passing through final Linear Layer
        Z = self.output_layer(Z)
        Z = F.softmax(Z)
        
        return Z

In [9]:
trans_params = {
    'dim' : 512,
    'dq' : 64,
    'dk' : 64,
    'dv' : 64,
    'h' : 8,
    'N' : 6,
    'batch_size' : 32,
    
}

In [None]:
trans = Transformer()