**Initialization**
- I use these three lines of code on top of my each notebooks because it will help to prevent any problems while reloading the same project. And the third line of code helps to make visualization within the notebook.

In [1]:
#@ INITIALIZATION: 
%reload_ext autoreload
%autoreload 2
%matplotlib inline

**Downloading Libraries and Dependencies**
- I have downloaded all the libraries and dependencies required for the project in one particular cell.

In [3]:
#@ IMPORTING MODULES: UNCOMMENT BELOW:
# !pip install transformers[sentencepiece]
# !pip install bertviz
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import AutoModel
from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show
from bertviz import head_view
import torch
from torch import nn
import torch.nn.functional as F
from math import sqrt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#@ IGNORING WARNINGS: 
import warnings
warnings.filterwarnings("ignore")

**Note:**
- The numerical representation computed for a given token in **encoder** only transformer architecture depends both on the left or before the token and the right or after the token contexts which is called **bidirectional attention**. 
- The numerical representation computed for a given token in **decoder** only transformer architecture depends only on the left context which is called **autoregressive attention**. 

**The Encoder**

In [5]:
#@ VISUALIZING SCALED DOT PRODUCT ATTENTION: UNCOMMENT BELOW:
model_ckpt = "bert-base-uncased"                            # Initializing model checkpoint.
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)       # Initializing bert tokenizer.
model = BertModel.from_pretrained(model_ckpt)               # Initializing pretrained bert model. 
text = "time flies like an arrow."                          # Initializing a text.
# show(model, "bert", tokenizer,text,display_mode="light",
#      layer=0, head=8)                                     # Inspecting bert model.

In [6]:
#@ INITIALIZING TOKENIZATION:
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False)         # Initializing tokenization.
inputs.input_ids                                                                # Inspecting inputs. 

tensor([[ 2051, 10029,  2066,  2019,  8612,  1012]])

In [7]:
#@ INITIALIZING EMBEDDINGS: 
config = AutoConfig.from_pretrained(model_ckpt)                                 # Initializing configurations.
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)                 # Initializing embedding layers. 
token_emb                                                                       # Inspection.

Embedding(30522, 768)

In [8]:
#@ INITIALIZING TOKEN EMBEDDINGS:
input_embeds = token_emb(inputs.input_ids)                                      # Initializing token embeddings.
input_embeds.size()                                                             # Inspection.

torch.Size([1, 6, 768])

In [9]:
#@ INITIALIZING QKV VECTORS:
query = key = value = input_embeds                                  # Initialization.
dim_k = key.size(-1)                                                # Initializing dimensions.
scores = torch.bmm(query, key.transpose(1,2)) / sqrt(dim_k)         # Initializing attention scores.
scores.size() 

torch.Size([1, 6, 6])

In [10]:
#@ IMPLEMENTATION OF SOFTMAX LAYER:
weights = F.softmax(scores, dim=-1)                                 # Implementation of softmax.
weights.sum(dim=-1)                                                 # Initializing attention weights.
attn_outputs = torch.bmm(weights, value)                            # Batched matrix multiplication.
attn_outputs.shape                                                  # Inspection. 

torch.Size([1, 6, 768])

**Scaled Dot Product Attention**

In [11]:
#@ INITIALIZING SCALED DOT-PRODUCT ATTENTION:
def scaled_dot_product_attention(query, key, value):                # Defining function. 
    dim_k = query.size(-1)                                          # Initializing dimensions. 
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)    # Initializing attention scores. 
    weights = F.softmax(scores, dim=-1)                             # Implementation of softmax layer.
    return torch.bmm(weights, value)                                # Batched matrix multiplication.

**Multi-Headed Attention**

In [12]:
#@ IMPLEMENTATION OF SINGLE ATTENTION HEAD: 
class AttentionHead(nn.Module):                                             # Defining attention head.
    def __init__(self, embed_dim, head_dim):                                # Constructor function. 
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)                             # Initializing query.
        self.k = nn.Linear(embed_dim, head_dim)                             # Initializing key.
        self.v = nn.Linear(embed_dim, head_dim)                             # Initializing value. 
    
    def forward(self, hidden_state):                                        # Forward propagation function. 
        attn_outputs = scaled_dot_product_attention(
            self.q(hidden_state),self.k(hidden_state),self.v(hidden_state)
        )                                                                   # Initializing attention outputs. 
        return attn_outputs                                                 # Getting attention outputs. 

In [13]:
#@ INITIALIZING OF MULTI-HEADED ATTENTION LAYERS:
class MultiHeadAttention(nn.Module):                                        # Defining class. 
    def __init__(self, config):                                             # Constructor function. 
        super().__init__()
        embed_dim = config.hidden_size                                      # Initializing embedding dimensions. 
        num_heads = config.num_attention_heads                              # Initializing attention heads. 
        head_dim = embed_dim // num_heads                                   # Initializing head dimensions. 
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]  # Implementation of attention head. 
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)                # Output linear layer.
    
    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)        # Sequence concatenation.
        x = self.output_linear(x)                                           # Implementation of linear layer.
        return x

In [14]:
#@ IMPLEMENTATION OF MULTI HEADED ATTENTION LAYER:
multihead_attn = MultiHeadAttention(config)                                 # Initialization.
attn_output = multihead_attn(input_embeds)                                  # Implementation of multi headed attention. 
attn_output.size()                                                          # Inspection.

torch.Size([1, 6, 768])

In [16]:
#@ VISUALIZING ATTENTION: UNCOMMENT BELOW:
model = AutoModel.from_pretrained(model_ckpt, output_attentions=True)       # Initializing pretrained model.
sentence_a = "time flies like an arrow"                                     # Initializing text example.
sentence_b = "fruit flies like a banana"                                    # Initializing text example.
viz_inputs = tokenizer(sentence_a, sentence_b, return_tensors="pt")         # Tokenization.
attention = model(**viz_inputs).attentions                                  # Generating attention. 
sentence_b_start = (viz_inputs.token_type_ids==0).sum(dim=1)
tokens = tokenizer.convert_ids_to_tokens(viz_inputs.input_ids[0])           # Generating tokens.  
# head_view(attention, tokens, sentence_b_start, heads=[8])                 # Visualization.

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


**Feed-Forward Layer**
- Skip connections pass a tensor to the next layer of the model without processing and add it to the processed tensor. 

In [17]:
#@ INITIALIZING FEED-FORWARD LAYER: 
class FeedForward(nn.Module):                                                       # Defining class. 
    def __init__(self, config):                                                     # Initializing contructor function. 
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)     # Initializing linear layer.
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)     # Initializing linear layer.
        self.gelu = nn.GELU()                                                       # GELU activation function. 
        self.dropout = nn.Dropout(config.hidden_dropout_prob)                       # Initializing dropout layer. 
    
    def forward(self, x):                                                           # Forward propagation function. 
        x = self.linear_1(x)                                                        # Implementation of linear lLayer. 
        x = self.gelu(x)                                                            # Implementation of GELU. 
        x = self.linear_2(x)                                                        # Implementation of linear layer. 
        x = self.dropout(x)                                                         # Implementation of dropout layer. 
        return x

In [18]:
#@ IMPLEMENTATION OF FEED FORWARD LAYER:
feed_forward = FeedForward(config)                                                  # Initializing feed forward layer.
ff_outputs = feed_forward(attn_outputs)                                             # Implementation.
ff_outputs.size()                                                                   # Inspection. 

torch.Size([1, 6, 768])

**Layer Normalization**
- Layer normalization normalizes each input in the batch to have zero mean and unity in variance. 

In [19]:
#@ INITIALIZING LAYER NORMALIZATION:
class TransformerEncoderLayer(nn.Module):                                   # Defining class.
    def __init__(self, config):                                             # Constructor function. 
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)                # Initializing layer normalization. 
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)                # Initializing layer normalization. 
        self.attention = MultiHeadAttention(config)                         # Initializing multi headed attention. 
        self.feed_forward = FeedForward(config)                             # Initializing feed forward layer. 
    
    def forward(self, x):                                                   # Forward propagation function. 
        hidden_state = self.layer_norm_1(x)                                 # Applying layer normalization. 
        x = x + self.attention(hidden_state)                                # Applying multi-headed attention. 
        x = x + self.feed_forward(self.layer_norm_2(x))                     # Applying feed forward layer. 
        return x

In [20]:
#@ IMPLEMENTATION OFINPUT EMBEDDINGS: 
encoder_layer = TransformerEncoderLayer(config)                             # Initialization.
input_embeds.shape, encoder_layer(input_embeds).size()                      # Inspection.

(torch.Size([1, 6, 768]), torch.Size([1, 6, 768]))

**Positional Embeddings**

In [21]:
#@ INITIALIZING EMBEDDINGS MODULE: 
class Embeddings(nn.Module):                                                        # Defining class. 
    def __init__(self, config):                                                     # Constructor function. 
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocab_size, 
                                             config.hidden_size)                    # Initializing token embeddings. 
        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                                config.hidden_size)                 # Initializing position embeddings. 
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-12)               # Initializing layer normalization.
        self.dropout = nn.Dropout()                                                 # Initializing dropout layer. 
    
    def forward(self, input_ids):                                                   # Forward propagation function. 
        seq_length = input_ids.size(1)                                              # Initializing sequence length.
        position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0)      # Initializing position ids. 
        token_embeddings = self.token_embeddings(input_ids)                         # Token embeddings. 
        position_embeddings = self.position_embeddings(position_ids)                # Positional embeddings. 
        embeddings = token_embeddings + position_embeddings                         # Combining token and positional embeddings.
        embeddings = self.layer_norm(embeddings)                                    # Implementation of layer normalization.
        embeddings = self.dropout(embeddings)                                       # Implementation of dropout layer.
        return embeddings

#@ IMPLEMENTATION OF EMBEDDINGS:
embedding_layer = Embeddings(config)                                                # Initialization.
embedding_layer(inputs.input_ids).size()                                            # Inspection.

torch.Size([1, 6, 768])

In [22]:
#@ INITIALIZING TRANSFORMER ENCODER: 
class TransformerEncoder(nn.Module):                                                # Defining transformer encoder. 
    def __init__(self, config):                                                     # Initializing constructor function. 
        super().__init__()
        self.embeddings = Embeddings(config)                                        # Initializing embeddings layer.
        self.layers = nn.ModuleList([TransformerEncoderLayer(config) for _ in 
                                     range(config.num_hidden_layers)])              # Initializing transformer encoder layer. 
    
    def forward(self, x):                                                           # Forward propagation function. 
        x = self.embeddings(x)                                                      # Initializing embeddings. 
        for layer in self.layers:
            x = layer(x)
        return x

#@ IMPLEMENTATION OF ENCODER TRANSFORMER: 
encoder = TransformerEncoder(config)                                                # Initialization. 
encoder(inputs.input_ids).size()                                                    # Inspection.

torch.Size([1, 6, 768])

**Attention Head**

In [23]:
#@ ADDING CLASSIFICATION HEAD: 
class TransformerForSequenceClassification(nn.Module):                              # Defining class. 
    def __init__(self, config):                                                     # Initializing constructor function. 
        super().__init__()
        self.encoder = TransformerEncoder(config)                                   # Initializing transformer encoder. 
        self.dropout = nn.Dropout(config.hidden_dropout_prob)                       # Initializing dropout layer. 
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)          # Initializing output linear layer. 
    
    def forward(self, x):                                                           # Forward propagation function. 
        x = self.encoder(x)[:, 0, :]                                                # Initializing hidden state token. 
        x = self.dropout(x)                                                         # Implementation of dropout layer.
        x = self.classifier(x)                                                      # Implementation of output linear layer.
        return x

#@ INITIALIZING CLASSIFICATION MODEL: 
config.num_labels = 3                                                               # Initializing classes. 
encoder_classifier = TransformerForSequenceClassification(config)                   # Initializing classifier model.
encoder_classifier(inputs.input_ids).size()                                         # Inspection.

torch.Size([1, 3])

**The Decoder**

In [24]:
#@ INITIALIZING MASKED MULTI HEAD ATTENTION LAYER: 
seq_len = inputs.input_ids.size(-1)                                     # Initialization. 
mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0)            # Lower triangular matrix. 
mask[0]                                                                 # Inspection.

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])

In [25]:
#@ INITIALIZING ATTENTION HEAD: 
scores.masked_fill(mask==0, -float("inf"))                              # Inspection.

tensor([[[30.0134,    -inf,    -inf,    -inf,    -inf,    -inf],
         [-0.3075, 26.9884,    -inf,    -inf,    -inf,    -inf],
         [ 1.1714, -0.2755, 26.6385,    -inf,    -inf,    -inf],
         [ 0.7308, -0.5113,  0.4295, 28.6890,    -inf,    -inf],
         [-0.6842,  0.5326, -0.6896, -2.2311, 27.9228,    -inf],
         [ 0.2730, -1.1449,  1.3761, -0.4889, -0.3024, 27.3423]]],
       grad_fn=<MaskedFillBackward0>)

In [27]:
#@ INITIALIZING SCALED DOT-PRODUCT ATTENTION:
def scaled_dot_product_attention(query, key, value, mask=None):     # Defining function. 
    dim_k = query.size(-1)                                          # Initializing dimensions. 
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)    # Initializing attention scores.
    if mask is not None:
        scores = scores.masked_fill(mask==0, float("-inf"))
    weights = F.softmax(scores, dim=-1)                             # Implementation of softmax layer.
    return torch.bmm(value)                                         # Batched matrix multiplication.