In [None]:
import math
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
def scaled_dot_product(q, k , v, mask = None):
  # q, k ,v = 30, 8, 200, 64
  d_k = q.size()[-1] # track the last index, so 64
  scaled = torch.matmul(q, k.transpose(-1,-2)) / math.sqrt(d_k) # 30, 8, 200 , 200 (after matmul & transpose)
  print(f"scaled.size() : {scaled.size()}")
  if mask is not None:
    print(f"-- ADDING MASK of shape {mask.size()} --")
    scaled += mask # 30, 8, 200 , 200 (its same shape after add same shape of mask, no change in shape)
  attention = F.softmax(scaled, dim = 1) # 30, 8, 200 , 200 (its same shape after pass though softmax, no change in shape)
  values = torch.matmul(attention, v) # 30, 8, 200 , 64 (its change bcz matmul with value where dim of value is 64 and attention is 200, and see now final value dimention is same as initial q,k,v dimention)
  return values, attention

In [None]:
class MultiheadAttention(nn.Module):

    def __init__(self, d_model, num_heads):
        super().__init__()
        # self.input_dim = input_dim
        self.d_model = d_model #512
        self.num_heads = num_heads #8
        self.head_dim = d_model // num_heads # 512/6 = 64
        self.qkv_layer = nn.Linear(d_model , 3 * d_model) # 512, 1536
        self.linear_layer = nn.Linear(d_model, d_model) # 512 , 512

    def forward(self, x, mask=None):
        batch_size, sequence_length, d_model = x.size() # 30,200, 512
        print(f"x.size(): {x.size()}")
        qkv = self.qkv_layer(x) # 30 , 200, 1536
        print(f"qkv.size(): {qkv.size()}")
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim) # 30, 200, 8 , 3 X 64 = 192
        print(f"qkv.size(): {qkv.size()}")
        qkv = qkv.permute(0, 2, 1, 3) # 30, 8, 200, 192
        print(f"qkv.size(): {qkv.size()}")
        q, k, v = qkv.chunk(3, dim=-1) # each - 30, 8, 200, 64
        print(f"q size: {q.size()}, k size: {k.size()}, v size: {v.size()}, ")
        values, attention = scaled_dot_product(q, k, v, mask) # attention= 30, 8, 200 , 200 ## value= 30, 8, 200, 64
        print(f"values.size(): {values.size()}, attention.size:{ attention.size()} ")
        values = values.reshape(batch_size, sequence_length, self.num_heads * self.head_dim) # 30, 200, 8 X 64=512 , so, 30, 200, 512
        print(f"values.size(): {values.size()}")
        out = self.linear_layer(values) # 30, 200, 512 (see its same as the initial input here.)
        print(f"out.size(): {out.size()}")
        return out

In [None]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape # 512
        # as eps will be denominator, so to prevent zero in denominator, as zero in demo. can bring inf value
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape)) # [512]
        self.beta =  nn.Parameter(torch.zeros(parameters_shape)) # [512]

    def forward(self, inputs): # 30, 200, 512 (batch, max_seq, dimenion of vec)
        dims = [-(i + 1) for i in range(len(self.parameters_shape))] # [-1] ## here we do operation only for the last index, mean dimention of vec
        # here, last index shape is 1, bcz it calculates the mean of the all dimentions, as we are trig to std norm, so mean is 0, so 1 value only
        mean = inputs.mean(dim=dims, keepdim=True) # 30 , 200, 1
        print(f"Mean ({mean.size()})")
        ## here, subtract each value from mean and calculate mean of all values that we got from subtraction,again! as mean so 1
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True) # 30, 200, 1
        std = (var + self.eps).sqrt() # 30, 200, 1
        print(f"Standard Deviation  ({std.size()})")
        y = (inputs - mean) / std # 30, 200, 512
        print(f"y: {y.size()}")
        ## gamma and beta is learnable parameter in normalization (1 gamma value of entire batch and entire sequence, so 512 gamma param, and 512 for beta)
        out = self.gamma * y  + self.beta
        print(f"self.gamma: {self.gamma.size()}, self.beta: {self.beta.size()}")
        print(f"out: {out.size()}")
        return out

In [None]:
class PositionwiseFeedForward(nn.Module):
  def __init__(self,d_model, hidden, drop_prob = 0.1):
    super(PositionwiseFeedForward, self).__init__()
    self.linear1 = nn.Linear(d_model, hidden) # 512, 2048
    self.linear2 = nn.Linear(hidden, d_model) # 2048, 512
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(p=drop_prob)

  def forward(self, x): # 30, 200, 512
    x = self.linear1(x) # 30, 200, 2048
    print(f"x after first linear layer: {x.size()}")
    x = self.relu(x) # 30, 200, 2048
    print(f"x after activation: {x.size()}")
    x = self.dropout(x) # 30, 200, 2048
    print(f"x after dropout: {x.size()}")
    x = self.linear2(x) # 30, 200, 512
    print(f"x after 2nd linear layer: {x.size()}")
    return x

In [None]:
# from os import setgid
#Encoder_layer
class EncoderLayer(nn.Module):
  def __init__(self, d_model,ffn_hidden, num_heads, drop_prob):
    super(EncoderLayer, self).__init__() #not clear
    self.attention = MultiheadAttention(d_model = d_model, num_heads = num_heads)
    self.norm1 = LayerNormalization(parameters_shape = [d_model])
    self.dropout1 = nn.Dropout(p=drop_prob)
    self.ffn = PositionwiseFeedForward(d_model = d_model, hidden = ffn_hidden, drop_prob = drop_prob)
    self.norm2 = LayerNormalization(parameters_shape=[d_model])
    self.dropout2 = nn.Dropout(p=drop_prob)

  def forward(self, x):
    residual_x = x # 30, 200, 512
    print("------- ATTENTION 1 ------")
    x = self.attention(x,mask = None) # 30, 200, 512
    print("------- DROPOUT 1 ------")
    x = self.dropout1(x) # 30, 200, 512
    print("------- ADD AND LAYER NORMALIZATION 1 ------")
    x = self.norm1(x +residual_x) # 30, 200, 512
    residual_x = x # 30, 200, 512
    print("------- ATTENTION 2 ------")
    x = self.ffn(x) # 30, 200, 512
    print("------- DROPOUT 2 ------")
    x = self.dropout2(x) # 30, 200, 512
    print("------- ADD AND LAYER NORMALIZATION 2 ------")
    x = self.norm2(x + residual_x) # 30, 200, 512
    return x

In [None]:
# this is whole Encoder

class Encoder(nn.Module): ## inheritance a nested child class. mean, Module is a child class of nn class
  def __init__(self, d_model, ffn_hidden, num_heads, drop_prob, num_layers):
    super().__init__()
    self.layers = nn.Sequential(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                    for _ in range(num_layers)])

  def forward(self, x):
    x = self.layers(x)
    return x

In [None]:
#size/dimention of vector for each words
d_model = 512
# its used in case of multi-head attention,so here 8 head, which devides 512/8 = 64 for dimention during Q, K V caculation of self-attention, which help parallal operation
num_heads = 8
# it is a prob value, applies during the feedward network in the encoder(later part), which useful for more randomness and fight against overfitting.
drop_prob = 0.1
""" batch is needed when we want to train faster and update the gradient then we need to fix the batch,
 here we are placing some custom batch size, so it will fall under mini-batch gradient descent category.
 Here, 30 means after 30 words/sentences, it will update gradient."""
batch_size = 30
# maximum length of sentence, we have to fix a fixed value for it, if our sentence has less words, then rest places are filled up with padding.
max_sequence_length = 200
# in the feedforward network there are some hidden layer. As in the hidden layer the dimention got expand and get to same size in the final output of the feedforward
ffn_hidden = 2048
# number of encoder layers top of each other, as encoder layer repeats the layer several times.
num_layers = 5


encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers)

In [None]:
x = torch.randn( (batch_size, max_sequence_length, d_model) ) # includes positional encoding
out = encoder(x)

------- ATTENTION 1 ------
x.size(): torch.Size([30, 200, 512])
qkv.size(): torch.Size([30, 200, 1536])
qkv.size(): torch.Size([30, 200, 8, 192])
qkv.size(): torch.Size([30, 8, 200, 192])
q size: torch.Size([30, 8, 200, 64]), k size: torch.Size([30, 8, 200, 64]), v size: torch.Size([30, 8, 200, 64]), 
scaled.size() : torch.Size([30, 8, 200, 200])
values.size(): torch.Size([30, 8, 200, 64]), attention.size:torch.Size([30, 8, 200, 200]) 
values.size(): torch.Size([30, 200, 512])
out.size(): torch.Size([30, 200, 512])
------- DROPOUT 1 ------
------- ADD AND LAYER NORMALIZATION 1 ------
Mean (torch.Size([30, 200, 1]))
Standard Deviation  (torch.Size([30, 200, 1]))
y: torch.Size([30, 200, 512])
self.gamma: torch.Size([512]), self.beta: torch.Size([512])
out: torch.Size([30, 200, 512])
------- ATTENTION 2 ------
x after first linear layer: torch.Size([30, 200, 2048])
x after activation: torch.Size([30, 200, 2048])
x after dropout: torch.Size([30, 200, 2048])
x after 2nd linear layer: torch.