# Full Transformer Enocder

In [5]:
# importing libraries
import torch 
import math
from torch import nn
import torch.nn.functional as F
import numpy as np

In [2]:
# self attention 
def scaled_dot_product(q, k, v, mask=None):
    #q,k,v = 30 x 8 x 200 x 64
    d_k = q.size()[-1] # d_k is some constant value wich is sometig value as 64
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)  #dimensions are:- 30 x 8 x 200 x 200 #now we are scaling this accoringto the transformer formulae
    # we are transposing the last 2 vectors for key vector . for example :- 1st x=[30,200,512] after x.T.size() it will become [512,200,30] but we don't need this 
    # so we use transpose(-1,-2) so that we can flip last two things like [30,512,200]
    print(f"scaled.size() : {scaled.size()}")
    if mask is not None:
        print(f"-- ADDING MASK of shape {mask.size()} --") 
        #  30 x 8 x 200 x 200 # Broadcasting add. So just the last N dimensions need to match
        scaled += mask
    attention = F.softmax(scaled, dim=-1) #dimension:- 30 x 8 x 200 x 200 # here we are passing the scaled value inside softmax , so that we can get probability related to how much we should focus 
    values = torch.matmul(attention, v)# dim:- 30 x 8 x 200 x 64 #here we are going to multiply our attention and value matrix whihch gives us a new set of vectors or metrix for very single word in which
    #it contains a vector for every input word that value vectors will actually have all information associated with context it will know how much attention it need to pay to all of the word in that sentence 
    return values, attention

In [6]:
# Why we need sqrt(d_k) in denominator
#here we are initalizing the query , key and value vector randomly 
l,d_k,d_v=4,8,8
q=np.random.randn(l,d_k)
k=np.random.randn(l,d_k)
v=np.random.randn(l,d_v)

In [7]:
print("Q\n",q)
print("K\n",k)
print("V\n",v)

Q
 [[-1.2409909   0.90763738 -0.37162313 -0.72170716  0.37804335 -2.0983542
  -0.04451034 -1.77467943]
 [ 0.4444631  -0.32316202 -0.59587402 -0.86820023 -0.76033911  0.32929205
   1.73225249  2.12433348]
 [-1.73270032 -0.87297009 -2.28696211  2.44771049 -0.5760314  -1.1468115
   0.48249086 -0.0833044 ]
 [ 1.90143069  2.20785939  0.65371143 -1.08809488  0.23084203  0.25644221
  -0.99442266 -1.17051757]]
K
 [[-0.38516766  1.08147623 -0.90402383  0.55450851 -0.66211389 -1.60420253
  -0.41567107  1.42711086]
 [-0.30283242  1.21629387  1.0837513  -0.51003737  1.51546112  0.79794109
   0.40142596 -1.4081618 ]
 [ 0.22654161  0.81043765 -0.43465046 -0.3022559  -0.73327951  0.43979501
  -0.7688607  -0.74311775]
 [ 0.55306727  1.75043203 -1.26142238 -0.67548748 -0.20949446 -0.43540319
   0.41595423  0.2218635 ]]
V
 [[-0.70733232 -0.01089761 -0.7059525   0.57680686  2.24565716  0.00877778
   0.77378761 -1.55469493]
 [-1.61675356  0.40391201 -1.14248596  1.70354848 -0.21934609 -0.07261548
   1.411

In [8]:
np.matmul(q,k.T)

array([[ 1.99705594,  2.82483175,  0.98707472,  2.28086526],
       [ 1.82336713, -3.91616533, -1.84792643,  2.22601388],
       [ 5.04970247, -5.74104311, -1.23685558, -0.4527346 ],
       [-1.36028532,  5.17656962,  3.84274783,  3.9933698 ]])

In [11]:
q.var(),k.var(),np.matmul(q,k.T).var() # we can see that query times key is above the 1 or prity large

(1.534563270249921, 0.7645613512636953, 9.339336850140846)

In [13]:
# now see the mean of each
q.mean(),k.mean(),np.matmul(q,k.T).mean()

(-0.14546082888851875, 0.02312118336159208, 0.9779117511276983)

In [18]:
# now when we scaled this then we can see that this query times key
#now also becomes has a variance of the order of 1 and its mean also actually has a variance that's slighty it's many time smaller than mean
# so that's why we are scaling this by put this into denominator , it allow us our metrices is in mean =0 and std=1 , it allows us us for easier and stable traning
# if we can't do this then during the backpropagation and forward propagation our value might be vary leage or small with that our gradient will be affected 
# and this create vanishing or exploadng grading probeln then our model dosn't learn anything . so for mitigate this problem 
# we use this sqrt(d_k) in denominator

In [17]:
scaled=np.matmul(q,k.T)/math.sqrt(d_k)
q.var(),k.var(),scaled.var()

(1.534563270249921, 0.7645613512636953, 1.1674171062676058)

In [3]:
# Multihead attention
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model # 512
        self.num_heads = num_heads #8
        self.head_dim = d_model // num_heads #512/8=64
        self.qkv_layer = nn.Linear(d_model , 3 * d_model) #The Linear layer is basically a feed-forward layer , here this is going to be like:- 512 x 1536
        self.linear_layer = nn.Linear(d_model, d_model) #512 x 512
    
    def forward(self, x, mask=None):
        batch_size, max_sequence_length, d_model = x.size() # 30 x 200 x 512
        print(f"x.size(): {x.size()}")
        qkv = self.qkv_layer(x) #30*200*1536
        print(f"qkv.size(): {qkv.size()}") 
        qkv = qkv.reshape(batch_size, max_sequence_length, self.num_heads, 3 * self.head_dim) # 30*200*8*192. here we are broking our query,key and value vector into 8 heads
        print(f"qkv.size(): {qkv.size()}")
        qkv = qkv.permute(0, 2, 1, 3) # swith up dimension which is lie 30 x 8 x 200 x 192
        print(f"qkv.size(): {qkv.size()}")
        q, k, v = qkv.chunk(3, dim=-1) #it is going to basically break this entire tensor of this(30*8*200*192) shape into three parts and the way it will going to brake accoring to last dimmension(dim=-1)
        #Here each are 30 x 8 x 200 x 64
        print(f"q size: {q.size()}, k size: {k.size()}, v size: {v.size()}, ")
        values, attention = scaled_dot_product(q, k, v, mask) #attention dim:- 30 x 8 x 200 x 20 and value dim:- 30 x 8 x 200 x 64  #here we are performing scaled_dot_product 
        print(f"values.size(): {values.size()}, attention.size:{ attention.size()} ") 
        values = values.reshape(batch_size, max_sequence_length, self.num_heads * self.head_dim) # we are reshaping the value tensor to just be this dimension :- 30 x 200 x 512  
        print(f"values.size(): {values.size()}")
        out = self.linear_layer(values)
        print(f"out.size(): {out.size()}")
        return out

In [4]:
# Layer Normalization
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape # 512 # here parameter shape will actually tell us along which dimension we want to perform the layer normalization on and typically this is going to be our embedding dimension   
        self.eps=eps # very small value
        self.gamma = nn.Parameter(torch.ones(parameters_shape))  # 512 #learnable parameter , it is effectively going to represent like a standard deviation of values  
        self.beta =  nn.Parameter(torch.zeros(parameters_shape)) # 512 # learnable parameter, it is effectively going to represent like a the mean of values that will be applying , as we know that it will learn continuosly. 

    def forward(self, inputs):# 30 x 200 x 512
        dims = [-(i + 1) for i in range(len(self.parameters_shape))] #-1
        mean = inputs.mean(dim=dims, keepdim=True) # 30 x 200 x 1  # we are calculating mean for only last dim vector on which we want to perform layer normalization. 
        print(f"Mean ({mean.size()})")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True) # 30 x 200 x 1 # calculating variance
        std = (var + self.eps).sqrt() # 30 x 200 x 1 # calculating standard deviation
        print(f"Standard Deviation  ({std.size()})")
        y = (inputs - mean) / std #30 x 200 x 512 # here we are normalizaing our data mean=0 and std=1 and it only applied on a single sample or most of batch but we want to make sure that these numbers are applicable for accros the traning set and so 
        #that's kind of y we have learnable parameter gamma  and beta that will kind help us in making sure that we are sacling values y appropraitely so that the eventaul output tensor that we get is going to comparable throughout every single example  
        print(f"y: {y.size()}")
        out = self.gamma * y  + self.beta #30 x 200 x 512, here we have 512 learnable parameters in gamma and beta
        print(f"self.gamma: {self.gamma.size()}, self.beta: {self.beta.size()}")
        print(f"out: {out.size()}")
        return out

In [5]:
# positional encoding
class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden) # 512 x 2048
        self.linear2 = nn.Linear(hidden, d_model) # 2048 x 512
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x): # 30 x 200 x 512 
        x = self.linear1(x) # 30 x 200 x 2048
        print(f"x after first linear layer: {x.size()}")
        x = self.relu(x) # 30 x 200 x 2048
        print(f"x after activation: {x.size()}")
        x = self.dropout(x) # 30 x 200 x 2048
        print(f"x after dropout: {x.size()}")
        x = self.linear2(x) # 30 x 200 x 512
        print(f"x after 2nd linear layer: {x.size()}")
        return x

In [7]:
# Encoder layer
class EncoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x):
        residual_x = x # 30 x 200 x 512
        print("------- ATTENTION 1 ------")
        x = self.attention(x, mask=None) # 30 x 200 x 512 # in this case we are passing the mask= None because we are make it a Autoregressive model , in which the next model output is depend upone previous prediction
        print("------- DROPOUT 1 ------")
        x = self.dropout1(x) #30 x 200 x 512
        print("------- ADD AND LAYER NORMALIZATION 1 ------")
        x = self.norm1(x + residual_x) # 30 x 200 x 512
        residual_x = x # 30 x 200 x 512
        print("------- ATTENTION 2 ------")
        x = self.ffn(x) # 30 x 200 x 512
        print("------- DROPOUT 2 ------")
        x = self.dropout2(x) # 30 x 200 x 512
        print("------- ADD AND LAYER NORMALIZATION 2 ------")
        x = self.norm2(x + residual_x) # 30 x 200 x 500 
        return x

#In the Encoder class we are inheriting the Module class because it inherit all the requirements 
#which are importing for model learning. it is a base class for all neural netword module
# This Module class provide us its own forward method which we implemented below and we need to override this

#now in the self.layers:- we are using Sequential which take the inputs one by one. we use * beciase it takes the list and de-construct the list into 5 components
# we are doing this 5 times because our num_laeyrs is 5 and we iterate this with the help of loop and then all the thing passing into sequentail, which contains a sequence of 5 encoder layers 
    class Encoder(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob, num_layers):
        super().__init__()
        self.layers = nn.Sequential(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                     for _ in range(num_layers)])

    def forward(self, x):
        x = self.layers(x)# it is overriding the forward method because it is necessary to do and it will take x and pass this x inside the 5 encoder layer and then we are returing the layer
        return x

# all parameters details:
* d_model:- it is the size of every single vector throghout our the encoder architecture 
* num_head:- it is basically used in multi-head attention mechanism for defining the number of heads for parallel processing the word vectors
* drop_prob:-  it is nothin but a dropout which randomly off the status of one or more neurons , which allows the neteork to caputer or understand different aspect parts of the input . with this we can make generalised neural netork.
* batch_size:- it is used for faster and stable training.
* max_sequence_length:- this is the largest number of words that we can be passing inside the encoder.
* ffn_hiden:- we have feed-forward network and for this we are uinsg hindel layer neurons .
* num_layers:- it is the number of transformers in encoder unit that we want to include in our architecture

In [9]:
# now prepareing data or parameters for encoder
d_model = 512
num_heads = 8
drop_prob = 0.1
batch_size = 30
max_sequence_length = 200
ffn_hidden = 2048
num_layers = 5

encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers)

In [10]:
# run the encoder
x = torch.randn( (batch_size, max_sequence_length, d_model) ) # includes positional encoding
out = encoder(x)

------- ATTENTION 1 ------
x.size(): torch.Size([30, 200, 512])
qkv.size(): torch.Size([30, 200, 1536])
qkv.size(): torch.Size([30, 200, 8, 192])
qkv.size(): torch.Size([30, 8, 200, 192])
q size: torch.Size([30, 8, 200, 64]), k size: torch.Size([30, 8, 200, 64]), v size: torch.Size([30, 8, 200, 64]), 
scaled.size() : torch.Size([30, 8, 200, 200])
values.size(): torch.Size([30, 8, 200, 64]), attention.size:torch.Size([30, 8, 200, 200]) 
values.size(): torch.Size([30, 200, 512])
out.size(): torch.Size([30, 200, 512])
------- DROPOUT 1 ------
------- ADD AND LAYER NORMALIZATION 1 ------
Mean (torch.Size([30, 200, 1]))
Standard Deviation  (torch.Size([30, 200, 1]))
y: torch.Size([30, 200, 512])
self.gamma: torch.Size([512]), self.beta: torch.Size([512])
out: torch.Size([30, 200, 512])
------- ATTENTION 2 ------
x after first linear layer: torch.Size([30, 200, 2048])
x after activation: torch.Size([30, 200, 2048])
x after dropout: torch.Size([30, 200, 2048])
x after 2nd linear layer: torch.