In [1]:
import torch
import torch.nn as nn
import math

In [2]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model:int, vocab_size: int ):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)

    def forward(self,x):
        return self.embedding * math.sqrt(self.d_model)     

In [3]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        #Create a matrix of shape(seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)

        #Create a vector of shape (seq_len, 1)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)

        div_term = torch.exp(torch.arange(0,d_model,2).float() * (-math.log(10000.0)/d_model))

        #Apply the sine to even postions
        pe[: , 0::2] = torch.sin(position * div_term)
        pe[: , 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0) # (1,Seq_len, d_model)
        self.register_buffer('pe',pe)

    
    
    def forward(self, x):
        x = x+ (self.pe[:, :x.shape[1], :]).requires_grad(False)
        return self.dropout(x)

### Building Add and Norm Layer

__dfdfdfd__

_dfdf_ 
**dfdf** 

In [4]:
print(10**-3)

0.001


In [5]:
class LayerNormalization(nn.Module):

    # constructor
    def __init__(self, eps: float = 10**-6) -> None:
        super.__init__()
        self.eps = eps

        # Trainable parameters of neural net
        self.alpha = nn.Parameter(data= torch.ones(1)) #alpha = multiplicative
        self.bias = nn.Parameter(data= torch.ones(1)) # bias = additive

    def forward(self, x):
        mean = x.mean(dim = -1, keepdim = True)
        std = x.std(dim = -1, keepdim = True)
        return self.alpha * (x - mean)/(std + self.eps) + self.bias

In [6]:
a = torch.tensor([1,12])
b = a.mean(dtype=float, dim=0)

In [7]:
b

tensor(6.5000, dtype=torch.float64)

# 4.) Feed Forward Block
### Contains two Linear transformations and a ReLU in between

In [8]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: int) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # W1 and B1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # W2 and B2

    def forward(self, x):
        # (Batch, Seq_len, d_model) ---> (Batch, seq_len, d_ff) ---> (Batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

# 5) Multihead Attention

In [9]:
# Linear Transformation 
w_q = nn.Linear(3,4)

In [10]:
print(w_q.weight)
print(w_q.bias)

Parameter containing:
tensor([[ 0.0687,  0.0798, -0.3189],
        [-0.3182, -0.0675, -0.1275],
        [ 0.0954,  0.0717,  0.2598],
        [-0.0671, -0.5393, -0.3109]], requires_grad=True)
Parameter containing:
tensor([-0.4992,  0.1079,  0.1394,  0.2484], requires_grad=True)


In [11]:
input = torch.randn(4,3)
print(input)

tensor([[-0.1093,  0.0028, -0.5319],
        [-0.2542,  2.1631, -0.4545],
        [-0.0384, -0.6592,  0.9634],
        [ 0.9426,  1.0073, -0.7800]])


In [12]:
# print(input)

In [13]:
# w_q(input)
dropout = nn.Dropout()
print(dropout(input))

tensor([[-0.0000,  0.0000, -0.0000],
        [-0.0000,  0.0000, -0.0000],
        [-0.0767, -1.3184,  0.0000],
        [ 0.0000,  2.0147, -0.0000]])


In [14]:
print(dropout)

Dropout(p=0.5, inplace=False)


In [15]:
## Fair

class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float ) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h
        
        assert d_model % h == 0, "d_model is not divisible by 4"
        
        ## Integer division //
        ## d_k = dimension of each head
        self.d_k = d_model // h

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)

        self.w_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    # A static method can be called without an instance
    @staticmethod
    def attention(query, key, value,  mask, dropout: nn.Dropout):
        d_k = query.shape[-1]

        # (Batch,h, Seq_len, d_k) x (Batch,h, d_k, Seq_len) = (Batch, h, Seq_len, Seq_len)
        attention_scores = (query @ key.transpose(-2,-1))/ math.sqrt(d_k)

        ## before applying softmax, mask the values
        if mask is not None:
            attention_scores.masked_fill(mask == 0, -1e9)

        # Now apply softmax
        attention_scores =  attention_scores.softmax(dim = -1)  #(Batch, h, seq_len,seq_len)

        if dropout is not None:
            attention_scores = dropout(attention_scores)

        #(Batch, h, seq_len,seq_len) x (Batch,h, Seq_len, d_k) = (Batch, h, seq_len, d_k)
        return (attention_scores @ value), attention_scores    
    
        



    def forward(self, q,k,v, mask):

        ## final arrays for Multihead
        ## dimension of q = seq x d_model
        # w_q == > [d_mode, d_moel]
        # seq x d_model (MatMUl) ==>>  
        query  = self.w_q(q)   ## dimensions [seq x d_model]
        key  = self.w_k(k)   
        value  = self.w_v(v)

        ## dividing each matrix into number of heads

        #(Batch, Seq_len, d_model) --> (Batch, Seq_len, h, d_k) --> (Batch,h, Seq_len, d_k)
        query =  query.view(query.shape[0],query.shape[1], self.h, self.d_k).transpose(1,2)
        key =  key.view(key.shape[0],key.shape[1], self.h, self.d_k).transpose(1,2)
        value =  value.view(value.shape[0],value.shape[1], self.h, self.d_k).transpose(1,2)

        ## Applying attention on each head matrix
        # (Batch, h, seq_len, d_k)
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query,key, value,mask,dropout= self.dropout)

        #(Batch, h, seq_len, d_k) --> (Batch,  seq_len, h ,d_k) using transpose --> (Batch, seq_len, h*d_k = d_model)

        

        
        
    

In [17]:
# a = torch.randn(3,512)
# print(a.shape[1])
# a = a.view()

In [18]:
a.size()

torch.Size([3, 512])

In [19]:
query = torch.randn(1,2,1,2)
# (Batch, h, Seq_len, d_k)
h = query.shape[1]
d_k = query.shape[-1]
seq_len = query.shape[2]
## what I want -- (1, seq_len, d_model)

In [20]:
print(query)

tensor([[[[-0.0681,  1.5254]],

         [[ 0.7955,  1.3008]]]])


In [21]:
query=query.transpose(1,2)

In [22]:
query.shape

torch.Size([1, 1, 2, 2])

In [24]:
# a = MultiHeadAttentionBlock(d_model=512)
# a.forward()

### Self in Python
- represent instance of class
- 


## nn.Dropout ?


## Assert Keyword Python
### I can customize the assertion error

In [25]:
x  = 90

assert x == 90, "Not equal"

### query.view()
### to change the dimension of array

In [26]:
query = torch.randn(3,2)
query = query.view(size= [1,6])


In [27]:
print( query)

tensor([[-0.2488, -0.8772,  0.2910,  0.2125, -0.9777, -1.0864]])


In [28]:
query = query.transpose(1,0)


In [29]:
print(query)

tensor([[-0.2488],
        [-0.8772],
        [ 0.2910],
        [ 0.2125],
        [-0.9777],
        [-1.0864]])


In [30]:
query.shape[-2]

6

### Matrix Multiplication using @

In [31]:
a = torch.tensor([1,2,3])
b = torch.tensor([3,4,5])
b = b.view(3,1) 

In [33]:
# query = torch.randn(1,3,512)
# print(query.shape[2])
# query.view(query.shape[0], query.shape[1], )

In [35]:
# query.view()

## dim  = -1 ??

In [36]:
a = torch.tensor([[1,2,3,4],[5,6,7,8]])
a = a.float()
print(a.shape[-1])
# print(a.softmax(dim = -1))
# help(softmax)

4


## Softmax Function Pytorch ??


## torch.Tensor.Contiguous ??

In [47]:
#(Batch, h, seq_len, d_k) --> (Batch, seq_len, h, d_k) --> (Batch, seq_len, h*d_k = d_model)
torch.manual_seed(seed=0)
query = torch.randn(2,3,4,5)

# output required (2,4,3,5)
query = query.view(2,4, 15)

In [48]:
query

tensor([[[-1.1258e+00, -1.1524e+00, -2.5058e-01, -4.3388e-01,  8.4871e-01,
           6.9201e-01, -3.1601e-01, -2.1152e+00,  3.2227e-01, -1.2633e+00,
           3.4998e-01,  3.0813e-01,  1.1984e-01,  1.2377e+00,  1.1168e+00],
         [-2.4728e-01, -1.3527e+00, -1.6959e+00,  5.6665e-01,  7.9351e-01,
           5.9884e-01, -1.5551e+00, -3.4136e-01,  1.8530e+00,  7.5019e-01,
          -5.8550e-01, -1.7340e-01,  1.8348e-01,  1.3894e+00,  1.5863e+00],
         [ 9.4630e-01, -8.4368e-01, -6.1358e-01,  3.1593e-02, -4.9268e-01,
           2.4841e-01,  4.3970e-01,  1.1241e-01,  6.4079e-01,  4.4116e-01,
          -1.0231e-01,  7.9244e-01, -2.8967e-01,  5.2507e-02,  5.2286e-01],
         [ 2.3022e+00, -1.4689e+00, -1.5867e+00, -6.7309e-01,  8.7283e-01,
           1.0554e+00,  1.7784e-01, -2.3034e-01, -3.9175e-01,  5.4329e-01,
          -3.9516e-01, -4.4622e-01,  7.4402e-01,  1.5210e+00,  3.4105e+00]],

        [[-1.5312e+00, -1.2341e+00,  1.8197e+00, -5.5153e-01, -5.6925e-01,
           9.1997e-

In [46]:
torch.manual_seed(seed=0)
value = torch.randn(2,3,4,5)
print(value)
value = value.transpose(1,2).contiguous().view(value.sh)

tensor([[[[-1.1258e+00, -1.1524e+00, -2.5058e-01, -4.3388e-01,  8.4871e-01],
          [ 6.9201e-01, -3.1601e-01, -2.1152e+00,  3.2227e-01, -1.2633e+00],
          [ 3.4998e-01,  3.0813e-01,  1.1984e-01,  1.2377e+00,  1.1168e+00],
          [-2.4728e-01, -1.3527e+00, -1.6959e+00,  5.6665e-01,  7.9351e-01]],

         [[ 5.9884e-01, -1.5551e+00, -3.4136e-01,  1.8530e+00,  7.5019e-01],
          [-5.8550e-01, -1.7340e-01,  1.8348e-01,  1.3894e+00,  1.5863e+00],
          [ 9.4630e-01, -8.4368e-01, -6.1358e-01,  3.1593e-02, -4.9268e-01],
          [ 2.4841e-01,  4.3970e-01,  1.1241e-01,  6.4079e-01,  4.4116e-01]],

         [[-1.0231e-01,  7.9244e-01, -2.8967e-01,  5.2507e-02,  5.2286e-01],
          [ 2.3022e+00, -1.4689e+00, -1.5867e+00, -6.7309e-01,  8.7283e-01],
          [ 1.0554e+00,  1.7784e-01, -2.3034e-01, -3.9175e-01,  5.4329e-01],
          [-3.9516e-01, -4.4622e-01,  7.4402e-01,  1.5210e+00,  3.4105e+00]]],


        [[[-1.5312e+00, -1.2341e+00,  1.8197e+00, -5.5153e-01, -5.69