In [2]:
import torch
import torch.nn as nn
import math

In [3]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model:int, vocab_size: int ):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)

    def forward(self,x):
        return self.embedding * math.sqrt(self.d_model)     

In [4]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        #Create a matrix of shape(seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)

        #Create a vector of shape (seq_len, 1)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)

        div_term = torch.exp(torch.arange(0,d_model,2).float() * (-math.log(10000.0)/d_model))

        #Apply the sine to even postions
        pe[: , 0::2] = torch.sin(position * div_term)
        pe[: , 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0) # (1,Seq_len, d_model)
        self.register_buffer('pe',pe)

    
    
    def forward(self, x):
        x = x+ (self.pe[:, :x.shape[1], :]).requires_grad(False)
        return self.dropout(x)

### Building Add and Norm Layer

__dfdfdfd__

_dfdf_ 
**dfdf** 

In [5]:
print(10**-3)

0.001


In [6]:
class LayerNormalization(nn.Module):

    # constructor
    def __init__(self, eps: float = 10**-6) -> None:
        super.__init__()
        self.eps = eps

        # Trainable parameters of neural net
        self.alpha = nn.Parameter(data= torch.ones(1)) #alpha = multiplicative
        self.bias = nn.Parameter(data= torch.ones(1)) # bias = additive

    def forward(self, x):
        mean = x.mean(dim = -1, keepdim = True)
        std = x.std(dim = -1, keepdim = True)
        return self.alpha * (x - mean)/(std + self.eps) + self.bias

In [7]:
a = torch.tensor([1,12])
b = a.mean(dtype=float, dim=0)

In [8]:
b

tensor(6.5000, dtype=torch.float64)

# 4.) Feed Forward Block
### Contains two Linear transformations and a ReLU in between

In [9]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: int) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # W1 and B1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # W2 and B2

    def forward(self, x):
        # (Batch, Seq_len, d_model) ---> (Batch, seq_len, d_ff) ---> (Batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

# 5) Multihead Attention

In [45]:
# Linear Transformation 
w_q = nn.Linear(3,4)

In [52]:
print(w_q.weight)
print(w_q.bias)

Parameter containing:
tensor([[ 0.3512, -0.4764,  0.4040],
        [-0.4498,  0.2830,  0.0951],
        [ 0.3542, -0.2401,  0.2020],
        [ 0.0870,  0.4940, -0.0263]], requires_grad=True)
Parameter containing:
tensor([ 0.5289, -0.2276, -0.2474,  0.1907], requires_grad=True)


In [70]:
input = torch.randn(4,3)
print(input)

tensor([[-0.1528, -1.8556,  1.8692],
        [-0.6580, -0.7270,  2.6306],
        [-0.0721, -1.3781, -0.7330],
        [-1.9977,  1.8410,  2.0114]])


In [71]:
# print(input)

In [72]:
# w_q(input)
dropout = nn.Dropout()
print(dropout(input))

tensor([[-0.3057, -0.0000,  0.0000],
        [-1.3159, -0.0000,  0.0000],
        [-0.0000, -2.7561, -1.4659],
        [-3.9955,  3.6821,  4.0228]])


In [65]:
print(dropout)

Dropout(p=0.5, inplace=False)


In [76]:
## Fair

class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float ) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h
        
        assert d_model % h == 0, "d_model is not divisible by 4"
        
        ## Integer division //
        ## d_k = dimension of each head
        self.d_k = d_model // h

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)

        self.w_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    # A static method can be called without an instance
    @staticmethod
    def attention(query, key, value,  mask, dropout: nn.Dropout):
        d_k = query.shape[-1]

        # (Batch,h, Seq_len, d_k) x (Batch,h, d_k, Seq_len) = (Batch, h, Seq_len, Seq_len)
        attention_scores = (query @ key.transpose(-2,-1))/ math.sqrt(d_k)

        ## before applying softmax, mask the values
        if mask is not None:
            attention_scores.masked_fill(mask == 0, -1e9)

        # Now apply softmax
        attention_scores =  attention_scores.softmax(dim = -1)  #(Batch, h, seq_len,seq_len)

        if dropout is not None:
            attention_scores = dropout(attention_scores)

        #(Batch, h, seq_len,seq_len) x (Batch,h, Seq_len, d_k) = (Batch, h, seq_len, d_k)
        return (attention_scores @ value), attention_scores    
    
        



    def forward(self, q,k,v, mask):

        ## final arrays for Multihead
        ## dimension of q = seq x d_model
        # w_q == > [d_mode, d_moel]
        # seq x d_model (MatMUl) ==>>  
        query  = self.w_q(q)   ## dimensions [seq x d_model]
        key  = self.w_k(k)   
        value  = self.w_v(v)

        ## dividing each matrix into number of heads

        #(Batch, Seq_len, d_model) --> (Batch, Seq_len, h, d_k) --> (Batch,h, Seq_len, d_k)
        query =  query.view(query.shape[0],query.shape[1], self.h, self.d_k).transpose(1,2)
        key =  key.view(key.shape[0],key.shape[1], self.h, self.d_k).transpose(1,2)
        value =  value.view(value.shape[0],value.shape[1], self.h, self.d_k).transpose(1,2)

        ## Applying attention on each head matrix
        # (Batch, h, seq_len, d_k)
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query,key, value,mask,dropout= self.dropout)

        #(Batch, h, seq_len, d_k) --> (Batch,  seq_len, h ,d_k) using transpose --> (Batch, seq_len, h*d_k = d_model)

        
        
    

In [102]:
a = torch.randn(3,512)
print(a.shape[1])
a = a.view()

512


In [104]:
a.size()

torch.Size([8, 128])

In [217]:
query = torch.randn(1,2,1,2)
# (Batch, h, Seq_len, d_k)
h = query.shape[1]
d_k = query.shape[-1]
seq_len = query.shape[2]
## what I want -- (1, seq_len, d_model)

In [219]:
print(query)

tensor([[[[ 1.6096,  0.1722]],

         [[-0.3086, -1.3135]]]])


In [220]:
query=query.transpose(1,2)

In [226]:
query.shape

torch.Size([1, 1, 2, 2])

In [61]:
a = MultiHeadAttentionBlock(d_model=512)
a.forward()

512


### Self in Python
- represent instance of class
- 


## nn.Dropout ?


## Assert Keyword Python
### I can customize the assertion error

In [75]:
x  = 90

assert x == 90, "Not equal"

### query.view()
### to change the dimension of array

In [121]:
query = torch.randn(3,2)
query = query.view(size= [1,6])


In [122]:
print( query)

tensor([[-0.5354, -0.1534,  0.3063,  2.7817,  0.4930,  1.7247]])


In [145]:
query = query.transpose(1,0)


In [146]:
print(query)

tensor([[-0.5354],
        [-0.1534],
        [ 0.3063],
        [ 2.7817],
        [ 0.4930],
        [ 1.7247]])


In [152]:
query.shape[-2]

6

### Matrix Multiplication using @

In [154]:
a = torch.tensor([1,2,3])
b = torch.tensor([3,4,5])
b = b.view(3,1) 

In [175]:
query = torch.randn(1,3,512)
print(query.shape[2])
query.view(query.shape[0], query.shape[1], )

512


In [172]:
query.view()

tensor([[[[ 0.8631,  0.7630, -1.6483,  0.3889,  1.3791,  0.4691, -0.3281,
            0.0448, -0.0949, -0.9514],
          [-0.6676,  1.7781, -0.6553,  0.9162, -0.0135,  1.1914,  1.7496,
           -0.7324, -0.8473, -0.8272]],

         [[-1.8201, -0.6571,  0.7106,  0.4080,  0.2826, -1.1668,  1.1877,
           -0.4599, -1.3485,  0.0875],
          [ 0.4184, -0.3341, -0.6875, -1.3446, -0.4840,  1.4499,  0.4163,
            0.1590, -0.1998,  0.3030]]]])

## dim  = -1 ??

In [199]:
a = torch.tensor([[1,2,3,4],[5,6,7,8]])
a = a.float()
print(a.shape[-1])
# print(a.softmax(dim = -1))
# help(softmax)

4


## Softmax Function Pytorch ??


## torch.Tensor.Contiguous ??