In [3]:
import torch
import torch.nn as nn
import math

# Using kernel : testenv (office)



In [4]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model:int, vocab_size: int ):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)

    def forward(self,x):
        return self.embedding * math.sqrt(self.d_model)     

In [5]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        #Create a matrix of shape(seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)

        #Create a vector of shape (seq_len, 1)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)

        div_term = torch.exp(torch.arange(0,d_model,2).float() * (-math.log(10000.0)/d_model))

        #Apply the sine to even postions
        pe[: , 0::2] = torch.sin(position * div_term)
        pe[: , 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0) # (1,Seq_len, d_model)
        self.register_buffer('pe',pe)

    
    
    def forward(self, x):
        x = x+ (self.pe[:, :x.shape[1], :]).requires_grad(False)
        return self.dropout(x)

### Building Add and Norm Layer

__dfdfdfd__

_dfdf_ 
**dfdf** 

In [6]:
print(10**-3)

0.001


In [7]:
class LayerNormalization(nn.Module):

    # constructor
    def __init__(self, eps: float = 10**-6) -> None:
        super.__init__()
        self.eps = eps

        # Trainable parameters of neural net
        self.alpha = nn.Parameter(data= torch.ones(1)) #alpha = multiplicative
        self.bias = nn.Parameter(data= torch.ones(1)) # bias = additive

    def forward(self, x):
        mean = x.mean(dim = -1, keepdim = True)
        std = x.std(dim = -1, keepdim = True)
        return self.alpha * (x - mean)/(std + self.eps) + self.bias

In [8]:
a = torch.tensor([1,12])
b = a.mean(dtype=float, dim=0)

In [9]:
b

tensor(6.5000, dtype=torch.float64)

# 4.) Feed Forward Block
### Contains two Linear transformations and a ReLU in between

In [10]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: int) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # W1 and B1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # W2 and B2

    def forward(self, x):
        # (Batch, Seq_len, d_model) ---> (Batch, seq_len, d_ff) ---> (Batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

# 5) Multihead Attention

In [11]:
# Linear Transformation 
w_q = nn.Linear(3,4)

In [12]:
print(w_q.weight)
print(w_q.bias)

Parameter containing:
tensor([[-0.5440, -0.5345,  0.5400],
        [-0.5013,  0.0595,  0.3118],
        [ 0.1393,  0.3877, -0.0468],
        [ 0.5532, -0.4944,  0.1819]], requires_grad=True)
Parameter containing:
tensor([-0.1159,  0.0061, -0.3233,  0.4707], requires_grad=True)


In [13]:
input = torch.randn(4,3)
print(input)

tensor([[-1.0129, -0.3421,  2.0021],
        [ 0.9800, -0.7892,  0.9688],
        [ 1.5839,  0.5169,  0.1802],
        [-0.3647,  0.1959, -0.0209]])


In [14]:
# print(input)

In [15]:
# w_q(input)
dropout = nn.Dropout()
print(dropout(input))

tensor([[-2.0257, -0.6842,  4.0042],
        [ 1.9599, -1.5784,  0.0000],
        [ 3.1677,  0.0000,  0.0000],
        [-0.7294,  0.0000, -0.0000]])


In [16]:
print(dropout)

Dropout(p=0.5, inplace=False)


In [143]:
## Fair

class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float ) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h
        
        assert d_model % h == 0, "d_model is not divisible by 4"
        
        ## Integer division //
        ## d_k = dimension of each head
        self.d_k = d_model // h

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)

        self.w_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    # A static method can be called without an instance
    @staticmethod
    def attention(query, key, value,  mask, dropout: nn.Dropout):
        d_k = query.shape[-1]

        # (Batch,h, Seq_len, d_k) x (Batch,h, d_k, Seq_len) = (Batch, h, Seq_len, Seq_len)
        attention_scores = (query @ key.transpose(-2,-1))/ math.sqrt(d_k)

        ## before applying softmax, mask the values
        if mask is not None:
            attention_scores.masked_fill(mask == 0, -1e9)

        # Now apply softmax
        attention_scores =  attention_scores.softmax(dim = -1)  #(Batch, h, seq_len,seq_len)

        if dropout is not None:
            attention_scores = dropout(attention_scores)

        #(Batch, h, seq_len,seq_len) x (Batch,h, Seq_len, d_k) = (Batch, h, seq_len, d_k)
        return (attention_scores @ value), attention_scores    
    
        



    def forward(self, q,k,v, mask):
        print('When are you called ?')

        ## final arrays for Multihead
        ## dimension of q = seq x d_model
        # w_q == > [d_mode, d_moel]
        # seq x d_model (MatMUl) ==>>  
        query  = self.w_q(q)   ## dimensions [seq x d_model]
        key  = self.w_k(k)   
        value  = self.w_v(v)

        ## dividing each matrix into number of heads

        #(Batch, Seq_len, d_model) --> (Batch, Seq_len, h, d_k) --> (Batch,h, Seq_len, d_k)
        query =  query.view(query.shape[0],query.shape[1], self.h, self.d_k).transpose(1,2)
        key =  key.view(key.shape[0],key.shape[1], self.h, self.d_k).transpose(1,2)
        value =  value.view(value.shape[0],value.shape[1], self.h, self.d_k).transpose(1,2)

        ## Applying attention on each head matrix
        # (Batch, h, seq_len, d_k)
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query,key, value,mask,dropout= self.dropout)

        #(Batch, h, seq_len, d_k) --> (Batch,  seq_len, h ,d_k) using transpose --> (Batch, seq_len, h*d_k = d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        ## Linear transformation of value matrix
        # (Batch, seq_len, d_model) ----> (Batch, seq_len, d_model)
        return self.w_o(x)


        
        
    

In [18]:
# a = torch.randn(3,512)
# print(a.shape[1])
# a = a.view()

In [19]:
a.size()

torch.Size([2])

In [20]:
query = torch.randn(1,2,1,2)
# (Batch, h, Seq_len, d_k)
h = query.shape[1]
d_k = query.shape[-1]
seq_len = query.shape[2]
## what I want -- (1, seq_len, d_model)

In [21]:
print(query)

tensor([[[[ 1.0655,  0.5363]],

         [[-1.0024,  0.2427]]]])


In [22]:
query=query.transpose(1,2)

In [23]:
query.shape

torch.Size([1, 1, 2, 2])

In [24]:
# a = MultiHeadAttentionBlock(d_model=512)
# a.forward()

### Self in Python
- represent instance of class
- 


## nn.Dropout ?


## Assert Keyword Python
### I can customize the assertion error

In [25]:
x  = 90

assert x == 90, "Not equal"

### query.view()
### to change the dimension of array

In [26]:
query = torch.randn(3,2)
query = query.view(size= [1,6])


In [27]:
print( query)

tensor([[ 3.2773, -0.8508, -0.3612,  0.6094, -1.3985,  1.8431]])


In [28]:
query = query.transpose(1,0)


In [29]:
print(query)

tensor([[ 3.2773],
        [-0.8508],
        [-0.3612],
        [ 0.6094],
        [-1.3985],
        [ 1.8431]])


In [30]:
query.shape[-2]

6

### Matrix Multiplication using @

In [31]:
a = torch.tensor([1,2,3])
b = torch.tensor([3,4,5])
b = b.view(3,1) 

In [32]:
# query = torch.randn(1,3,512)
# print(query.shape[2])
# query.view(query.shape[0], query.shape[1], )

In [33]:
# query.view()

## dim  = -1 ??

In [34]:
a = torch.tensor([[1,2,3,4],[5,6,7,8]])
a = a.float()
print(a.shape[-1])
# print(a.softmax(dim = -1))
# help(softmax)

4


## Softmax Function Pytorch ??


## torch.Tensor.Contiguous ??

In [35]:
#(Batch, h, seq_len, d_k) --> (Batch,seq_len, h ,d_k) ---> (Batch, seq_len, d_k*h = d_model)

# Using contiguous

torch.manual_seed(seed=0)
key = torch.randn(1,2,3,4)
print(key)
print(key.shape)



key = key.transpose(1,2).contiguous().view(key.shape[0],-1, key.shape[-2]*key.shape[-1])
print(key)

print(key.shape)





tensor([[[[-1.1258, -1.1524, -0.2506, -0.4339],
          [ 0.8487,  0.6920, -0.3160, -2.1152],
          [ 0.4681, -0.1577,  1.4437,  0.2660]],

         [[ 0.1665,  0.8744, -0.1435, -0.1116],
          [ 0.9318,  1.2590,  2.0050,  0.0537],
          [ 0.6181, -0.4128, -0.8411, -2.3160]]]])
torch.Size([1, 2, 3, 4])
tensor([[[-1.1258, -1.1524, -0.2506, -0.4339,  0.1665,  0.8744, -0.1435,
          -0.1116,  0.8487,  0.6920, -0.3160, -2.1152],
         [ 0.9318,  1.2590,  2.0050,  0.0537,  0.4681, -0.1577,  1.4437,
           0.2660,  0.6181, -0.4128, -0.8411, -2.3160]]])
torch.Size([1, 2, 12])


In [36]:
key = key.transpose(1,2).contiguous().view(key.shape[0],-1, key.shape[-2]*key.shape[-1])
print(key)

print(key.shape)

tensor([[[-1.1258,  0.9318, -1.1524,  1.2590, -0.2506,  2.0050, -0.4339,
           0.0537,  0.1665,  0.4681,  0.8744, -0.1577, -0.1435,  1.4437,
          -0.1116,  0.2660,  0.8487,  0.6181,  0.6920, -0.4128, -0.3160,
          -0.8411, -2.1152, -2.3160]]])
torch.Size([1, 1, 24])


# 5.) Residual Connection
Still need to understand where is it applied?

In [38]:
class ResidualConnection(nn.Module):

    ## dropout to reduce overfitting
    def __init__(self, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization()

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

# 6.) Encoder Block

In [40]:
class EncoderBlock(nn.Module):

    def __init__(self,self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()

        #Layers needed in an encoder block
        # Multihead
        self.self_attention_block = self_attention_block

        # feed forward
        self.feed_forward_block = feed_forward_block
        
        # Residual layer block
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])


    def forward(self, x, src_mask):

        # multihead    

        # 1st residual layer output
        x = self.residual_connections[0](x, lambda x : self.self_attention_block(x,x,x, src_mask))

        # 2nd residual layer output
        x = self.residual_connections[1](x, self.feed_forward_block)

        return x


# nn.ModuleList() ??

In [None]:
## indexed like a regulary Python list
## modules are properly registered
## visible by all Module methods




In [81]:
print(b)

tensor([[1., 2., 3.]], dtype=torch.float64)


In [84]:
a = nn.Linear(1,3,dtype=float)
print(a.weight.shape)
b = torch.tensor([[1,2,3]],dtype=float).transpose(0,1)
print(b.shape)

b = a(b)

torch.Size([3, 1])
torch.Size([3, 1])


In [85]:
b

tensor([[0.1719, 0.5601, 0.6506],
        [1.0706, 1.2657, 0.4044],
        [1.9693, 1.9713, 0.1582]], dtype=torch.float64,
       grad_fn=<AddmmBackward0>)

Linear(in_features=3, out_features=10, bias=True)


In [73]:
# b = a[0](b)
# print(b)

## ModuleListExample

In [87]:
## ModuleListExample

class MyModule(nn.Module):

    def __init__(self ) -> None:
        super().__init__()
        self.linears = nn.ModuleList([nn.Linear(10,10) for _ in range(10)])


    def forward(self, x):
            

IndentationError: expected an indented block (3457403226.py, line 11)

# enumerate() ?? why to use ? when to use ?

In [122]:
a = nn.ModuleList([nn.Linear(10,10) for _ in range(10)])
b = enumerate(a)
for c in b:
    print(c[1])

Linear(in_features=10, out_features=10, bias=True)
Linear(in_features=10, out_features=10, bias=True)
Linear(in_features=10, out_features=10, bias=True)
Linear(in_features=10, out_features=10, bias=True)
Linear(in_features=10, out_features=10, bias=True)
Linear(in_features=10, out_features=10, bias=True)
Linear(in_features=10, out_features=10, bias=True)
Linear(in_features=10, out_features=10, bias=True)
Linear(in_features=10, out_features=10, bias=True)
Linear(in_features=10, out_features=10, bias=True)


In [137]:
#  (Batch, seq_len, d_model)

g = torch.randn(1,2,3)
print(g)

tensor([[[ 1.8662, -0.6956,  0.1612],
         [ 0.7041,  0.4871,  0.9247]]])


In [138]:
a = (x ,lambda x : g)

In [142]:
print(a)

(90, <function <lambda> at 0x1394c29d0>)


In [149]:
fish = MultiHeadAttentionBlock(8,8,1)

## 7.) Encoder