In [1]:
import torch
import torch.nn as nn
import math

In [2]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model:int, vocab_size: int ):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)

    def forward(self,x):
        return self.embedding * math.sqrt(self.d_model)     

In [3]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        #Create a matrix of shape(seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)

        #Create a vector of shape (seq_len, 1)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)

        div_term = torch.exp(torch.arange(0,d_model,2).float() * (-math.log(10000.0)/d_model))

        #Apply the sine to even postions
        pe[: , 0::2] = torch.sin(position * div_term)
        pe[: , 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0) # (1,Seq_len, d_model)
        self.register_buffer('pe',pe)

    
    
    def forward(self, x):
        x = x+ (self.pe[:, :x.shape[1], :]).requires_grad(False)
        return self.dropout(x)

### Building Add and Norm Layer

__dfdfdfd__

_dfdf_ 
**dfdf** 

In [4]:
print(10**-3)

0.001


In [5]:
class LayerNormalization(nn.Module):

    # constructor
    def __init__(self, eps: float = 10**-6) -> None:
        super.__init__()
        self.eps = eps

        # Trainable parameters of neural net
        self.alpha = nn.Parameter(data= torch.ones(1)) #alpha = multiplicative
        self.bias = nn.Parameter(data= torch.ones(1)) # bias = additive

    def forward(self, x):
        mean = x.mean(dim = -1, keepdim = True)
        std = x.std(dim = -1, keepdim = True)
        return self.alpha * (x - mean)/(std + self.eps) + self.bias

In [6]:
a = torch.tensor([1,12])
b = a.mean(dtype=float, dim=0)

In [7]:
b

tensor(6.5000, dtype=torch.float64)

# 4.) Feed Forward Block
### Contains two Linear transformations and a ReLU in between

In [8]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: int) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # W1 and B1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # W2 and B2

    def forward(self, x):
        # (Batch, Seq_len, d_model) ---> (Batch, seq_len, d_ff) ---> (Batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

# 5) Multihead Attention