# 2. Input Embedding

In [2]:
import torch
import torch.nn as nn
import math

In [3]:
class EnputEmbedding(nn.Module):

    def __init__(self,
                 d_model:int,
                 vocab_size:int
                 )-> None:
        
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        

        # calculate the embeddings 
        self.embeddings = nn.Embedding(vocab_size, d_model)


    def forward(self,x):
        x = self.embeddings(x)  * math.sqrt(self.d_model)
        return x


In [4]:
d_model = 4
vocab_size = 20000

embedding_layer = EnputEmbedding(d_model, 
                                 vocab_size
                                 )
embedding_layer


EnputEmbedding(
  (embeddings): Embedding(20000, 4)
)


<style>
    .blue {
        background-color: #0074D9;
        border-radius:2px
    }
</style>

<div class="blue">
    Question: what is the difference  between vocab size and sequence length in transformer, let's give the example ?
</div>





Sure, let's consider the sentence "Ram eats mango."

**Vocabulary Size**: If we tokenize this sentence into words, our vocabulary (the set of unique tokens) would be ["Ram", "eats", "mango"]. The vocabulary size is 3 because there are three unique tokens.

**Sequence Length**: The sequence length of the sentence "Ram eats mango" is 3 because there are three tokens in this sequence when we process it.

In a transformer model, the embedding layer would have an embedding for each token in the vocabulary, and the model would process sequences of tokens up to a certain sequence length. For instance, if the model's maximum sequence length is 5, it can process sequences like "Ram eats mango" (length 3) without any issue, but for longer sequences like "Ram eats a ripe mango under the tree" (length 7), it would need to truncate or otherwise handle the extra tokens since the sequence length exceeds the model's maximum.

<style>
    .blue {
        background-color: #0074D9;
        border-radius:2px;
    }
</style>

<div class="blue">
    Question: suppose that sentence is [Ram eats mango give this mango by ram] then what is vocabulary size and sequence length of this sentence ?
</div>




For the sentence "Ram eats mango give this mango by ram":

**Vocabulary Size**: The unique words are ["Ram", "eats", "mango", "give", "this", "by"], noting that "Ram" and "ram" are considered the same if we ignore case sensitivity. So, the vocabulary size is 6.

**Sequence Length**: The total number of words in the sentence is 8, so the sequence length is 8.

# Positional Encoding

In [5]:
class PositionalEncoding(nn.Module):

    def __init__(self, 
                 d_model:int,
                 seq_len:int,
                 dropout:float
                 ) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len 
        self.dropout = nn.Dropout(dropout)

        # calculate the positional Encoding

        pe = torch.zeros(seq_len, d_model)

        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)

        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))



        # calculate the sine indices 
        pe[:, 0::2] = torch.sin(position / div_term)

        # calculate the cos indices 
        pe[:, 1::2] = torch.cos(position / div_term)

        # unsqueeze the Encoding 
        pe.unsqueeze(0)


        # register the Encoding using register buffer 
        self.register_buffer('pe', pe)


    def forward(self, x):

        x = x + (self.pe[:, :x.shape[1], :]).requires_grad(False)
        x = self.dropout(x)
        return x


In [6]:
pe = PositionalEncoding(
    d_model, 
    seq_len=20,
    dropout=0.2
)
pe

PositionalEncoding(
  (dropout): Dropout(p=0.2, inplace=False)
)

- In the output `PositionalEncoding((dropout): Dropout(p=0.2, inplace=False))`, the `p` represents the probability of an element to be zeroed, which is the dropout rate. So, `p=0.2` means that there is a 20% chance that any given neuron (or element) will be set to zero during training at each update cycle. This helps prevent overfitting by providing a form of regularization.

In [7]:
d_model = 512

div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000) / d_model))
div_term

tensor([1.0000e+00, 9.6466e-01, 9.3057e-01, 8.9769e-01, 8.6596e-01, 8.3536e-01,
        8.0584e-01, 7.7737e-01, 7.4989e-01, 7.2339e-01, 6.9783e-01, 6.7317e-01,
        6.4938e-01, 6.2643e-01, 6.0430e-01, 5.8294e-01, 5.6234e-01, 5.4247e-01,
        5.2330e-01, 5.0481e-01, 4.8697e-01, 4.6976e-01, 4.5316e-01, 4.3714e-01,
        4.2170e-01, 4.0679e-01, 3.9242e-01, 3.7855e-01, 3.6517e-01, 3.5227e-01,
        3.3982e-01, 3.2781e-01, 3.1623e-01, 3.0505e-01, 2.9427e-01, 2.8387e-01,
        2.7384e-01, 2.6416e-01, 2.5483e-01, 2.4582e-01, 2.3714e-01, 2.2876e-01,
        2.2067e-01, 2.1288e-01, 2.0535e-01, 1.9810e-01, 1.9110e-01, 1.8434e-01,
        1.7783e-01, 1.7154e-01, 1.6548e-01, 1.5963e-01, 1.5399e-01, 1.4855e-01,
        1.4330e-01, 1.3824e-01, 1.3335e-01, 1.2864e-01, 1.2409e-01, 1.1971e-01,
        1.1548e-01, 1.1140e-01, 1.0746e-01, 1.0366e-01, 1.0000e-01, 9.6466e-02,
        9.3057e-02, 8.9769e-02, 8.6596e-02, 8.3536e-02, 8.0584e-02, 7.7736e-02,
        7.4989e-02, 7.2339e-02, 6.9783e-

In [8]:
d_model = 512

div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0)/ d_model))
div_term

tensor([1.0000e+00, 9.6466e-01, 9.3057e-01, 8.9769e-01, 8.6596e-01, 8.3536e-01,
        8.0584e-01, 7.7737e-01, 7.4989e-01, 7.2339e-01, 6.9783e-01, 6.7317e-01,
        6.4938e-01, 6.2643e-01, 6.0430e-01, 5.8294e-01, 5.6234e-01, 5.4247e-01,
        5.2330e-01, 5.0481e-01, 4.8697e-01, 4.6976e-01, 4.5316e-01, 4.3714e-01,
        4.2170e-01, 4.0679e-01, 3.9242e-01, 3.7855e-01, 3.6517e-01, 3.5227e-01,
        3.3982e-01, 3.2781e-01, 3.1623e-01, 3.0505e-01, 2.9427e-01, 2.8387e-01,
        2.7384e-01, 2.6416e-01, 2.5483e-01, 2.4582e-01, 2.3714e-01, 2.2876e-01,
        2.2067e-01, 2.1288e-01, 2.0535e-01, 1.9810e-01, 1.9110e-01, 1.8434e-01,
        1.7783e-01, 1.7154e-01, 1.6548e-01, 1.5963e-01, 1.5399e-01, 1.4855e-01,
        1.4330e-01, 1.3824e-01, 1.3335e-01, 1.2864e-01, 1.2409e-01, 1.1971e-01,
        1.1548e-01, 1.1140e-01, 1.0746e-01, 1.0366e-01, 1.0000e-01, 9.6466e-02,
        9.3057e-02, 8.9769e-02, 8.6596e-02, 8.3536e-02, 8.0584e-02, 7.7736e-02,
        7.4989e-02, 7.2339e-02, 6.9783e-

<style>
    .blue {
        background-color: #0074D9;
    }
</style>

<div class="blue">
   Question: suppose that sentence is [Ram eats mango give this mango by ram] then what is vocabulary size and sequence length of this sentence ?
</div>



- what is the difference  of output if i use the float in the code i am privide the code above

Using `.float()` in the code ensures that the tensor elements are floating-point numbers, which allows for proper floating-point arithmetic during the calculation of `div_term`. If you don't use `.float()`, the tensor elements would be integers, and the operation could result in an error because PyTorch expects tensors to be of the same type when performing element-wise operations. Even if it didn't result in an error, you would get incorrect results due to integer division truncating the decimal part of the results.

So, using `.float()` is crucial for obtaining the correct `div_term` values as intended for positional encoding in a transformer model.

<style>
    .blue {
        background-color: #0074D9;
    }
</style>

<div class="blue">
    Question: why in positional Encoding using the register buffer ?
</div>




In PyTorch, `register_buffer` is used to register a tensor as a buffer within a module. Buffers are tensors that are not treated as model parameters; that is, they do not get updated during training when using optimizers. However, they are part of the model's state and are included when saving and loading the model.

Positional encodings in transformers are usually fixed and do not change during training, which makes them perfect candidates for buffers. By registering positional encodings as buffers, you ensure that they're saved and loaded with the model, and they're moved to the correct device along with the rest of the model. This is important for maintaining consistency in the model's behavior, especially when you're deploying it or resuming training from a checkpoint.




<style>
    .blue {
        background-color: #0074D9;
    }
</style>

<div class="blue">
    Question: if i am not used to the register buffer  then what is problem i phase ?
</div>


If you do not use `register_buffer` for positional encodings in a transformer model, you might face several issues:

1. **State Dict**: The positional encodings will not be part of the model's `state_dict`, which means they won't be saved or loaded correctly with the model's parameters. This can lead to inconsistencies when saving and loading models.

2. **Device Management**: When you move your model to a different device (e.g., from CPU to GPU), the positional encodings will not automatically move with it. This can cause device mismatches and result in runtime errors.

3. **Model Updates**: If the positional encodings are mistakenly treated as parameters rather than buffers, they might get updated during training, which is not desired since positional encodings should remain constant.

Using `register_buffer` ensures that the positional encodings are properly managed throughout the lifecycle of the model.

<style>
    .blue {
        background-color: #0074D9;
    }
</style>

<div class="blue">
    Question: why i used the requuires_grad if i am not use then what happend ?
</div>


In PyTorch, `.requires_grad(False)` is used to tell the framework that gradients should not be calculated for this tensor during backpropagation. Positional encodings are constant values that do not need to be updated during training, so setting `.requires_grad` to `False` prevents them from being part of the gradient computation.

If you do not use `.requires_grad(False)`, then:

1. **Gradient Computation**: The model will unnecessarily compute gradients for the positional encodings during backpropagation, even though they should remain constant.

2. **Memory Usage**: This will increase memory usage because the computational graph will keep track of operations performed on the positional encodings for gradient calculation.

3. **Training Efficiency**: It can potentially slow down training because of the extra computation and memory used for something that doesn't contribute to model learning.

So, it's a good practice to set `.requires_grad` to `False` for tensors that are not supposed to change during training.

<style>
    .blue {
        background-color: #0074D9;
    }
</style>

<div class="blue">
    Question: which num should take dropout in the positonal encoding ?
</div>


The dropout rate in the positional encoding module is a hyperparameter that you can tune based on your specific use case and the behavior of your model during training. Commonly used dropout rates range from **0.1 to 0.5**. A rate of **0.1** means that each unit or connection has a 10% chance of being dropped, while a rate of **0.5** means a 50% chance.

Choosing the right dropout rate often involves experimentation and validation. Start with a lower rate like **0.1** or **0.2**, and if you notice overfitting (where your model performs well on the training data but poorly on unseen data), you can try increasing the dropout rate to see if it improves generalization.

Remember, too much dropout can also lead to underfitting, where the model fails to learn the underlying patterns in the data. It's all about finding the right balance.

# Normalization

In [9]:
class LayerNormalization(nn.Module):

    def __init__(self,
                 features:int,
                 eps:float=10**-6
                 ) -> None:
        super().__init__()
        self.eps = eps 
        self.alpha = nn.Parameter(torch.ones(features))
        self.bias = nn.Parameter(torch.zeros(features))


    def forward(self, x):

        # calculate the mean 
        mean = x.mean(dim=-1,
                      keepdim=True)
        

        std = x.std(dim = -1, deepdim=True)

        return self.alpha * (x - mean) / (std + self.eps) + self.bias


In [10]:
norm = LayerNormalization(
    features=512
)
norm.alpha
norm.bias


Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.

<style>
    .blue {
        background-color: #0074D9;
    }
</style>

<div class="blue">
    Question: what number should give feature in this layernormalization ?
</div>


The `features` parameter in the `LayerNormalization` class should match the number of features (or dimensions) in the last axis of the input tensor that you want to normalize. For example, if you're working with a transformer model and your input tensor has a shape of `[batch_size, seq_length, d_model]`, where `d_model` is the size of the model's embeddings or hidden states, then `features` should be set to `d_model`.

In other words, if your input tensor's last dimension has 512 features (i.e., each token in your sequence is represented by a 512-dimensional vector), then you would initialize your `LayerNormalization` module with `features=512`. This ensures that layer normalization is applied correctly across each feature dimension for every token in your sequence.

# FeedForward

In [11]:
class FeedForwardBlock(nn.Module):

    def __init__(self,
                 d_model:int,
                 d_ff:int,
                 dropout:float
                 ) -> None:
        super().__init__()

        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)


    def forward(self, x):
        x = self.linear_2(self.dropout(torch.relu(self.linear_1(x))))
        return x 


In [12]:
feedforward = FeedForwardBlock(d_model=512,
                               d_ff=2048,
                               dropout=0.2
                               )
feedforward

FeedForwardBlock(
  (linear_1): Linear(in_features=512, out_features=2048, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (linear_2): Linear(in_features=2048, out_features=512, bias=True)
)

In [13]:
class MultiHeadAttentionBlock(nn.Module):

    def __init__(self,
                 d_model:int,  # 512
                 h:int,        # 8
                 dropout:float  # 0.2
                 ) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h 
        assert d_model % h == 0

        self.d_k = d_model // h 
       
        self.w_q = nn.Linear(d_model, d_model, bias=False)
        self.w_k = nn.Linear(d_model, d_model, bias=False)
        self.w_v = nn.Linear(d_model, d_model, bias=False)
        self.w_o = nn.Linear(d_model, d_model, bias=False)
        self.dropout = nn.Dropout(dropout)



    @staticmethod
    def attention(query, key, value, mask, dropout:nn.Dropout):

        d_k = query.shape[-1]
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)

        if mask is not None:
            attention_scores.masked_fill_(mask == 0,
                                          -1e9)
            
        attention_scores = attention_scores.softmax(dim=-1)

        if dropout is not None:
            attention_scores = dropout(attention_scores)
            return (attention_scores @ value), attention_scores




    def forward(self, q, k, v, mask):
        query = self.w_q(q)
        key = self.w_k(k)
        value = self.w_v(v)

        query = query.view(
            query.shape[0], query.shape[1], self.h, self.d_k
        ).transpose(1, 2)

        key = key.view(
            key.shape[0], key.shape[1], self.h, self.d_k
        ).transpose(1, 2)

        value = value.view(
            value.shape[0], value.shape[1], self.h, self.d_k
        ).transpose(1, 2)

        # calculation of attention 
        x , self.attention_score = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        return self.w_o(x)

In [14]:
multiHeadAttention = MultiHeadAttentionBlock(
    d_model=512,
    h=8,
    dropout=0.2
)
# multiHeadAttention.attention
# multiHeadAttention.w_q
# multiHeadAttention.w_k
# multiHeadAttention.w_v
# multiHeadAttention.w_o
# multiHeadAttention.d_model
# multiHeadAttention.h
# multiHeadAttention.d_k
multiHeadAttention


MultiHeadAttentionBlock(
  (w_q): Linear(in_features=512, out_features=512, bias=False)
  (w_k): Linear(in_features=512, out_features=512, bias=False)
  (w_v): Linear(in_features=512, out_features=512, bias=False)
  (w_o): Linear(in_features=512, out_features=512, bias=False)
  (dropout): Dropout(p=0.2, inplace=False)
)

<style>
    .blue {
        background-color: #0074D9;
    }
</style>

<div class="blue">
    Question: what is the heads of multhhead attention ?
</div>


In multi-head attention, "heads" refer to the parallel attention mechanisms that run at the same time. Each head computes its own set of attention scores independently, allowing the model to capture different types of information from different parts of the input sequence. For example, one head might focus on the syntactic aspects while another might capture semantic information.

The number of heads is a hyperparameter that you can tune. More heads allow the model to capture a wider variety of information but also increase computational complexity. Common values for the number of heads are 8 or 16, but this can vary based on your specific task and model size.

Here's a simplified explanation:
- **Single Attention**: One set of attention scores, focusing on one aspect of the input.
- **Multi-Head Attention**: Multiple sets of attention scores (heads), each focusing on different aspects of the input simultaneously.

This design is part of what allows transformers to be so effective for tasks involving sequences, such as language modeling and machine translation.


# ResidualConnection

In [15]:
class ResidualConnection(nn.Module):

    def __init__(self,
                 features:int,
                 dropout:float) -> None:
        super().__init__()

        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization(features)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

In [16]:
residualconnection = ResidualConnection(
    features=512,
    dropout=0.2
)
residualconnection.dropout


Dropout(p=0.2, inplace=False)

# EncoderBlock

In [17]:
class EncoderBlock(nn.Module):

    def __init__(self,
                 features:int,   # 512
                 self_attention_block:MultiHeadAttentionBlock,
                 feed_forward_block:FeedForwardBlock,
                 dropout:float  # 0.2
                 ) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])


    def forward(self, x, src_mask):

        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))    
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x 

In [18]:
encoderblock = EncoderBlock(
    features=512,
    dropout=0.2,
    self_attention_block=MultiHeadAttentionBlock,
    feed_forward_block=FeedForwardBlock
)

encoderblock

EncoderBlock(
  (residual_connections): ModuleList(
    (0-1): 2 x ResidualConnection(
      (dropout): Dropout(p=0.2, inplace=False)
      (norm): LayerNormalization()
    )
  )
)

<style>
    .blue {
        background-color: #0074D9;
    }
</style>

<div class="blue">
    Question: (0-1): 2 x ResidualConnection   what is this ?
</div>


The notation "(0-1): 2 x ResidualConnection" indicates that there are two instances of the `ResidualConnection` module within the `ModuleList`, and they are indexed as 0 and 1. The "2 x" signifies that there are two copies or instances of the `ResidualConnection` module being used in sequence.

In PyTorch's `ModuleList`, each module is indexed starting from 0, similar to elements in a Python list. So, this means you have two `ResidualConnection` modules, one after the other, which can be accessed using their respective indices (0 and 1) in the `ModuleList`.

# Encoder

In [19]:
class Encoder(nn.Module):

    def __init__(self,
                 features:int,  #512
                 layers:nn.ModuleList
                 ) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)


    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [20]:
encoder = Encoder(
    features=512,
    layers=12
)
encoder

Encoder(
  (norm): LayerNormalization()
)

- In the `Encoder` class you've provided, the `layers` parameter in the constructor should take a `nn.ModuleList` containing the layers or blocks that make up the encoder. Each element in this `ModuleList` should be a module that has a forward method accepting two arguments: `x` (the input data) and `mask`.

For a transformer encoder, these layers are typically instances of an encoder block that might include:

- Multi-head self-attention mechanism
- Feed-forward neural network
- Residual connections
- Layer normalization

Here's an example of how you might initialize the `layers` parameter with encoder blocks:



In this example:
- `d_model` is the number of features (e.g., 512).
- `n_heads` is the number of attention heads.
- `d_ff` is the dimensionality of the feed-forward network.
- `dropout` is the dropout rate.
- `n_layers` is the number of encoder blocks you want to stack.

Each `EncoderBlock` would be defined to include multi-head attention, feed-forward network, and any necessary residual connections and layer normalizations. The exact structure would depend on your implementation of `EncoderBlock`.


<style>
    .blue {
        background-color: #0074D9;
    }
</style>

<div class="blue">
    Question: how many value take n_layers ?
</div>


The value of `n_layers` depends on how deep you want your encoder to be. It represents the number of encoder blocks you will stack in your `ModuleList`. In transformer models, it's common to have anywhere from 2 to 12 layers, but this can vary depending on the complexity of the task and the amount of data you have.

For example, if you're working with a smaller dataset or a less complex problem, you might choose a smaller number of layers, like 2 or 4. For larger datasets or more complex problems, you might use 6, 8, or even more layers.

Ultimately, the choice of how many layers to use is a hyperparameter that you can tune during the training process. You might start with a certain number and then increase or decrease it based on the performance of your model on validation data.


# Decoder

In [21]:
class DecoderBlock(nn.Module):

    def __init__(self,
                 features:int,
                 self_attention_block:MultiHeadAttentionBlock,
                 cross_attention_block:MultiHeadAttentionBlock,
                 feed_forward_block:FeedForwardBlock,
                 dropout
                 ) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block,
        self.feed_feedward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])


    def forward(self, x, encoder_output, src_mask, tgt_mask):

        x = self.residual_connections[0](x, lambda x:self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[0](x, lambda x:self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_feedward_block)
        
        return x

In [22]:
decoder = DecoderBlock(
    features=512,
    self_attention_block=MultiHeadAttentionBlock,
    cross_attention_block=MultiHeadAttentionBlock,
    feed_forward_block=FeedForwardBlock,
    dropout=0.2
)
decoder

DecoderBlock(
  (residual_connections): ModuleList(
    (0-2): 3 x ResidualConnection(
      (dropout): Dropout(p=0.2, inplace=False)
      (norm): LayerNormalization()
    )
  )
)

# Decoder

In [23]:
class Decoder(nn.Module):

    def __init__(self,
                 features:int,
                 layers:nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)


    def forward(self, 
                x, 
                encoder_output,
                src_mask, 
                tgt_mask):
        
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

In [24]:
decoder = Decoder(
    features=512,
    layers=12)

decoder

Decoder(
  (norm): LayerNormalization()
)

# Projection Layer

In [25]:
class ProjectionLayer(nn.Module):

    def __init__(self,
                 d_model,
                 vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):

        return self.proj(x)

In [26]:
linearLayer = ProjectionLayer(
    d_model=512,
    vocab_size=200
)
linearLayer

ProjectionLayer(
  (proj): Linear(in_features=512, out_features=200, bias=True)
)

# Transformer

In [27]:
class Transformer(nn.Module):

    def __init__(self,
                 encoder:Encoder,
                 decoder:Decoder,
                 src_embed = EnputEmbedding,
                 tgt_embed = EnputEmbedding,
                 src_pos = PositionalEncoding,
                 tgt_pos = PositionalEncoding,
                 projection_layer = ProjectionLayer
                 ) -> None:
        super().__init__()


        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer


    def encode(self, 
               src,  # source language
               src_mask
               ):
        
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)
    

    def decode(self, 
               encoder_output:torch.Tensor,
               src_mask:torch.Tensor,
               tgt:torch.Tensor,
               tgt_mask:torch.Tensor
               ):
        
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)

        return self.decoder(tgt, # value
                            encoder_output,  # query, key
                            src_mask, tgt_mask)
    

    def project(self, x):
        return self.projection_layer(x)
        

In [30]:
transformer = Transformer(
    encoder=Encoder,
    decoder=Decoder,
    src_embed=EnputEmbedding,
    tgt_embed = EnputEmbedding,
    src_pos = PositionalEncoding,
    tgt_pos = PositionalEncoding,
    projection_layer=ProjectionLayer
)
transformer

Transformer()

# Build Transformer

In [34]:
def build_transformer(
        src_vocab_size:int,  # source language
        tgt_vocab_size:int,  # target language
        src_seq_len:int,
        tgt_seq_len:int,
        d_model:int=512,
        N:int=6,             # Number of layers
        h:int=8,              # number of heads
        dropout:float=0.1,
        d_ff:int=2048) -> Transformer:
    

    src_embed = EnputEmbedding(d_model, src_vocab_size)
    tgt_embed = EnputEmbedding(d_model, tgt_vocab_size)

    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos= PositionalEncoding(d_model, tgt_seq_len, dropout)


    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(d_model, 
                                     decoder_self_attention_block,
                                     decoder_cross_attention_block,
                                     feed_forward_block,
                                     dropout)
        
        decoder_blocks.append(decoder_block)



    # create the encoder the decoder 
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))


    # create the projection layer 
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)


    # create transformer 
    transformer = Transformer(encoder,
                              decoder,
                              src_embed,
                              tgt_embed,
                              src_pos,
                              tgt_pos,
                              projection_layer)
    


    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)


    return transformer


In [37]:
a = build_transformer(
    src_vocab_size=22,
    tgt_vocab_size=22,
    tgt_seq_len=40,
    src_seq_len=40,
    d_model=512,
    N=6,
    h=8,
    dropout=0.2,
    d_ff=2048
)
a

Transformer(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderBlock(
        (self_attention_block): MultiHeadAttentionBlock(
          (w_q): Linear(in_features=512, out_features=512, bias=False)
          (w_k): Linear(in_features=512, out_features=512, bias=False)
          (w_v): Linear(in_features=512, out_features=512, bias=False)
          (w_o): Linear(in_features=512, out_features=512, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (feed_forward_block): FeedForwardBlock(
          (linear_1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
          (linear_2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (residual_connections): ModuleList(
          (0-1): 2 x ResidualConnection(
            (dropout): Dropout(p=0.2, inplace=False)
            (norm): LayerNormalization()
          )
        )
      )
    )
    (norm): LayerNormalization