In [21]:
import math

import torch
import torch.nn as nn

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Embedding Layer
*The Idea is to Augument the token embeddings with position-dependednt pattern of values.* 
##
*If the Pattern is characterisitc for each position, then other layers could learn to incoperate positional information into their transformation.*
##
***In Other words, if each position has a unique encoding, the model can infer order and distance between tokens.*** 

### nn.Embedding: Maps the Descrete Input Tokens into Dense Vectors (Embeddings) , a learnable look-up table

*What if we don't use Embeddings for learning, or transforming into more complex term, instead use things as it , could be use linear with it ?*

In [7]:
# Example usage of nn.Embedding
Num_Unique_Tokens = 1000
Size_Of_Vector_to_map = 64
enmbedding_layer = nn.Embedding(num_embeddings=Num_Unique_Tokens, embedding_dim=Size_Of_Vector_to_map)

In [23]:
positional_indices = torch.arange(0, 10).unsqueeze(0)
print(positional_indices.size())

torch.Size([1, 10])


In [29]:
positional_indices[:, 1::2]

tensor([[1, 3, 5, 7, 9]])

*For Positional Informations, max_length defines how long the input sequences can be that the model will see.*

In [32]:
# Adding Positional Information using indicies Information, though lacks relative positional information
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_size, max_length):
        super(EmbeddingLayer, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)
        self.layer_norm = nn.LayerNorm(embed_size, eps=1e-12)
    
    def forward(self, x):
        word_embedding = self.embedding(x)                                                  # Convert unique word tokens to word embeddings
        
        positional_indices = torch.arange(x.size(-2), device=x.device).unsqueeze(0)         # Creates positional inidices tensor                             Shape: (1, Seqlen)
        positional_embeddings = self.position_embedding(positional_indices)                 # Convert positional indicies to positional embeddings          Shape: (1, Seqlen, embed_size)  
        
        x = word_embedding + positional_embeddings                                          # Adds word embedding to positional embedding
        x = self.layer_norm(x)                                                              # Apply layer normalization
        return x

In [35]:
## Usage With Config
class Config:
    vocab_size = 1000
    embed_size = 64
    max_length = 100    # Maximum length of the input sequence

## Usage ??

In [33]:
# Adding positional Information using sinusoidal function
class SinusoidalEmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_size, max_length, device):
        super(SinusoidalEmbeddingLayer, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        # register_buffer => Tensor which is not a parameter, but should be part of the modules state.
        self.register_buffer("positional_embedding", self._get_positional_encoding(max_length, embed_size, device))
        self.layer_norm = nn.LayerNorm(embed_size, eps=1e-12)
    
    def _get_positional_encoding(self, max_length, embed_size, device):
        pe = torch.zeros(max_length, embed_size, device=device)                              # Create a tensor of zeros of size (max_length, embed_size)
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)              # Create a tensor of size (max_length, 1)
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * (-math.log(10000.0) / embed_size))    # Create a tensor of exp values of 0 to embed_size/2
        
        pe[:, 0::2] = torch.sin(position * div_term)                                          # Apply sin function to even indices, start=0 , step=2
        pe[:, 1::2] = torch.cos(position * div_term)                                          # Apply cos function to odd indices, start=1, step=2
        pe = pe.unsqueeze(0)                                                                  # shape: (1, max_length, embed_size)
        return pe

    def forward(self, x):
        word_embedding = self.embedding(x)                                                  # Convert unique word tokens to word embeddings
        
        positional_embeddings = self.positional_embedding[:, :x.size(-2), :].to(x.device)   # Get sinosudal indicies information as positional embeddings          Shape: (1, Seqlen, embed_size)
        x = word_embedding + positional_embeddings                                          # Adds word embedding to positional embedding
        x = self.layer_norm(x)                                                              # Apply layer normalization
        return x

In [36]:
class Config:
    vocab_size = 1000
    embed_size = 64
    max_length = 100    # Maximum length of the input sequence
    device = device     # Device to use (CPU or GPU) 

## Feed Forward Layer
*It Process Each Embedding Sepeartly insteaad of processing whole as single vector, that's why it is also called **Position-Wise Feed Forward Layer***

In [3]:
class FeedForward(nn.Module):
    def __init__(self, model_dim, ff_dim, droupout=0.5):    # ff_dim is usally higher that model_dim
        super(FeedForward, self).__init__()

        self.fc1 = nn.Linear(model_dim, ff_dim)
        self.fc2 = nn.Linear(ff_dim, model_dim)
        self.relu = nn.ReLU()
        self.droupout = nn.Dropout(droupout)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)

        x = self.droupout(x)       # apply dropout to the output of the second linear layer to reduce overfitting
        return x

In [4]:
class Config:
    model_dim = 512
    ff_dim = 2048   # usually 4 times model_dim
    dropout = 0.1   

In [5]:
feed_forward = FeedForward(Config.model_dim, Config.ff_dim, Config.dropout).to(device)
print(feed_forward)

FeedForward(
  (fc1): Linear(in_features=512, out_features=2048, bias=True)
  (fc2): Linear(in_features=2048, out_features=512, bias=True)
  (relu): ReLU()
  (droupout): Dropout(p=0.1, inplace=False)
)
