In [1]:
5+5

10

In [2]:
import torch

In [14]:
in_ids = torch.tensor([  5, 2,  8,  6])

In [15]:
in_ids

tensor([5, 2, 8, 6])

In [16]:
vocab_size = 10
embed_dim = 3 

torch.manual_seed(123)

embedding_layer = torch.nn.Embedding(vocab_size , embed_dim )

In [17]:
embedding_layer.weight

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035],
        [-0.5880,  0.3486,  0.6603],
        [-0.2196, -0.3792,  0.7671],
        [-1.1925,  0.6984, -1.4097],
        [ 0.1794,  1.8951,  1.3689],
        [-1.6033, -1.3250,  0.1784],
        [-2.1338,  1.0524, -0.3885],
        [-0.9343,  1.8319, -0.3378],
        [ 0.8805,  1.5542,  0.6266],
        [-0.1755,  0.0983, -0.0935]], requires_grad=True)

In [18]:
embedding_layer(in_ids)

tensor([[-1.6033, -1.3250,  0.1784],
        [-0.2196, -0.3792,  0.7671],
        [ 0.8805,  1.5542,  0.6266],
        [-2.1338,  1.0524, -0.3885]], grad_fn=<EmbeddingBackward0>)

In [19]:
import torch
import torch.nn as nn
import math

class InputEmbedding(nn.Module):
    """
    InputEmbedding is responsible for converting token IDs into dense vectors
    that can be processed by the Transformer. 
    
    - Each token ID is mapped to a learnable embedding vector.
    - The embeddings are scaled by sqrt(d_model) as suggested in the Transformer paper.
    
    Args:
        vocab_size (int): Size of the vocabulary.
        d_model (int): Dimensionality of the embeddings.
    """
    def __init__(self, vocab_size: int, d_model: int):
        super(InputEmbedding, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
        self.d_model = d_model

    def forward(self, x):
        """
        Forward pass of the InputEmbedding.
        
        Args:
            x (Tensor): Input token IDs of shape (batch_size, seq_length).
        
        Returns:
            Tensor: Scaled embeddings of shape (batch_size, seq_length, d_model).
        """
        # Convert token IDs to embeddings and scale them
        return self.embedding(x) * math.sqrt(self.d_model)

# positional Embedding

In [None]:
import torch
import torch.nn as nn

class LearnedPositionalEncoding(nn.Module):
    """
    Learned Positional Encoding Layer for Transformers.

    Instead of using fixed sinusoidal functions (as in the original Transformer),
    this module assigns each position in the sequence a trainable embedding vector. 
    This allows the model to learn positional patterns directly from data, which is 
    useful for tasks like machine translation where sentence lengths are limited.

    Args:
        d_model (int): Dimension of the model (embedding size).
        max_len (int): Maximum sequence length expected.
        dropout (float): Dropout probability applied after adding positional embeddings.
    """

    def __init__(self, d_model: int, max_len: int, dropout: float):
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        # Positional embeddings are trainable (shape: [max_len, d_model])
        self.pe = nn.Embedding(max_len, d_model)

        # Initialize embeddings with small random values
        self._init_weights()

    def _init_weights(self):
        nn.init.normal_(self.pe.weight, mean=0, std=0.02)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass for positional encoding.

        Args:
            x (Tensor): Input embeddings of shape (batch_size, seq_len, d_model).

        Returns:
            Tensor: Embeddings with positional encodings added, same shape as input.
        """
        # Create position indices for the current sequence length
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)  # (1, seq_len)

        # Add positional embeddings to token embeddings
        x = x + self.pe(positions)

        # Apply dropout to improve generalization
        return self.dropout(x)