<a href="https://colab.research.google.com/github/PaulaAdelKamal/Self-Attention-mechanism/blob/main/self_attention_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Self Attention from scratch using PyTorch

In This notebook you will find the implementation for self attention mechanism from scrach

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [11]:
class SelfAttention(nn.Module):
    """
    Implements scaled dot-product self-attention mechanism.

    Args:
        d_model (int): Dimension of input token embeddings and output projections.
        row_dim (int): Dimension along which to compute similarity scores (default: 0).
                       Typically corresponds to the sequence length dimension.
        col_dim (int): Dimension along which to transpose keys for dot-product (default: 1).
                       Typically corresponds to the embedding dimension.
    """
    def __init__(self, d_model=2, row_dim=0, col_dim=1):
        super().__init__()
        # Learnable projection matrices (no bias as in standard transformer architecture)
        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)  # Query projection
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)  # Key projection
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)  # Value projection

        # Configuration for flexible dimension handling (supports non-standard input shapes)
        self.row_dim = row_dim  # Dimension for similarity score rows
        self.col_dim = col_dim  # Dimension for similarity score columns

    def forward(self, token_encoding):
        """
        Compute attention-weighted contextualized token representations.

        Args:
            token_encoding (torch.Tensor): Input tensor of shape [..., seq_len, d_model]

        Returns:
            torch.Tensor: Contextualized embeddings of shape [..., seq_len, d_model]

        Shape:
            Input: (..., S, E) where S is sequence length, E is embedding dimension
            Output: (..., S, E) with same shape as input
        """
        # Project input into query/key/value representations (same-dimensional space)
        q = self.W_q(token_encoding)  # [..., S, E]
        k = self.W_k(token_encoding)  # [..., S, E]
        v = self.W_v(token_encoding)  # [..., S, E]

        # Compute pairwise similarity scores between queries and keys
        # Uses Einstein sum notation: sum over embedding dimension (E)
        sims = torch.matmul(
            q,
            k.transpose(dim0=self.row_dim, dim1=self.col_dim)  # Swaps specified dimensions
        )  # [..., S, S]

        # Scale scores to prevent gradient saturation (sqrt(d_k) as in Transformer paper)
        scaling_factor = torch.sqrt(torch.tensor(k.size(self.col_dim), dtype=torch.float32))
        scaled_sim = sims / scaling_factor  # [..., S, S]

        # Compute attention probabilities using softmax along sequence dimension
        attention_percents = F.softmax(scaled_sim, dim=self.row_dim)  # [..., S, S]

        # Compute weighted sum of value vectors using attention probabilities
        attention_output = torch.matmul(attention_percents, v)  # [..., S, E]

        return attention_output

In [13]:
# ----------------------------
# Example Usage and Demonstration
# ----------------------------

# Synthetic input matrix representing 3 tokens with 2-dimensional embeddings
# Shape: (sequence_length=3, d_model=2)
encodings_matrix = torch.tensor(
    [[1.16, 0.23],
     [0.57, 1.36],
     [4.41, -2.16]],  # Extreme values for demonstration of attention dynamics
    dtype=torch.float32  # Explicit dtype for numerical stability
)

# Reproducibility setup for deterministic parameter initialization
torch.manual_seed(139)  # Ensures consistent weight matrices across runs

# Initialize self-attention module with configuration matching input dimensions
# Note: row_dim=0 and col_dim=1 creates sequence-length vs feature dim attention
selfAttention = SelfAttention(
    d_model=2,        # Match input embedding dimension
    row_dim=0,         # Sequence dimension for attention computation
    col_dim=1          # Feature dimension for key transposition
)

# ----------------------------
# Forward Pass Execution
# ----------------------------
attention_output = selfAttention(encodings_matrix)  # Shape preserved: (3, 2)

# The resulting tensor contains context-aware representations where:
# - Each row corresponds to a token's updated embedding
# - Columns maintain the original d_model dimension
# - Gradients are enabled for subsequent backpropagation

In [14]:
attention_output

tensor([[ 0.0129,  0.0187],
        [-0.1479, -0.1828],
        [ 0.3581,  0.4510]], grad_fn=<MmBackward0>)

In [6]:
# Run these commands in a Colab code cell
!gh repo clone PaulaAdelKamal/Self-Attention-mechanism

/bin/bash: line 1: gh: command not found


In [4]:
!mv /content/self attention from scratch.ipynb /content/your-repo/


mv: target '/content/your-repo/' is not a directory
