## Positional Encoding

This notebook will code positional encoding for Transformer neural networks with pytrch

In [None]:
import torch
import torch.nn as nn

# Maximum number of tokens (words) in a single input sequence.
# In a real Transformer this is typically 512 or more; we use 10 here for easy visualization.
max_sequence_length = 10

# The size of each token's embedding vector.
# Every word is represented as a dense vector of this many dimensions.
# In the original "Attention Is All You Need" paper this is 512;
# we use 6 here so the tensors are small enough to inspect by hand.
d_model = 6

$$
PE(\text{position}, 2i) = \sin\bigg( \frac{ \text{position} }{10000^\frac{2i}{d_{model}}} \bigg)
$$

$$
PE(\text{position}, 2i+1) = \cos\bigg( \frac{ \text{position} }{10000^\frac{2i}{d_{model}}} \bigg)
$$

We can rewrite these as

$$
PE(\text{position}, i) = \sin\bigg( \frac{ \text{position} }{10000^\frac{i}{d_{model}}} \bigg) \text{ when i is even}
$$

$$
PE(\text{position}, i) = \cos\bigg( \frac{ \text{position} }{10000^\frac{i-1}{d_{model}}} \bigg) \text{ when i is odd}
$$

In [None]:
# Generate even dimension indices: 0, 2, 4, ... up to (but not including) d_model.
# These correspond to the "2i" term in the positional encoding formula.
# For d_model=6 this produces tensor([0, 2, 4]).
# .float() converts to float so we can use them in division later.
even_i = torch.arange(0, d_model, 2).float()
even_i

tensor([0., 2., 4.])

In [None]:
# Compute the denominator for the EVEN-index positional encoding formula:
#   denominator = 10000 ^ (2i / d_model)
#
# For each even dimension index i, the denominator grows exponentially.
# Low-index dimensions get small denominators → high-frequency sine waves,
# while high-index dimensions get large denominators → low-frequency waves.
# This spread of frequencies lets the model distinguish positions at
# many different scales (nearby words vs. far-apart words).
even_denominator = torch.pow(10000, even_i / d_model)
even_denominator

tensor([  1.0000,  21.5443, 464.1590])

In [None]:
# Generate odd dimension indices: 1, 3, 5, ... up to d_model.
# These correspond to the "2i+1" term in the positional encoding formula,
# where cosine is applied instead of sine.
# For d_model=6 this produces tensor([1, 3, 5]).
odd_i = torch.arange(1, d_model, 2).float()
odd_i

tensor([1., 3., 5.])

In [None]:
# Compute the denominator for the ODD-index formula:
#   denominator = 10000 ^ ((i-1) / d_model)
#
# Notice that (odd_i - 1) gives [0, 2, 4] — exactly the same as even_i!
# So the denominators for even and odd positions turn out to be identical.
# This is because dimension pair (2i, 2i+1) shares the same frequency;
# only the function changes (sin vs cos).
even_denominator = torch.pow(10000, (odd_i - 1) / d_model)
even_denominator

tensor([  1.0000,  21.5443, 464.1590])

`even_denominator` and `odd_denominator` are the same! So we can just do one of these actions and call the resulting variable `denominator`

In [None]:
# Since both even and odd denominators are identical,
# we only need one variable. We'll call it 'denominator'
# and reuse it for both the sin (even) and cos (odd) calculations.
denominator = even_denominator

In [None]:
# Create a column vector of position indices: [[0], [1], [2], ..., [max_sequence_length-1]]
# Shape: (max_sequence_length, 1)  →  (10, 1)
#
# Each row represents one token's position in the sequence.
# The reshape to a column vector is essential so that when we later divide
# by the denominator (shape (3,)), PyTorch broadcasts the operation to
# produce a (10, 3) matrix — one value per (position, frequency) pair.
position = torch.arange(max_sequence_length, dtype=torch.float).reshape(max_sequence_length, 1)

In [None]:
# Inspect the position column vector.
# You should see values 0 through 9, each in its own row:
# tensor([[0.], [1.], [2.], ..., [9.]])
position

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.],
        [9.]])

In [None]:
# ---- Core positional encoding calculation ----
#
# EVEN dimensions (0, 2, 4):  PE(pos, 2i)   = sin(pos / denominator)
# ODD  dimensions (1, 3, 5):  PE(pos, 2i+1) = cos(pos / denominator)
#
# Broadcasting:
#   position   shape: (10, 1)
#   denominator shape: (3,)   →  broadcast to (10, 3)
#
# Result: each is a (10, 3) matrix — 10 positions × 3 frequency channels.
# even_PE holds the sin values for dimensions 0, 2, 4
# odd_PE  holds the cos values for dimensions 1, 3, 5
even_PE = torch.sin(position / denominator)
odd_PE = torch.cos(position / denominator)

In [None]:
# Inspect the sine-based (even) encodings.
# Shape is (10, 3) — 10 positions, 3 even dimensions.
# Notice column 0 changes fast (high freq) while column 2 changes slowly (low freq).
even_PE

tensor([[ 0.0000,  0.0000,  0.0000],
        [ 0.8415,  0.0464,  0.0022],
        [ 0.9093,  0.0927,  0.0043],
        [ 0.1411,  0.1388,  0.0065],
        [-0.7568,  0.1846,  0.0086],
        [-0.9589,  0.2300,  0.0108],
        [-0.2794,  0.2749,  0.0129],
        [ 0.6570,  0.3192,  0.0151],
        [ 0.9894,  0.3629,  0.0172],
        [ 0.4121,  0.4057,  0.0194]])

In [None]:
# Confirm shape: (max_sequence_length, d_model/2) = (10, 3)
even_PE.shape

torch.Size([10, 3])

In [None]:
# Inspect the cosine-based (odd) encodings.
# Same shape (10, 3). These will be interleaved with even_PE next.
odd_PE

tensor([[ 1.0000,  1.0000,  1.0000],
        [ 0.5403,  0.9989,  1.0000],
        [-0.4161,  0.9957,  1.0000],
        [-0.9900,  0.9903,  1.0000],
        [-0.6536,  0.9828,  1.0000],
        [ 0.2837,  0.9732,  0.9999],
        [ 0.9602,  0.9615,  0.9999],
        [ 0.7539,  0.9477,  0.9999],
        [-0.1455,  0.9318,  0.9999],
        [-0.9111,  0.9140,  0.9998]])

In [None]:
# Confirm shape: (10, 3) — matches even_PE
odd_PE.shape

torch.Size([10, 3])

In [None]:
# Stack even_PE and odd_PE along a NEW third dimension (dim=2).
#
# Before stacking:
#   even_PE shape: (10, 3)   — sin values for dims 0, 2, 4
#   odd_PE  shape: (10, 3)   — cos values for dims 1, 3, 5
#
# After stacking:
#   stacked shape: (10, 3, 2)
#   For each position and each frequency channel, we now have a pair:
#   [sin_value, cos_value]
#
# This arrangement prepares us to interleave them: [sin0, cos0, sin1, cos1, sin2, cos2]
stacked = torch.stack([even_PE, odd_PE], dim=2)
stacked.shape

torch.Size([10, 3, 2])

In [None]:
# Flatten dimensions 1 and 2 to interleave the sin/cos pairs.
#
# stacked shape: (10, 3, 2)  →  flattened to (10, 6)
#
# The flatten merges the last two dims, so each row becomes:
#   [sin_dim0, cos_dim1, sin_dim2, cos_dim3, sin_dim4, cos_dim5]
#
# This is exactly the positional encoding matrix PE of shape
# (max_sequence_length, d_model) = (10, 6).
#
# In a real Transformer, this matrix is ADDED element-wise to the
# word embedding matrix so that each token's vector carries both
# semantic meaning (from the embedding) and position information.
PE = torch.flatten(stacked, start_dim=1, end_dim=2)
PE

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])

## Class

Let's combine all the code above into a cute class

In [None]:
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
    """
    Generates the sinusoidal positional encoding matrix described in
    "Attention Is All You Need" (Vaswani et al., 2017).

    The encoding injects position information into token embeddings so the
    Transformer can distinguish token order (since it has no recurrence or
    convolution to capture sequence ordering on its own).

    Output shape: (max_sequence_length, d_model)
    Each row is the positional encoding vector for that position index.
    """

    def __init__(self, d_model, max_sequence_length):
        """
        Args:
            d_model (int): Dimensionality of the token embeddings.
                           Must be even so that dimensions split evenly
                           into sin (even indices) and cos (odd indices).
            max_sequence_length (int): The longest sequence the model will
                           handle. Determines the number of rows in the
                           output matrix.
        """
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        """
        Compute and return the positional encoding matrix.

        Steps:
        1. Build even dimension indices [0, 2, 4, ...] to compute
           the shared denominator  10000^(2i / d_model).
        2. Create a column vector of positions [0, 1, ..., max_seq_len-1].
        3. Apply sin to even dimensions and cos to odd dimensions.
        4. Interleave the results so that the final tensor has the layout:
           [sin_dim0, cos_dim1, sin_dim2, cos_dim3, ...]

        Returns:
            Tensor of shape (max_sequence_length, d_model) containing the
            positional encodings. In practice this tensor is added
            element-wise to the word embedding matrix before it enters
            the self-attention layers.
        """
        # Step 1: Even dimension indices → [0, 2, 4, ..., d_model-2]
        even_i = torch.arange(0, self.d_model, 2).float()

        # Denominator = 10000^(2i / d_model).
        # Low-index dims get small denominators → high-frequency waves.
        # High-index dims get large denominators → low-frequency waves.
        # This spread of frequencies allows the model to capture both
        # fine-grained (nearby tokens) and coarse (distant tokens) position info.
        denominator = torch.pow(10000, even_i / self.d_model)

        # Step 2: Position column vector — shape (max_seq_len, 1)
        # Reshaped so that division by denominator broadcasts to (max_seq_len, d_model/2).
        position = torch.arange(self.max_sequence_length).reshape(self.max_sequence_length, 1)

        # Step 3: Apply sin and cos
        even_PE = torch.sin(position / denominator)   # shape: (max_seq_len, d_model/2)
        odd_PE  = torch.cos(position / denominator)    # shape: (max_seq_len, d_model/2)

        # Step 4: Interleave sin and cos values
        # stack → (max_seq_len, d_model/2, 2)  then flatten last two dims → (max_seq_len, d_model)
        # Result layout per row: [sin0, cos0, sin1, cos1, sin2, cos2, ...]
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)

        return PE

In [None]:
# Instantiate the PositionalEncoding class with the same small values
# we used above (d_model=6, max_sequence_length=10) and call forward()
# to generate the (10, 6) encoding matrix.
#
# In a full Transformer pipeline you would do:
#   encoded_input = word_embeddings + pe.forward()
# so that each token vector now carries both meaning and position.
pe = PositionalEncoding(d_model=6, max_sequence_length=10)
pe.forward()

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])

Happy Coding!