# Chapter 3: Implementing Large Language Models

In this chapter, we will code attention mechanism 

In [1]:
# Implement attention mechanism in the decoder "simple example"
import torch as nn 


In [4]:

# Define the sentence
sentence = "Your journey starts with one step"

# Tokenize the sentence into words
words = sentence.split()

# Define the size of the embedding (e.g., 10)
embedding_size = len(words)

# Create a random matrix tensor for the words
random_input_matrix = nn.randn(len(words), embedding_size)

print(random_input_matrix)

tensor([[-1.1776, -0.2477, -1.4939,  0.5932, -0.4045,  0.8290],
        [-0.0125,  0.4582, -2.3477,  0.2177, -0.1264, -0.0633],
        [-0.3668,  0.0942,  1.5271,  1.3618, -0.2063,  0.4015],
        [-0.2211, -0.5934,  1.7383,  0.5103, -1.6741, -0.0507],
        [ 0.7186, -0.3664,  1.6488,  0.9783,  0.8619,  0.7014],
        [ 0.9988,  0.6478,  0.9972,  0.2161,  1.3627,  1.6040]])


In [5]:
# Calculate intermediate attention scores 

# Select the query vector (second word in the sentence)
query = random_input_matrix[1]

# Initialize an empty tensor to store attention scores
attn_scores_2 = nn.empty(random_input_matrix.shape[0])

# Calculate attention scores by taking dot product of each word vector with the query vector
for i, x_i in enumerate(random_input_matrix):
    attn_scores_2[i] = nn.dot(x_i, query)

# Print the attention scores
print(attn_scores_2)

tensor([ 3.5364,  5.7893, -3.2404, -4.0244, -3.9883, -2.2836])


In [6]:
# Normalize the attention scores using softmax function
# We use dim=0 to apply softmax across the rows (i.e., across the different word vectors)
attn_scores_2 = nn.softmax(attn_scores_2, dim=0)

# Print the normalized attention scores
print(attn_scores_2)

tensor([9.5052e-02, 9.0446e-01, 1.0835e-04, 4.9471e-05, 5.1289e-05, 2.8207e-04])


In [7]:
print(nn.sum(attn_scores_2))

tensor(1.)


In [10]:
# Conpute the attention score for each word vector
attn_scores1 = random_input_matrix @ random_input_matrix.T
print(attn_scores1)

tensor([[ 4.8826,  3.5364, -0.6487, -1.2517, -2.4055, -1.9199],
        [ 3.5364,  5.7893, -3.2404, -4.0244, -3.9883, -2.2836],
        [-0.6487, -3.2404,  4.5338,  3.6998,  3.6558,  1.8747],
        [-1.2517, -4.0244,  3.6998,  6.4883,  1.9455, -1.1241],
        [-2.4055, -3.9883,  3.6558,  1.9455,  5.5610,  4.6355],
        [-1.9199, -2.2836,  1.8747, -1.1241,  4.6355,  6.8881]])


In [15]:
# Normalize the attention scores using softmax function
attn_scores2 = nn.softmax(attn_scores1, dim=-1)
print(attn_scores2)

tensor([[7.8854e-01, 2.0521e-01, 3.1235e-03, 1.7090e-03, 5.3907e-04, 8.7610e-04],
        [9.5052e-02, 9.0446e-01, 1.0835e-04, 4.9471e-05, 5.1289e-05, 2.8207e-04],
        [2.9149e-03, 2.1829e-04, 5.1923e-01, 2.2550e-01, 2.1579e-01, 3.6349e-02],
        [4.0543e-04, 2.5338e-05, 5.7325e-02, 9.3187e-01, 9.9183e-03, 4.6063e-04],
        [2.2060e-04, 4.5312e-05, 9.4624e-02, 1.7108e-02, 6.3595e-01, 2.5205e-01],
        [1.3443e-04, 9.3439e-05, 5.9765e-03, 2.9792e-04, 9.4508e-02, 8.9899e-01]])


In [17]:
# compute attention weight for each word vector
attn_weights = attn_scores2 @ random_input_matrix
print(attn_weights)

tensor([[-0.9314, -0.1017, -1.6503,  0.5183, -0.3467,  0.6436],
        [-0.1230,  0.3910, -2.2648,  0.2535, -0.1524,  0.0221],
        [-0.0524, -0.1411,  1.5721,  1.0429, -0.2503,  0.4091],
        [-0.2200, -0.5510,  1.7236,  0.5637, -1.5628, -0.0162],
        [ 0.6700, -0.0710,  1.4737,  0.8143,  0.8433,  0.8876],
        [ 0.9634,  0.5482,  1.0616,  0.2951,  1.3047,  1.5107]])


In [29]:
import torch 
import torch.nn as nn

class SelfAttention(nn.Module):
    def __init__(self, d_in, d_out):
        super(SelfAttention, self).__init__()
        self.w_query = nn.Parameter(torch.randn(d_in, d_out))
        self.w_key = nn.Parameter(torch.randn(d_in, d_out))
        self.w_value = nn.Parameter(torch.randn(d_in, d_out))

    def forward(self, x):
        keys = x @ self.w_key
        queries = x @ self.w_query
        values = x @ self.w_value 
        attn_scores = queries @ keys.T 
        attn_weights = nn.functional.softmax(attn_scores, dim=-1)
        attn_output = attn_weights @ values
        return attn_output


In [37]:
torch.manual_seed(42)
sa_v1 = SelfAttention(embedding_size, embedding_size)
attention_score = sa_v1(random_input_matrix)
print(attention_score)

tensor([[-0.3083,  0.6986,  9.0132,  3.4046, -0.5365, -0.1862],
        [ 0.4245, -0.5881, -9.7885, -3.0913, -2.4793,  1.7715],
        [ 2.6022, -1.7643,  5.5865, -0.0712,  1.6634, -2.3016],
        [ 2.6025, -1.7646,  5.5862, -0.0715,  1.6635, -2.3018],
        [ 0.8760, -0.2813,  7.6022,  2.1714,  0.1237, -0.9518],
        [-3.3342,  2.2808,  7.0788,  4.1153, -0.6294,  2.1286]],
       grad_fn=<MmBackward0>)


In [38]:
# Hiding future words causal attention 
# Mask the future words by setting the attention scores to negative infinity
attn_scores_masked = torch.softmax(attention_score + torch.triu(torch.full(attention_score.shape, float('-inf')), diagonal=1), dim=-1)
print(attn_scores_masked)

tensor([[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [7.3352e-01, 2.6648e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [4.8110e-02, 6.1078e-04, 9.5128e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [4.7980e-02, 6.0881e-04, 9.4810e-01, 3.3095e-03, 0.0000e+00, 0.0000e+00],
        [1.1913e-03, 3.7447e-04, 9.9352e-01, 4.3512e-03, 5.6143e-04, 0.0000e+00],
        [2.8141e-05, 7.7247e-03, 9.3682e-01, 4.8375e-02, 4.2074e-04, 6.6345e-03]],
       grad_fn=<SoftmaxBackward0>)


In [39]:
# verify that the attention scores are normalized
print(attn_scores_masked.sum(dim=-1))

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
       grad_fn=<SumBackward1>)


In [40]:
# Make additional attentuion weights with dropout 
attn_weights_dropout = nn.functional.dropout(attn_scores_masked, p=0.5, training=True)
print(attn_weights_dropout)

tensor([[2.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 1.9026e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 1.8962e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.1229e-03, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 1.8736e+00, 9.6749e-02, 8.4147e-04, 1.3269e-02]],
       grad_fn=<MulBackward0>)


In [41]:
# Pytprch class for casual attention
import torch
import torch.nn as nn
import torch.nn.functional as F

class CausalSelfAttention(nn.Module):
    """
    Implements a causal attention mechanism where each token only attends to previous tokens.
    """

    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.1):
        """
        Args:
            embed_dim (int): Dimension of the input embeddings.
            num_heads (int): Number of attention heads.
            dropout (float): Dropout rate for regularization.
        """
        super().__init__()

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        assert self.embed_dim % self.num_heads == 0, "Embedding dim must be divisible by num_heads"

        # Query, Key, and Value projection layers
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)

        # Output projection
        self.out_proj = nn.Linear(embed_dim, embed_dim)

        # Dropout
        self.dropout = nn.Dropout(dropout)

        # Mask for causal attention
        self.register_buffer("mask", torch.tril(torch.ones(1, 1, 1000, 1000)))  # Assumes max seq length of 1000

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass for causal attention.

        Args:
            x (torch.Tensor): Input tensor of shape (batch, seq_len, embed_dim)

        Returns:
            torch.Tensor: Output tensor after applying causal self-attention.
        """
        batch_size, seq_len, embed_dim = x.shape

        # Project input into Query, Key, and Value
        Q = self.q_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.k_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.v_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Compute scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)

        # Apply causal mask
        mask = self.mask[:, :, :seq_len, :seq_len]
        scores = scores.masked_fill(mask == 0, float('-inf'))

        # Compute attention weights
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Compute weighted sum of values
        attn_output = torch.matmul(attn_weights, V)

        # Reshape and project output
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)
        return self.out_proj(attn_output), attn_weights


In [42]:
#Example Usage & Testing
# Initialize Causal Attention
embed_dim = 64   # Embedding size
num_heads = 8    # Multi-head attention
seq_len = 10     # Sequence length
batch_size = 2   # Batch size

In [43]:
causal_attention = CausalSelfAttention(embed_dim, num_heads)
print(causal_attention)

CausalSelfAttention(
  (q_proj): Linear(in_features=64, out_features=64, bias=True)
  (k_proj): Linear(in_features=64, out_features=64, bias=True)
  (v_proj): Linear(in_features=64, out_features=64, bias=True)
  (out_proj): Linear(in_features=64, out_features=64, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


In [44]:
# Generate dummy input
x = torch.rand(batch_size, seq_len, embed_dim)
print(x.shape)
print(x)

torch.Size([2, 10, 64])
tensor([[[0.1439, 0.5043, 0.1673,  ..., 0.0499, 0.6817, 0.8787],
         [0.8150, 0.0649, 0.1663,  ..., 0.1957, 0.2380, 0.6600],
         [0.5565, 0.7017, 0.9229,  ..., 0.2101, 0.6002, 0.9715],
         ...,
         [0.8724, 0.0510, 0.3880,  ..., 0.1878, 0.1430, 0.0822],
         [0.5730, 0.2669, 0.5206,  ..., 0.7782, 0.9385, 0.3942],
         [0.3370, 0.0276, 0.4640,  ..., 0.9877, 0.4345, 0.0163]],

        [[0.2058, 0.4842, 0.7821,  ..., 0.7823, 0.7696, 0.1237],
         [0.9298, 0.9635, 0.4529,  ..., 0.4137, 0.4539, 0.3953],
         [0.7257, 0.6575, 0.6200,  ..., 0.0690, 0.4895, 0.6925],
         ...,
         [0.8053, 0.3670, 0.2015,  ..., 0.2986, 0.3613, 0.7340],
         [0.3851, 0.4409, 0.9412,  ..., 0.3959, 0.9708, 0.0127],
         [0.8741, 0.7567, 0.7489,  ..., 0.3626, 0.3905, 0.9904]]])


In [45]:
# Forward pass
output, attn_weights = causal_attention(x)
print(output.shape)
print(output)

print(attn_weights.shape)
print(attn_weights)

torch.Size([2, 10, 64])
tensor([[[ 0.0388, -0.0762,  0.2145,  ..., -0.1133,  0.2355, -0.1140],
         [ 0.0296,  0.0880,  0.2732,  ..., -0.1395,  0.2499, -0.0477],
         [-0.0039,  0.1242,  0.2915,  ..., -0.1036,  0.2336, -0.1170],
         ...,
         [ 0.0970,  0.1154,  0.3041,  ..., -0.0982,  0.2300, -0.0981],
         [ 0.0661,  0.0643,  0.2958,  ..., -0.0584,  0.2754, -0.0442],
         [ 0.1122,  0.0993,  0.2972,  ..., -0.0535,  0.2828, -0.1044]],

        [[ 0.0521,  0.1877,  0.0901,  ..., -0.0352,  0.3724, -0.2702],
         [-0.0333,  0.1738,  0.2314,  ..., -0.1156,  0.2846, -0.1275],
         [-0.0142,  0.1713,  0.2203,  ..., -0.0401,  0.3044, -0.1009],
         ...,
         [ 0.0055,  0.0362,  0.2979,  ...,  0.0493,  0.2350, -0.1619],
         [ 0.0500,  0.0811,  0.3076,  ...,  0.0121,  0.2659, -0.1488],
         [ 0.0149,  0.1058,  0.2782,  ..., -0.0125,  0.2657, -0.1234]]],
       grad_fn=<ViewBackward0>)
torch.Size([2, 8, 10, 10])
tensor([[[[1.1111, 0.0000, 0.0000