In [None]:
#@title üéß Download Narration Audio & Play Introduction
import os as _os
if not _os.path.exists("/content/narration"):
    !pip install -q gdown
    import gdown
    gdown.download(id="1_yOuaRupWcvvBB5tNnjVrtDllXqg6x4Q", output="/content/narration.zip", quiet=False)
    !unzip -q /content/narration.zip -d /content/narration
    !rm /content/narration.zip
    print(f"Loaded {len(_os.listdir('/content/narration'))} narration segments")
else:
    print("Narration audio already loaded.")

from IPython.display import Audio, display
display(Audio("/content/narration/03_00_intro.mp3"))


In [None]:
#@title üéß Code Walkthrough: Setup Run Cell
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_01_setup_run_cell.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß What to Look For: Visualization It Example
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_04_visualization_it_example.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# üîß Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"‚úÖ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("‚ö†Ô∏è No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime ‚Üí Change runtime type ‚Üí GPU")

print(f"\nüì¶ Python {sys.version.split()[0]}")
print(f"üî• PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"üé≤ Random seed set to {SEED}")

%matplotlib inline

In [None]:
#@title üéß Listen: Why It Matters
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_02_why_it_matters.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß Listen: Building Intuition
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_03_building_intuition.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


# Self-Attention and the Transformer -- Vizuara

---

## 1. Why Does This Matter?

In 2017, a team at Google published a paper with one of the most confident titles in AI history: "Attention Is All You Need." They were right.

The Transformer architecture they introduced solved every limitation of RNNs in a single stroke: it processes all words **simultaneously** (no sequential bottleneck), it can attend to **any position** directly (no vanishing gradients), and it scales beautifully with modern GPU hardware.

Every modern LLM -- GPT, BERT, LLaMA, Claude, Gemini -- is built on this architecture. Understanding self-attention is not optional if you want to understand modern AI.

In this notebook, you will:
- Build the self-attention mechanism from scratch, step by step
- Implement scaled dot-product attention with actual matrix operations
- Build multi-head attention to capture multiple types of relationships
- Implement positional encodings so the model knows word order
- Assemble a complete Transformer block
- Visualize attention patterns to see what the model is "looking at"

Let us attend.

---

## 2. Building Intuition

Consider: "The cat sat on the mat because **it** was comfortable."

What does "it" refer to? The mat. How did you figure it out? You **looked back** at all the previous words and decided "mat" was most relevant.

Now: "The cat sat on the mat because **it** was hungry."

Now "it" refers to the cat. Same structure, different meaning -- and you figured it out by attending to a **different** word.

This selective, context-dependent backward glance is self-attention. Instead of processing words one at a time and hoping a hidden state remembers what matters, self-attention lets every word **directly look at every other word** and decide how much to pay attention.

Let us see this in action.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

# The library analogy for Q, K, V:
# You walk into a library with a QUERY ("I want to learn about cats").
# Each book has a KEY on its spine (title, keywords).
# You compare your query to every key, find matches, and read the VALUES (contents).

# In self-attention, every word plays ALL THREE roles simultaneously.

# Let's demonstrate with a simple example
words = ["The", "cat", "sat", "on", "the", "mat"]

# Imagine the "it" is trying to figure out what it refers to
query_word = "it"
print(f"Query word: '{query_word}'")
print(f"The word '{query_word}' asks: 'Who should I pay attention to?'\n")

# Simulated attention scores (what self-attention learns)
attention_case1 = {"The": 0.05, "cat": 0.08, "sat": 0.05,
                   "on": 0.02, "the": 0.05, "mat": 0.75}
attention_case2 = {"The": 0.05, "cat": 0.72, "sat": 0.08,
                   "on": 0.02, "the": 0.05, "mat": 0.08}

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Case 1: "it was comfortable" ‚Üí mat
bars1 = axes[0].bar(words, [attention_case1[w] for w in words],
                     color=['#90CAF9' if v < 0.3 else '#E53935'
                            for v in [attention_case1[w] for w in words]],
                     edgecolor='white', linewidth=2)
axes[0].set_ylabel('Attention Weight', fontsize=12)
axes[0].set_title('"...because it was comfortable"\n‚Üí "it" attends to "mat"',
                  fontsize=13, fontweight='bold')
axes[0].set_ylim(0, 1.0)

# Case 2: "it was hungry" ‚Üí cat
bars2 = axes[1].bar(words, [attention_case2[w] for w in words],
                     color=['#90CAF9' if v < 0.3 else '#E53935'
                            for v in [attention_case2[w] for w in words]],
                     edgecolor='white', linewidth=2)
axes[1].set_ylabel('Attention Weight', fontsize=12)
axes[1].set_title('"...because it was hungry"\n‚Üí "it" attends to "cat"',
                  fontsize=13, fontweight='bold')
axes[1].set_ylim(0, 1.0)

plt.suptitle('Self-Attention: Same Structure, Different Attention Pattern',
             fontsize=15, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

print("Key insight: The attention pattern changes based on CONTEXT.")
print("This is impossible for N-grams or fixed-window models.")

In [None]:
#@title üéß Listen: The Mathematics
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_05_the_mathematics.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 3. The Mathematics

The attention formula is:

$$\text{Attention}(Q, K, V) = \text{softmax}\!\left(\frac{Q K^\top}{\sqrt{d_k}}\right) V$$

Where:
- $Q = X W_Q$ -- Queries: "what am I looking for?"
- $K = X W_K$ -- Keys: "what do I have to offer?"
- $V = X W_V$ -- Values: "what information do I carry?"
- $d_k$ -- dimension of keys (for scaling)

The $\sqrt{d_k}$ scaling prevents the dot products from growing too large, which would push softmax into regions with near-zero gradients.

Let us compute this step by step with actual numbers.

In [None]:
#@title üéß Code Walkthrough: Step By Step Computation
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_06_step_by_step_computation.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß Wrap-Up: Closing
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_30_closing.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Step-by-step attention computation with actual numbers

# 3 words: "the", "cat", "sat"
# Each has a d_k = 2 dimensional representation after projection

# After multiplying by W_Q, W_K, W_V, suppose we get:
Q = torch.tensor([
    [1.0, 0.0],   # "the" query
    [0.0, 1.0],   # "cat" query
    [1.0, 1.0],   # "sat" query
])

K = torch.tensor([
    [0.0, 1.0],   # "the" key
    [1.0, 0.0],   # "cat" key
    [1.0, 1.0],   # "sat" key
])

V = torch.tensor([
    [1.0, 0.0],   # "the" value
    [0.0, 1.0],   # "cat" value
    [0.5, 0.5],   # "sat" value
])

d_k = Q.shape[-1]  # 2

print("=" * 60)
print("STEP-BY-STEP ATTENTION COMPUTATION")
print("=" * 60)

# Step 1: Q @ K^T
print("\nStep 1: Compute QK^T (dot product between queries and keys)")
scores = Q @ K.T
print(f"  Q @ K^T = ")
print(f"  {scores.numpy()}")
print(f"\n  Each entry (i,j) = how relevant word j's KEY is to word i's QUERY")
print(f"  e.g., scores[2,2] = {scores[2,2].item():.1f} ‚Üí 'sat' is most relevant to itself")

# Step 2: Scale
print(f"\nStep 2: Scale by sqrt(d_k) = sqrt({d_k}) = {d_k**0.5:.3f}")
scaled = scores / (d_k ** 0.5)
print(f"  Scaled scores = ")
print(f"  {scaled.numpy()}")
print(f"\n  Why scale? Without it, large d_k ‚Üí large scores ‚Üí near-one-hot softmax ‚Üí tiny gradients")

# Step 3: Softmax
print(f"\nStep 3: Apply softmax row-wise (each row sums to 1)")
weights = F.softmax(scaled, dim=-1)
print(f"  Attention weights = ")
print(f"  {weights.numpy().round(3)}")
print(f"\n  Row sums: {weights.sum(dim=-1).numpy()}")

# Step 4: Multiply by V
print(f"\nStep 4: Multiply weights by V to get output")
output = weights @ V
print(f"  Output = Weights @ V = ")
print(f"  {output.numpy().round(3)}")
print(f"\n  Each word's output is a WEIGHTED COMBINATION of all V vectors")

In [None]:
#@title üéß What to Look For: Visualization Flow Diagram
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_07_visualization_flow_diagram.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Visualize the attention computation as a flow diagram

fig, axes = plt.subplots(1, 4, figsize=(18, 5))
word_labels = ["the", "cat", "sat"]

# Panel 1: Raw scores Q@K^T
im1 = axes[0].imshow(scores.numpy(), cmap='Blues', vmin=-0.5, vmax=2.5)
axes[0].set_title('Step 1: QK^T\n(raw scores)', fontsize=12, fontweight='bold')
axes[0].set_xticks(range(3)); axes[0].set_xticklabels(word_labels, fontsize=11)
axes[0].set_yticks(range(3)); axes[0].set_yticklabels(word_labels, fontsize=11)
axes[0].set_xlabel('Key (K)', fontsize=11); axes[0].set_ylabel('Query (Q)', fontsize=11)
for i in range(3):
    for j in range(3):
        axes[0].text(j, i, f'{scores[i,j]:.1f}', ha='center', va='center',
                     fontsize=13, fontweight='bold')

# Panel 2: Scaled scores
im2 = axes[1].imshow(scaled.numpy(), cmap='Blues', vmin=-0.5, vmax=2.0)
axes[1].set_title('Step 2: Scale\n√∑ sqrt(d_k)', fontsize=12, fontweight='bold')
axes[1].set_xticks(range(3)); axes[1].set_xticklabels(word_labels, fontsize=11)
axes[1].set_yticks(range(3)); axes[1].set_yticklabels(word_labels, fontsize=11)
for i in range(3):
    for j in range(3):
        axes[1].text(j, i, f'{scaled[i,j]:.2f}', ha='center', va='center',
                     fontsize=12, fontweight='bold')

# Panel 3: Attention weights (after softmax)
im3 = axes[2].imshow(weights.numpy(), cmap='Reds', vmin=0, vmax=0.6)
axes[2].set_title('Step 3: Softmax\n(attention weights)', fontsize=12, fontweight='bold')
axes[2].set_xticks(range(3)); axes[2].set_xticklabels(word_labels, fontsize=11)
axes[2].set_yticks(range(3)); axes[2].set_yticklabels(word_labels, fontsize=11)
for i in range(3):
    for j in range(3):
        axes[2].text(j, i, f'{weights[i,j]:.3f}', ha='center', va='center',
                     fontsize=12, fontweight='bold',
                     color='white' if weights[i,j] > 0.4 else 'black')

# Panel 4: Output
im4 = axes[3].imshow(output.numpy(), cmap='Greens', vmin=0, vmax=0.7)
axes[3].set_title('Step 4: Output\n= Weights √ó V', fontsize=12, fontweight='bold')
axes[3].set_xticks(range(2)); axes[3].set_xticklabels(['d‚ÇÅ', 'd‚ÇÇ'], fontsize=11)
axes[3].set_yticks(range(3)); axes[3].set_yticklabels(word_labels, fontsize=11)
for i in range(3):
    for j in range(2):
        axes[3].text(j, i, f'{output[i,j]:.3f}', ha='center', va='center',
                     fontsize=13, fontweight='bold')

plt.suptitle('Scaled Dot-Product Attention: Complete Computation',
             fontsize=15, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
#@title üéß Transition: Building Components Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_08_building_components_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 4. Let's Build It -- Component by Component

Now let us implement self-attention as a proper PyTorch module.

In [None]:
#@title üéß Code Walkthrough: Self Attention Class
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_09_self_attention_class.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
class SelfAttention(nn.Module):
    """
    Single-head self-attention.

    Each word creates a Query, Key, and Value vector.
    Attention weights are computed from Q and K, then used to
    create a weighted combination of V vectors.
    """

    def __init__(self, d_model, d_k):
        """
        Args:
            d_model: Dimension of input embeddings
            d_k: Dimension of Q, K, V projections
        """
        super().__init__()
        self.d_k = d_k

        # Learned projection matrices
        self.W_q = nn.Linear(d_model, d_k, bias=False)
        self.W_k = nn.Linear(d_model, d_k, bias=False)
        self.W_v = nn.Linear(d_model, d_k, bias=False)

    def forward(self, x, mask=None):
        """
        Args:
            x: (batch_size, seq_len, d_model) input embeddings
            mask: optional (seq_len, seq_len) boolean mask (True = block)
        Returns:
            output: (batch_size, seq_len, d_k) attention output
            weights: (batch_size, seq_len, seq_len) attention weights
        """
        Q = self.W_q(x)  # (batch, seq_len, d_k)
        K = self.W_k(x)
        V = self.W_v(x)

        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

        # Apply mask (for causal/decoder attention)
        if mask is not None:
            scores = scores.masked_fill(mask, float('-inf'))

        weights = F.softmax(scores, dim=-1)
        output = torch.matmul(weights, V)

        return output, weights

# Test it
d_model = 8
d_k = 4
seq_len = 5

attn = SelfAttention(d_model, d_k)

# Random input (batch=1, seq_len=5, d_model=8)
x = torch.randn(1, seq_len, d_model)
output, weights = attn(x)

print(f"Input shape:    {x.shape}")
print(f"Output shape:   {output.shape}")
print(f"Weights shape:  {weights.shape}")
print(f"Weight row sum: {weights[0].sum(dim=-1).detach().numpy()}")

In [None]:
#@title üéß Listen: Multi Head Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_10_multi_head_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


Now let us build **multi-head attention** -- multiple attention heads running in parallel, each learning different types of relationships.

In [None]:
#@title üéß Code Walkthrough: Multi Head Attention Class
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_11_multi_head_attention_class.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
class MultiHeadAttention(nn.Module):
    """
    Multi-head attention.

    Runs h parallel attention heads, each learning different
    types of relationships (syntax, semantics, coreference, etc.)
    """

    def __init__(self, d_model, num_heads):
        """
        Args:
            d_model: Model dimension
            num_heads: Number of attention heads
        """
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        # Linear projections for all heads at once
        self.W_q = nn.Linear(d_model, d_model, bias=False)
        self.W_k = nn.Linear(d_model, d_model, bias=False)
        self.W_v = nn.Linear(d_model, d_model, bias=False)

        # Output projection
        self.W_o = nn.Linear(d_model, d_model, bias=False)

    def forward(self, x, mask=None):
        batch_size, seq_len, d_model = x.shape

        # Project Q, K, V
        Q = self.W_q(x)  # (batch, seq_len, d_model)
        K = self.W_k(x)
        V = self.W_v(x)

        # Reshape for multi-head: split d_model into num_heads * d_k
        Q = Q.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        # Now: (batch, num_heads, seq_len, d_k)

        # Scaled dot-product attention for each head
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

        if mask is not None:
            scores = scores.masked_fill(mask.unsqueeze(0).unsqueeze(0), float('-inf'))

        weights = F.softmax(scores, dim=-1)
        attn_output = torch.matmul(weights, V)
        # (batch, num_heads, seq_len, d_k)

        # Concatenate heads
        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(batch_size, seq_len, d_model)

        # Final projection
        output = self.W_o(attn_output)

        return output, weights

# Test multi-head attention
d_model = 16
num_heads = 4
mha = MultiHeadAttention(d_model, num_heads)

x = torch.randn(1, 6, d_model)
output, weights = mha(x)

print(f"Input shape:   {x.shape}")
print(f"Output shape:  {output.shape}")
print(f"Weights shape: {weights.shape}  (batch, heads, seq, seq)")
print(f"\nEach of {num_heads} heads learns a DIFFERENT attention pattern.")

In [None]:
#@title üéß Listen: Positional Encoding Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_12_positional_encoding_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


Now let us implement **positional encoding** -- without it, the Transformer cannot distinguish word order.

In [None]:
#@title üéß What to Look For: Positional Encoding Class Vis
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_13_positional_encoding_class_vis.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
class PositionalEncoding(nn.Module):
    """
    Sinusoidal positional encoding from "Attention Is All You Need."

    Adds position information to word embeddings so the model
    can distinguish "cat sat" from "sat cat."
    """

    def __init__(self, d_model, max_len=512):
        super().__init__()

        # Create positional encoding matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)
        )

        pe[:, 0::2] = torch.sin(position * div_term)  # Even dimensions
        pe[:, 1::2] = torch.cos(position * div_term)  # Odd dimensions

        self.register_buffer('pe', pe.unsqueeze(0))  # (1, max_len, d_model)

    def forward(self, x):
        """Add positional encoding to input embeddings."""
        return x + self.pe[:, :x.size(1), :]


# Visualize positional encodings
pe = PositionalEncoding(d_model=64, max_len=100)
pe_matrix = pe.pe[0].numpy()  # (100, 64)

fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Heatmap
im = axes[0].imshow(pe_matrix[:50, :].T, aspect='auto', cmap='RdBu_r', vmin=-1, vmax=1)
axes[0].set_xlabel('Position in Sequence', fontsize=12)
axes[0].set_ylabel('Encoding Dimension', fontsize=12)
axes[0].set_title('Positional Encoding Heatmap\nEach position has a unique "fingerprint"',
                  fontsize=14, fontweight='bold')
plt.colorbar(im, ax=axes[0])

# Individual dimensions
for dim in [0, 1, 4, 5, 20, 21]:
    style = '-' if dim % 2 == 0 else '--'
    label = f'dim {dim} ({"sin" if dim % 2 == 0 else "cos"})'
    axes[1].plot(range(50), pe_matrix[:50, dim], style, linewidth=2, label=label, alpha=0.8)

axes[1].set_xlabel('Position', fontsize=12)
axes[1].set_ylabel('Encoding Value', fontsize=12)
axes[1].set_title('Individual Encoding Dimensions\nLow dims = high frequency, High dims = low frequency',
                  fontsize=14, fontweight='bold')
axes[1].legend(ncol=3, fontsize=9, loc='upper right')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("Each position gets a unique combination of sin/cos values.")
print("The key property: PE[pos+k] - PE[pos] is the same for any pos,")
print("so the model can learn relative position patterns.")

In [None]:
#@title üéß Before You Start: Todo Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_14_todo_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 5. Your Turn

**TODO 1: Implement Causal (Masked) Attention**

In a decoder language model (like GPT), each word can only attend to previous words -- it cannot peek at the future. Implement the causal mask.

In [None]:
#@title üéß Before You Start: Todo Causal Mask
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_15_todo_1_causal_mask.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# TODO: Create and apply a causal attention mask
#
# Instructions:
# 1. Create a mask where mask[i, j] = True means position i CANNOT attend to position j
# 2. For causal attention: i cannot attend to j if j > i (no peeking at future)
# 3. Apply the mask in SelfAttention and verify the attention weights
# 4. Check: attention_weights[i, j] should be 0 for all j > i

# YOUR CODE HERE
seq_len = 5

# Create causal mask: upper triangle = True (blocked)
# causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
# print("Causal mask (True = blocked):")
# print(causal_mask.int())

# Test with SelfAttention
# attn_causal = SelfAttention(d_model=8, d_k=4)
# x_test = torch.randn(1, seq_len, 8)
# output_causal, weights_causal = attn_causal(x_test, mask=causal_mask)
#
# print("\nAttention weights with causal mask:")
# print(weights_causal[0].detach().numpy().round(3))
# print("\nVerify: all upper-triangle weights should be 0 ‚úì")

In [None]:
#@title üéß Before You Start: Todo Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_16_todo_2_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


**TODO 2: Build and Visualize 4-Head Attention Patterns**

Each attention head specializes in different relationships. Visualize what 4 different heads learn.

In [None]:
#@title üéß Before You Start: Todo Multi Head Vis
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_17_todo_2_multi_head_vis.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# TODO: Visualize attention patterns from all 4 heads
#
# Instructions:
# 1. Create a MultiHeadAttention with d_model=16, num_heads=4
# 2. Feed a sentence through it (use random embeddings for simplicity)
# 3. Extract the attention weights for each head
# 4. Create a 2x2 grid of heatmaps, one per head

# YOUR CODE HERE
# sentence = ["The", "cat", "sat", "on", "the", "mat"]
# mha_vis = MultiHeadAttention(d_model=16, num_heads=4)
# x_vis = torch.randn(1, len(sentence), 16)
# _, head_weights = mha_vis(x_vis)  # (1, 4, 6, 6)
#
# fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# for h in range(4):
#     ax = axes[h // 2][h % 2]
#     w = head_weights[0, h].detach().numpy()
#     ax.imshow(w, cmap='Blues')
#     ax.set_xticks(range(len(sentence)))
#     ax.set_xticklabels(sentence, rotation=45)
#     ax.set_yticks(range(len(sentence)))
#     ax.set_yticklabels(sentence)
#     ax.set_title(f'Head {h+1}')
# plt.suptitle('Multi-Head Attention Patterns')
# plt.tight_layout()
# plt.show()

In [None]:
#@title üéß Before You Start: Todo Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_18_todo_3_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


**TODO 3: Implement the Feed-Forward Network**

Each Transformer block has a position-wise feed-forward network after attention. It applies the same two-layer MLP to each position independently.

$$\text{FFN}(x) = \max(0, x W_1 + b_1) W_2 + b_2$$

In [None]:
#@title üéß Before You Start: Todo Feed Forward
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_19_todo_3_feed_forward.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß Code Walkthrough: Transformer Block Class
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_21_transformer_block_class.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# TODO: Implement the feed-forward network
#
# Instructions:
# 1. Two linear layers with a GELU activation between them
# 2. Typical expansion: inner dimension = 4 * d_model
# 3. This processes each position independently

# class FeedForward(nn.Module):
#     def __init__(self, d_model, d_ff=None):
#         super().__init__()
#         if d_ff is None:
#             d_ff = 4 * d_model
#         # TODO: Define two linear layers and activation
#         pass
#
#     def forward(self, x):
#         # TODO: x ‚Üí Linear ‚Üí GELU ‚Üí Linear
#         pass

In [None]:
#@title üéß Transition: Putting It Together Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_20_putting_it_together_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 6. Putting It All Together

Now let us assemble a complete Transformer block from our components.

In [None]:
class FeedForward(nn.Module):
    """Position-wise feed-forward network."""

    def __init__(self, d_model, d_ff=None):
        super().__init__()
        d_ff = d_ff or 4 * d_model
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.activation = nn.GELU()

    def forward(self, x):
        return self.linear2(self.activation(self.linear1(x)))


class TransformerBlock(nn.Module):
    """
    A single Transformer decoder block.

    Components:
    1. Multi-Head Self-Attention (with causal mask)
    2. Add & Layer Normalize
    3. Feed-Forward Network
    4. Add & Layer Normalize

    Residual connections around both sub-layers.
    """

    def __init__(self, d_model, num_heads, d_ff=None, dropout=0.1):
        super().__init__()

        self.attention = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Sub-layer 1: Multi-Head Attention + Residual + Norm
        attn_output, attn_weights = self.attention(x, mask)
        x = self.norm1(x + self.dropout1(attn_output))

        # Sub-layer 2: Feed-Forward + Residual + Norm
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout2(ff_output))

        return x, attn_weights

# Test the complete Transformer block
d_model = 32
num_heads = 4
seq_len = 6

block = TransformerBlock(d_model, num_heads)
x = torch.randn(1, seq_len, d_model)

# Create causal mask
causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()

output, weights = block(x, mask=causal_mask)

print(f"Transformer Block:")
print(f"  Input shape:  {x.shape}")
print(f"  Output shape: {output.shape}")
print(f"  Attention:    {weights.shape}")
print(f"\n  Components:")
print(f"    Multi-Head Attention: {num_heads} heads √ó {d_model//num_heads} dim each")
print(f"    Feed-Forward: {d_model} ‚Üí {4*d_model} ‚Üí {d_model}")
print(f"    Layer Norm: 2 √ó")
print(f"    Residual connections: 2 √ó")

# Count parameters
total_params = sum(p.numel() for p in block.parameters())
print(f"\n  Total parameters: {total_params:,}")

In [None]:
#@title üéß Listen: Stacking Blocks Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_22_stacking_blocks_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


Now let us stack multiple blocks and see how representations evolve through layers.

In [None]:
#@title üéß Code Walkthrough: Transformer Decoder Class
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_23_transformer_decoder_class.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
class TransformerDecoder(nn.Module):
    """
    Stack of Transformer blocks for language modeling.

    Includes token embedding, positional encoding, and output projection.
    """

    def __init__(self, vocab_size, d_model, num_heads, num_layers, max_len=512):
        super().__init__()

        self.d_model = d_model
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)

        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, num_heads, dropout=0.1)
            for _ in range(num_layers)
        ])

        self.norm = nn.LayerNorm(d_model)
        self.output_proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        seq_len = x.shape[1]

        # Causal mask
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(x.device)

        # Embed + positional encoding
        h = self.token_embedding(x) * (self.d_model ** 0.5)
        h = self.pos_encoding(h)

        # Pass through transformer blocks
        all_weights = []
        for block in self.blocks:
            h, weights = block(h, mask)
            all_weights.append(weights)

        # Final layer norm + projection
        h = self.norm(h)
        logits = self.output_proj(h)

        return logits, all_weights


# Create a small Transformer
vocab_size = 20
d_model = 32
num_heads = 4
num_layers = 3

transformer = TransformerDecoder(vocab_size, d_model, num_heads, num_layers)

# Test
test_input = torch.randint(0, vocab_size, (1, 8))  # 8 tokens
logits, all_weights = transformer(test_input)

print(f"Transformer Decoder (GPT-style):")
print(f"  Vocab size:  {vocab_size}")
print(f"  d_model:     {d_model}")
print(f"  Heads:       {num_heads}")
print(f"  Layers:      {num_layers}")
print(f"\n  Input:  {test_input.shape}")
print(f"  Output: {logits.shape}")
print(f"  ‚Üí At each position, outputs logits over {vocab_size} vocabulary words")

total_params = sum(p.numel() for p in transformer.parameters())
print(f"\n  Total parameters: {total_params:,}")
print(f"\n  For reference:")
print(f"    GPT-2 Small:  124M parameters")
print(f"    GPT-3:        175B parameters")
print(f"    Our mini-GPT: {total_params:,} parameters")

In [None]:
#@title üéß Transition: Training Results Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_24_training_results_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 7. Training and Results

Let us visualize the attention patterns across all layers and heads to see what different parts of the model focus on.

In [None]:
#@title üéß What to Look For: Visualization Across Layers Heads
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_25_visualization_across_layers_heads.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Visualize attention across all layers

fig, axes = plt.subplots(num_layers, num_heads, figsize=(16, 10))

words_vis = [f"w{i}" for i in range(8)]  # placeholder labels

for layer_idx in range(num_layers):
    for head_idx in range(num_heads):
        ax = axes[layer_idx][head_idx]
        w = all_weights[layer_idx][0, head_idx].detach().numpy()

        im = ax.imshow(w, cmap='Blues', vmin=0, vmax=w.max())
        ax.set_xticks(range(8))
        ax.set_xticklabels(words_vis, fontsize=8, rotation=45)
        ax.set_yticks(range(8))
        ax.set_yticklabels(words_vis, fontsize=8)

        if layer_idx == 0:
            ax.set_title(f'Head {head_idx+1}', fontsize=11, fontweight='bold')
        if head_idx == 0:
            ax.set_ylabel(f'Layer {layer_idx+1}', fontsize=11, fontweight='bold')

plt.suptitle('Attention Patterns Across Layers and Heads\n'
             '(each head in each layer learns a different pattern)',
             fontsize=15, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

print("Notice: the causal mask ensures each position only attends to earlier positions.")
print("Different heads specialize ‚Äî some focus on nearby words, others on distant ones.")

In [None]:
#@title üéß What to Look For: Visualization Architecture Comparison
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_26_visualization_architecture_comparison.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Compare architectures: N-gram vs Neural LM vs RNN vs Transformer

comparison = {
    'Feature': ['Context Window', 'Word Similarity', 'Long-Range Dependencies',
                'Parallel Training', 'Typical Parameters'],
    'N-gram': ['n-1 words (fixed)', 'None', 'None (n < 5)',
               'N/A (counting)', '~10M entries'],
    'Neural LM': ['n-1 words (fixed)', 'Learned embeddings', 'Limited by window',
                  'Yes (batch)', '~1-10M'],
    'RNN/LSTM': ['Unlimited (theory)', 'Learned embeddings', '~10-200 tokens',
                 'No (sequential)', '~10-100M'],
    'Transformer': ['Full sequence', 'Learned embeddings', 'Full sequence',
                    'Yes (fully parallel)', '100M - 1T'],
}

fig, ax = plt.subplots(figsize=(14, 6))
ax.axis('off')

table = ax.table(
    cellText=[comparison[k] for k in comparison],
    rowLabels=list(comparison.keys()),
    loc='center',
    cellLoc='center'
)

table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 1.8)

# Color the header row
for j in range(5):
    table[0, j].set_facecolor('#E3F2FD')
    table[0, j].set_text_props(fontweight='bold')

# Color the Transformer column
for i in range(len(comparison)):
    table[i, 4].set_facecolor('#E8F5E9')

ax.set_title('Architecture Comparison: The Journey from N-grams to Transformers',
             fontsize=15, fontweight='bold', y=0.95)
plt.tight_layout()
plt.show()

In [None]:
#@title üéß Transition: Final Output Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_27_final_output_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 8. Final Output

In [None]:
#@title üéß What to Look For: Final Visualization
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_28_final_visualization.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Final demonstration: attention weight visualization on a real-looking sentence

# Let's create a more meaningful visualization
sentence = ["The", "cat", "sat", "on", "the", "mat", "because", "it"]
n = len(sentence)

# Simulate realistic attention patterns for visualization
# Head 1: local attention (nearby words)
local_attn = np.zeros((n, n))
for i in range(n):
    for j in range(i+1):
        local_attn[i, j] = np.exp(-abs(i - j) * 0.5)
    local_attn[i, :i+1] /= local_attn[i, :i+1].sum()

# Head 2: syntactic attention (subject-verb, article-noun)
syntactic_attn = np.zeros((n, n))
syntactic_pairs = {1: 0, 2: 1, 5: 4, 7: 5}  # each word attends to its syntactic partner
for i in range(n):
    for j in range(i+1):
        if i in syntactic_pairs and syntactic_pairs[i] == j:
            syntactic_attn[i, j] = 0.7
        else:
            syntactic_attn[i, j] = 0.3 / max(i, 1)
    syntactic_attn[i, :i+1] /= syntactic_attn[i, :i+1].sum()

# Head 3: coreference attention ("it" ‚Üí "mat")
coref_attn = np.zeros((n, n))
for i in range(n):
    for j in range(i+1):
        coref_attn[i, j] = 0.1
    if i == 7:  # "it"
        coref_attn[7, 5] = 0.6  # attends to "mat"
    coref_attn[i, :i+1] /= coref_attn[i, :i+1].sum()

fig, axes = plt.subplots(1, 3, figsize=(18, 6))
titles = ['Head 1: Local Proximity', 'Head 2: Syntactic Structure', 'Head 3: Coreference']
matrices = [local_attn, syntactic_attn, coref_attn]

for ax, title, matrix in zip(axes, titles, matrices):
    im = ax.imshow(matrix, cmap='Blues', vmin=0, vmax=0.8)
    ax.set_xticks(range(n))
    ax.set_xticklabels(sentence, rotation=45, ha='right', fontsize=11)
    ax.set_yticks(range(n))
    ax.set_yticklabels(sentence, fontsize=11)
    ax.set_title(title, fontsize=13, fontweight='bold')

    for i in range(n):
        for j in range(n):
            if matrix[i, j] > 0.01:
                color = 'white' if matrix[i, j] > 0.4 else 'black'
                ax.text(j, i, f'{matrix[i,j]:.2f}', ha='center', va='center',
                       fontsize=8, color=color)

plt.suptitle('Multi-Head Attention: Each Head Learns Different Relationships',
             fontsize=15, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

print("Each attention head specializes:")
print("  Head 1 ‚Äî focuses on nearby words (local context)")
print("  Head 2 ‚Äî connects syntactically related words (subject‚Üîverb, article‚Üînoun)")
print("  Head 3 ‚Äî resolves coreferences ('it' ‚Üí 'mat')")
print("\nThis is why multi-head attention is so powerful:")
print("it captures MULTIPLE types of relationships simultaneously.")

In [None]:
#@title üéß Listen: Reflection
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_29_reflection.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 9. Reflection and Next Steps

**What we learned:**

1. **Self-attention lets every word look at every other word directly.** No more vanishing gradients, no sequential bottleneck.

2. **Q, K, V are learned projections** that determine what each word is looking for (Q), what it advertises (K), and what information it carries (V).

3. **Scaling by sqrt(d_k) is essential** to keep softmax in a useful gradient range.

4. **Multi-head attention captures multiple relationship types simultaneously** -- syntax, semantics, coreference, and more.

5. **Positional encoding gives the model word-order information** that is otherwise lost in the permutation-invariant attention operation.

6. **A Transformer block = Attention + Feed-Forward + Residuals + LayerNorm** -- and you stack N of these.

**What comes next:**

In the next notebook, we will put everything together: build a complete mini-GPT model, train it on real text, and generate coherent language. We will implement the training loop with next-token prediction, measure perplexity, and see the model learn to write.

In [None]:
print("=" * 60)
print("  NOTEBOOK COMPLETE: Self-Attention & the Transformer")
print("  You built attention from scratch, implemented")
print("  multi-head attention, positional encoding, and")
print("  assembled a complete Transformer block.")
print()
print("  Next: Building a Tiny Language Model (Mini-GPT)")
print("=" * 60)