In [None]:
#@title üéß Download Narration Audio & Play Introduction
import os as _os
if not _os.path.exists("/content/narration"):
    !pip install -q gdown
    import gdown
    gdown.download(id="1RJjttCvltRK-j5XaI_Tp752cibGKRYMf", output="/content/narration.zip", quiet=False)
    !unzip -q /content/narration.zip -d /content/narration
    !rm /content/narration.zip
    print(f"Loaded {len(_os.listdir('/content/narration'))} narration segments")
else:
    print("Narration audio already loaded.")

from IPython.display import Audio, display
display(Audio("/content/narration/02_00_intro.mp3"))


In [None]:
#@title üéß Code Walkthrough: Setup Code
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_01_setup_code.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# üîß Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"‚úÖ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("‚ö†Ô∏è No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime ‚Üí Change runtime type ‚Üí GPU")

print(f"\nüì¶ Python {sys.version.split()[0]}")
print(f"üî• PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"üé≤ Random seed set to {SEED}")

%matplotlib inline

In [None]:
#@title üéß Listen: Why It Matters
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_02_why_it_matters.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß Listen: Building Intuition Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_03_building_intuition_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


# Neural Language Models and Word Embeddings -- Vizuara

---

## 1. Why Does This Matter?

In the previous notebook, we built an N-gram language model that predicts the next word by counting. It works, but it has two fatal flaws: it assigns zero probability to unseen word combinations, and it has absolutely no notion that similar words should behave similarly.

In 2003, Yoshua Bengio asked a simple question: **what if, instead of counting words, we could learn to represent them?** His answer ‚Äî the **Neural Probabilistic Language Model** ‚Äî is one of the most important papers in the history of NLP.

The core idea: represent each word as a **dense vector of real numbers** (an embedding), then use a neural network to predict the next word from these embeddings. Words that appear in similar contexts end up with similar vectors, so knowledge transfers automatically from one word to another.

In this notebook, you will:
- Build Bengio's neural language model from scratch in PyTorch
- Train word embeddings and visualize semantic relationships
- Implement Word2Vec (Skip-gram) and see the famous King - Man + Woman = Queen
- Build a simple RNN language model and see its limitations
- Understand why fixed-context and sequential processing are fundamental bottlenecks

Let us move from counting to learning.

---

## 2. Building Intuition

Imagine you are in a foreign city looking for a good restaurant. The N-gram approach is like having a massive phone book: you look up the exact address. If the restaurant is not in the book, you are stuck.

The neural approach is like having a **map.** Even if the specific restaurant is not marked, you can see that there is a cluster of restaurants in a particular neighborhood and walk there. The "map" is the **embedding space**, and similar words live in the same neighborhood.

Let us see what "word as a number" means in practice.

In [None]:
#@title üéß What to Look For: Embedding Visualization
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_04_embedding_visualization.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß What to Look For: Train Bengiolm Results
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_12_train_bengiolm_results.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß What to Look For: Visualize Embeddings Results
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_15_visualize_embeddings_results.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß What to Look For: Train Rnn Results
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_26_train_rnn_results.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß What to Look For: Rnn Hidden State Results
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_29_rnn_hidden_state_results.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß What to Look For: Summary Visualization Results
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_35_summary_visualization_results.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict, Counter

# In an N-gram model, each word is just an INDEX ‚Äî a meaningless integer.
# "cat" = 0, "dog" = 1, "fish" = 2
# There is NO relationship between these numbers.

# In a neural model, each word is a VECTOR ‚Äî a point in continuous space.
# "cat" = [0.2, 0.8, -0.1, 0.5, ...]
# "dog" = [0.3, 0.7, -0.2, 0.4, ...]  <-- close to cat!
# "fish" = [-0.5, 0.1, 0.9, -0.3, ...]  <-- farther away

# Let's visualize this difference
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: N-gram view (discrete indices)
words = ["cat", "dog", "fish", "bird", "mat", "rug"]
indices = list(range(len(words)))

axes[0].scatter(indices, [0]*len(indices), s=200, c='#E53935', zorder=5)
for i, w in enumerate(words):
    axes[0].annotate(w, (i, 0), textcoords="offset points",
                     xytext=(0, 15), ha='center', fontsize=13, fontweight='bold')
axes[0].set_xlim(-0.5, len(words)-0.5)
axes[0].set_ylim(-0.5, 0.5)
axes[0].set_title('N-gram View: Words as Discrete Indices\n(no relationships)', fontsize=13, fontweight='bold')
axes[0].set_xlabel('Word Index', fontsize=11)
axes[0].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
axes[0].set_yticks([])

# Right: Embedding view (continuous vectors)
np.random.seed(42)
embeddings_2d = {
    "cat": [1.2, 2.1], "dog": [1.5, 1.8], "bird": [1.8, 2.5], "fish": [2.2, 2.0],
    "mat": [-1.0, -0.5], "rug": [-0.7, -0.3],
}
colors = ['#E53935', '#E53935', '#E53935', '#E53935', '#1E88E5', '#1E88E5']

for (word, (x, y)), c in zip(embeddings_2d.items(), colors):
    axes[1].scatter(x, y, s=200, c=c, zorder=5)
    axes[1].annotate(word, (x, y), textcoords="offset points",
                     xytext=(8, 8), fontsize=13, fontweight='bold')

# Draw similarity circles
from matplotlib.patches import Ellipse
axes[1].add_patch(Ellipse((1.6, 2.1), 1.8, 1.2, fill=False,
                           linestyle='--', color='#E53935', alpha=0.5, linewidth=2))
axes[1].add_patch(Ellipse((-0.85, -0.4), 1.0, 0.6, fill=False,
                           linestyle='--', color='#1E88E5', alpha=0.5, linewidth=2))
axes[1].text(1.6, 2.9, 'animals', ha='center', fontsize=11, color='#E53935', fontstyle='italic')
axes[1].text(-0.85, -0.9, 'surfaces', ha='center', fontsize=11, color='#1E88E5', fontstyle='italic')

axes[1].set_title('Embedding View: Words as Vectors\n(similar words are nearby)', fontsize=13, fontweight='bold')
axes[1].set_xlabel('Dimension 1', fontsize=11)
axes[1].set_ylabel('Dimension 2', fontsize=11)
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("Key insight: In embedding space, 'cat' and 'dog' are CLOSE together.")
print("Anything the model learns about 'cat' automatically helps it predict 'dog'.")

In [None]:
#@title üéß Listen: Math Explanation
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_05_math_explanation.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 3. The Mathematics

Bengio's model computes:

$$P(w_t \mid w_{t-n+1}, \ldots, w_{t-1}) = \text{softmax}(W \cdot h + b)$$

where:

$$h = \tanh(H \cdot x + d)$$

$$x = [C(w_{t-n+1}); \ldots; C(w_{t-1})]$$

Here, $C$ is the **embedding matrix** ‚Äî each row is a word's vector representation. The key parameters:

- $C \in \mathbb{R}^{|V| \times d}$ : embedding matrix (|V| words, d dimensions each)
- $H \in \mathbb{R}^{h \times (n-1)d}$ : hidden layer weights
- $W \in \mathbb{R}^{|V| \times h}$ : output layer weights

The loss function is **cross-entropy** (negative log-likelihood):

$$\mathcal{L} = -\frac{1}{N}\sum_{i=1}^{N} \log P(w_i \mid \text{context}_i)$$

Let us trace through the forward pass with actual numbers.

In [None]:
#@title üéß Code Walkthrough: Manual Forward Pass Code
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_06_manual_forward_pass_code.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß What to Look For: Manual Forward Pass Results
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_07_manual_forward_pass_results.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Manual forward pass through Bengio's model
# Vocabulary: cat=0, sat=1, mat=2 (3 words)
# Context window: n=2 (bigram ‚Äî predict from 1 previous word)
# Embedding dimension: d=3
# Hidden dimension: h=4

torch.manual_seed(42)

vocab = {"cat": 0, "sat": 1, "mat": 2}
V = len(vocab)
d = 3   # embedding dimension
h = 4   # hidden dimension

# Embedding matrix C: each row is a word's vector
C = torch.tensor([
    [0.2, 0.8, -0.1],   # cat
    [0.5, 0.1, 0.7],    # sat
    [0.9, 0.3, -0.5],   # mat
], dtype=torch.float32)

print("Embedding matrix C:")
for word, idx in vocab.items():
    print(f"  '{word}' (idx={idx}) ‚Üí {C[idx].tolist()}")

# Step 1: Look up embedding for context word "cat"
context_word = "cat"
x = C[vocab[context_word]]  # shape: (d,) = (3,)
print(f"\nStep 1: Embedding lookup for '{context_word}'")
print(f"  x = C[{vocab[context_word]}] = {x.tolist()}")

# Step 2: Hidden layer
H = torch.tensor([
    [0.3, -0.1, 0.2],
    [0.4, 0.2, -0.3],
    [-0.1, 0.5, 0.1],
    [0.2, -0.2, 0.4],
], dtype=torch.float32)  # shape: (h, d) = (4, 3)
d_bias = torch.zeros(h)

hidden = torch.tanh(H @ x + d_bias)
print(f"\nStep 2: Hidden layer h = tanh(H @ x)")
print(f"  H @ x = {(H @ x).tolist()}")
print(f"  h = tanh(H @ x) = {hidden.tolist()}")

# Step 3: Output layer + softmax
W = torch.tensor([
    [0.5, -0.3, 0.2, 0.1],
    [-0.1, 0.4, 0.3, -0.2],
    [0.2, 0.1, -0.4, 0.5],
], dtype=torch.float32)  # shape: (V, h) = (3, 4)
b = torch.zeros(V)

logits = W @ hidden + b
probs = F.softmax(logits, dim=0)

print(f"\nStep 3: Output logits = W @ h = {logits.tolist()}")
print(f"  Softmax probabilities:")
for word, idx in vocab.items():
    print(f"    P('{word}' | '{context_word}') = {probs[idx].item():.4f}")

print(f"\n  Most likely next word: '{list(vocab.keys())[probs.argmax().item()]}'")

In [None]:
#@title üéß Transition: Transition To Pytorch
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_08_transition_to_pytorch.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 4. Let's Build It -- Component by Component

Now let us build Bengio's neural language model as a proper PyTorch module and train it.

In [None]:
#@title üéß Code Walkthrough: Bengiolm Class
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_09_bengiolm_class.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
class BengioLM(nn.Module):
    """
    Bengio's Neural Probabilistic Language Model (2003).

    Given (n-1) context words, predicts the probability of the next word.
    """

    def __init__(self, vocab_size, embed_dim, context_size, hidden_dim):
        """
        Args:
            vocab_size: Number of words in vocabulary
            embed_dim: Dimension of word embeddings
            context_size: Number of context words (n-1 for n-gram)
            hidden_dim: Hidden layer dimension
        """
        super().__init__()

        # The embedding matrix C
        self.embeddings = nn.Embedding(vocab_size, embed_dim)

        # Hidden layer: takes concatenated context embeddings
        self.hidden = nn.Linear(context_size * embed_dim, hidden_dim)

        # Output layer: produces logits over vocabulary
        self.output = nn.Linear(hidden_dim, vocab_size)

    def forward(self, context_indices):
        """
        Forward pass.

        Args:
            context_indices: (batch_size, context_size) tensor of word indices
        Returns:
            logits: (batch_size, vocab_size) unnormalized log-probabilities
        """
        # Step 1: Look up embeddings for each context word
        embeds = self.embeddings(context_indices)  # (batch, context_size, embed_dim)

        # Step 2: Concatenate embeddings
        x = embeds.view(embeds.size(0), -1)  # (batch, context_size * embed_dim)

        # Step 3: Hidden layer with tanh
        h = torch.tanh(self.hidden(x))  # (batch, hidden_dim)

        # Step 4: Output logits
        logits = self.output(h)  # (batch, vocab_size)

        return logits


# Prepare a corpus and vocabulary
corpus = [
    "the cat sat on the mat",
    "the dog sat on the rug",
    "the cat ate the fish",
    "the dog ate the bone",
    "the bird flew over the house",
    "the bird sat on the tree",
    "the cat ran after the dog",
    "the dog ran after the cat",
    "the fish swam in the pond",
    "the bird flew over the pond",
    "a cat sat on a mat",
    "a dog sat on a rug",
    "the cat is a nice pet",
    "the dog is a good pet",
    "a bird sang in the tree",
]

# Build vocabulary
all_words = []
for sentence in corpus:
    all_words.extend(sentence.lower().split())

word_counts = Counter(all_words)
vocab_list = ["<unk>"] + [w for w, c in word_counts.most_common()]
word2idx = {w: i for i, w in enumerate(vocab_list)}
idx2word = {i: w for w, i in word2idx.items()}

print(f"Vocabulary: {len(word2idx)} words")
print(f"Words: {vocab_list}")

In [None]:
#@title üéß Code Walkthrough: Data Prep
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_10_data_prep.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Create training data: (context, target) pairs for bigram model
CONTEXT_SIZE = 2  # Use 2 previous words (trigram-style)

def make_training_data(corpus, context_size, word2idx):
    """Create (context, target) pairs from corpus."""
    data = []
    for sentence in corpus:
        tokens = sentence.lower().split()
        for i in range(context_size, len(tokens)):
            context = [word2idx.get(tokens[j], 0) for j in range(i - context_size, i)]
            target = word2idx.get(tokens[i], 0)
            data.append((context, target))
    return data

training_data = make_training_data(corpus, CONTEXT_SIZE, word2idx)

# Convert to tensors
contexts = torch.tensor([d[0] for d in training_data])
targets = torch.tensor([d[1] for d in training_data])

print(f"Training examples: {len(training_data)}")
print(f"\nFirst 5 examples:")
for i in range(5):
    ctx_words = [idx2word[c.item()] for c in contexts[i]]
    tgt_word = idx2word[targets[i].item()]
    print(f"  Context: {ctx_words} ‚Üí Target: '{tgt_word}'")

In [None]:
#@title üéß Code Walkthrough: Train Bengiolm Code
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_11_train_bengiolm_code.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Train the model
EMBED_DIM = 16
HIDDEN_DIM = 64
LEARNING_RATE = 0.01
EPOCHS = 200

model = BengioLM(
    vocab_size=len(word2idx),
    embed_dim=EMBED_DIM,
    context_size=CONTEXT_SIZE,
    hidden_dim=HIDDEN_DIM
)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

losses = []

for epoch in range(EPOCHS):
    # Forward pass
    logits = model(contexts)
    loss = criterion(logits, targets)

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    losses.append(loss.item())

    if (epoch + 1) % 50 == 0:
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {loss.item():.4f}")

# Plot training loss
plt.figure(figsize=(10, 4))
plt.plot(losses, color='#1E88E5', linewidth=2)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Cross-Entropy Loss', fontsize=12)
plt.title('Training Loss: Bengio Neural Language Model', fontsize=14, fontweight='bold')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
#@title üéß Listen: Visualize Embeddings Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_13_visualize_embeddings_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


Now let us visualize the learned embeddings -- the model should have placed similar words near each other.

In [None]:
#@title üéß Code Walkthrough: Visualize Embeddings Code
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_14_visualize_embeddings_code.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Extract learned embeddings and visualize with PCA
from sklearn.decomposition import PCA

# Get embedding vectors
with torch.no_grad():
    embedding_matrix = model.embeddings.weight.numpy()

# Reduce to 2D with PCA
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embedding_matrix)

# Plot
fig, ax = plt.subplots(figsize=(12, 8))

# Color code by category
animals = {"cat", "dog", "bird", "fish", "pet"}
surfaces = {"mat", "rug"}
actions = {"sat", "ate", "ran", "flew", "swam", "sang"}
locations = {"tree", "house", "pond"}

for i, word in enumerate(vocab_list):
    x, y = embeddings_2d[i]

    if word in animals:
        color = '#E53935'
        category = 'animals'
    elif word in surfaces:
        color = '#1E88E5'
        category = 'surfaces'
    elif word in actions:
        color = '#43A047'
        category = 'actions'
    elif word in locations:
        color = '#FF9800'
        category = 'locations'
    else:
        color = '#9E9E9E'
        category = 'other'

    ax.scatter(x, y, c=color, s=100, zorder=5)
    ax.annotate(word, (x, y), textcoords="offset points",
                xytext=(5, 5), fontsize=10)

# Legend
from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='#E53935', markersize=10, label='Animals'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='#1E88E5', markersize=10, label='Surfaces'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='#43A047', markersize=10, label='Actions'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='#FF9800', markersize=10, label='Locations'),
]
ax.legend(handles=legend_elements, loc='best', fontsize=11)
ax.set_title('Learned Word Embeddings (PCA Projection)\nSimilar words cluster together!',
             fontsize=14, fontweight='bold')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
#@title üéß Before You Start: Todo1 Word2vec Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_16_todo1_word2vec_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 5. Your Turn

**TODO 1: Implement Word2Vec (Skip-gram)**

Word2Vec flips the prediction: instead of predicting the next word from context, it predicts **context words from the center word.** This is more efficient for learning embeddings.

Given center word $w_c$, predict each context word $w_o$ within a window:

$$P(w_o \mid w_c) = \frac{\exp(v'_{w_o} \cdot v_{w_c})}{\sum_{w=1}^{V} \exp(v'_w \cdot v_{w_c})}$$

In [None]:
#@title üéß Before You Start: Todo1 Word2vec Task
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_17_todo1_word2vec_task.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# TODO: Complete the Word2Vec Skip-gram model
#
# Instructions:
# 1. Fill in the forward() method
# 2. The model should have TWO embedding layers:
#    - center_embeddings: for center words
#    - context_embeddings: for context words
# 3. Forward pass: dot product between center and context embeddings

class Word2VecSkipGram(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        # TODO: Define two embedding layers
        self.center_embeddings = nn.Embedding(vocab_size, embed_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embed_dim)

    def forward(self, center_word, context_word):
        """
        Args:
            center_word: (batch_size,) indices of center words
            context_word: (batch_size,) indices of context words
        Returns:
            scores: (batch_size,) dot product scores
        """
        # TODO: Compute dot product between center and context embeddings
        # center_embed = self.center_embeddings(center_word)  # (batch, embed_dim)
        # context_embed = ???
        # score = ???  # dot product
        # return score
        pass

# TODO: Create training pairs (center_word, context_word) with window_size=2
# TODO: Train the model for 100 epochs
# TODO: Visualize the learned embeddings

In [None]:
#@title üéß Before You Start: Todo2 Cosine Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_19_todo2_cosine_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


**TODO 2: Measure Cosine Similarity Between Word Pairs**

After training, compute the cosine similarity between word pairs to verify that similar words have similar embeddings.

$$\text{cosine}(u, v) = \frac{u \cdot v}{\|u\| \|v\|}$$

In [None]:
#@title üéß Before You Start: Todo2 Cosine Task
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_20_todo2_cosine_task.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# TODO: Compute cosine similarities between word pairs
#
# Instructions:
# 1. Extract the embedding vectors from the trained Bengio model
# 2. Compute cosine similarity for these pairs:
#    - ("cat", "dog") ‚Äî should be high
#    - ("cat", "fish") ‚Äî should be moderate
#    - ("cat", "mat") ‚Äî should be low
#    - ("mat", "rug") ‚Äî should be high
# 3. Create a bar chart comparing the similarities

# YOUR CODE HERE
# def cosine_similarity(word1, word2, model, word2idx):
#     with torch.no_grad():
#         v1 = model.embeddings.weight[word2idx[word1]]
#         v2 = model.embeddings.weight[word2idx[word2]]
#         return F.cosine_similarity(v1.unsqueeze(0), v2.unsqueeze(0)).item()
#
# pairs = [("cat", "dog"), ("cat", "fish"), ("cat", "mat"), ("mat", "rug")]
# for w1, w2 in pairs:
#     sim = cosine_similarity(w1, w2, model, word2idx)
#     print(f"  cosine('{w1}', '{w2}') = {sim:.4f}")

In [None]:
#@title üéß Transition: Transition To Rnns
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_22_transition_to_rnns.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 6. Putting It All Together

Now let us build a simple **RNN language model** to see how recurrent processing extends the context window beyond a fixed number of words.

In [None]:
#@title üéß Code Walkthrough: Rnn Model Code
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_23_rnn_model_code.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß Code Walkthrough: Rnn Data Prep
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_24_rnn_data_prep.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
class RNNLanguageModel(nn.Module):
    """
    A simple RNN language model.

    Unlike Bengio's model which uses a fixed context window,
    the RNN carries a hidden state that (in theory) summarizes
    the entire history of the sequence.
    """

    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim

        self.embeddings = nn.Embedding(vocab_size, embed_dim)

        # RNN cell: h_t = tanh(W_hh * h_{t-1} + W_xh * x_t + b)
        self.W_xh = nn.Linear(embed_dim, hidden_dim)
        self.W_hh = nn.Linear(hidden_dim, hidden_dim, bias=False)

        # Output: project hidden state to vocabulary
        self.output = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_seq, hidden=None):
        """
        Process a sequence one token at a time.

        Args:
            input_seq: (batch_size, seq_len) token indices
            hidden: initial hidden state, or None for zeros
        Returns:
            logits: (batch_size, seq_len, vocab_size)
            hidden: final hidden state
        """
        batch_size, seq_len = input_seq.shape

        if hidden is None:
            hidden = torch.zeros(batch_size, self.hidden_dim)

        embeds = self.embeddings(input_seq)  # (batch, seq_len, embed_dim)

        outputs = []
        for t in range(seq_len):
            x_t = embeds[:, t, :]  # (batch, embed_dim)

            # RNN step: h_t = tanh(W_xh * x_t + W_hh * h_{t-1})
            hidden = torch.tanh(self.W_xh(x_t) + self.W_hh(hidden))

            # Predict next word from hidden state
            logit = self.output(hidden)  # (batch, vocab_size)
            outputs.append(logit.unsqueeze(1))

        logits = torch.cat(outputs, dim=1)  # (batch, seq_len, vocab_size)
        return logits, hidden


# Prepare sequence data for RNN
def prepare_rnn_data(corpus, word2idx, max_len=None):
    """Convert sentences to padded index tensors."""
    sequences = []
    for sentence in corpus:
        tokens = sentence.lower().split()
        indices = [word2idx.get(t, 0) for t in tokens]
        sequences.append(indices)

    if max_len is None:
        max_len = max(len(s) for s in sequences)

    # Pad sequences
    padded = torch.zeros(len(sequences), max_len, dtype=torch.long)
    for i, seq in enumerate(sequences):
        length = min(len(seq), max_len)
        padded[i, :length] = torch.tensor(seq[:length])

    return padded

# Prepare data: input is all but last token, target is all but first
sequences = prepare_rnn_data(corpus, word2idx)
inputs = sequences[:, :-1]    # everything except last word
targets_rnn = sequences[:, 1:]  # everything except first word

print(f"Input shape: {inputs.shape}")
print(f"Target shape: {targets_rnn.shape}")
print(f"\nExample:")
print(f"  Input:  {[idx2word[i.item()] for i in inputs[0] if i.item() in idx2word]}")
print(f"  Target: {[idx2word[i.item()] for i in targets_rnn[0] if i.item() in idx2word]}")

In [None]:
#@title üéß Code Walkthrough: Train Rnn Code
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_25_train_rnn_code.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Train the RNN
rnn_model = RNNLanguageModel(
    vocab_size=len(word2idx),
    embed_dim=16,
    hidden_dim=32
)

optimizer_rnn = torch.optim.Adam(rnn_model.parameters(), lr=0.01)
criterion_rnn = nn.CrossEntropyLoss(ignore_index=0)  # ignore padding

rnn_losses = []

for epoch in range(300):
    logits_rnn, _ = rnn_model(inputs)

    # Reshape for cross-entropy: (batch*seq_len, vocab_size)
    loss_rnn = criterion_rnn(
        logits_rnn.reshape(-1, len(word2idx)),
        targets_rnn.reshape(-1)
    )

    optimizer_rnn.zero_grad()
    loss_rnn.backward()
    optimizer_rnn.step()

    rnn_losses.append(loss_rnn.item())

    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch+1}/300, Loss: {loss_rnn.item():.4f}")

# Plot both training curves
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(losses, label='Bengio Feed-Forward LM', color='#1E88E5', linewidth=2, alpha=0.8)
ax.plot(rnn_losses, label='RNN Language Model', color='#E53935', linewidth=2, alpha=0.8)
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Cross-Entropy Loss', fontsize=12)
ax.set_title('Training Comparison: Feed-Forward vs RNN', fontsize=14, fontweight='bold')
ax.legend(fontsize=12)
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
#@title üéß Listen: Rnn Advantage Limitation Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_27_rnn_advantage_limitation_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 7. Training and Results

Let us demonstrate the RNN's key advantage ‚Äî and its key limitation.

In [None]:
#@title üéß Code Walkthrough: Rnn Hidden State Code
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_28_rnn_hidden_state_code.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# RNN advantage: it uses the ENTIRE sequence history, not just a fixed window

# Let's trace the hidden state through a sentence to see how
# information accumulates
rnn_model.eval()
test_sentence = "the cat sat on the mat"
test_tokens = test_sentence.split()
test_indices = torch.tensor([[word2idx.get(w, 0) for w in test_tokens]])

with torch.no_grad():
    embeds = rnn_model.embeddings(test_indices)
    hidden = torch.zeros(1, rnn_model.hidden_dim)

    hidden_states = [hidden.squeeze().numpy().copy()]
    predictions = []

    for t in range(len(test_tokens)):
        x_t = embeds[:, t, :]
        hidden = torch.tanh(rnn_model.W_xh(x_t) + rnn_model.W_hh(hidden))
        hidden_states.append(hidden.squeeze().numpy().copy())

        logit = rnn_model.output(hidden)
        pred_idx = logit.argmax(dim=-1).item()
        predictions.append(idx2word.get(pred_idx, "<unk>"))

# Visualize hidden states
hidden_matrix = np.array(hidden_states)

fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Hidden state heatmap
im = axes[0].imshow(hidden_matrix.T, aspect='auto', cmap='RdBu_r', vmin=-1, vmax=1)
axes[0].set_yticks(range(0, rnn_model.hidden_dim, 4))
step_labels = ["h‚ÇÄ"] + [f"'{w}'" for w in test_tokens]
axes[0].set_xticks(range(len(step_labels)))
axes[0].set_xticklabels(step_labels, fontsize=11)
axes[0].set_ylabel('Hidden Dimension', fontsize=12)
axes[0].set_title('RNN Hidden State Evolution\nEach column accumulates more information',
                  fontsize=14, fontweight='bold')
plt.colorbar(im, ax=axes[0], label='Activation')

# Predictions at each step
axes[1].axis('off')
header = "Step  |  Input Word  |  Hidden State Info  |  Predicted Next Word"
axes[1].text(0.05, 0.92, header, fontsize=11, fontfamily='monospace',
             fontweight='bold', transform=axes[1].transAxes)
axes[1].axhline(y=0.88, xmin=0.03, xmax=0.97, color='gray', linewidth=1,
                transform=axes[1].transAxes)

for t, (word, pred) in enumerate(zip(test_tokens, predictions)):
    y_pos = 0.82 - t * 0.12
    info = f"Encodes words 1..{t+1}"
    line = f"  {t+1}   |  {word:10s}  |  {info:20s}  |  '{pred}'"
    axes[1].text(0.05, y_pos, line, fontsize=10, fontfamily='monospace',
                 transform=axes[1].transAxes)

plt.tight_layout()
plt.show()

In [None]:
#@title üéß Listen: Vanishing Gradient Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_30_vanishing_gradient_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


Now let us see the vanishing gradient problem in action.

In [None]:
#@title üéß Code Walkthrough: Vanishing Gradient Code
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_31_vanishing_gradient_code.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß What to Look For: Vanishing Gradient Results
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_32_vanishing_gradient_results.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# The vanishing gradient problem: gradients shrink exponentially

# Simulate gradient flow through an RNN
seq_lengths = [5, 10, 20, 50, 100]
weight_scale = 0.8  # Typical weight magnitude

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: gradient magnitude over time steps
for length in seq_lengths:
    gradients = [1.0]  # Start with gradient = 1 at the last step
    for t in range(1, length):
        # Gradient shrinks by factor of ~weight_scale at each step
        gradients.append(gradients[-1] * weight_scale)
    gradients.reverse()

    steps = list(range(length))
    axes[0].plot(steps, gradients, linewidth=2, label=f'T={length}')

axes[0].set_xlabel('Time Step (from start)', fontsize=12)
axes[0].set_ylabel('Relative Gradient Magnitude', fontsize=12)
axes[0].set_title('Vanishing Gradients in RNNs\nGradient signal fades for early words',
                  fontsize=13, fontweight='bold')
axes[0].legend(fontsize=10)
axes[0].set_yscale('log')
axes[0].grid(alpha=0.3)
axes[0].axhline(y=0.01, color='red', linestyle='--', alpha=0.5, label='Negligible')

# Right: effective memory window
effective_memory = []
thresholds = [0.1, 0.01, 0.001]
for thresh in thresholds:
    # How many steps back can gradient reach with magnitude > threshold?
    steps_back = int(np.log(thresh) / np.log(weight_scale))
    effective_memory.append(steps_back)

bars = axes[1].bar(
    [f'>{t}' for t in thresholds],
    effective_memory,
    color=['#4CAF50', '#FF9800', '#F44336'],
    edgecolor='white', linewidth=2
)
for bar, mem in zip(bars, effective_memory):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                 f'{mem} steps', ha='center', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Gradient Threshold', fontsize=12)
axes[1].set_ylabel('Effective Memory (steps)', fontsize=12)
axes[1].set_title('How Far Back Can an RNN "Remember"?\n(with weight scale = 0.8)',
                  fontsize=13, fontweight='bold')
axes[1].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("Key takeaway: With typical weight scales, RNN gradients fade within 10-20 steps.")
print("This means the model CANNOT effectively use long-range context.")
print("LSTMs/GRUs extend this to ~100-200 tokens, but the fundamental")
print("problem remains: sequential processing prevents parallelism.")

In [None]:
#@title üéß Transition: Summary Visualization Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_33_summary_visualization_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 8. Final Output

In [None]:
#@title üéß Code Walkthrough: Summary Visualization Code
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_34_summary_visualization_code.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Summary visualization: the complete picture so far

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Panel 1: N-grams (count table)
axes[0].set_title('Era 1: N-grams\n"Count"', fontsize=14, fontweight='bold')
table_data = [
    ['the ‚Üí cat', '3/10', '‚úì'],
    ['the ‚Üí dog', '2/10', '‚úì'],
    ['cat ‚Üí sat', '2/3', '‚úì'],
    ['cat ‚Üí ran', '0/3', '‚úó'],
    ['dog ‚Üí flew', '0/2', '‚úó'],
]
table = axes[0].table(cellText=table_data,
                       colLabels=['Bigram', 'P', 'Seen?'],
                       loc='center', cellLoc='center')
table.auto_set_font_size(False)
table.set_fontsize(11)
table.scale(1, 1.5)
axes[0].axis('off')
axes[0].text(0.5, -0.05, 'Limitation: Zero probability\nfor unseen combinations',
             ha='center', fontsize=10, color='#E53935', transform=axes[0].transAxes)

# Panel 2: Neural LM (embeddings)
axes[1].set_title('Era 2: Neural LMs\n"Learn"', fontsize=14, fontweight='bold')
words_demo = {"cat": [0.8, 1.5], "dog": [1.0, 1.3], "bird": [0.6, 1.8],
              "mat": [-0.5, -0.8], "rug": [-0.3, -0.6]}
for w, (x, y) in words_demo.items():
    color = '#E53935' if w in ['cat', 'dog', 'bird'] else '#1E88E5'
    axes[1].scatter(x, y, s=120, c=color, zorder=5)
    axes[1].annotate(w, (x, y), textcoords="offset points", xytext=(5, 5), fontsize=11)
axes[1].set_xlabel('Dim 1', fontsize=10)
axes[1].set_ylabel('Dim 2', fontsize=10)
axes[1].grid(alpha=0.3)
axes[1].text(0.5, -0.05, 'Strength: Similarity transfers\nLimit: Fixed context window',
             ha='center', fontsize=10, color='#FF9800', transform=axes[1].transAxes)

# Panel 3: RNN (sequential, vanishing gradients)
axes[2].set_title('Era 2.5: RNNs\n"Remember"', fontsize=14, fontweight='bold')
words_seq = ['the', 'cat', 'sat', 'on', 'the', 'mat']
for i, w in enumerate(words_seq):
    axes[2].add_patch(plt.Rectangle((i*1.3, 0.8), 1.0, 0.6, fill=True,
                                      facecolor='#E3F2FD', edgecolor='#1E88E5', linewidth=2))
    axes[2].text(i*1.3 + 0.5, 1.1, w, ha='center', fontsize=10)
    if i < len(words_seq) - 1:
        axes[2].annotate('', xy=((i+1)*1.3, 1.1), xytext=(i*1.3 + 1.0, 1.1),
                         arrowprops=dict(arrowstyle='->', color='#1E88E5',
                                         alpha=max(0.2, 1.0 - i*0.15), linewidth=2))
axes[2].set_xlim(-0.3, 8.5)
axes[2].set_ylim(0.3, 1.8)
axes[2].axis('off')
axes[2].text(0.5, -0.05, 'Strength: Unlimited context (theory)\nLimit: Vanishing gradients + sequential',
             ha='center', fontsize=10, color='#FF9800', transform=axes[2].transAxes)

plt.suptitle('The Journey So Far: From Counting to Learning',
             fontsize=16, fontweight='bold', y=1.05)
plt.tight_layout()
plt.show()

In [None]:
#@title üéß Listen: Reflection Next Steps
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_36_reflection_next_steps.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


---

## 9. Reflection and Next Steps

**What we learned:**

1. **Neural language models replace count tables with learned parameters.** Every weight -- embeddings, hidden layers, output projections -- is trained end-to-end.

2. **Word embeddings solve the sparsity problem.** Similar words get similar vectors, so knowledge transfers automatically.

3. **Bengio's 2003 model was the breakthrough.** Embedding lookup, concatenation, hidden layer, softmax -- a simple recipe with profound consequences.

4. **RNNs extend context but hit walls.** The vanishing gradient problem limits effective memory to ~10-20 steps (LSTMs extend to ~100-200).

5. **Sequential processing is a bottleneck.** RNNs process one word at a time, making them slow on modern parallel hardware.

**What comes next:**

Both Bengio's model and RNNs share a fundamental limitation: they process context **sequentially** or through a **fixed window.** What if every word could look at every other word simultaneously and decide what is important?

That is exactly what the **Transformer** does with **self-attention.** In the next notebook, we will build the attention mechanism from scratch.

In [None]:
#@title üéß Wrap-Up: Closing
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_37_closing.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
print("=" * 60)
print("  NOTEBOOK COMPLETE: Neural Language Models")
print("  You built Bengio's model, trained embeddings,")
print("  and saw why RNNs hit the vanishing gradient wall.")
print()
print("  Next: Self-Attention and the Transformer")
print("=" * 60)