In [None]:
#@title üéß Download Narration Audio & Play Introduction
import os as _os
if not _os.path.exists("/content/narration"):
    !pip install -q gdown
    import gdown
    gdown.download(id="1_yOuaRupWcvvBB5tNnjVrtDllXqg6x4Q", output="/content/narration.zip", quiet=False)
    !unzip -q /content/narration.zip -d /content/narration
    !rm /content/narration.zip
    print(f"Loaded {len(_os.listdir('/content/narration'))} narration segments")
else:
    print("Narration audio already loaded.")

from IPython.display import Audio, display
display(Audio("/content/narration/03_00_intro.mp3"))


In [None]:
# üîß Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"‚úÖ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("‚ö†Ô∏è No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime ‚Üí Change runtime type ‚Üí GPU")

print(f"\nüì¶ Python {sys.version.split()[0]}")
print(f"üî• PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"üé≤ Random seed set to {SEED}")

%matplotlib inline

In [None]:
#@title üéß Listen: Why It Matters
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_02_why_it_matters.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß Listen: Intuition
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_03_intuition.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
#@title üéß Listen: Math Overview
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_04_math_overview.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


# Score Matching and Denoising Score Matching -- Vizuara

## 1. Why Does This Matter?

So far we have seen that the score function is a powerful tool for sampling from distributions. But there was a catch: we assumed we KNEW the score function. In practice, we do not know the true data distribution, so we cannot compute the true score.

This notebook introduces two landmark techniques that solve this problem:
1. **Score Matching** (Hyvarinen, 2005) -- learn the score from data without knowing the true distribution
2. **Denoising Score Matching** (Vincent, 2010) -- a computationally efficient alternative that connects directly to diffusion models

**By the end of this notebook, you will:**
- Implement the tractable score matching loss
- Train a neural network to estimate the score function
- Implement denoising score matching
- Generate samples from a learned score using Langevin dynamics
- Understand the deep connection to modern diffusion models

## 2. Building Intuition

### Why Can't We Just Use MSE Against the True Score?

The obvious approach is to train a neural network $s_\theta(x)$ by minimizing:

$$J(\theta) = \frac{1}{2} \mathbb{E}_{p(x)} \left[\|s_\theta(x) - \nabla_x \log p(x)\|^2\right]$$

But this requires the TRUE score $\nabla_x \log p(x)$, which we do not have.

### Hyvarinen's Insight

Hyvarinen showed in 2005 that through integration by parts, this loss can be rewritten into a form that only requires the model and the data -- no true score needed. This is truly amazing.

### The Denoising Trick (Vincent, 2010)

Even Hyvarinen's formulation requires computing the Jacobian trace, which is expensive. Vincent's insight was simpler: add noise to data, then the score of the NOISY distribution is known analytically. Train the network to match this known target.

Think of it like invisible magnets on a table: you flick a ball away from a magnet and train a neural network to predict the direction back to the magnet.

## 3. The Mathematics

### Tractable Score Matching (Hyvarinen 2005)

$$J(\theta) = \mathbb{E}_{p(x)} \left[\text{tr}(\nabla_x s_\theta(x)) + \frac{1}{2}\|s_\theta(x)\|^2\right]$$

**Term 1** (Jacobian trace): forces score arrows to converge inward at data points. Computationally, we compute the diagonal of the Jacobian matrix and sum the entries.

**Term 2** (score magnitude): penalizes large scores, making high-density points stationary. Computationally, this is just the squared L2 norm of the score vector.

### Denoising Score Matching (Vincent 2010)

$$J_{DSM}(\theta) = \mathbb{E}_{p(x)\, q(\tilde{x}|x)} \left[\left\|s_\theta(\tilde{x}) + \frac{\tilde{x} - x}{\sigma^2}\right\|^2\right]$$

Computationally: add Gaussian noise to each data point, then train the network to predict the direction from the noisy point back to the clean point, scaled by $1/\sigma^2$.

**Numerical example:** Clean point $x = 2$, noise $\epsilon = 0.6$, $\sigma = 0.5$. Noisy point $\tilde{x} = 2.3$. Target score: $-(2.3 - 2)/0.25 = -1.2$.

## 4. Let's Build It -- Component by Component

### 4.1 The Score Network

In [None]:
#@title üéß Code Walkthrough: Score Network Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_05_score_network_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

class ScoreNetwork(nn.Module):
    """
    Neural network that estimates the score function s_theta(x).

    Takes a 2D point as input, outputs a 2D score vector.
    Simple MLP with SiLU activations.
    """
    def __init__(self, dim=2, hidden=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden),
            nn.SiLU(),
            nn.Linear(hidden, hidden),
            nn.SiLU(),
            nn.Linear(hidden, hidden),
            nn.SiLU(),
            nn.Linear(hidden, dim),
        )

    def forward(self, x):
        return self.net(x)

In [None]:
#@title üéß Listen: Training Data Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_06_training_data_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


### 4.2 Training Data: Mixture of Gaussians

In [None]:
#@title üéß What to Look For: Training Data Viz
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_07_training_data_viz.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
def sample_mixture_of_gaussians(n, centers=None, std=0.3):
    """Generate samples from a mixture of Gaussians."""
    if centers is None:
        centers = [[-2.0, 0.0], [2.0, 0.0]]
    centers = torch.tensor(centers)
    n_components = len(centers)

    # Random component assignment
    idx = torch.randint(0, n_components, (n,))
    samples = torch.randn(n, 2) * std + centers[idx]
    return samples

# Generate and visualize training data
data = sample_mixture_of_gaussians(2000)

plt.figure(figsize=(8, 6))
plt.scatter(data[:, 0].numpy(), data[:, 1].numpy(), s=5, alpha=0.3, c='steelblue')
plt.title('Training Data: Mixture of Two Gaussians', fontsize=14)
plt.xlabel('x1')
plt.ylabel('x2')
plt.axis('equal')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
#@title üéß Code Walkthrough: Sm Loss Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_08_sm_loss_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


### 4.3 Tractable Score Matching Loss

In [None]:
def score_matching_loss(model, x):
    """
    Compute the tractable score matching loss (Hyvarinen 2005).

    L = E[ tr(Jacobian of s_theta) + 0.5 * ||s_theta||^2 ]

    This requires computing the diagonal of the Jacobian of s_theta(x),
    which needs one backward pass per dimension.
    """
    x = x.clone().requires_grad_(True)
    score = model(x)

    # Term 2: score magnitude (easy)
    score_sq = 0.5 * (score ** 2).sum(dim=-1)

    # Term 1: trace of Jacobian (harder -- need autograd)
    trace = torch.zeros(x.shape[0], device=x.device)
    for i in range(x.shape[1]):
        # Gradient of the i-th component of score w.r.t. x
        grad_i = torch.autograd.grad(
            score[:, i].sum(), x,
            create_graph=True, retain_graph=True
        )[0][:, i]  # Only the diagonal element
        trace += grad_i

    loss = (trace + score_sq).mean()
    return loss

In [None]:
#@title üéß Listen: Sm Loss Explanation
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_09_sm_loss_explanation.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


Let us understand this code. We compute two terms: the squared magnitude of the predicted score (simple), and the trace of its Jacobian (requires `autograd` to differentiate each component of the score with respect to the corresponding input dimension). The sum gives us the tractable loss.

In [None]:
#@title üéß Code Walkthrough: Sm Loss Test
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_10_sm_loss_test.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Quick test: does the loss compute without errors?
model = ScoreNetwork(dim=2, hidden=64)
test_x = torch.randn(32, 2)
test_loss = score_matching_loss(model, test_x)
print(f"Score matching loss (random model): {test_loss.item():.4f}")
print("Loss computed successfully!")

In [None]:
#@title üéß Code Walkthrough: Training Sm
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_11_training_sm.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


### 4.4 Training with Score Matching

In [None]:
model_sm = ScoreNetwork(dim=2, hidden=128)
optimizer = torch.optim.Adam(model_sm.parameters(), lr=1e-3)
losses = []

for epoch in range(500):
    x = sample_mixture_of_gaussians(256)
    loss = score_matching_loss(model_sm, x)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch+1:4d} | Loss: {loss.item():.4f}")

In [None]:
#@title üéß What to Look For: Sm Loss Plot
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_12_sm_loss_plot.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Visualization checkpoint: training loss
plt.figure(figsize=(10, 4))
plt.plot(losses, 'b-', alpha=0.5, linewidth=0.5)
plt.plot(np.convolve(losses, np.ones(20)/20, mode='valid'), 'r-', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Score Matching Loss')
plt.title('Training Loss (Score Matching)')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
#@title üéß What to Look For: Sm Field Viz
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_13_sm_field_viz.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Visualize the learned score field
n_grid = 20
g = torch.linspace(-4, 4, n_grid)
G1, G2 = torch.meshgrid(g, g, indexing='ij')
grid_pts = torch.stack([G1.flatten(), G2.flatten()], dim=-1)

with torch.no_grad():
    scores = model_sm(grid_pts)

fig, ax = plt.subplots(figsize=(8, 8))

# Data as background
data_vis = sample_mixture_of_gaussians(1000)
ax.scatter(data_vis[:, 0], data_vis[:, 1], s=5, alpha=0.2, c='gray')

# Score field
ax.quiver(G1.numpy(), G2.numpy(),
          scores[:, 0].reshape(n_grid, n_grid).numpy(),
          scores[:, 1].reshape(n_grid, n_grid).numpy(),
          color='darkblue', scale=60, width=0.004)

ax.set_title('Learned Score Field (Score Matching)', fontsize=14)
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_aspect('equal')
plt.tight_layout()
plt.show()
print("Arrows should point toward the two data clusters.")

In [None]:
#@title üéß Before You Start: Dsm Intro Todo
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_14_dsm_intro_todo.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


## 5. Your Turn -- Denoising Score Matching

The tractable score matching loss works but is slow because of the Jacobian computation. Denoising Score Matching is much faster.

In [None]:
def dsm_loss(model, x, sigma=0.5):
    """
    TODO: Implement the Denoising Score Matching loss.

    Steps:
    1. Add Gaussian noise to x: x_noisy = x + sigma * epsilon
    2. Compute the target score: target = -(x_noisy - x) / sigma^2
       (which simplifies to: target = -epsilon / sigma)
    3. Predict the score at x_noisy using the model
    4. Return the MSE between predicted and target scores

    Args:
        model: Score network
        x: Clean data points (batch_size, dim)
        sigma: Noise level

    Returns:
        Scalar loss value
    """
    # ============ TODO ============
    # Step 1: Sample noise
    epsilon = ???  # torch.randn_like(x)

    # Step 2: Create noisy data
    x_noisy = ???  # x + sigma * epsilon

    # Step 3: Compute target score
    target = ???  # -epsilon / sigma  (or equivalently -(x_noisy - x) / sigma**2)

    # Step 4: Predict score and compute MSE
    pred = ???  # model(x_noisy)
    loss = ???  # MSE between pred and target
    # ==============================

    return loss

# Test (uncomment after implementing):
# model_test = ScoreNetwork(dim=2, hidden=64)
# x_test = torch.randn(32, 2)
# l = dsm_loss(model_test, x_test, sigma=0.5)
# print(f"DSM loss: {l.item():.4f}")

In [None]:
#@title üéß Code Walkthrough: Dsm Verification
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_15_dsm_verification.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Verification: compare DSM against known solution
def dsm_loss_solution(model, x, sigma=0.5):
    epsilon = torch.randn_like(x)
    x_noisy = x + sigma * epsilon
    target = -epsilon / sigma
    pred = model(x_noisy)
    loss = ((pred - target) ** 2).sum(dim=-1).mean()
    return loss

model_verify = ScoreNetwork(dim=2, hidden=64)
x_verify = torch.randn(64, 2)
loss_val = dsm_loss_solution(model_verify, x_verify)
print(f"DSM loss (verification): {loss_val.item():.4f}")
print("If your implementation gives a similar value, it is correct!")

In [None]:
#@title üéß Before You Start: Train Dsm Todo
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_16_train_dsm_todo.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


## 5.2 Your Turn -- Train with DSM and Compare

In [None]:
def train_dsm(n_epochs=2000, sigma=0.5, lr=1e-3):
    """
    TODO: Train a score network using DSM.

    Follow the same pattern as the score matching training above,
    but use dsm_loss_solution() instead of score_matching_loss().

    Return the trained model and loss history.
    """
    model = ScoreNetwork(dim=2, hidden=128)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    losses = []

    for epoch in range(n_epochs):
        x = sample_mixture_of_gaussians(256)

        # ============ TODO ============
        # Compute DSM loss, backprop, update
        # ==============================

        loss = dsm_loss_solution(model, x, sigma)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.item())

        if (epoch + 1) % 500 == 0:
            print(f"Epoch {epoch+1:4d} | DSM Loss: {loss.item():.4f}")

    return model, losses

model_dsm, dsm_losses = train_dsm(n_epochs=2000, sigma=0.5)

In [None]:
#@title üéß What to Look For: Dsm Sm Comparison Viz
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_17_dsm_sm_comparison_viz.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
# Visualization: compare score fields
fig, axes = plt.subplots(1, 2, figsize=(16, 7))

data_bg = sample_mixture_of_gaussians(500)
n_g = 20
g = torch.linspace(-4, 4, n_g)
G1, G2 = torch.meshgrid(g, g, indexing='ij')
gp = torch.stack([G1.flatten(), G2.flatten()], dim=-1)

for ax, model, title in zip(axes, [model_sm, model_dsm],
                              ['Score Matching', 'Denoising Score Matching']):
    with torch.no_grad():
        s = model(gp)
    ax.scatter(data_bg[:, 0], data_bg[:, 1], s=5, alpha=0.2, c='gray')
    ax.quiver(G1.numpy(), G2.numpy(),
              s[:, 0].reshape(n_g, n_g).numpy(),
              s[:, 1].reshape(n_g, n_g).numpy(),
              color='darkblue', scale=60, width=0.004)
    ax.set_title(title, fontsize=14)
    ax.set_aspect('equal')

plt.suptitle('Comparison: Score Matching vs DSM', fontsize=15, y=1.02)
plt.tight_layout()
plt.show()
print("Both methods learn similar score fields, but DSM is MUCH faster to train!")

In [None]:
#@title üéß Listen: Langevin Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_18_langevin_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


## 6. Putting It All Together -- Sample with Langevin Dynamics

In [None]:
#@title üéß What to Look For: Langevin Sampling Viz
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_19_langevin_sampling_viz.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


In [None]:
def langevin_sample(model, n_samples=500, n_steps=1000, step_size=0.01, dim=2):
    """Sample using Langevin dynamics from a learned score network."""
    x = torch.randn(n_samples, dim) * 3  # Start from noise
    trajectories = [x[:5].clone()]

    for t in range(n_steps):
        with torch.no_grad():
            score = model(x)
        noise = torch.randn_like(x)
        x = x + step_size * score + (2 * step_size) ** 0.5 * noise

        if t % 50 == 0:
            trajectories.append(x[:5].clone())

    return x, trajectories

# Sample from the DSM-trained model
samples_dsm, trajs = langevin_sample(model_dsm, n_samples=1000, n_steps=1500)

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# True data
true_data = sample_mixture_of_gaussians(1000)
axes[0].scatter(true_data[:, 0], true_data[:, 1], s=5, alpha=0.3, c='steelblue')
axes[0].set_title('True Data', fontsize=14)
axes[0].set_aspect('equal')
axes[0].set_xlim(-5, 5)
axes[0].set_ylim(-4, 4)

# Trajectories
for i in range(5):
    tx = [t[i, 0].item() for t in trajs]
    ty = [t[i, 1].item() for t in trajs]
    axes[1].plot(tx, ty, '-', alpha=0.5, linewidth=1)
    axes[1].plot(tx[0], ty[0], 'go', markersize=5)
    axes[1].plot(tx[-1], ty[-1], 'rs', markersize=5)
axes[1].set_title('Langevin Trajectories', fontsize=14)
axes[1].set_aspect('equal')
axes[1].set_xlim(-5, 5)
axes[1].set_ylim(-4, 4)

# Generated samples
axes[2].scatter(samples_dsm[:, 0].numpy(), samples_dsm[:, 1].numpy(),
                s=5, alpha=0.3, c='coral')
axes[2].set_title('Generated Samples (DSM)', fontsize=14)
axes[2].set_aspect('equal')
axes[2].set_xlim(-5, 5)
axes[2].set_ylim(-4, 4)

plt.suptitle('Full Pipeline: Train Score (DSM) + Sample (Langevin)', fontsize=15, y=1.02)
plt.tight_layout()
plt.show()
print("The generated samples match the true data distribution! Not bad right?")

In [None]:
#@title üéß Listen: Diffusion Connection
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_20_diffusion_connection.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


## 7. Training and Results -- Connection to Diffusion Models

In [None]:
# The DSM loss is equivalent to predicting the added noise
# This is EXACTLY what DDPM does!

# Let us verify this connection numerically
x_clean = sample_mixture_of_gaussians(100)
sigma = 0.5
epsilon = torch.randn_like(x_clean)
x_noisy = x_clean + sigma * epsilon

# DSM target: -(x_noisy - x_clean) / sigma^2
dsm_target = -(x_noisy - x_clean) / sigma**2

# Noise prediction target (DDPM style): epsilon
noise_target = epsilon

# These are related by: dsm_target = -noise_target / sigma
reconstructed = -noise_target / sigma
print(f"DSM target and -epsilon/sigma match: {torch.allclose(dsm_target, reconstructed)}")
print(f"\nThis confirms: learning the score function is the SAME as")
print(f"learning to predict the noise -- the foundation of DDPM!")

In [None]:
#@title üéß What to Look For: Final Output Viz
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_21_final_output_viz.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


## 8. Final Output

In [None]:
# Grand finale: 4-panel summary of the entire pipeline
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# Panel 1: True data
true_data = sample_mixture_of_gaussians(2000)
axes[0, 0].scatter(true_data[:, 0], true_data[:, 1], s=3, alpha=0.3, c='steelblue')
axes[0, 0].set_title('1. True Data Distribution', fontsize=13)

# Panel 2: Learned score field
n_g = 25
g = torch.linspace(-4, 4, n_g)
G1, G2 = torch.meshgrid(g, g, indexing='ij')
gp = torch.stack([G1.flatten(), G2.flatten()], dim=-1)
with torch.no_grad():
    sc = model_dsm(gp)
axes[0, 1].quiver(G1.numpy(), G2.numpy(),
                   sc[:, 0].reshape(n_g, n_g).numpy(),
                   sc[:, 1].reshape(n_g, n_g).numpy(),
                   color='darkblue', scale=80, width=0.003)
axes[0, 1].set_title('2. Learned Score Field (DSM)', fontsize=13)

# Panel 3: Sampling trajectories
samples_final, trajs_final = langevin_sample(model_dsm, n_samples=500, n_steps=1000)
for i in range(8):
    tx = [t[min(i, len(t)-1), 0].item() for t in trajs_final]
    ty = [t[min(i, len(t)-1), 1].item() for t in trajs_final]
    axes[1, 0].plot(tx, ty, '-', alpha=0.4, linewidth=1)
axes[1, 0].set_title('3. Langevin Sampling Trajectories', fontsize=13)

# Panel 4: Generated vs true overlay
axes[1, 1].scatter(true_data[:, 0], true_data[:, 1], s=3, alpha=0.15, c='steelblue', label='True')
axes[1, 1].scatter(samples_final[:, 0].numpy(), samples_final[:, 1].numpy(),
                    s=3, alpha=0.3, c='coral', label='Generated')
axes[1, 1].legend(fontsize=11)
axes[1, 1].set_title('4. Generated vs True Data', fontsize=13)

for ax in axes.flat:
    ax.set_aspect('equal')
    ax.set_xlim(-5, 5)
    ax.set_ylim(-4, 4)
    ax.grid(True, alpha=0.2)

plt.suptitle('Score-Based Generative Modeling: Complete Pipeline', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

print("\nSummary:")
print("1. We trained a neural network to estimate the score function using DSM")
print("2. DSM only requires adding noise and learning to undo it")
print("3. We sampled new data using Langevin dynamics with the learned score")
print("4. This is the foundation of modern diffusion models!")

In [None]:
#@title üéß Wrap-Up: Reflection
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_22_reflection.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")


## 9. Reflection and Next Steps

### Think About These Questions:
1. The DSM loss uses a single noise level $\sigma$. What happens if $\sigma$ is too small? Too large? (Hint: think about the bias-variance tradeoff)
2. How does this connect to DDPM, which uses MANY noise levels? What advantage does multi-scale noise provide?
3. The Jacobian trace in the tractable score matching loss scales as $O(D)$ per sample. For a 256x256 image ($D = 196608$), how many times slower is this compared to DSM?
4. Can you think of distributions where Langevin dynamics would struggle, even with a perfect score function?

### What's Next
The next notebook brings everything together: we will apply the full pipeline (DSM + Langevin) to a more complex dataset and explore what happens when we use multiple noise scales -- the bridge to modern diffusion models.