In [None]:
#@title üéß Download Narration Audio & Play Introduction
import os as _os
if not _os.path.exists("/content/narration"):
    !pip install -q gdown
    import gdown
    gdown.download(id="18Lc8XC_lV-uzRcNcgg-LLZXfkKSQ0vDX", output="/content/narration.zip", quiet=False)
    !unzip -q /content/narration.zip -d /content/narration
    !rm /content/narration.zip
    print(f"Loaded {len(_os.listdir('/content/narration'))} narration segments")
else:
    print("Narration audio already loaded.")

from IPython.display import Audio, display
display(Audio("/content/narration/00_intro.mp3"))

In [None]:
# üîß Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"‚úÖ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("‚ö†Ô∏è No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime ‚Üí Change runtime type ‚Üí GPU")

print(f"\nüì¶ Python {sys.version.split()[0]}")
print(f"üî• PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"üé≤ Random seed set to {SEED}")

%matplotlib inline

# üöÄ Video Diffusion from Scratch: Generating Moving Digits

*Part 1 of the Vizuara series on Diffusion Models for Video Generation*
*Estimated time: 45 minutes*

In [None]:
# üîß Setup ‚Äî Run this cell first
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
from IPython.display import HTML
import math

%matplotlib inline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# Reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [None]:
#@title üéß Listen: Why It Matters
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/01_why_it_matters.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 1. Why Does This Matter?

We have seen how diffusion models can generate stunning **images** from pure noise. But what about **video**?

A video is just a sequence of images played in rapid succession. If we can generate one image from noise, surely we can generate a *sequence* of images ‚Äî a video ‚Äî from noise too?

That is exactly what we will build in this notebook. By the end, you will have trained a neural network that takes **pure random noise** and transforms it, step by step, into a coherent video of a digit moving across the screen.

Let us start by creating our dataset.

In [None]:
#@title üéß Listen: Building Intuition
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_building_intuition.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 2. Building Intuition ‚Äî What Is a Video, Really?

Think of a flipbook. Each page has a slightly different drawing, and when you flip fast enough, it comes to life as smooth motion.

A video works the same way. It is a **4D tensor** with shape `(T, C, H, W)`:
- **T** = number of frames (time)
- **C** = channels (1 for grayscale, 3 for RGB)
- **H, W** = height and width of each frame

The key insight for video diffusion: we treat this entire 4D tensor as a *single data point*. We add noise to the entire video at once, and we train a network to denoise the entire video at once. This forces the network to learn not just what individual frames look like, but how they relate to each other over time.

### ü§î Think About This

If we generated each frame independently using an image diffusion model, what would happen? Why can't we just run an image model 16 times and stitch the results together?

*Hint: Think about what "independently sampled" means for consistency between frames.*

In [None]:
#@title üéß Listen: Creating Dataset
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_creating_dataset.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 3. Creating Our Dataset: Moving MNIST

We will create a simple but powerful dataset: MNIST digits that **move** across a canvas. Each video shows a digit translating smoothly from one position to another.

In [None]:
from torchvision import datasets, transforms

# Download MNIST
mnist = datasets.MNIST(root="./data", train=True, download=True,
                       transform=transforms.ToTensor())

print(f"MNIST loaded: {len(mnist)} images")
print(f"Image shape: {mnist[0][0].shape}")

In [None]:
def create_moving_mnist_video(digit_img, canvas_size=32, num_frames=16):
    """
    Create a video of a single MNIST digit moving across a canvas.

    Args:
        digit_img: (1, 28, 28) MNIST digit tensor
        canvas_size: size of the video canvas (square)
        num_frames: number of frames in the video

    Returns:
        video: (num_frames, 1, canvas_size, canvas_size) tensor
    """
    # Resize digit to fit on canvas (14x14)
    digit = F.interpolate(digit_img.unsqueeze(0), size=14,
                          mode="bilinear", align_corners=False)[0]
    dh, dw = digit.shape[1], digit.shape[2]

    # Random start and end positions
    max_pos = canvas_size - dh
    start_y = np.random.randint(0, max_pos)
    start_x = np.random.randint(0, max_pos)
    end_y = np.random.randint(0, max_pos)
    end_x = np.random.randint(0, max_pos)

    video = torch.zeros(num_frames, 1, canvas_size, canvas_size)

    for t in range(num_frames):
        # Linear interpolation of position
        frac = t / max(num_frames - 1, 1)
        y = int(start_y + frac * (end_y - start_y))
        x = int(start_x + frac * (end_x - start_x))
        video[t, :, y:y+dh, x:x+dw] = digit

    return video


def create_dataset(num_videos=2000, num_frames=16, canvas_size=32):
    """Create a dataset of moving MNIST videos."""
    videos = []
    for i in range(num_videos):
        idx = np.random.randint(0, len(mnist))
        digit_img = mnist[idx][0]  # (1, 28, 28)
        video = create_moving_mnist_video(digit_img, canvas_size, num_frames)
        videos.append(video)
    return torch.stack(videos)  # (N, T, 1, H, W)


# Create dataset
NUM_FRAMES = 16
CANVAS_SIZE = 32
dataset = create_dataset(num_videos=2000, num_frames=NUM_FRAMES,
                         canvas_size=CANVAS_SIZE)
print(f"Dataset shape: {dataset.shape}")
print(f"  {dataset.shape[0]} videos, {dataset.shape[1]} frames each")
print(f"  Resolution: {dataset.shape[3]}x{dataset.shape[4]}")
print(f"  Total values per video: {dataset[0].numel():,}")

In [None]:
#@title üéß Listen: Dataset Visualization
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_dataset_visualization.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

In [None]:
# üìä Visualize a few videos from our dataset
fig, axes = plt.subplots(4, NUM_FRAMES, figsize=(20, 5))
for row in range(4):
    for t in range(NUM_FRAMES):
        axes[row, t].imshow(dataset[row, t, 0].numpy(), cmap="gray",
                            vmin=0, vmax=1)
        axes[row, t].axis("off")
        if row == 0:
            axes[row, t].set_title(f"t={t}", fontsize=8)
fig.suptitle("Sample Videos ‚Äî Moving MNIST", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
#@title üéß Listen: Math
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/05_math.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 4. The Mathematics of Video Diffusion

Now let us formalize the diffusion process for video. This is **exactly** the same math as image diffusion ‚Äî the only difference is that our data point $\mathbf{v}$ is a 4D video tensor instead of a 2D image.

### Forward Process: Adding Noise

At each diffusion timestep $t$, we add Gaussian noise:

$$q(\mathbf{v}_t \mid \mathbf{v}_{t-1}) = \mathcal{N}(\mathbf{v}_t;\, \sqrt{1-\beta_t}\,\mathbf{v}_{t-1},\, \beta_t\,\mathbf{I})$$

Computationally, this means: scale the video down by $\sqrt{1-\beta_t}$ and add noise with standard deviation $\sqrt{\beta_t}$.

Using the reparameterization trick, we can jump directly to any timestep $t$:

$$\mathbf{v}_t = \sqrt{\bar{\alpha}_t}\,\mathbf{v}_0 + \sqrt{1-\bar{\alpha}_t}\,\boldsymbol{\epsilon}, \quad \boldsymbol{\epsilon} \sim \mathcal{N}(0, \mathbf{I})$$

where $\bar{\alpha}_t = \prod_{s=1}^{t}(1-\beta_s)$.

This is exactly what we want ‚Äî a single formula to noise a clean video to any level.

In [None]:
#@title üéß Listen: Noise Schedule
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/06_noise_schedule.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 5. Let's Build It ‚Äî Component by Component

### 5.1 The Noise Schedule

The noise schedule $\{\beta_t\}$ controls how quickly we add noise. We will use a cosine schedule, which adds noise more gradually than a linear schedule.

In [None]:
def cosine_beta_schedule(num_timesteps, s=0.008):
    """
    Cosine noise schedule (Nichol & Dhariwal, 2021).
    Produces a smoother noise curve than a linear schedule.
    """
    steps = torch.linspace(0, num_timesteps, num_timesteps + 1)
    f = torch.cos((steps / num_timesteps + s) / (1 + s) * math.pi / 2) ** 2
    alphas_cumprod = f / f[0]
    betas = 1 - alphas_cumprod[1:] / alphas_cumprod[:-1]
    return torch.clamp(betas, 0.0001, 0.999)


NUM_DIFFUSION_STEPS = 200
betas = cosine_beta_schedule(NUM_DIFFUSION_STEPS).to(device)
alphas = 1.0 - betas
alphas_cumprod = torch.cumprod(alphas, dim=0)
sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)

print(f"Noise schedule: {NUM_DIFFUSION_STEPS} timesteps")
print(f"  alpha_bar at t=0:   {alphas_cumprod[0]:.4f} (almost no noise)")
print(f"  alpha_bar at t=100: {alphas_cumprod[100]:.4f} (moderate noise)")
print(f"  alpha_bar at t=199: {alphas_cumprod[-1]:.4f} (almost pure noise)")

In [None]:
# üìä Visualize the noise schedule
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
timesteps = range(NUM_DIFFUSION_STEPS)

ax1.plot(timesteps, alphas_cumprod.cpu().numpy())
ax1.set_xlabel("Diffusion Timestep t")
ax1.set_ylabel("·æ±_t (signal remaining)")
ax1.set_title("Cumulative Signal Retention")
ax1.grid(True, alpha=0.3)

ax2.plot(timesteps, sqrt_one_minus_alphas_cumprod.cpu().numpy())
ax2.set_xlabel("Diffusion Timestep t")
ax2.set_ylabel("‚àö(1-·æ±_t) (noise level)")
ax2.set_title("Noise Level Over Time")
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
#@title üéß Listen: Forward Process
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/07_forward_process.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

### 5.2 The Forward Noising Process

Now let us implement the forward process. Given a clean video $\mathbf{v}_0$ and a timestep $t$, we compute the noisy video $\mathbf{v}_t$ and the noise $\boldsymbol{\epsilon}$ that was added.

In [None]:
def forward_diffusion(v_0, t):
    """
    Add noise to a clean video at timestep t.

    Args:
        v_0: clean video tensor, shape (B, T, C, H, W)
        t: timestep tensor, shape (B,)

    Returns:
        v_t: noisy video at timestep t
        noise: the noise that was added (our training target)
    """
    noise = torch.randn_like(v_0)

    # Gather the schedule values for each sample in the batch
    sqrt_alpha = sqrt_alphas_cumprod[t].view(-1, 1, 1, 1, 1)
    sqrt_one_minus_alpha = sqrt_one_minus_alphas_cumprod[t].view(-1, 1, 1, 1, 1)

    # v_t = sqrt(alpha_bar_t) * v_0 + sqrt(1 - alpha_bar_t) * noise
    v_t = sqrt_alpha * v_0 + sqrt_one_minus_alpha * noise

    return v_t, noise

In [None]:
# üìä Visualize the forward noising process on a single video
sample_video = dataset[0:1].to(device)  # (1, 16, 1, 32, 32)

fig, axes = plt.subplots(5, NUM_FRAMES, figsize=(20, 6))
timesteps_to_show = [0, 25, 75, 150, 199]

for row, ts in enumerate(timesteps_to_show):
    t_tensor = torch.tensor([ts], device=device)
    noisy, _ = forward_diffusion(sample_video, t_tensor)
    for col in range(NUM_FRAMES):
        axes[row, col].imshow(noisy[0, col, 0].cpu().numpy(), cmap="gray")
        axes[row, col].axis("off")
        if col == 0:
            axes[row, col].set_ylabel(f"t={ts}", fontsize=10)

fig.suptitle("Forward Diffusion: Clean Video ‚Üí Pure Noise", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
#@title üéß Listen: Denoising Network
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/08_denoising_network.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

This is exactly what we want. At $t=0$, the video is perfectly clean. As $t$ increases, the noise gradually destroys the signal. By $t=199$, the video is indistinguishable from random noise.

Now we need a neural network that can **reverse** this process ‚Äî take a noisy video and predict the noise that was added.

### 5.3 The Denoising Network: A Simple Video U-Net

We will build a small 3D U-Net that processes the entire video tensor jointly. It takes the noisy video and the timestep as input, and outputs the predicted noise.

The key idea: we use **3D convolutions** that operate across both space and time, so the network naturally learns spatial-temporal patterns.

In [None]:
class SinusoidalTimeEmbedding(nn.Module):
    """Encode the diffusion timestep as a vector."""
    def __init__(self, dim):
        super().__init__()
        self.dim = dim

    def forward(self, t):
        half_dim = self.dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, device=t.device) * -emb)
        emb = t.float().unsqueeze(1) * emb.unsqueeze(0)
        return torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)

In [None]:
class ResBlock3D(nn.Module):
    """Residual block with 3D convolutions + timestep conditioning."""
    def __init__(self, in_ch, out_ch, time_dim):
        super().__init__()
        self.conv1 = nn.Conv3d(in_ch, out_ch, 3, padding=1)
        self.conv2 = nn.Conv3d(out_ch, out_ch, 3, padding=1)
        self.time_mlp = nn.Linear(time_dim, out_ch)
        self.norm1 = nn.GroupNorm(8, out_ch)
        self.norm2 = nn.GroupNorm(8, out_ch)
        self.skip = nn.Conv3d(in_ch, out_ch, 1) if in_ch != out_ch else nn.Identity()

    def forward(self, x, t_emb):
        h = self.norm1(F.silu(self.conv1(x)))
        # Add timestep information
        t = self.time_mlp(F.silu(t_emb))
        h = h + t.view(t.shape[0], t.shape[1], 1, 1, 1)
        h = self.norm2(F.silu(self.conv2(h)))
        return h + self.skip(x)

In [None]:
class SimpleVideoUNet(nn.Module):
    """
    A small 3D U-Net for video noise prediction.

    Takes: noisy video (B, T, 1, H, W) + timestep (B,)
    Returns: predicted noise (B, T, 1, H, W)
    """
    def __init__(self, channels=1, base_dim=32, time_dim=64):
        super().__init__()
        self.time_embed = nn.Sequential(
            SinusoidalTimeEmbedding(time_dim),
            nn.Linear(time_dim, time_dim),
            nn.SiLU(),
        )

        # Encoder: (B, 1, T, H, W) ‚Üí (B, 32, T, H, W) ‚Üí (B, 64, T/2, H/2, W/2)
        self.enc1 = ResBlock3D(channels, base_dim, time_dim)
        self.enc2 = ResBlock3D(base_dim, base_dim * 2, time_dim)
        self.down = nn.Conv3d(base_dim, base_dim, 3, stride=2, padding=1)
        self.down2 = nn.Conv3d(base_dim * 2, base_dim * 2, 3, stride=2, padding=1)

        # Bottleneck
        self.mid = ResBlock3D(base_dim * 2, base_dim * 2, time_dim)

        # Decoder: mirror of encoder with skip connections
        self.up2 = nn.ConvTranspose3d(base_dim * 2, base_dim * 2,
                                       4, stride=2, padding=1)
        self.dec2 = ResBlock3D(base_dim * 4, base_dim, time_dim)
        self.up1 = nn.ConvTranspose3d(base_dim, base_dim,
                                       4, stride=2, padding=1)
        self.dec1 = ResBlock3D(base_dim * 2, base_dim, time_dim)

        self.out_conv = nn.Conv3d(base_dim, channels, 1)

    def forward(self, v, t):
        # v: (B, T, C, H, W) ‚Üí rearrange to (B, C, T, H, W) for Conv3d
        B, T, C, H, W = v.shape
        x = v.permute(0, 2, 1, 3, 4)  # (B, C, T, H, W)

        t_emb = self.time_embed(t)

        # Encoder
        h1 = self.enc1(x, t_emb)       # (B, 32, T, H, W)
        h1_down = self.down(h1)          # (B, 32, T/2, H/2, W/2)
        h2 = self.enc2(h1_down, t_emb)  # (B, 64, T/2, H/2, W/2)
        h2_down = self.down2(h2)         # (B, 64, T/4, H/4, W/4)

        # Bottleneck
        mid = self.mid(h2_down, t_emb)

        # Decoder with skip connections
        up2 = self.up2(mid)              # (B, 64, T/2, H/2, W/2)
        up2 = torch.cat([up2, h2], dim=1)  # (B, 128, ...)
        d2 = self.dec2(up2, t_emb)       # (B, 32, T/2, H/2, W/2)

        up1 = self.up1(d2)              # (B, 32, T, H, W)
        up1 = torch.cat([up1, h1], dim=1)  # (B, 64, ...)
        d1 = self.dec1(up1, t_emb)       # (B, 32, T, H, W)

        out = self.out_conv(d1)          # (B, 1, T, H, W)
        return out.permute(0, 2, 1, 3, 4)  # (B, T, 1, H, W)


model = SimpleVideoUNet(channels=1, base_dim=32, time_dim=64).to(device)
num_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {num_params:,}")

# Quick shape test
test_v = torch.randn(2, NUM_FRAMES, 1, CANVAS_SIZE, CANVAS_SIZE, device=device)
test_t = torch.randint(0, NUM_DIFFUSION_STEPS, (2,), device=device)
test_out = model(test_v, test_t)
print(f"Input shape:  {test_v.shape}")
print(f"Output shape: {test_out.shape}")
assert test_v.shape == test_out.shape, "Shape mismatch!"
print("‚úÖ Model works correctly!")

In [None]:
#@title üéß Listen: Todo Training
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/09_todo_training.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 6. üîß Your Turn ‚Äî Implement the Training Loop

Now it is your turn. The training objective is simple: predict the noise that was added.

$$\mathcal{L} = \mathbb{E}_{\mathbf{v}_0, \boldsymbol{\epsilon}, t}\left[\|\boldsymbol{\epsilon} - \boldsymbol{\epsilon}_\theta(\mathbf{v}_t, t)\|^2\right]$$

Computationally: sample a clean video, sample a random timestep, add noise, predict the noise, and minimize the MSE between predicted and actual noise.

In [None]:
def train_one_step(model, optimizer, batch):
    """
    Perform one training step of video diffusion.

    Args:
        model: the denoising network
        optimizer: the optimizer
        batch: clean video batch, shape (B, T, C, H, W)

    Returns:
        loss value (float)
    """
    # ============ TODO ============
    # Step 1: Sample random timesteps for each video in the batch
    #         (integers from 0 to NUM_DIFFUSION_STEPS-1)
    # Step 2: Use forward_diffusion() to get noisy videos and target noise
    # Step 3: Predict the noise using the model
    # Step 4: Compute MSE loss between predicted and actual noise
    # Step 5: Backpropagate and update weights
    # ==============================

    t = ???  # YOUR CODE HERE ‚Äî random timesteps, shape (B,)
    noisy_video, noise = ???  # YOUR CODE HERE ‚Äî forward diffusion
    predicted_noise = ???  # YOUR CODE HERE ‚Äî model prediction

    loss = ???  # YOUR CODE HERE ‚Äî MSE loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()

In [None]:
# ‚úÖ Verification ‚Äî check your implementation
# We'll run one step and verify the loss is a reasonable number
test_model = SimpleVideoUNet(channels=1, base_dim=32, time_dim=64).to(device)
test_opt = torch.optim.Adam(test_model.parameters(), lr=1e-3)
test_batch = dataset[:4].to(device)

try:
    test_loss = train_one_step(test_model, test_opt, test_batch)
    assert isinstance(test_loss, float), "Loss should be a float"
    assert 0 < test_loss < 10, f"Loss {test_loss:.4f} seems wrong (expected 0-10 range)"
    print(f"‚úÖ Training step works! Loss = {test_loss:.4f}")
except Exception as e:
    print(f"‚ùå Error: {e}")
    print("\nHint: t should be torch.randint(0, NUM_DIFFUSION_STEPS, (B,), device=device)")
    print("Hint: Use F.mse_loss(predicted_noise, noise)")

del test_model, test_opt

In [None]:
#@title üéß Listen: Training
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/10_training.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 7. Training the Model

Now let us train our video diffusion model. With our small dataset and model, this should take about 5 minutes on a T4 GPU.

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
BATCH_SIZE = 16
NUM_EPOCHS = 30

# Create data loader
train_loader = torch.utils.data.DataLoader(
    dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True
)

losses = []
for epoch in range(NUM_EPOCHS):
    epoch_losses = []
    for batch in train_loader:
        batch = batch.to(device)
        loss = train_one_step(model, optimizer, batch)
        epoch_losses.append(loss)
    avg_loss = np.mean(epoch_losses)
    losses.append(avg_loss)
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1}/{NUM_EPOCHS} ‚Äî Loss: {avg_loss:.4f}")

print("\n‚úÖ Training complete!")

In [None]:
# üìä Training curve
plt.figure(figsize=(8, 4))
plt.plot(losses, linewidth=2)
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.title("Training Loss ‚Äî Video Diffusion Model")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
#@title üéß Listen: Todo Sampling
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/11_todo_sampling.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 8. Sampling ‚Äî Generating Videos from Noise

Now for the exciting part: generating videos from pure noise using the trained model!

The sampling process reverses the forward diffusion. Starting from pure noise $\mathbf{v}_T \sim \mathcal{N}(0, \mathbf{I})$, we iteratively denoise using our trained model:

$$\mathbf{v}_{t-1} = \frac{1}{\sqrt{\alpha_t}}\left(\mathbf{v}_t - \frac{\beta_t}{\sqrt{1-\bar{\alpha}_t}}\,\boldsymbol{\epsilon}_\theta(\mathbf{v}_t, t)\right) + \sigma_t \mathbf{z}$$

where $\mathbf{z} \sim \mathcal{N}(0, \mathbf{I})$ and $\sigma_t = \sqrt{\beta_t}$.

### üîß TODO: Implement the DDPM Sampling Step

The sampling process reverses the forward diffusion using the formula above. Complete the core reverse step inside the loop:

In [None]:
@torch.no_grad()
def sample_videos(model, num_videos=4, num_frames=16, size=32):
    """
    Generate videos by iteratively denoising from pure noise.
    """
    model.eval()
    shape = (num_videos, num_frames, 1, size, size)

    # Start from pure noise
    v = torch.randn(shape, device=device)

    # Store intermediate steps for visualization
    intermediates = [v.cpu().clone()]

    for t in reversed(range(NUM_DIFFUSION_STEPS)):
        t_batch = torch.full((num_videos,), t, device=device, dtype=torch.long)

        # ============ TODO ============
        # Step 1: Use the model to predict the noise: predicted_noise = model(v, t_batch)
        # Step 2: Get alpha_t, alpha_bar_t, and beta_t from the schedule
        # Step 3: Compute the denoised mean using the DDPM formula:
        #         v = (1/sqrt(alpha_t)) * (v - (beta_t/sqrt(1-alpha_bar_t)) * predicted_noise)
        # Step 4: If t > 0, add noise: v = v + sqrt(beta_t) * random_noise
        # ==============================

        predicted_noise = ???  # YOUR CODE HERE (Step 1)
        alpha_t = ???  # YOUR CODE HERE (Step 2)
        alpha_bar_t = ???  # YOUR CODE HERE (Step 2)
        beta_t = ???  # YOUR CODE HERE (Step 2)

        v = ???  # YOUR CODE HERE (Step 3 ‚Äî DDPM mean formula)

        if t > 0:
            v = ???  # YOUR CODE HERE (Step 4 ‚Äî add noise)

        # Save snapshots
        if t % 40 == 0 or t == 0:
            intermediates.append(v.cpu().clone())

    model.train()
    return v.cpu(), intermediates

In [None]:
# ‚úÖ Verification ‚Äî test sampling
try:
    test_gen, test_inters = sample_videos(model, num_videos=1, num_frames=NUM_FRAMES, size=CANVAS_SIZE)
    assert test_gen.shape == (1, NUM_FRAMES, 1, CANVAS_SIZE, CANVAS_SIZE), f"Wrong shape: {test_gen.shape}"
    assert len(test_inters) > 1, "Should have intermediate snapshots"
    print(f"‚úÖ Sampling works! Generated shape: {test_gen.shape}")
    print(f"   Saved {len(test_inters)} intermediate snapshots")
except Exception as e:
    print(f"‚ùå Error: {e}")
    print("Hint: alpha_t = alphas[t], alpha_bar_t = alphas_cumprod[t], beta_t = betas[t]")
    print("Hint: For Step 4, use torch.randn_like(v)")

In [None]:
#@title üéß Listen: Results
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/12_results.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

In [None]:
# üéØ Generate videos!
print("Generating videos from pure noise...")
generated_videos, intermediates = sample_videos(model, num_videos=8)
print(f"Generated {generated_videos.shape[0]} videos!")
print(f"Shape: {generated_videos.shape}")

In [None]:
# üìä Visualize the denoising process
fig, axes = plt.subplots(len(intermediates), NUM_FRAMES, figsize=(20, 2 * len(intermediates)))
titles = [f"t={NUM_DIFFUSION_STEPS}"] + [f"t={t}" for t in range(NUM_DIFFUSION_STEPS-1, -1, -40)][:-1] + ["t=0"]

for row, (snap, title) in enumerate(zip(intermediates, titles)):
    for col in range(NUM_FRAMES):
        axes[row, col].imshow(snap[0, col, 0].clamp(0, 1).numpy(),
                              cmap="gray", vmin=0, vmax=1)
        axes[row, col].axis("off")
        if col == 0:
            axes[row, col].set_ylabel(title, fontsize=10)

fig.suptitle("Reverse Diffusion: Noise ‚Üí Video", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
#@title üéß Listen: Final Gallery
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/13_final_gallery.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 9. üéØ Final Output ‚Äî Generated Video Gallery

In [None]:
# Display our generated videos as animated GIFs
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.flatten()

for i in range(8):
    # Show all frames as a montage
    frames = generated_videos[i, :, 0].clamp(0, 1).numpy()
    # Create a 4x4 grid of frames
    montage = np.zeros((4 * CANVAS_SIZE, 4 * CANVAS_SIZE))
    for idx in range(min(16, NUM_FRAMES)):
        r, c = idx // 4, idx % 4
        montage[r*CANVAS_SIZE:(r+1)*CANVAS_SIZE,
                c*CANVAS_SIZE:(c+1)*CANVAS_SIZE] = frames[idx]
    axes[i].imshow(montage, cmap="gray", vmin=0, vmax=1)
    axes[i].set_title(f"Video {i+1}", fontsize=11)
    axes[i].axis("off")

fig.suptitle("üéâ Generated Videos ‚Äî Each 4√ó4 Grid Shows 16 Frames", fontsize=14)
plt.tight_layout()
plt.show()

print("üéâ Congratulations! You've trained a video diffusion model from scratch!")
print("   Each grid above shows a 16-frame video of a moving digit,")
print("   generated entirely from random noise.")

In [None]:
#@title üéß Listen: Reflection
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/14_reflection.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 10. Reflection and Next Steps

### ü§î Reflection Questions

1. **Why 3D convolutions?** Our model used 3D convolutions that process space and time jointly. What are the advantages and disadvantages of this approach? Would it be cheaper to process spatial and temporal dimensions separately?

2. **Temporal coherence:** Look at the generated videos. Are the digits moving smoothly? What would improve temporal consistency?

3. **Scaling up:** Our model had ~100K parameters and operated on 32√ó32 grayscale videos. What challenges would we face scaling to 256√ó256 RGB videos at 30fps?

### üèÜ Optional Challenges

1. **Different motion:** Modify `create_moving_mnist_video` to add rotation or scaling ‚Äî not just translation. How does this affect generation quality?

2. **Longer videos:** Try generating 32-frame videos instead of 16. Does quality degrade?

3. **Conditional generation:** Add the digit class (0-9) as a conditioning signal. Can the model learn to generate specific digits?

**Next notebook:** We will build the **factorized spatial-temporal attention** mechanism that makes modern video diffusion models so much more powerful and efficient than simple 3D convolutions.

In [None]:
#@title üí¨ AI Teaching Assistant ‚Äî Click ‚ñ∂ to start
#@markdown This AI chatbot reads your notebook and can answer questions about any concept, code, or exercise.

import json as _json
import requests as _requests
from google.colab import output as _output
from IPython.display import display, HTML as _HTML, Markdown as _Markdown

# --- Read notebook content for context ---
def _get_notebook_context():
    try:
        from google.colab import _message
        nb = _message.blocking_request("get_ipynb", request="", timeout_sec=10)
        cells = nb.get("ipynb", {}).get("cells", [])
        parts = []
        for cell in cells:
            src = "".join(cell.get("source", []))
            tags = cell.get("metadata", {}).get("tags", [])
            if "chatbot" in tags:
                continue
            if src.strip():
                ct = cell.get("cell_type", "unknown")
                parts.append(f"[{ct.upper()}]\n{src}")
        return "\n\n---\n\n".join(parts)
    except Exception:
        return "Notebook content unavailable."

_NOTEBOOK_CONTEXT = _get_notebook_context()
_CHAT_HISTORY = []
_API_URL = "https://course-creator-brown.vercel.app/api/chat"

def _notebook_chat(question):
    global _CHAT_HISTORY
    try:
        resp = _requests.post(_API_URL, json={
            'question': question,
            'context': _NOTEBOOK_CONTEXT[:100000],
            'history': _CHAT_HISTORY[-10:],
        }, timeout=60)
        data = resp.json()
        answer = data.get('answer', 'Sorry, I could not generate a response.')
        _CHAT_HISTORY.append({'role': 'user', 'content': question})
        _CHAT_HISTORY.append({'role': 'assistant', 'content': answer})
        return answer
    except Exception as e:
        return f'Error connecting to teaching assistant: {str(e)}'

_output.register_callback('notebook_chat', _notebook_chat)

def ask(question):
    """Ask the AI teaching assistant a question about this notebook."""
    answer = _notebook_chat(question)
    display(_Markdown(answer))

print("\u2705 AI Teaching Assistant is ready!")
print("\U0001f4a1 Use the chat below, or call ask(\'your question\') in any cell.")

# --- Display chat widget ---
display(_HTML('''<style>
  .vc-wrap{font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',Roboto,sans-serif;max-width:100%;border-radius:16px;overflow:hidden;box-shadow:0 4px 24px rgba(0,0,0,.12);background:#fff;border:1px solid #e5e7eb}
  .vc-hdr{background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);color:#fff;padding:16px 20px;display:flex;align-items:center;gap:12px}
  .vc-avatar{width:42px;height:42px;background:rgba(255,255,255,.2);border-radius:50%;display:flex;align-items:center;justify-content:center;font-size:22px}
  .vc-hdr h3{font-size:16px;font-weight:600;margin:0}
  .vc-hdr p{font-size:12px;opacity:.85;margin:2px 0 0}
  .vc-msgs{height:420px;overflow-y:auto;padding:16px;background:#f8f9fb;display:flex;flex-direction:column;gap:10px}
  .vc-msg{display:flex;flex-direction:column;animation:vc-fade .25s ease}
  .vc-msg.user{align-items:flex-end}
  .vc-msg.bot{align-items:flex-start}
  .vc-bbl{max-width:85%;padding:10px 14px;border-radius:16px;font-size:14px;line-height:1.55;word-wrap:break-word}
  .vc-msg.user .vc-bbl{background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);color:#fff;border-bottom-right-radius:4px}
  .vc-msg.bot .vc-bbl{background:#fff;color:#1a1a2e;border:1px solid #e8e8e8;border-bottom-left-radius:4px}
  .vc-bbl code{background:rgba(0,0,0,.07);padding:2px 6px;border-radius:4px;font-size:13px;font-family:'Fira Code',monospace}
  .vc-bbl pre{background:#1e1e2e;color:#cdd6f4;padding:12px;border-radius:8px;overflow-x:auto;margin:8px 0;font-size:13px}
  .vc-bbl pre code{background:none;padding:0;color:inherit}
  .vc-bbl h3,.vc-bbl h4{margin:10px 0 4px;font-size:15px}
  .vc-bbl ul,.vc-bbl ol{margin:4px 0;padding-left:20px}
  .vc-bbl li{margin:2px 0}
  .vc-chips{display:flex;flex-wrap:wrap;gap:8px;padding:0 16px 12px;background:#f8f9fb}
  .vc-chip{background:#fff;border:1px solid #d1d5db;border-radius:20px;padding:6px 14px;font-size:12px;cursor:pointer;transition:all .15s;color:#4b5563}
  .vc-chip:hover{border-color:#667eea;color:#667eea;background:#f0f0ff}
  .vc-input{display:flex;padding:12px 16px;background:#fff;border-top:1px solid #eee;gap:8px}
  .vc-input input{flex:1;padding:10px 16px;border:2px solid #e8e8e8;border-radius:24px;font-size:14px;outline:none;transition:border-color .2s}
  .vc-input input:focus{border-color:#667eea}
  .vc-input button{background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);color:#fff;border:none;border-radius:50%;width:42px;height:42px;cursor:pointer;display:flex;align-items:center;justify-content:center;font-size:18px;transition:transform .1s}
  .vc-input button:hover{transform:scale(1.05)}
  .vc-input button:disabled{opacity:.5;cursor:not-allowed;transform:none}
  .vc-typing{display:flex;gap:5px;padding:4px 0}
  .vc-typing span{width:8px;height:8px;background:#667eea;border-radius:50%;animation:vc-bounce 1.4s infinite ease-in-out}
  .vc-typing span:nth-child(2){animation-delay:.2s}
  .vc-typing span:nth-child(3){animation-delay:.4s}
  @keyframes vc-bounce{0%,80%,100%{transform:scale(0)}40%{transform:scale(1)}}
  @keyframes vc-fade{from{opacity:0;transform:translateY(8px)}to{opacity:1;transform:translateY(0)}}
  .vc-note{text-align:center;font-size:11px;color:#9ca3af;padding:8px 16px 12px;background:#fff}
</style>
<div class="vc-wrap">
  <div class="vc-hdr">
    <div class="vc-avatar">&#129302;</div>
    <div>
      <h3>Vizuara Teaching Assistant</h3>
      <p>Ask me anything about this notebook</p>
    </div>
  </div>
  <div class="vc-msgs" id="vcMsgs">
    <div class="vc-msg bot">
      <div class="vc-bbl">&#128075; Hi! I've read through this entire notebook. Ask me about any concept, code block, or exercise &mdash; I'm here to help you learn!</div>
    </div>
  </div>
  <div class="vc-chips" id="vcChips">
    <span class="vc-chip" onclick="vcAsk(this.textContent)">Explain the main concept</span>
    <span class="vc-chip" onclick="vcAsk(this.textContent)">Help with the TODO exercise</span>
    <span class="vc-chip" onclick="vcAsk(this.textContent)">Summarize what I learned</span>
  </div>
  <div class="vc-input">
    <input type="text" id="vcIn" placeholder="Ask about concepts, code, exercises..." />
    <button id="vcSend" onclick="vcSendMsg()">&#10148;</button>
  </div>
  <div class="vc-note">AI-generated &middot; Verify important information &middot; <a href="#" onclick="vcClear();return false" style="color:#667eea">Clear chat</a></div>
</div>
<script>
(function(){
  var msgs=document.getElementById('vcMsgs'),inp=document.getElementById('vcIn'),
      btn=document.getElementById('vcSend'),chips=document.getElementById('vcChips');

  function esc(s){var d=document.createElement('div');d.textContent=s;return d.innerHTML}

  function md(t){
    return t
      .replace(/```(\w*)\n([\s\S]*?)```/g,function(_,l,c){return '<pre><code>'+esc(c)+'</code></pre>'})
      .replace(/`([^`]+)`/g,'<code>$1</code>')
      .replace(/\*\*([^*]+)\*\*/g,'<strong>$1</strong>')
      .replace(/\*([^*]+)\*/g,'<em>$1</em>')
      .replace(/^#### (.+)$/gm,'<h4>$1</h4>')
      .replace(/^### (.+)$/gm,'<h4>$1</h4>')
      .replace(/^## (.+)$/gm,'<h3>$1</h3>')
      .replace(/^\d+\. (.+)$/gm,'<li>$1</li>')
      .replace(/^- (.+)$/gm,'<li>$1</li>')
      .replace(/\n\n/g,'<br><br>')
      .replace(/\n/g,'<br>');
  }

  function addMsg(text,isUser){
    var m=document.createElement('div');m.className='vc-msg '+(isUser?'user':'bot');
    var b=document.createElement('div');b.className='vc-bbl';
    b.innerHTML=isUser?esc(text):md(text);
    m.appendChild(b);msgs.appendChild(m);msgs.scrollTop=msgs.scrollHeight;
  }

  function showTyping(){
    var m=document.createElement('div');m.className='vc-msg bot';m.id='vcTyping';
    m.innerHTML='<div class="vc-bbl"><div class="vc-typing"><span></span><span></span><span></span></div></div>';
    msgs.appendChild(m);msgs.scrollTop=msgs.scrollHeight;
  }

  function hideTyping(){var e=document.getElementById('vcTyping');if(e)e.remove()}

  window.vcSendMsg=function(){
    var q=inp.value.trim();if(!q)return;
    inp.value='';chips.style.display='none';
    addMsg(q,true);showTyping();btn.disabled=true;
    google.colab.kernel.invokeFunction('notebook_chat',[q],{})
      .then(function(r){
        hideTyping();
        var a=r.data['application/json'];
        addMsg(typeof a==='string'?a:JSON.stringify(a),false);
      })
      .catch(function(){
        hideTyping();
        addMsg('Sorry, I encountered an error. Please check your internet connection and try again.',false);
      })
      .finally(function(){btn.disabled=false;inp.focus()});
  };

  window.vcAsk=function(q){inp.value=q;vcSendMsg()};
  window.vcClear=function(){
    msgs.innerHTML='<div class="vc-msg bot"><div class="vc-bbl">&#128075; Chat cleared. Ask me anything!</div></div>';
    chips.style.display='flex';
  };

  inp.addEventListener('keypress',function(e){if(e.key==='Enter')vcSendMsg()});
  inp.focus();
})();
</script>'''))