In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# Noise Conditioned Score Networks -- Architecture and Training -- Vizuara

## 1. Why Does This Matter?

In the previous notebook, we saw that score estimation fails in low-density regions. The score function becomes unreliable precisely where we need it most -- in the empty spaces of high-dimensional data.

Noise Conditioned Score Networks (NCSN) solve this with a brilliantly simple idea: **perturb the data with noise at multiple scales**, then train a single neural network to predict the score at every noise level.

By the end of this notebook, you will:
- Understand why multi-scale noise perturbation fills empty space
- Implement the geometric noise schedule
- Build a noise-conditioned score network from scratch
- Train it on 2D data and visualize the learned score fields at each noise level

## 2. Building Intuition

Think of it like fogging up a window. Behind the glass is a picture -- your data.

- **Thin fog** (small noise): You can still see the shapes clearly. The data structure is preserved, but you can see the glass itself -- the score is defined on the glass surface too.
- **Medium fog**: The shapes are blurred, but you can tell something is there. The score now covers more area.
- **Thick fog** (large noise): Everything is covered. You cannot see the picture at all, but you know something is behind the glass. The score is now defined everywhere.

Each fog thickness gives you different information. The thick fog tells you "there is data somewhere over there" (coarse, global structure). The thin fog tells you "the data has this precise shape right here" (fine, local structure).

NCSN learns all these levels simultaneously in a single neural network.

## 3. The Mathematics

### 3.1 Noise Perturbation

Given data distribution $p(x)$ and noise level $\sigma$, the perturbed distribution is:

$$q_\sigma(\tilde{x}) = \int p(x) \cdot \mathcal{N}(\tilde{x} \mid x, \sigma^2 I) \, dx$$

The score of this perturbed distribution is:

$$\nabla_{\tilde{x}} \log q_\sigma(\tilde{x}) = -\frac{\tilde{x} - x}{\sigma^2} = -\frac{\epsilon}{\sigma}$$

where $\epsilon \sim \mathcal{N}(0, I)$ is the noise that was added.

**Worked example:** Data point $x = 3$, noise $\epsilon = 0.4$, $\sigma = 0.5$.
- Noisy sample: $\tilde{x} = 3 + 0.5 \times 0.4 = 3.2$
- Score target: $-\epsilon / \sigma = -0.4 / 0.5 = -0.8$
- This says "move left by 0.8" -- pointing back toward the original data. This is exactly what we want.

### 3.2 The Geometric Noise Schedule

We use $L$ noise levels arranged in a geometric sequence:

$$\sigma_i = \sigma_1 \cdot \left(\frac{\sigma_L}{\sigma_1}\right)^{\frac{i-1}{L-1}}$$

**Worked example:** $\sigma_1 = 10$, $\sigma_L = 0.01$, $L = 5$.

$$\sigma_1 = 10, \quad \sigma_2 = 10 \cdot (0.001)^{0.25} = 10 \cdot 0.1778 = 1.778$$
$$\sigma_3 = 10 \cdot (0.001)^{0.5} = 10 \cdot 0.03162 = 0.3162$$
$$\sigma_4 = 10 \cdot (0.001)^{0.75} = 10 \cdot 0.005623 = 0.05623, \quad \sigma_5 = 0.01$$

### 3.3 The NCSN Training Objective

$$\mathcal{L}(\theta) = \frac{1}{L} \sum_{i=1}^{L} \sigma_i^2 \, \mathbb{E}_{p(x)} \mathbb{E}_{\epsilon \sim \mathcal{N}(0,I)} \left[ \left\| s_\theta(\tilde{x}, \sigma_i) + \frac{\epsilon}{\sigma_i} \right\|^2 \right]$$

The $\sigma_i^2$ weighting ensures that losses at different scales are comparable.

**Worked example:** Suppose $s_\theta$ predicts $[-0.7]$ but the target is $[-0.8]$ at $\sigma = 0.5$.
- Unweighted loss: $(−0.7 − (−0.8))^2 = 0.01$
- Weighted loss: $0.5^2 \times 0.01 = 0.0025$

At $\sigma = 5.0$ with the same error:
- Weighted loss: $5.0^2 \times 0.01 = 0.25$

The weighting makes errors at large $\sigma$ contribute more, which helps learn the global structure first.

## 4. Let's Build It -- Component by Component

### 4.1 Generate 2D Training Data

In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

def generate_data(n=5000):
    """Generate a mixture of 2 Gaussians in 2D."""
    mix = torch.rand(n, 1)
    centers = torch.tensor([[-3.0, 0.0], [3.0, 0.0]])
    idx = (mix > 0.5).long().squeeze()
    data = centers[idx] + 0.5 * torch.randn(n, 2)
    return data

data = generate_data(5000)
plt.figure(figsize=(8, 5))
plt.scatter(data[:, 0].numpy(), data[:, 1].numpy(), s=2, alpha=0.3, c='blue')
plt.title('Training Data: Mixture of 2 Gaussians')
plt.xlabel('x1')
plt.ylabel('x2')
plt.axis('equal')
plt.grid(True, alpha=0.3)
plt.show()
print(f"Data shape: {data.shape}")

### 4.2 Define the Geometric Noise Schedule

In [None]:
def geometric_noise_schedule(sigma_1, sigma_L, L):
    """Create L noise levels in a geometric sequence."""
    sigmas = torch.tensor([
        sigma_1 * (sigma_L / sigma_1) ** (i / (L - 1))
        for i in range(L)
    ])
    return sigmas

# Create our noise schedule
L = 10
sigma_1, sigma_L = 10.0, 0.01
sigmas = geometric_noise_schedule(sigma_1, sigma_L, L)

print("Noise levels:")
for i, s in enumerate(sigmas):
    print(f"  sigma_{i+1:2d} = {s:.4f}")

plt.figure(figsize=(10, 4))
plt.bar(range(1, L+1), sigmas.numpy(), color='steelblue')
plt.yscale('log')
plt.xlabel('Noise Level Index')
plt.ylabel('sigma (log scale)')
plt.title('Geometric Noise Schedule')
plt.grid(True, alpha=0.3, axis='y')
plt.show()

### 4.3 Visualize the Effect of Different Noise Levels

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(20, 4))

noise_levels_to_show = [0, 3, 6, 9]  # indices into sigmas

for ax, idx in zip(axes, noise_levels_to_show):
    sigma = sigmas[idx]
    noisy_data = data + sigma * torch.randn_like(data)
    ax.scatter(noisy_data[:, 0].numpy(), noisy_data[:, 1].numpy(),
               s=1, alpha=0.2, c='blue')
    ax.set_title(f'sigma = {sigma:.4f}')
    ax.set_xlim(-15, 15)
    ax.set_ylim(-10, 10)
    ax.set_aspect('equal')
    ax.grid(True, alpha=0.3)

plt.suptitle('Data Perturbed at Different Noise Levels', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("Notice: large sigma spreads data everywhere, small sigma preserves structure")

### 4.4 Build the Noise Conditioned Score Network

In [None]:
class ScoreNet(nn.Module):
    """Noise Conditioned Score Network for 2D data."""
    def __init__(self, hidden_dim=128):
        super().__init__()
        # Input: 2D data point + 1D noise level = 3
        self.net = nn.Sequential(
            nn.Linear(3, hidden_dim),
            nn.SiLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Linear(hidden_dim, 2),  # Output: 2D score
        )

    def forward(self, x, sigma):
        # x: (batch, 2), sigma: (batch,) or (batch, 1)
        if sigma.dim() == 1:
            sigma = sigma.unsqueeze(1)
        net_input = torch.cat([x, sigma], dim=1)
        return self.net(net_input)

model = ScoreNet(hidden_dim=128)
total_params = sum(p.numel() for p in model.parameters())
print(f"Model created with {total_params:,} parameters")

# Quick test
test_x = torch.randn(4, 2)
test_sigma = torch.tensor([1.0, 2.0, 0.5, 0.1])
output = model(test_x, test_sigma)
print(f"Input shape: {test_x.shape}, Sigma shape: {test_sigma.shape}")
print(f"Output shape: {output.shape}")

**Checkpoint:** The model should accept a batch of 2D points and their noise levels, and output 2D score vectors.

## 5. Your Turn

### TODO 1: Implement the NCSN Training Loss

In [None]:
def ncsn_loss(model, data, sigmas):
    """
    Compute the NCSN training loss.

    Args:
        model: ScoreNet that takes (x, sigma) and returns score
        data: clean data tensor of shape (batch, 2)
        sigmas: tensor of all noise levels, shape (L,)

    Returns:
        Scalar loss value

    Steps:
        1. Sample random noise level indices for each data point
        2. Get the corresponding sigma values
        3. Sample noise epsilon ~ N(0, I)
        4. Create noisy data: x_tilde = data + sigma * epsilon
        5. Compute target: -epsilon / sigma
        6. Get model prediction: model(x_tilde, sigma)
        7. Compute weighted MSE loss: mean of sigma^2 * ||pred - target||^2
    """
    batch_size = data.shape[0]
    L = len(sigmas)

    # YOUR CODE HERE
    # idx = ...
    # sigma = ...
    # epsilon = ...
    # noisy_data = ...
    # target = ...
    # pred = ...
    # loss = ...
    pass

# Verify: loss should be a positive scalar
# test_loss = ncsn_loss(model, data[:32], sigmas)
# print(f"Test loss: {test_loss.item():.4f}")

### TODO 2: Implement the Training Loop

In [None]:
def train_ncsn(model, data, sigmas, n_epochs=3000, lr=1e-3, batch_size=256):
    """
    Train the NCSN model.

    Args:
        model: ScoreNet
        data: training data
        sigmas: noise levels
        n_epochs: number of epochs
        lr: learning rate
        batch_size: mini-batch size

    Returns:
        List of loss values per epoch

    Hints:
        - Use Adam optimizer
        - Sample random batches each epoch
        - Call ncsn_loss for each batch
        - Print loss every 500 epochs
    """
    # YOUR CODE HERE
    pass

# losses = train_ncsn(model, data, sigmas)

## 6. Putting It All Together

In [None]:
# Full training (reference implementation)
model = ScoreNet(hidden_dim=128)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
data = generate_data(5000)

losses = []
for epoch in range(3000):
    # Sample random noise level for each data point
    idx = torch.randint(0, L, (data.shape[0],))
    sigma = sigmas[idx].unsqueeze(1)  # (N, 1)

    # Add noise
    noise = torch.randn_like(data)
    noisy_data = data + sigma * noise

    # Target: -noise / sigma
    target = -noise / sigma

    # Predict score
    pred = model(noisy_data, sigma.squeeze(1))

    # Weighted loss: sigma^2 * ||pred - target||^2
    loss = (sigma ** 2 * (pred - target) ** 2).mean()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

    if (epoch + 1) % 500 == 0:
        print(f"Epoch {epoch+1:4d}, Loss: {loss.item():.4f}")

In [None]:
# Plot training loss
plt.figure(figsize=(10, 4))
plt.semilogy(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss (log scale)')
plt.title('NCSN Training Loss')
plt.grid(True, alpha=0.3)
plt.show()

## 7. Training and Results

In [None]:
# Visualize learned score fields at different noise levels
def plot_score_field(model, sigma_val, ax, grid_range=(-8, 8), n_grid=20):
    """Plot the learned score field at a specific noise level."""
    x = np.linspace(grid_range[0], grid_range[1], n_grid)
    y = np.linspace(-5, 5, n_grid)
    X, Y = np.meshgrid(x, y)
    points = torch.tensor(np.stack([X.ravel(), Y.ravel()], axis=1),
                          dtype=torch.float32)
    sigma_tensor = torch.full((points.shape[0],), sigma_val)

    with torch.no_grad():
        scores = model(points, sigma_tensor).numpy()

    # Normalize arrow lengths for visibility
    norms = np.sqrt(scores[:, 0]**2 + scores[:, 1]**2).reshape(n_grid, n_grid)
    max_norm = np.percentile(norms, 95) + 1e-6

    ax.quiver(X, Y,
              scores[:, 0].reshape(n_grid, n_grid) / max_norm,
              scores[:, 1].reshape(n_grid, n_grid) / max_norm,
              color='steelblue', alpha=0.7)
    ax.set_title(f'sigma = {sigma_val:.4f}', fontsize=12)
    ax.set_xlim(grid_range)
    ax.set_ylim(-5, 5)
    ax.set_aspect('equal')
    ax.grid(True, alpha=0.2)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

sigma_indices = [0, 3, 6, 9]
for ax, idx in zip(axes.ravel(), sigma_indices):
    plot_score_field(model, sigmas[idx].item(), ax)
    # Overlay true data
    ax.scatter(data[:500, 0].numpy(), data[:500, 1].numpy(),
               s=1, alpha=0.2, c='red')

plt.suptitle('Learned Score Fields at Different Noise Levels', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("Large sigma: single broad basin pulling everything inward")
print("Small sigma: two distinct basins at the cluster centers")
print("This is exactly what we want!")

## 8. Final Output

In [None]:
# Summary: compare score reliability across noise levels
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Panel 1: Score field at large sigma -- covers everywhere
plot_score_field(model, sigmas[0].item(), axes[0], n_grid=25)
axes[0].set_title(f'Large Noise (sigma={sigmas[0]:.1f})\nScore defined EVERYWHERE',
                  fontsize=11, fontweight='bold')

# Panel 2: Score field at medium sigma -- two basins emerging
plot_score_field(model, sigmas[5].item(), axes[1], n_grid=25)
axes[1].set_title(f'Medium Noise (sigma={sigmas[5]:.3f})\nTwo basins emerging',
                  fontsize=11, fontweight='bold')

# Panel 3: Score field at small sigma -- precise clusters
plot_score_field(model, sigmas[9].item(), axes[2], n_grid=25)
axes[2].set_title(f'Small Noise (sigma={sigmas[9]:.4f})\nPrecise cluster structure',
                  fontsize=11, fontweight='bold')

for ax in axes:
    ax.scatter(data[:300, 0].numpy(), data[:300, 1].numpy(), s=2, alpha=0.3, c='red')

plt.suptitle('NCSN: One Network Learns Score at All Noise Levels', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nThe NCSN has learned to provide meaningful score estimates at EVERY noise level.")
print("At large sigma: global structure (where are the clusters roughly?)")
print("At small sigma: precise structure (exact cluster boundaries)")
print("The low-density problem from Notebook 1 is solved!")

## 9. Reflection and Next Steps

**What we learned:**
1. Adding noise at multiple scales fills the empty space, making score estimation reliable everywhere
2. A geometric noise schedule spans from coarse to fine resolution
3. The NCSN takes both data and noise level as input, outputting a noise-level-specific score
4. The $\sigma^2$ weighting ensures balanced learning across all scales

**Reflection questions:**
- Why is a geometric sequence better than a linear sequence for the noise levels?
- What would happen if we used too few noise levels (e.g., L = 2)?
- How does the choice of $\sigma_1$ (largest noise) affect the generated samples?

**Next notebook:** We will use the trained NCSN to actually generate new samples using Annealed Langevin Dynamics -- the "coarse to fine" sampling procedure.