In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# Energy Functions and the Boltzmann Distribution -- Vizuara

## 1. Why Does This Matter?

Energy-Based Models (EBMs) are one of the most general frameworks in machine learning. Unlike models that directly output probabilities, EBMs assign a single scalar "energy" to every possible configuration of data. Lower energy means more likely.

This idea comes from statistical physics -- the same equations that describe how gas molecules distribute themselves in a room also describe how we can model complex data distributions.

**By the end of this notebook, you will:**
- Understand how energy functions map data to scalar values
- Implement the Boltzmann distribution to convert energy to probability
- Visualize energy landscapes and their corresponding probability distributions
- Experience the partition function problem firsthand

## 2. Building Intuition

### The Ball on a Landscape

Imagine you are holding a ball at the top of a hilly landscape. When you release it, the ball rolls downhill and settles at the lowest point. This is a fundamental principle from physics: systems naturally settle into configurations that minimize their energy.

Now here is the key insight: **what if we could use this same principle to describe data?** Suppose we assign low energy to data points that look like real images and high energy to random noise. Then the "landscape" naturally peaks at real data.

### Think About This

Before we write any code, ask yourself:
- If you had an energy function, how would you convert it to a probability?
- What properties should the probability function have?
- What mathematical function flips "low energy = good" into "high probability"?

## 3. The Mathematics

The energy function takes any data point $x$ and outputs a scalar:

$$E_\theta(x) \in \mathbb{R}$$

To convert energy to probability, we use the **Boltzmann distribution**:

$$p(x) = \frac{\exp(-E_\theta(x))}{Z}, \quad Z = \int \exp(-E_\theta(x)) \, dx$$

Computationally, this means: for each data point, compute the negative energy, exponentiate it (making low energy values large and high energy values small), and then divide by the sum of all such values to normalize.

**Numerical example:** Suppose we have three states with energies $E(x_1)=3$, $E(x_2)=1$, $E(x_3)=2$:
- $\exp(-3) = 0.050$, $\exp(-1) = 0.368$, $\exp(-2) = 0.135$
- $Z = 0.050 + 0.368 + 0.135 = 0.553$
- $p(x_1) = 0.090$, $p(x_2) = 0.665$, $p(x_3) = 0.244$

The state with the lowest energy ($x_2$) gets the highest probability. This is exactly what we want.

## 4. Let's Build It -- Component by Component

### 4.1 Defining Simple Energy Functions

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt

# Let's define some simple 1D energy functions
def quadratic_energy(x):
    """Simple quadratic energy: E(x) = x^2. Minimum at x=0."""
    return x ** 2

def double_well_energy(x):
    """Double-well energy: E(x) = (x^2 - 1)^2. Minima at x=-1 and x=1."""
    return (x ** 2 - 1) ** 2

def asymmetric_energy(x):
    """Asymmetric energy with a deep well on the right."""
    return 0.5 * (x + 1) ** 2 * (x - 2) ** 2 - 0.3 * x

Let us visualize these energy landscapes.

In [None]:
x = torch.linspace(-3, 3, 500)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
energies = [quadratic_energy, double_well_energy, asymmetric_energy]
titles = ["Quadratic Energy", "Double-Well Energy", "Asymmetric Energy"]

for ax, energy_fn, title in zip(axes, energies, titles):
    E = energy_fn(x)
    ax.plot(x.numpy(), E.numpy(), 'b-', linewidth=2)
    ax.set_xlabel('x')
    ax.set_ylabel('E(x)')
    ax.set_title(title)
    ax.grid(True, alpha=0.3)
    # Mark the minimum
    min_idx = torch.argmin(E)
    ax.plot(x[min_idx], E[min_idx], 'ro', markersize=10, label=f'Min at x={x[min_idx]:.2f}')
    ax.legend()

plt.tight_layout()
plt.show()

### 4.2 Converting Energy to Probability (Boltzmann Distribution)

In [None]:
def boltzmann_distribution(energy_fn, x, temperature=1.0):
    """
    Convert an energy function to a probability distribution
    using the Boltzmann distribution.

    Args:
        energy_fn: Function that maps x -> energy scalar
        x: Tensor of points to evaluate
        temperature: Controls the sharpness (lower = sharper peaks)

    Returns:
        Normalized probability values at each x
    """
    E = energy_fn(x)
    unnormalized = torch.exp(-E / temperature)

    # The partition function Z: sum (discrete) or integral (continuous)
    # For our discrete grid, we approximate the integral as a sum
    dx = x[1] - x[0]  # grid spacing
    Z = torch.sum(unnormalized) * dx

    probabilities = unnormalized / Z
    return probabilities, Z

In [None]:
# Visualize energy -> probability for the quadratic energy
x = torch.linspace(-4, 4, 1000)

fig, axes = plt.subplots(2, 1, figsize=(10, 8), sharex=True)

# Energy landscape
E = quadratic_energy(x)
axes[0].plot(x.numpy(), E.numpy(), 'b-', linewidth=2)
axes[0].set_ylabel('E(x)', fontsize=12)
axes[0].set_title('Energy Landscape', fontsize=14)
axes[0].fill_between(x.numpy(), E.numpy(), alpha=0.1, color='blue')
axes[0].grid(True, alpha=0.3)

# Probability distribution
p, Z = boltzmann_distribution(quadratic_energy, x)
axes[1].plot(x.numpy(), p.numpy(), 'r-', linewidth=2)
axes[1].set_ylabel('p(x)', fontsize=12)
axes[1].set_xlabel('x', fontsize=12)
axes[1].set_title(f'Probability Distribution (Z = {Z.item():.4f})', fontsize=14)
axes[1].fill_between(x.numpy(), p.numpy(), alpha=0.1, color='red')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
print(f"Notice: where energy is LOW, probability is HIGH. This is exactly what we want.")

## 5. Your Turn -- Temperature Effects

### TODO: Explore Temperature

The temperature parameter $T$ controls how "peaked" the probability distribution is. Low temperature makes the distribution sharper (more concentrated at the energy minimum), while high temperature makes it flatter (more uniform).

In [None]:
def plot_temperature_effects(energy_fn, x, temperatures):
    """
    TODO: Complete this function to plot the Boltzmann distribution
    at different temperatures.

    For each temperature in the list:
    1. Compute the Boltzmann distribution using the given energy function
    2. Plot the resulting probability distribution

    Hint: Use the boltzmann_distribution() function we defined above,
    passing the temperature parameter.
    """
    fig, ax = plt.subplots(figsize=(10, 5))

    for T in temperatures:
        # ============ TODO ============
        # Step 1: Compute probabilities at this temperature
        # Step 2: Plot the probability curve with a label
        # Hint: p, Z = boltzmann_distribution(energy_fn, x, temperature=T)
        # ==============================

        p, Z = ???  # YOUR CODE HERE
        ax.plot(x.numpy(), p.numpy(), linewidth=2, label=f'T={T}')

    ax.set_xlabel('x')
    ax.set_ylabel('p(x)')
    ax.set_title('Effect of Temperature on Boltzmann Distribution')
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.show()

# Test with these temperatures
x = torch.linspace(-4, 4, 1000)
temperatures = [0.1, 0.5, 1.0, 2.0, 5.0]
# plot_temperature_effects(quadratic_energy, x, temperatures)

In [None]:
# Verification: Run this after completing the TODO
x = torch.linspace(-4, 4, 1000)
p_cold, _ = boltzmann_distribution(quadratic_energy, x, temperature=0.1)
p_hot, _ = boltzmann_distribution(quadratic_energy, x, temperature=5.0)

# Cold temperature should be more peaked
assert p_cold.max() > p_hot.max(), "Cold temperature should have a higher peak"
# Hot temperature should be flatter
assert p_cold.std() < p_hot.std(), "Hot temperature should be more spread out"
print("Correct! Lower temperature = sharper peak, higher temperature = flatter distribution.")

## 5.2 Your Turn -- 2D Energy Landscapes

In [None]:
def plot_2d_energy_landscape(energy_fn, title="2D Energy Landscape"):
    """
    TODO: Visualize a 2D energy function as a contour plot
    and its corresponding 2D Boltzmann distribution.

    Steps:
    1. Create a meshgrid of x1 and x2 values from -3 to 3
    2. Compute E(x1, x2) at every grid point
    3. Compute p(x1, x2) = exp(-E) / Z at every grid point
    4. Plot both as filled contour plots side by side

    Hint: Use torch.meshgrid() and plt.contourf()
    """
    x1 = torch.linspace(-3, 3, 200)
    x2 = torch.linspace(-3, 3, 200)
    X1, X2 = torch.meshgrid(x1, x2, indexing='ij')

    # ============ TODO ============
    # Step 1: Compute the energy at each grid point
    # Step 2: Compute exp(-E) for each point
    # Step 3: Compute Z as the sum of all exp(-E) values
    # Step 4: Compute normalized probabilities
    # ==============================

    E = ???  # YOUR CODE HERE

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    axes[0].contourf(X1.numpy(), X2.numpy(), E.numpy(), levels=30, cmap='viridis')
    axes[0].set_title('Energy E(x)')
    axes[0].set_xlabel('x1')
    axes[0].set_ylabel('x2')

    # axes[1].contourf(X1.numpy(), X2.numpy(), p.numpy(), levels=30, cmap='hot')
    # axes[1].set_title('Probability p(x)')
    plt.tight_layout()
    plt.show()

# Example 2D energy: E(x1, x2) = x1^2 + x2^2
# plot_2d_energy_landscape(lambda x1, x2: x1**2 + x2**2)

## 6. Putting It All Together -- The Partition Function Problem

Now let us experience firsthand why the partition function is a problem.

In [None]:
def partition_function_scaling(dims_list):
    """
    Demonstrate how the cost of computing Z scales with dimension.

    For a grid of 100 points per dimension, the total number of
    evaluations needed grows exponentially.
    """
    grid_points = 100  # points per dimension

    print("Dimension  |  Grid Points  |  Total Evaluations")
    print("-" * 50)
    for d in dims_list:
        total = grid_points ** d
        print(f"    {d:3d}     |     {grid_points:5d}     |  {total:.2e}")

    return [grid_points ** d for d in dims_list]

dims = [1, 2, 3, 5, 10, 50, 256]
counts = partition_function_scaling(dims)

In [None]:
# Visualize the exponential blowup
fig, ax = plt.subplots(figsize=(8, 5))
ax.semilogy(dims[:5], counts[:5], 'ro-', linewidth=2, markersize=8)
ax.set_xlabel('Dimensionality', fontsize=12)
ax.set_ylabel('Number of Evaluations (log scale)', fontsize=12)
ax.set_title('Partition Function: Computational Cost vs Dimension', fontsize=14)
ax.grid(True, alpha=0.3)

# Add annotation
ax.annotate('For a 16x16 image\n(dim=256): 100^256\n= impossible!',
            xy=(5, counts[3]), fontsize=10, color='red',
            bbox=dict(boxstyle='round', facecolor='lightyellow'))

plt.tight_layout()
plt.show()
print("\nThis is why we CANNOT directly compute Z for real-world data.")
print("We need a way to train EBMs WITHOUT computing the partition function.")
print("This brings us to the score function...")

## 7. Training and Results

Let us train a simple neural network as an energy function on 1D data.

In [None]:
import torch.nn as nn

class EnergyNetwork(nn.Module):
    """Neural network that outputs a scalar energy for 1D input."""
    def __init__(self, hidden=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(1, hidden), nn.SiLU(),
            nn.Linear(hidden, hidden), nn.SiLU(),
            nn.Linear(hidden, 1),
        )

    def forward(self, x):
        return self.net(x).squeeze(-1)

# Create synthetic 1D data from a mixture of Gaussians
def sample_mixture(n=1000):
    """Sample from a mixture of two Gaussians."""
    mix = torch.rand(n) < 0.5
    data = torch.randn(n) * 0.3
    data[mix] += 1.5
    data[~mix] -= 1.5
    return data

data = sample_mixture(2000)

# Visualize the data
plt.figure(figsize=(10, 3))
plt.hist(data.numpy(), bins=50, density=True, alpha=0.7, color='steelblue')
plt.title('Data Distribution (Mixture of Two Gaussians)')
plt.xlabel('x')
plt.ylabel('Density')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Visualize what the energy landscape looks like for this data
# A good energy function should have LOW energy where data is dense
# and HIGH energy where data is sparse

model = EnergyNetwork()
x_grid = torch.linspace(-4, 4, 500).unsqueeze(-1)

with torch.no_grad():
    E_random = model(x_grid).numpy()

plt.figure(figsize=(10, 4))
plt.plot(x_grid.squeeze().numpy(), E_random, 'b-', linewidth=2)
plt.title('Energy Function (Random Initialization)')
plt.xlabel('x')
plt.ylabel('E(x)')
plt.grid(True, alpha=0.3)
plt.show()
print("Before training, the energy landscape is random.")
print("We need a training method that does NOT require computing Z.")
print("This is what the score function enables -- see the next notebook!")

## 8. Final Output

In [None]:
# Final visualization: the key insight of this notebook
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

x = torch.linspace(-4, 4, 500)

for ax, energy_fn, title in zip(axes,
    [quadratic_energy, double_well_energy, asymmetric_energy],
    ["Single Well", "Double Well", "Asymmetric"]):

    E = energy_fn(x)
    p, Z = boltzmann_distribution(energy_fn, x)

    ax2 = ax.twinx()
    ax.plot(x.numpy(), E.numpy(), 'b-', linewidth=2, label='Energy')
    ax2.plot(x.numpy(), p.numpy(), 'r-', linewidth=2, label='Probability')

    ax.set_xlabel('x')
    ax.set_ylabel('E(x)', color='blue')
    ax2.set_ylabel('p(x)', color='red')
    ax.set_title(title)
    ax.grid(True, alpha=0.3)

plt.suptitle('Energy Landscapes and Their Probability Distributions', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

print("\nKey Takeaways:")
print("1. Low energy = high probability (and vice versa)")
print("2. The Boltzmann distribution converts energy to probability")
print("3. But computing Z is intractable in high dimensions")
print("4. We need the SCORE FUNCTION to bypass Z -- next notebook!")

## 9. Reflection and Next Steps

### Think About These Questions:
1. Why does the temperature parameter matter? What happens in the limit as $T \to 0$ and $T \to \infty$?
2. Can you think of other functions besides $\exp(-E)$ that could convert energy to probability while satisfying the required properties (positive, higher when energy is lower)?
3. In physics, the Boltzmann distribution describes thermal equilibrium. What is the analogy in machine learning?
4. The partition function is intractable for high-dimensional continuous data. Is it always intractable? When CAN you compute it exactly?

### What's Next
In the next notebook, we will discover the **score function** -- the gradient of the log probability density -- which completely bypasses the partition function. We will also learn **Langevin dynamics**, a sampling method that uses only the score function to generate data.