# Kernel Design & Function Priors

**Bayesian Optimisation Series · Notebook 2 of 3**

The kernel is the *only* place where prior knowledge about the objective function enters the GP model. This notebook:
1. Implements RBF, Matérn 5/2, Matérn 3/2, and Periodic kernels from scratch
2. Visualises the correlation structure each kernel implies
3. Shows how length-scale $\ell$ controls smoothness
4. Demonstrates kernel composition (sum and product)
5. Computes the log marginal likelihood for hyperparameter selection

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
from scipy.special import gamma as gamma_fn

rcParams['figure.figsize'] = (12, 5)
rcParams['font.size'] = 12
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False

np.random.seed(42)

## 1. Kernel Implementations

All kernels take two matrices of shape `(n, d)` and return an `(n, m)` kernel matrix.

In [None]:
def _cdist(X1, X2):
    """Pairwise Euclidean distances."""
    sq = np.sum(X1**2, 1).reshape(-1, 1) + np.sum(X2**2, 1).reshape(1, -1) - 2 * X1 @ X2.T
    return np.sqrt(np.maximum(sq, 0))


def rbf_kernel(X1, X2, l=1.0, sf=1.0):
    r = _cdist(X1, X2)
    return sf**2 * np.exp(-0.5 * (r / l)**2)


def matern52_kernel(X1, X2, l=1.0, sf=1.0):
    r = _cdist(X1, X2)
    s = np.sqrt(5) * r / l
    return sf**2 * (1 + s + s**2 / 3) * np.exp(-s)


def matern32_kernel(X1, X2, l=1.0, sf=1.0):
    r = _cdist(X1, X2)
    s = np.sqrt(3) * r / l
    return sf**2 * (1 + s) * np.exp(-s)


def periodic_kernel(X1, X2, l=1.0, sf=1.0, period=1.0):
    r = _cdist(X1, X2)
    return sf**2 * np.exp(-2 * np.sin(np.pi * r / period)**2 / l**2)

## 2. Kernel Correlation Profiles

$k(r)$ vs distance $r = |x - x'|$ — each kernel decays differently.

In [None]:
r = np.linspace(0, 5, 300).reshape(-1, 1)
origin = np.zeros((1, 1))

fig, ax = plt.subplots(figsize=(10, 5))

kernels = [
    ('RBF (C∞ smooth)', rbf_kernel, 'steelblue'),
    ('Matérn 5/2 (C² smooth)', matern52_kernel, '#2ca02c'),
    ('Matérn 3/2 (C¹ smooth)', matern32_kernel, '#d62728'),
]

for name, kern, color in kernels:
    vals = kern(r, origin, l=1.0, sf=1.0).ravel()
    ax.plot(r, vals, label=name, linewidth=2.5, color=color)

ax.set_xlabel('Distance r = |x − x\'|')
ax.set_ylabel('k(r)')
ax.set_title('Kernel Correlation Profiles — How Quickly Correlation Decays with Distance')
ax.legend(fontsize=11)
ax.axhline(0, color='gray', linewidth=0.5)
plt.tight_layout()
plt.show()

## 3. GP Samples Under Different Kernels

Same random seed, different kernels → different function classes. The kernel *is* the prior.

In [None]:
X = np.linspace(-5, 5, 300).reshape(-1, 1)

kernel_configs = [
    ('RBF (ℓ=1.0)', lambda X1, X2: rbf_kernel(X1, X2, l=1.0)),
    ('Matérn 5/2 (ℓ=1.0)', lambda X1, X2: matern52_kernel(X1, X2, l=1.0)),
    ('Matérn 3/2 (ℓ=1.0)', lambda X1, X2: matern32_kernel(X1, X2, l=1.0)),
    ('Periodic (p=2.0, ℓ=1.0)', lambda X1, X2: periodic_kernel(X1, X2, l=1.0, period=2.0)),
]

fig, axes = plt.subplots(2, 2, figsize=(14, 8), sharex=True, sharey=True)

for ax, (name, kern_fn) in zip(axes.ravel(), kernel_configs):
    K = kern_fn(X, X) + 1e-8 * np.eye(len(X))
    L = np.linalg.cholesky(K)
    samples = L @ np.random.randn(len(X), 4)

    for i in range(4):
        ax.plot(X, samples[:, i], alpha=0.7, linewidth=1.5)
    ax.set_title(name, fontsize=12, fontweight='bold')
    ax.set_xlabel('x')

fig.suptitle('GP Prior Samples — Same Randomness, Different Kernels', fontsize=15, y=1.02)
plt.tight_layout()
plt.show()

## 4. Length-Scale Controls Smoothness

Large $\ell$ → slow-varying, smooth functions. Small $\ell$ → rapidly changing, wiggly functions.

In [None]:
length_scales = [0.3, 1.0, 3.0]
fig, axes = plt.subplots(1, 3, figsize=(16, 4), sharey=True)

for ax, ls in zip(axes, length_scales):
    K = rbf_kernel(X, X, l=ls) + 1e-8 * np.eye(len(X))
    L = np.linalg.cholesky(K)
    samples = L @ np.random.randn(len(X), 4)

    for i in range(4):
        ax.plot(X, samples[:, i], alpha=0.7, linewidth=1.5)
    ax.set_title(f'ℓ = {ls}', fontsize=13, fontweight='bold')
    ax.set_xlabel('x')

axes[0].set_ylabel('f(x)')
fig.suptitle('RBF Kernel — Effect of Length-Scale on Function Smoothness', fontsize=15, y=1.02)
plt.tight_layout()
plt.show()

## 5. Kernel Composition — Sum and Product

Valid kernels can be combined:
- **Sum** $k_1 + k_2$: models additive structure (e.g., trend + periodicity)
- **Product** $k_1 \cdot k_2$: models interaction (e.g., locally periodic = RBF × Periodic)

In [None]:
def sample_gp(kern_fn, X, n_samples=3):
    K = kern_fn(X, X) + 1e-8 * np.eye(len(X))
    L = np.linalg.cholesky(K)
    return L @ np.random.randn(len(X), n_samples)

composed_kernels = [
    ('RBF + Periodic\n(trend + oscillation)',
     lambda X1, X2: rbf_kernel(X1, X2, l=3.0, sf=0.5) + periodic_kernel(X1, X2, l=1.0, period=2.0, sf=0.5)),
    ('RBF × Periodic\n(locally periodic)',
     lambda X1, X2: rbf_kernel(X1, X2, l=3.0, sf=1.0) * periodic_kernel(X1, X2, l=1.0, period=2.0, sf=1.0)),
    ('Matérn 5/2 + RBF\n(multi-scale)',
     lambda X1, X2: matern52_kernel(X1, X2, l=0.5, sf=0.5) + rbf_kernel(X1, X2, l=3.0, sf=0.5)),
]

fig, axes = plt.subplots(1, 3, figsize=(16, 4), sharey=True)

for ax, (name, kern_fn) in zip(axes, composed_kernels):
    samples = sample_gp(kern_fn, X)
    for i in range(3):
        ax.plot(X, samples[:, i], alpha=0.7, linewidth=1.5)
    ax.set_title(name, fontsize=11, fontweight='bold')
    ax.set_xlabel('x')

axes[0].set_ylabel('f(x)')
fig.suptitle('Kernel Composition — Building Complex Priors from Simple Kernels', fontsize=15, y=1.05)
plt.tight_layout()
plt.show()

## 6. Log Marginal Likelihood — Hyperparameter Selection

$$\log p(\mathbf{y} | X, \theta) = -\frac{1}{2} \mathbf{y}^\top (K + \sigma_n^2 I)^{-1} \mathbf{y} - \frac{1}{2} \log |K + \sigma_n^2 I| - \frac{n}{2} \log 2\pi$$

This balances data fit (first term) against model complexity (second term).

In [None]:
def log_marginal_likelihood(X_train, y_train, l, sf, noise_var=0.01):
    n = len(X_train)
    K = rbf_kernel(X_train, X_train, l=l, sf=sf) + noise_var * np.eye(n)
    L = np.linalg.cholesky(K)
    alpha = np.linalg.solve(L.T, np.linalg.solve(L, y_train))

    data_fit = -0.5 * y_train @ alpha
    complexity = -np.sum(np.log(np.diag(L)))
    const = -0.5 * n * np.log(2 * np.pi)

    return data_fit + complexity + const


true_fn = lambda x: np.sin(x) + 0.5 * np.cos(2.5 * x)
X_train = np.array([-3.5, -2.0, -0.5, 0.8, 2.0, 3.2, 4.0]).reshape(-1, 1)
y_train = true_fn(X_train.ravel()) + 0.1 * np.random.randn(len(X_train))

ls_range = np.linspace(0.1, 5.0, 100)
lml_values = [log_marginal_likelihood(X_train, y_train, l=l, sf=1.0) for l in ls_range]

best_l = ls_range[np.argmax(lml_values)]

fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(ls_range, lml_values, linewidth=2.5, color='steelblue')
ax.axvline(best_l, color='#d62728', linestyle='--', linewidth=1.5, label=f'optimal ℓ = {best_l:.2f}')
ax.set_xlabel('Length-scale ℓ')
ax.set_ylabel('Log Marginal Likelihood')
ax.set_title('Hyperparameter Selection via Log Marginal Likelihood')
ax.legend(fontsize=12)
plt.tight_layout()
plt.show()

print(f'Optimal length-scale: ℓ = {best_l:.2f}')

---

**Previous:** [Notebook 1 — GP Surrogate](./01_gp_surrogate.ipynb) · **Next:** [Notebook 3 — Acquisition Functions & Full BO Loop](./03_acquisition_functions.ipynb)

**Back to article:** [Bayesian Optimisation — Mathematical Deep Dive](https://omkarray.com/bayesian-optimization.html)