# Gaussian Processes for Machine Learning

> [pdf](./../RW-2006-gaussian-processes-for-machine-learning.pdf)

## Regression

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gaussian_process.GPfunctions as gp
from gaussian_process import GaussianProcess
from gaussian_process.kernels import SquaredExponentialKernel, CubicKernel

In [None]:
# Objective function

objectiveFunction = lambda x: -x * np.sin(x)
objectiveFunctionDerivative = lambda x: -x * np.cos(x) - np.sin(x)

X = np.linspace(start=-3.0, stop=3.0, num=1_000)
y = objectiveFunction(X)
g = objectiveFunctionDerivative(X)

sample_count = 6
rng = np.random.default_rng(1)
training_indices = rng.choice(np.arange(y.size), size=sample_count, replace=False)
X_train, y_train, g_train = (
    X[training_indices],
    y[training_indices],
    g[training_indices],
)

In [None]:
kernel = SquaredExponentialKernel()

GP_prior = GaussianProcess(kernel)

prior_mean, prior_variance = GP_prior(X)

fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

gp.plot_gp(ax1, X, prior_mean, prior_variance)
gp.plot_samples(ax1, X, [GP_prior.sample(rng, X) for _ in range(3)])
gp.plot_label(ax1, "Prior")

GP_posterior = GaussianProcess(kernel, x_known=X_train, f_known=y_train, f_noise=1e-14)

posterior_mean, posterior_variance = GP_posterior(X)

gp.plot_objective(ax2, X, y, X_train, y_train)
gp.plot_gp(ax2, X, posterior_mean, posterior_variance)
gp.plot_samples(ax2, X, [GP_posterior.sample(rng, X) for _ in range(3)])
gp.plot_label(ax2, "Posterior")

fig.suptitle("Squared exponential kernel sampled")
fig.set_figwidth(15)
plt.show()

fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

gp.plot_gp(ax1, X, prior_mean, prior_variance)
gp.plot_label(ax1, "Prior")

gp.plot_objective(ax2, X, y, X_train, y_train)
gp.plot_gp(ax2, X, posterior_mean, posterior_variance)
gp.plot_label(ax2, "Posterior")

fig.suptitle("Squared exponential kernel")
fig.set_figwidth(15)
plt.show()

In [None]:
# Cubic spline kernel
# Sampling does not work, since covariance matrix is not symmetric positive-definite

kernel = CubicKernel()

GP_prior = GaussianProcess(kernel)
GP_posterior = GaussianProcess(kernel, x_known=X_train, f_known=y_train)

prior_mean, prior_variance = GP_prior(X)
posterior_mean, posterior_variance = GP_posterior(X)

fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

gp.plot_gp(ax1, X, prior_mean, prior_variance)
gp.plot_label(ax1, "Prior")

gp.plot_objective(ax2, X, y, X_train, y_train)
gp.plot_gp(ax2, X, posterior_mean, posterior_variance)
gp.plot_label(ax2, "Posterior")

fig.suptitle("Cubic kernel")
fig.set_figwidth(15)
plt.show()

## GP with derivative information

In [None]:
kernel = SquaredExponentialKernel()

GP_posterior = GaussianProcess(
    kernel=kernel,
    x_known=X_train,
    f_known=y_train,
    g_known=g_train,
    f_noise=1e-14,
    g_noise=1e-14,
)

posterior_mean, posterior_variance = GP_posterior(X)

fig, (ax1) = plt.subplots(1, 1, sharey=True)

gp.plot_objective(ax1, X, y, X_train, y_train)
gp.plot_gp(ax1, X, posterior_mean, posterior_variance)
gp.plot_label(ax1, "Posterior")

fig.suptitle("Squared exponential kernel with gradient")
plt.show()

In [None]:
kernel = CubicKernel()

GP_posterior = GaussianProcess(
    kernel=kernel, x_known=X_train, f_known=y_train, g_known=g_train
)

posterior_mean, posterior_variance = GP_posterior(X)

fig, (ax1) = plt.subplots(1, 1, sharey=True)

gp.plot_objective(ax1, X, y, X_train, y_train)
gp.plot_gp(ax1, X, posterior_mean, posterior_variance)
gp.plot_label(ax1, "Posterior")

fig.suptitle("Cubic kernel with gradient")
plt.show()

## Derivative vs no derivative

In [None]:
kernel = CubicKernel()

GP_posterior = GaussianProcess(kernel=kernel, x_known=X_train, f_known=y_train)
GP_posterior_gradients = GaussianProcess(
    kernel=kernel, x_known=X_train, f_known=y_train, g_known=g_train
)

fig, (ax1) = plt.subplots(1, 1)

gp.plot_objective(ax1, X, y, X_train, y_train)

posterior_mean, posterior_variance = GP_posterior(X)

gp.plot_gp(ax1, X, posterior_mean, posterior_variance, label="No gradient")

posterior_mean, posterior_variance = GP_posterior_gradients(X)

gp.plot_gp(ax1, X, posterior_mean, posterior_variance, label="Gradient")

gp.plot_label(ax1, "Cubic kernal")
fig.suptitle("Gradient vs no gradient")
plt.show()

In [None]:
kernel = SquaredExponentialKernel()

GP_posterior = GaussianProcess(
    kernel=kernel, x_known=X_train, f_known=y_train, f_noise=1e-14
)
GP_posterior_gradients = GaussianProcess(
    kernel=kernel,
    x_known=X_train,
    f_known=y_train,
    g_known=g_train,
    f_noise=1e-14,
    g_noise=1e-14,
)

fig, (ax1) = plt.subplots(1, 1)

gp.plot_objective(ax1, X, y, X_train, y_train)

posterior_mean, posterior_variance = GP_posterior(X)

gp.plot_gp(ax1, X, posterior_mean, posterior_variance, label="No gradient")

posterior_mean, posterior_variance = GP_posterior_gradients(X)

gp.plot_gp(ax1, X, posterior_mean, posterior_variance, label="Gradient")

gp.plot_label(ax1, "Squared exponential kernel")
fig.suptitle("Gradient vs no gradient")
plt.show()

## Error at points

In [None]:
GP_posterior = GaussianProcess(
    SquaredExponentialKernel(), X_train, y_train, f_noise=1e-14
)
print("For gp with squared exponential kernel and no gradient info we have:")
for x, f in zip(X_train, y_train):
    print(f"x={x}: abs(f(x)-gp(x))={np.abs(f-GP_posterior(x)[0])}")
print(f"With a total f-error of {GP_posterior.f_error()}")

GP_posterior = GaussianProcess(CubicKernel(), X_train, y_train)
print()
print("For gp with cubic kernel and no gradient info we have:")
for x, f in zip(X_train, y_train):
    print(f"x={x}: abs(f(x)-gp(x))={np.abs(f-GP_posterior(x)[0])}")
print(f"With a total f-error of {GP_posterior.f_error()}")

GP_posterior = GaussianProcess(
    SquaredExponentialKernel(), X_train, y_train, g_train, f_noise=1e-14, g_noise=1e-14
)
print()
print("For gp with squared exponential kernel and gradient info we have:")
for x, f, g in zip(X_train, y_train, g_train):
    print(
        f"x={x}: abs(f(x)-gp(x))={np.abs(f-GP_posterior(x)[0])}, abs(g(x)-g_gp(x))={np.abs(g-GP_posterior.derivative(x)[0])}"
    )
print(f"With a total f-error of {GP_posterior.f_error()}")
print(f"With a total g-error of {GP_posterior.g_error()}")

GP_posterior = GaussianProcess(CubicKernel(), X_train, y_train, g_train)
print()
print("For gp with cubic kernel and gradient info we have:")
for x, f, g in zip(X_train, y_train, g_train):
    print(
        f"x={x}: abs(f(x)-gp(x))={np.abs(f-GP_posterior(x)[0])}, abs(g(x)-g_gp(x))={np.abs(g-GP_posterior.derivative(x)[0])}"
    )
print(f"With a total f-error of {GP_posterior.f_error()}")
print(f"With a total g-error of {GP_posterior.g_error()}")

## Sklearn

In [None]:
X_sklearn = X.reshape(-1, 1)
y_sklearn = y.reshape(-1, 1)

X_train_sklearn = X_train.reshape(-1, 1)
y_train_sklearn = y_train.reshape(-1, 1)

In [None]:
# Via sklearn

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel

l = 1.0

kernel = ConstantKernel(1.0, constant_value_bounds="fixed") * RBF(
    length_scale=l, length_scale_bounds="fixed"
)
gaussian_process = GaussianProcessRegressor(kernel=kernel)

mean_prediction, std_prediction = gaussian_process.predict(X_sklearn, return_std=True)

fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

gp.plot_gp(ax1, X_sklearn.ravel(), mean_prediction, std_prediction)
gp.plot_samples(
    ax1,
    X_sklearn,
    [gaussian_process.sample_y(X_sklearn, random_state=i) for i in range(3)],
)
gp.plot_label(ax1, "Prior")

gaussian_process.fit(X_train_sklearn, y_train_sklearn)

mean_prediction, std_prediction = gaussian_process.predict(X_sklearn, return_std=True)

gp.plot_objective(ax2, X_sklearn, y_sklearn, X_train_sklearn, y_train_sklearn)
gp.plot_gp(ax2, X_sklearn.ravel(), mean_prediction, std_prediction)
gp.plot_samples(
    ax2,
    X_sklearn,
    [gaussian_process.sample_y(X_sklearn, random_state=i) for i in range(3)],
)
gp.plot_label(ax2, "Posterior")

fig.suptitle("Sklearn")
fig.set_figwidth(15)
plt.show()

## Compare custom vs sklearn

In [None]:
GP_custom = GaussianProcess(
    SquaredExponentialKernel(l=l), X_train, y_train, f_noise=1e-10
)  # sklearn adds 1e-10 noise

mean, variance = GP_custom(X)
std = GP_custom.std_deviation(X, variance)

fig, (ax1) = plt.subplots(1, 1)
gp.plot_objective(ax1, X, y, X_train, y_train)
gp.plot_gp(ax1, X, mean, std, label="custom")
gp.plot_gp(ax1, X_sklearn.ravel(), mean_prediction, std_prediction, label="sklearn")
gp.plot_label(ax1, "Posterior")

fig.suptitle("Sklearn vs custom")
fig.set_figwidth(15)
plt.show()

print(
    f"Difference in log marginal likelihood is {np.abs(GP_custom.log_marginal_likelihood() - gaussian_process.log_marginal_likelihood())}"
)
print(f"Max difference in mean is {np.abs(mean - mean_prediction).max()}")
print(f"max difference in std is {np.abs(std - std_prediction).max()}")