To open on Google Colab\
https://colab.research.google.com/github/RodrigoAVargasHdz/CHEM-4PB3/blob/main/Course_Notes/Week5/gp_kernels.ipynb

# Extrapolation with GP

Example based on [CO2 data](https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_co2.html#sphx-glr-auto-examples-gaussian-process-plot-gpr-co2-py)

In [13]:
import numpy as np
import pandas as pd

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, Matern, ExpSineSquared
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

import matplotlib
import matplotlib.pyplot as plt

In [22]:
co2 = fetch_openml(data_id=41187, as_frame=True, parser="pandas")
co2_data = co2.frame
co2_data["date"] = pd.to_datetime(co2_data[["year", "month", "day"]])
co2_data = co2_data[["date", "co2"]].set_index("date")

X = (co2_data.index.year + co2_data.index.month / 12).to_numpy().reshape(-1, 1)
y = co2_data["co2"].to_numpy()
y = y - np.mean(y)

In [25]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5)
print(y_train.shape,y_test.shape)

(1112,) (1113,)


Train a GP with an different kernels

In [27]:
kernel = ConstantKernel(1., (1E-2, 1E4)) * \
    ExpSineSquared(length_scale=np.ones(1), periodicity=10.0, length_scale_bounds=(
        1e-2, 100.0), periodicity_bounds=(1.0, 100.0),)
kernel += ConstantKernel(1., (1E-2, 1E4)) * RBF()
# WhiteKernel(noise_level=1., noise_level_bounds=(1e-10, 1e-3))
# ExpSineSquared(    length_scale=np.ones(1),    periodicity=10.0,    length_scale_bounds=(1e-3, 20.0),    periodicity_bounds=(1.0, 20.0),)

gpr = GaussianProcessRegressor(
    kernel=kernel, alpha=1E-4, random_state=0, n_restarts_optimizer=10)
gpr.fit(X_train, y_train)

y_mean, y_std = gpr.predict(X, return_std=True)

fig, ax = plt.subplots(figsize=(10, 5))
plt.scatter(X.flatten(), y, s=5, label='data')
y_mean, y_std = gpr.predict(X, return_std=True)

# plt.plot(X.flatten(),y_mean,c='k',label=r'$\mu_{GP}$')

X_test = np.linspace(start=2000, stop=2020, num=1_000).reshape(-1, 1)

print(X[-5:])
print(X_test[:3])
mean_y_pred, std_y_pred = gpr.predict(X_test, return_std=True)

plt.plot(X_test, mean_y_pred, color="tab:blue",
         alpha=0.4, label="Gaussian process")
# plt.fill_between(
#     X_test.ravel(),
#     mean_y_pred - std_y_pred,
#     mean_y_pred + std_y_pred,
#     color="tab:blue",
#     alpha=0.2,
# )
plt.xlabel("year")
plt.ylabel("Monthly average of CO$_2$ concentration (ppm)")
# plt.xlim(1999,2020)
plt.legend()

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
