Added GPR tutorial.

Weizhe-Chen · Aug 30, 2023 · 43ef102 · 43ef102
1 parent 2a230a8
commit 43ef102
Show file tree

Hide file tree

Showing 6 changed files with 354 additions and 48 deletions.
diff --git a/docs/get_started.md b/docs/get_started.md
diff --git a/docs/tutorials/gaussian_process_regression.md b/docs/tutorials/gaussian_process_regression.md
@@ -0,0 +1,24 @@
+# Gaussian Process Regression
+
+## Prediction
+
+Given $N$ training inputs $\mathbf{X}\in\mathbb{R}^{N\times{D}}$ and training targets $\mathbf{y}\in\mathbb{R}^{N}$, the posterior predictive distribution has a closed-form expression:
+
+$$
+\begin{aligned}
+    p(f_{\star}\rvert\mathbf{y})&=\mathcal{N}(f_{\star}\rvert\mu,\nu),\\
+    \mu&=\mathbf{k}_{\star}^{\top}\mathbf{K}_{y}^{-1}\mathbf{y},\label{eq:pred_mu}\\
+    \nu&=k_{\star\star}-\mathbf{k}_{\star}^{\top}\mathbf{K}_{y}^{-1}\mathbf{k}_{\star},
+\end{aligned}
+$$
+
+where $\mathbf{k}_{\star}$ is the vector of kernel values between all the training inputs $\mathbf{X}$ and the test input $\mathbf{x}^{\star}$, $\mathbf{K}_{y}$ is a shorthand of $\mathbf{K}_{\mathbf{x}}+\sigma^{2}\mathbf{I}$, $\mathbf{K}_{\mathbf{x}}$ is the covariance matrix given by the kernel function evaluated at each pair of training inputs, and $k_{\star\star}\triangleq\mathtt{k}(\mathbf{x}^{\star},\mathbf{x}^{\star})$.
+
+## Learning
+
+Optimizing the hyperparameters -- a process known as model selection -- is a common practice to obtain a better prediction.
+Model selection is typically implemented by maximizing the model evidence (better known as log marginal likelihood)
+
+$$
+\ln{p(\mathbf{y}|\mathbf{\psi})}=\frac{1}{2}(\underbrace{-\mathbf{y}^{\top}\mathbf{K}_{y}^{-1}\mathbf{y}}_{\text{quadratic term}}-\underbrace{\ln{\mathrm{det}(\mathbf{K}_{y})}}_{\text{logdet term}}-\underbrace{N\ln(2\pi)}_{\text{constant term}}),
+$$
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -7,7 +7,8 @@ copyright: Copyright &copy; 2022 - 2023 PyPolo Developers
 nav:
   - Introduction: index.md
   - Installation: installation.md
-  - Get Started: get_started.md
+  - Tutorials:
+    - Gaussian Process Regression: tutorials/gaussian_process_regression.md
   - Resources:
     - Books: resources/books.md
     - Videos: resources/videos.md

diff --git a/pypolo/models/base_model.py b/pypolo/models/base_model.py
@@ -2,7 +2,6 @@
 from typing import Tuple, Union
 
 import numpy as np
-from torch.utils.tensorboard.writer import SummaryWriter
 
 from ..utils import torch_utils
 
@@ -27,8 +26,7 @@ def learn(self,
               x_new: np.ndarray,
               y_new: np.ndarray,
               num_iter: int,
-              verbose: bool = True,
-              writer: Union[SummaryWriter, None] = None) -> None:
+              verbose: bool = True) -> None:
         r"""Optimizes the model parameters.
 
         Args:

diff --git a/pypolo/models/gpr_model.py b/pypolo/models/gpr_model.py
@@ -4,7 +4,6 @@
 import torch
 from torch import nn
 from torch.nn import Parameter
-from torch.utils.tensorboard.writer import SummaryWriter
 from tqdm import tqdm
 
 from ..utils import torch_utils
@@ -14,23 +13,19 @@
 
 class GPRModel(BaseModel, nn.Module):
 
-    def __init__(
-        self,
-        device_name,
-        kernel: BaseKernel,
-        noise: float,
-        lr_hyper: float = 0.01,
-        lr_nn: float = 0.001,
-        jitter: float = 1e-6,
-    ) -> None:
+    def __init__(self,
+                 device_name,
+                 kernel: BaseKernel,
+                 noise: float,
+                 lr_hyper: float = 0.01,
+                 jitter: float = 1e-6) -> None:
         r"""Gaussian Process Regression.
 
         Args:
             device_name (str): The name of the device to run the model.
             kernel (BaseKernel): The kernel function.
             noise (float): The noise variance of the Gaussian likelihood.
             lr_hyper (float, optional): Learning rate of hyper-parameters.
-            lr_nn (float, optional): Learning rate of network parameters.
             jitter (float, optional): The jitter to add to the diagonal of the
                 covariance matrix. Defaults to 1e-6.
 
@@ -52,15 +47,14 @@ def __init__(
                     dtype=self.dtype,
                     device=self.device,
                 )))
-        self._init_optimizers(lr_hyper, lr_nn)
+        self._init_optimizers(lr_hyper)
         self.jitter = jitter
 
     def learn(self,
               x_new: np.ndarray,
               y_new: np.ndarray,
               num_iter: int,
-              verbose: bool = True,
-              writer: Union[SummaryWriter, None] = None) -> None:
+              verbose: bool = True) -> None:
         r"""Optimizes the model parameters.
 
         Args:
@@ -78,20 +72,11 @@ def learn(self,
         progress_bar = tqdm(range(num_iter), disable=not verbose)
         for i in progress_bar:
             self.opt_hyper.zero_grad()
-            if self.opt_nn is not None:
-                self.opt_nn.zero_grad()
             loss = self._compute_loss()
             loss.backward()
             self.opt_hyper.step()
-            if self.opt_nn is not None:
-                self.opt_nn.step()
             progress_bar.set_description(
                 f"Iter: {i:02d} loss: {loss.item(): .2f}")
-            if writer is not None:
-                writer.add_scalar('loss', loss.item(), i)
-                for name, param in self.named_parameters():
-                    if param.grad is not None:
-                        writer.add_histogram(name, param.grad, i)
         self.eval()
 
     def predict(self, x_test: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
@@ -293,31 +278,16 @@ def _compute_common(self):
         iK_y = torch.cholesky_solve(self.y_train, L, upper=False)
         return L, iK_y
 
-    def _init_optimizers(self, lr_hyper: float, lr_nn: float) -> None:
-        """Initialize optimizers for hyper-parameters and, optinally,
-        neural network parameters in non-stationary kernels.
+    def _init_optimizers(self, lr_hyper: float) -> None:
+        """Initialize optimizers for hyper-parameters.
 
         Args:
             lr_hyper (float, optional): Learning rate of hyper-parameters.
                 Defaults to 0.01.
-            lr_nn (float, optional): Learning rate of neural network parameters
-                in non-stationary kernels. Defaults to 0.001.
-
-        !!! note "Neural Network Parameters"
-
-            Neural network parameters are found by searching for the string
-            "nn" in the parameter name.
 
         """
-        self.lr_hyper, self.lr_nn = lr_hyper, lr_nn
-        hyper_params, nn_params = [], []
+        self.lr_hyper = lr_hyper
+        hyper_params = []
         for name, param in self.named_parameters():
-            if "nn" in name:
-                nn_params.append(param)
-            else:
-                hyper_params.append(param)
+            hyper_params.append(param)
         self.opt_hyper = torch.optim.Adam(hyper_params, lr=lr_hyper)
-        if nn_params:
-            self.opt_nn = torch.optim.Adam(nn_params, lr=lr_nn)
-        else:
-            self.opt_nn = None