In [3]:
from tensorflow.keras import Model, Sequential, layers, optimizers, metrics, losses
import tensorflow as tf
import tensorflow_probability as tfp
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [4]:
seed = 213
np.random.seed(seed)
tf.random.set_seed(seed)
dtype = tf.float32

In [5]:
# load data and labels
boston = load_boston()
data = boston.data
targets = boston.target

# divide into train and test splits
X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=0.2)

# Scale our inputs
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

Build and train neural network which we will use as basis function

In [6]:
# construct our model
model = Sequential()
model.add(layers.Dense(20, input_dim=13, activation='relu', name='layer_1'))
model.add(layers.Dense(8, activation='relu', name='layer_2'))
model.add(layers.Dense(1, activation='relu', name='layer_3'))

In [7]:
# compile the model
model.compile(optimizer=optimizers.Adam(),
              loss=losses.MeanSquaredError(),
              metrics=[metrics.RootMeanSquaredError()],)

In [8]:
# train the model for 200 epochs
num_epochs = 200
model.fit(X_train, y_train, epochs=num_epochs)
mse, rmse = model.evaluate(X_test, y_test)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [11]:
# access the pen-ultimate laters
basis_func = Model(inputs=model.input,
                   outputs=model.get_layer('layer_2').output)

In [13]:
# obtain output of the second hidden layer by calling predict method
layer_2_output = basis_func.predict(X_test)


In [14]:
layer_2_output = basis_func.predict(X_test)


In [15]:
layer_2_output

array([[6.67803884e-01, 4.85449028e+00, 0.00000000e+00, 6.59669971e+00,
        0.00000000e+00, 0.00000000e+00, 4.74888086e+00, 7.09521484e+00],
       [4.93824100e+00, 2.08727241e+00, 0.00000000e+00, 1.10894699e+01,
        3.50147414e+00, 0.00000000e+00, 5.50131464e+00, 8.15977764e+00],
       [0.00000000e+00, 1.34282188e+01, 0.00000000e+00, 1.14597454e+01,
        0.00000000e+00, 0.00000000e+00, 1.52780943e+01, 1.26173525e+01],
       [0.00000000e+00, 1.08129768e+01, 0.00000000e+00, 1.09407024e+01,
        0.00000000e+00, 0.00000000e+00, 1.20765619e+01, 1.09764414e+01],
       [2.44981170e-01, 5.23128128e+00, 0.00000000e+00, 1.14304829e+01,
        6.48803830e-01, 0.00000000e+00, 9.23081589e+00, 9.65928936e+00],
       [5.88344240e+00, 3.30265355e+00, 0.00000000e+00, 1.21880646e+01,
        2.81414151e+00, 0.00000000e+00, 5.90702343e+00, 9.70213604e+00],
       [0.00000000e+00, 7.61287022e+00, 0.00000000e+00, 1.03393860e+01,
        0.00000000e+00, 0.00000000e+00, 9.73524284e+00, 9.

Build Bayesian linear regressor

In [32]:
class BayesianLastLayer():

  def __init__(self,
                model,
                basis_layer,
                n_samples=1e4,
                n_burnin=5e3,
                step_size=1e-4,
                n_leapfrog=10,
                adaptive=False):
      # Setting up our model
      self.model = model
      self.basis_layer = basis_layer
      self.initialize_basis_function()
      # HMC Settings
      # number of hmc samples
      self.n_samples = int(n_samples)
      # number of burn-in steps
      self.n_burnin = int(n_burnin)
      # HMC step size
      self.step_size = step_size
      # HMC leapfrog steps
      self.n_leapfrog = n_leapfrog
      # whether to be adaptive or not
      self.adaptive = adaptive

  # define helper function for using the neural network as a basis function
  def initialize_basis_function(self):
      self.basis_func = Model(inputs=self.model.input,
                              outputs=self.model.get_layer(self.basis_layer).output)

  # define helper function to easily get predictions from helper function
  def get_basis(self, X):
      return self.basis_func.predict(X)

  # define fuction for fitting the Bayesian linear regressor on data
  def fit(self, X, y):
      X = tf.convert_to_tensor(self.get_basis(X), dtype=dtype)
      y = tf.convert_to_tensor(y, dtype=dtype)
      y = tf.reshape(y, (-1, 1))
      D = X.shape[1]

      # Define our joint distribution
      distribution = tfp.distributions.JointDistributionNamedAutoBatched(
          dict(
              sigma=tfp.distributions.HalfNormal(scale=tf.ones([1])),
              alpha=tfp.distributions.Normal(
                  loc=tf.zeros([1]),
                  scale=tf.ones([1]),
              ),
              beta=tfp.distributions.Normal(
                  loc=tf.zeros([D,1]),
                  scale=tf.ones([D,1]),
              ),
              y=lambda beta, alpha, sigma:
                  tfp.distributions.Normal(
                      loc=tf.linalg.matmul(X, beta) + alpha,
                      scale=sigma
                  )
              )
          )

      # Define the log probability function
      def target_log_prob_fn(beta, alpha, sigma):
          return distribution.log_prob(beta=beta, alpha=alpha, sigma=sigma, y=y)

      # Define the HMC kernel we'll be using for sampling
      hmc_kernel  = tfp.mcmc.HamiltonianMonteCarlo(
        target_log_prob_fn=target_log_prob_fn,
        step_size=self.step_size,
        num_leapfrog_steps=self.n_leapfrog
      )

      # We can use adaptive HMC to automatically adjust the kernel step size
      if self.adaptive:
          adaptive_hmc = tfp.mcmc.SimpleStepSizeAdaptation(
            inner_kernel = hmc_kernel,
            num_adaptation_steps=int(self.n_burnin * 0.8)
          )

      # If we define a function, we can extend this to multiple chains.
      @tf.function
      def run_chain():
          states, kernel_results = tfp.mcmc.sample_chain(
                num_results=self.n_samples,
                num_burnin_steps=self.n_burnin,
                current_state=[
                    tf.zeros((X.shape[1],1), name='init_model_coeffs'),
                    tf.zeros((1), name='init_bias'),
                    tf.ones((1), name='init_noise'),
                ],
                kernel=hmc_kernel
              )
          return states, kernel_results

      print(f'Running HMC with {self.n_samples} samples.')
      states, kernel_results = run_chain()

      print('Completed HMC sampling.')
      coeffs, bias, noise_std = states
      accepted_samples = kernel_results.is_accepted[self.n_burnin:]
      acceptance_rate = 100*np.mean(accepted_samples)
      # Print the acceptance rate - if this is low, we need to check our
      # HMC parameters
      print('Acceptance rate: %0.1f%%' % (acceptance_rate))

      # Obtain the post-burnin samples
      self.model_coeffs = coeffs[self.n_burnin:,:,0]
      self.bias = bias[self.n_burnin:]
      self.noise_std = noise_std[self.n_burnin:]

  def get_pred_dist(self, X):
      predictions = (tf.matmul(X, tf.transpose(self.model_coeffs)) +
                    self.bias[:,0])
      noise = (self.noise_std[:,0] *
              tf.random.normal([self.noise_std.shape[0]]))
      return predictions + noise

  def predict(self, X):
      X = tf.convert_to_tensor(self.get_basis(X), dtype=dtype)
      pred_dist = np.zeros((X.shape[0], self.model_coeffs.shape[0]))
      X = tf.reshape(X, (-1, 1, X.shape[1]))
      for i in range(X.shape[0]):
        pred_dist[i,:] = self.get_pred_dist(X[i,:])

      y_pred = np.mean(pred_dist, axis=1)
      y_std = np.std(pred_dist, axis=1)
      return y_pred, y_std

In [33]:
# instatiate last layer model
bll = BayesianLastLayer(model, 'layer_2')
# fit model on training data
bll.fit(X_train, y_train)
# perform inference
y_pred, y_std = bll.predict(X_test)

Running HMC with 10000 samples.




Completed HMC sampling.
Acceptance rate: 100.0%
