## Problem 7.1: Writing your own MCMC sampler

In [1]:
import itertools

import numpy as np
import pandas as pd
import scipy.stats as st
import random

import numba

import bebi103

import bokeh.io
import bokeh.plotting
bokeh.io.output_notebook()

First we write a function that takes or rejects a Metropolis-Hastings step.

In [3]:
def mh_step(x, logpost, logpost_current, sigma, args=()):
    """
    Parameters
    ----------
    x : ndarray, shape (n_variables,)
        The present location of the walker in parameter space.
    logpost : function
        The function to compute the log posterior. It has call
        signature `logpost(x, *args)`.
    logpost_current : float
        The current value of the log posterior.
    sigma : ndarray, shape (n_variables, )
        The standard deviations for the proposal distribution.
    args : tuple
        Additional arguments passed to `logpost()` function.

    Returns
    -------
    output : ndarray, shape (n_variables,)
        The position of the walker after the Metropolis-Hastings
        step. If no step is taken, returns the inputted `x`.
    """
    # Get next step
    x_next = np.random.multivariate_normal(x, sigma)

    # Calculate r
    theta_p = np.exp(logpost(x_next, *args))
    theta_i = np.exp(logpost_current)
    r = theta_p / theta_i
#     print(r)
    
    # Choose to accept or reject step    
    p = np.random.uniform(0, 1)
    if r >= 1:
        return x_next, 1
    elif p <= r:
        return x_next, 1
    else:
        return x, 0

### can r be greater than 1?

Now we write a function that uses the previous step function to take samples. We'd like to add some automatic tuning of sigma so that the acceptance rate is approximately 0.4. I've checked to see that the acceptance rate during the burn steps is approximately equal to the acceptance rate during the sampling steps (it's sometimes higher, sometimes lower). As a result, we'll be checking the acceptance rate after the burn steps, and if it is not in our desired range, we will recalculate sigma and restart the walking process. I'm thinking of checking right after the burn steps because we don't want to also take all our samples and use more computational power if the acceptance rate is not in the range we want.

In [70]:
def mh_sample(logpost, x0, sigma, args=(), n_burn=1000, n_steps=1000,
              variable_names=None):
    """
    Parameters
    ----------
    logpost : function
        The function to compute the log posterior. It has call
        signature `logpost(x, *args)`.
    x0 : ndarray, shape (n_variables,)
        The starting location of a walker in parameter space.
    sigma : ndarray, shape (n_variables, )
        The standard deviations for the proposal distribution.
    args : tuple
        Additional arguments passed to `logpost()` function.
    n_burn : int, default 1000
        Number of burn-in steps.
    n_steps : int, default 1000
        Number of steps to take after burn-in.
    variable_names : list, length n_variables
        List of names of variables. If None, then variable names
        are sequential integers.
    
    Returns
    -------
    output : DataFrame
        The first `n_variables` columns contain the samples.
        Additionally, column 'lnprob' has the log posterior value
        at each sample.
    """
    x = x0
    mu, inv_cov = args
    n_accept = 0
    
    # Steps that will be burned
    for i in range(n_burn):
        logpost_current = logpost(x, *args)
        x, accept = mh_step(x, logpost, logpost_current, sigma, args=(mu, inv_cov))
        n_accept += accept

    accept_rate = n_accept/n_burn
    if accept_rate < 0.2 or accept_rate > 0.5:
        print('Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is', accept_rate)
        return pd.DataFrame(), accept_rate
        
    # Set up empty arrays and variables to store sample info
    n_variables = []
    lnprob = []
    n_accept = 0
    
    # Draw samples
    for i in range(n_steps):
        n_variables.append(x)
        lnprob.append(logpost_current)
        
        logpost_current = logpost(x, *args)
        x, accept = mh_step(x, logpost, logpost_current, sigma, args=(mu, inv_cov))
        n_accept += accept
        
    df = pd.DataFrame(data=n_variables, columns=['x', 'y'])
    df['lnprob'] = lnprob
    
    accept_rate = n_accept / n_steps
    print('Current acceptance rate is', accept_rate)
    
    return df, n_accept/n_steps

These are the given means, covariances, and log posterior functions for the problem.

In [42]:
mu = np.array([10.0, 20])
cov = np.array([[4, -2],[-2, 6]])
inv_cov = np.linalg.inv(cov)

@numba.jit(nopython=True)
def log_test_distribution(x, mu, inv_cov):
    """
    Unnormalized log posterior of a multivariate Gaussian.
    """
    return -np.dot((x-mu), np.dot(inv_cov, (x-mu))) / 2

This is a function to tune sigma automatically based on the acceptance rate.

In [54]:
def tune_sigma(accept_rate, sigma):
    if accept_rate < 0.001:
        return sigma * 0.1
    elif accept_rate < 0.05:
        return sigma * 0.5
    elif accept_rate < 0.2:
        return sigma * 0.9
    elif accept_rate > 0.5:
        return sigma * 1.1
    elif accept_rate > 0.75:
        return sigma * 2
    elif accept_rate > 0.95:
        return sigma * 10
    else:
        return sigma

Now let's test our sampler with an arbitrary x0 and sigma.

In [71]:
# Choose arbitrary x0 and sigma
x0 = np.array([10, 5])
sigma = np.array([[100, -3],[-3, 100]])

# Take samples
df_samples, accept_rate = mh_sample(log_test_distribution, x0, sigma, args=(mu, inv_cov), n_burn=1000, n_steps=5000, variable_names=None)
while accept_rate < 0.2 or accept_rate > 0.5:
    sigma = tune_sigma(accept_rate, sigma)
    df_samples, accept_rate = mh_sample(log_test_distribution, x0, sigma, args=(mu, inv_cov), n_burn=1000, n_steps=5000, variable_names=None)
    
# Take a look
df_samples.head()

Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.074
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.1
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.11
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.102
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.122
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.108
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.143
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.145
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.172
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.174
Current acceptance rate is 0.1684
Acceptance rate is not in desired range from 0.2 to 0.5. Current acce

Unnamed: 0,x,y,lnprob
0,10.284638,17.422893,-0.602947
1,10.284638,17.422893,-0.602947
2,10.284638,17.422893,-0.602947
3,10.284638,17.422893,-0.602947
4,10.284638,17.422893,-0.602947


We see that actually if we take the burn step acceptance rate to check if the acceptance rate is in our desired range, it's not actually always the same as our acceptance rate during the sampling process. They are often close, so this may pose a problem. Maybe we should just take the acceptance rate of the actual samples, or take the acceptance rate during the burn steps as the rate we judge overall instead of calculating one during the burn steps and one during the sampling steps.

Now let's plot to check that our samples are actually drawn from the distribution we expect.

In [31]:
# Plot
p = bokeh.plotting.figure(width=400, height=400,
                          x_axis_label='x', 
                          y_axis_label='y')

# Plot samples
p.circle(df_samples['x'], df_samples['y'], alpha=0.025)

# Overlay multivariate gaussian
x, y = np.random.multivariate_normal(mu, cov, 5000).T
p.circle(x, y, alpha=0.025, color='orange')
bokeh.io.show(p)

Looks like our samples in blue match the multivariate gaussian distribution in orange. We also want to check that the covariance of our samples is similar to the inputted covariance.

In [32]:
np.cov([df_samples['x'], df_samples['y']])

array([[ 4.00109112, -2.04779239],
       [-2.04779239,  6.41828623]])

Yes it's similar :)

Now let's plot a corner plot to double check that the sample come from the distribution we expect.

In [34]:
# For corner plot
df_samples['divergent__'] = 0

# Plot
bokeh.io.show(bebi103.viz.corner(df_samples, pars=['x', 'y']))

Let's try another example where the acceptance rate starts out too high.

In [72]:
# Choose arbitrary x0 and sigma
x0 = np.array([5, 5])
sigma = np.array([[4, -2],[-2, 2]])

# Take samples
df_samples, accept_rate = mh_sample(log_test_distribution, x0, sigma, args=(mu, inv_cov), n_burn=1000, n_steps=5000, variable_names=None)
while accept_rate < 0.2 or accept_rate > 0.5:
    sigma = tune_sigma(accept_rate, sigma)
    df_samples, accept_rate = mh_sample(log_test_distribution, x0, sigma, args=(mu, inv_cov), n_burn=1000, n_steps=5000, variable_names=None)
    
# Take a look
df_samples.head()

Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.654
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.652
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.652
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.622
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.614
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.584
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.566
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.556
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.525
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.511
Acceptance rate is not in desired range from 0.2 to 0.5. Current acceptance rate is 0.519
Current ac

Unnamed: 0,x,y,lnprob
0,7.787633,24.444352,-1.24847
1,12.716907,20.859085,-1.726158
2,8.779579,23.494464,-1.414445
3,8.779579,23.494464,-1.01807
4,8.779579,23.494464,-1.01807


Again let's plot the samples.

In [73]:
# Plot
p = bokeh.plotting.figure(width=400, height=400,
                          x_axis_label='x', 
                          y_axis_label='y')

# Plot samples
p.circle(df_samples['x'], df_samples['y'], alpha=0.025)

# Overlay multivariate gaussian
x, y = np.random.multivariate_normal(mu, cov, 5000).T
p.circle(x, y, alpha=0.025, color='orange')
bokeh.io.show(p)

And check the covariance.

In [74]:
np.cov([df_samples['x'], df_samples['y']])

array([[ 3.68986738, -1.85621187],
       [-1.85621187,  6.03159754]])

And also check the corner plot.

In [75]:
# For corner plot
df_samples['divergent__'] = 0

# Plot
bokeh.io.show(bebi103.viz.corner(df_samples, pars=['x', 'y']))

I think it looks good. Just need to think about how we decide the tuning?