## Problem 7.1: Writing your own MCMC sampler

In [269]:
import itertools

import numpy as np
import pandas as pd
import scipy.stats as st
import random

import numba

import bebi103

import bokeh.io
import bokeh.plotting
bokeh.io.output_notebook()

In [219]:
mu = np.array([10.0, 20])
cov = np.array([[4, -2],[-2, 6]])
inv_cov = np.linalg.inv(cov)

@numba.jit(nopython=True)
def log_test_distribution(x, mu, inv_cov):
    """
    Unnormalized log posterior of a multivariate Gaussian.
    """
    return -np.dot((x-mu), np.dot(inv_cov, (x-mu))) / 2

In [249]:
def mh_step(x, logpost, logpost_current, sigma, args=()):
    """
    Parameters
    ----------
    x : ndarray, shape (n_variables,)
        The present location of the walker in parameter space.
    logpost : function
        The function to compute the log posterior. It has call
        signature `logpost(x, *args)`.
    logpost_current : float
        The current value of the log posterior.
    sigma : ndarray, shape (n_variables, )
        The standard deviations for the proposal distribution.
    args : tuple
        Additional arguments passed to `logpost()` function.

    Returns
    -------
    output : ndarray, shape (n_variables,)
        The position of the walker after the Metropolis-Hastings
        step. If no step is taken, returns the inputted `x`.
    """
    # Get next step
    x_next = np.random.multivariate_normal(x, sigma)

    # Calculate r
    theta_p = np.exp(logpost(x_next, *args))
    theta_i = np.exp(logpost_current)
    r = theta_p / theta_i
#     print(r)
    
    # Choose to accept or reject step    
    p = np.random.uniform(0, 1)
    if r >= 1:
        return x_next, 1
    elif p <= r:
        return x_next, 1
    else:
        return x, 0

In [264]:
def mh_sample(logpost, x0, sigma, args=(), n_burn=1000, n_steps=1000,
              variable_names=None):
    """
    Parameters
    ----------
    logpost : function
        The function to compute the log posterior. It has call
        signature `logpost(x, *args)`.
    x0 : ndarray, shape (n_variables,)
        The starting location of a walker in parameter space.
    sigma : ndarray, shape (n_variables, )
        The standard deviations for the proposal distribution.
    args : tuple
        Additional arguments passed to `logpost()` function.
    n_burn : int, default 1000
        Number of burn-in steps.
    n_steps : int, default 1000
        Number of steps to take after burn-in.
    variable_names : list, length n_variables
        List of names of variables. If None, then variable names
        are sequential integers.
    
    Returns
    -------
    output : DataFrame
        The first `n_variables` columns contain the samples.
        Additionally, column 'lnprob' has the log posterior value
        at each sample.
    """
    x = x0
    mu, inv_cov = args
    
    # Steps that will be burned
    for i in range(n_burn):
        logpost_current = logpost(x, *args)
        x, accept = mh_step(x, logpost, logpost_current, sigma, args=(mu, inv_cov))
    
    # Set up empty arrays to store info
    n_variables = []
    lnprob = []
#     n_accept = 0
    
    # Step
    for i in range(n_steps):
        n_variables.append(x)
        lnprob.append(logpost_current)
        
        logpost_current = logpost(x, *args)
        x, accept = mh_step(x, logpost, logpost_current, sigma, args=(mu, inv_cov))
#         n_accept += accept
        
    df = pd.DataFrame(data=n_variables, columns=['x', 'y'])
    df['lnprob'] = lnprob
    
    return df

### can r be greater than 1?

I've adjusted the acceptance rate to be around 0.4 for this x0 but we should add some logic to the code to automatically do that

In [285]:
# Choose arbitrary x0 and sigma
x0 = np.array([10, 5])
sigma = np.array([[9, -3],[-3, 10]])

# Take samples
df_samples = mh_sample(log_test_distribution, x0, sigma, args=(mu, inv_cov), n_burn=1000, n_steps=5000, variable_names=None)

# Take a look
df_samples.head()

Unnamed: 0,x,y,lnprob
0,13.470926,18.109169,-0.939787
1,13.470926,18.109169,-1.50833
2,13.470926,18.109169,-1.50833
3,11.557008,19.902113,-1.50833
4,9.027488,20.705668,-0.349358


Now let's plot to check.

In [284]:
# Plot
p = bokeh.plotting.figure(width=400, height=400,
                          x_axis_label='x', 
                          y_axis_label='y')

# Plot samples
p.circle(df_samples['x'], df_samples['y'], alpha=0.025)

# Overlay multivariate gaussian
x, y = np.random.multivariate_normal(mu, cov, 5000).T
p.circle(x, y, alpha=0.025, color='orange')
bokeh.io.show(p)

Looks like our samples in blue match the multivariate gaussian distribution in orange.