In [None]:
import itertools

import numpy as np
import pandas as pd

import altair as alt
import bebi103
import altair_catplot as altcat
import scipy.special as sp
import numba

import bokeh.io
import bokeh.plotting
bokeh.io.output_notebook()
color_palette=['#4e79a7', '#f28e2b', '#e15759', '#76b7b2', '#59a14f', '#edc948', '#b07aa1', '#ff9da7', '#9c755f', '#bab0ac']

In [None]:
model_code_prior_pred = """
data {
  int N;  
  real a;
  real b;
  int Nt;
}

generated quantities{
  real n[N];
  real theta = beta_rng(a, b);
  
  for (i in 1:N) {
    n[i] = binomial_rng(Nt, theta);
  }
}"""

In [None]:
sm_gen = bebi103.stan.StanModel(model_code=model_code_prior_pred)

In [None]:
def mcmc_theta(sm_gen, alpha, beta, Num_t, Num=500, N_iter=1000):
    data = dict(N=Num,
               a = alpha,
               b = beta,
               Nt = Num_t)
    return sm_gen.sampling(data=data,
                           algorithm='Fixed_param',
                           warmup=0,
                           chains=1,
                           iter=N_iter)


alpha, beta, Nt = 0.2, 8, 126

# df_samples = bebi103.stan.extract_array(mcmc_theta(sm_gen, alpha, beta), name='theta')

bokeh.io.show(
    bebi103.viz.predictive_ecdf(mcmc_theta(sm_gen, alpha, beta, Nt), 
                                'n', 
                                x_axis_label='number of reversals'))
    

In [None]:
alpha, beta, Nt = 5.5, 18., 124

# df_samples = bebi103.stan.extract_array(mcmc_theta(sm_gen, alpha, beta), name='theta')

bokeh.io.show(
    bebi103.viz.predictive_ecdf(mcmc_theta(sm_gen, alpha, beta, Nt), 
                                'n', 
                                x_axis_label='number of reversals'))

In [None]:
alpha, beta, Nt = 8., 3., 124

# df_samples = bebi103.stan.extract_array(mcmc_theta(sm_gen, alpha, beta), name='theta')

bokeh.io.show(
    bebi103.viz.predictive_ecdf(mcmc_theta(sm_gen, alpha, beta, Nt), 
                                'n', 
                                x_axis_label='number of reversals'))

In [None]:
model_code_prior_gen = """
data { 
  real a;
  real b;
  int Nt;
  int n;
}


parameters {
  real<lower=0, upper=1> theta;
}


model {
  // Priors
  theta ~ beta(a, b);

  // Likelihood
  n ~ binomial(Nt, theta);
}
"""

In [None]:
sm_gen_pri = bebi103.stan.StanModel(model_code=model_code_prior_gen)

In [None]:
def mcmc_theta_sampling(sm_gen, params , N_iter=100000):
    
    alpha, beta, Num_t, num = params    
    data = dict(a=alpha,
                b=beta,
                Nt=Num_t,
                n=num)
    samples = sm_gen.sampling(data=data, iter=N_iter)
    return bebi103.stan.to_dataframe(samples, diagnostics=False, inc_warmup=False)

In [None]:
params = [0.2, 8., 126, 13]

df_mcmc = mcmc_theta_sampling(sm_gen_pri, params, 100000)

df_mcmc['Strain'] = 'WT'

# Take a look
df_mcmc.head()

In [None]:
N_iter = 10000

strain = ['ASH','AVA','WT']

params = [[5.5, 18., 124, 39], 
          [8., 3., 124, 91],
          [0.2, 8., 126, 13]]

df_mcmc = pd.DataFrame()

p1 = bokeh.plotting.figure(width=400, height=300, title='Stan sampler')


for i, strain_name in enumerate(strain):
    temp = mcmc_theta_sampling(sm_gen_pri, params[i], N_iter)
    temp['Strain'] = strain_name
    p1 = bebi103.viz.histogram(temp['theta'],
                               p=p1,
                               bins=35,
                               line_width=2,
                               density=True,
                               x_axis_label='theta',
                               y_axis_label='g(theta|y)',
                               color = color_palette[i])
    
    df_mcmc = pd.concat([df_mcmc, temp])
        
df_mcmc = df_mcmc.reset_index(drop=True)

p2 = bebi103.viz.ecdf_collection(data=df_mcmc, 
                                cats='Strain',
                                val='theta',
                                formal=True,
                                line_width=2,
                                plot_width=600,
                                plot_height=300)

bokeh.io.show(bokeh.layouts.gridplot([p1, p2], ncols=2))

In [None]:
def mh_step(x, logpost, logpost_current, sigma, args=()):
    """
    Parameters
    ----------
    x : ndarray, shape (n_variables,)
        The present location of the walker in parameter space.
    logpost : function
        The function to compute the log posterior. It has call
        signature `logpost(x, *args)`.
    logpost_current : float
        The current value of the log posterior.
    sigma : ndarray, shape (n_variables, )
        The standard deviations for the proposal distribution.
    args : tuple
        Additional arguments passed to `logpost()` function.

    Returns
    -------
    output : ndarray, shape (n_variables,)
        The position of the walker after the Metropolis-Hastings
        step. If no step is taken, returns the inputted `x`.
    """
    # Get next step
    x_next = np.random.normal(x, sigma)

    # Calculate r
    theta_p = np.exp(logpost(x_next, *args))
    theta_i = np.exp(logpost_current)
    r = theta_p / theta_i
#     print(r)
    
    # Choose to accept or reject step    
    p = np.random.uniform(0, 1)
    if p <= r:
        return x_next, 1
    else:
        return x, 0

In [None]:
def mh_sample(logpost, x0, sigma, args=(), n_burn=1000, n_steps=1000,
              variable_names=None):
    """
    Parameters
    ----------
    logpost : function
        The function to compute the log posterior. It has call
        signature `logpost(x, *args)`.
    x0 : ndarray, shape (n_variables,)
        The starting location of a walker in parameter space.
    sigma : ndarray, shape (n_variables, )
        The standard deviations for the proposal distribution.
    args : tuple
        Additional arguments passed to `logpost()` function.
    n_burn : int, default 1000
        Number of burn-in steps.
    n_steps : int, default 1000
        Number of steps to take after burn-in.
    variable_names : list, length n_variables
        List of names of variables. If None, then variable names
        are sequential integers.
    
    Returns
    -------
    output : DataFrame
        The first `n_variables` columns contain the samples.
        Additionally, column 'lnprob' has the log posterior value
        at each sample.
    """
    x = x0
    n_variables = []
    lnprob = []
    n_accept = 0

    for i in range(n_burn):
        logpost_current = logpost(x, *args)
        x, accept = mh_step(x, logpost, logpost_current, sigma, args=args)

    # Draw samples
    for i in range(n_steps):
        n_variables.append(x)
        lnprob.append(logpost_current)

        logpost_current = logpost(x, *args)
        x, accept = mh_step(x, logpost, logpost_current, sigma, args=args)
        n_accept += accept
    
    accept_rate = n_accept / n_steps
    
    if accept_rate < 0.2 or accept_rate > 0.5:
        return pd.DataFrame(), accept_rate
    
    else:
        df = pd.DataFrame(data=n_variables, columns=['theta'])
        df['lnprob'] = lnprob
        return df, accept_rate

In [None]:
# @numba.jit(nopython=True)
def log_test_distribution(theta, alpha, beta, N, n):
    """
    Unnormalized log posterior of a multivariate Gaussian.
    """
    if theta <= 0 or theta >= 1:
        return -np.inf
    else:
        return sp.loggamma(N + 1) - sp.loggamma(n + 1) - sp.loggamma(N - n + 1) + \
               (n + alpha - 1) * np.log(theta) + (N - n + beta - 1) * np.log(1 - theta)\
               + sp.loggamma(alpha + beta) - sp.loggamma(alpha) - sp.loggamma(beta)


In [None]:
def tune_sigma(accept_rate, sigma):
    if accept_rate < 0.001:
        return sigma * 0.1
    elif accept_rate < 0.05:
        return sigma * 0.5
    elif accept_rate < 0.2:
        return sigma * 0.9
        return sigma * 0.975
    elif accept_rate > 0.5:
        return sigma * 1.1
    elif accept_rate > 0.75:
        return sigma * 2
    elif accept_rate > 0.95:
        return sigma * 10
    else:
        return sigma

In [None]:
# x0 = 0.8
# sigma = 1

# pa = [8., 3., 124, 91]
N_iter =10000
x0 = 0.2
sigma = 1

pa = [0.2, 8., 126, 13]

# x0 = 0.4
# sigma = 1

# pa = [5.5, 18., 124, 39]

df_samples = pd.DataFrame()

# Take samples
df_samples, accept_rate = mh_sample(log_test_distribution, 
                                    x0, 
                                    sigma, 
                                    args=pa, 
                                    n_burn=1000, 
                                    n_steps=N_iter, 
                                    variable_names=None)
while len(df_samples) == 0:
    sigma = tune_sigma(accept_rate, sigma)
    df_samples, accept_rate = mh_sample(log_test_distribution, 
                                        x0, 
                                        sigma, 
                                        args=pa, 
                                        n_burn=1000, 
                                        n_steps=N_iter, variable_names=None)

bokeh.io.show(bebi103.viz.ecdf(df_samples['theta']))

In [None]:
def hw7_sampling(x0, sigma, params, log_test_distribution, num_burn, N_iter=10000, variable_names=None):

        # Take samples
    df_samples, accept_rate = mh_sample(log_test_distribution, 
                                        x0, 
                                        sigma, 
                                        args=params, 
                                        n_burn=num_burn, 
                                        n_steps=N_iter, 
                                        variable_names=None)
    
    while len(df_samples) == 0:
        sigma = tune_sigma(accept_rate, sigma)
        df_samples, accept_rate = mh_sample(log_test_distribution, 
                                            x0, 
                                            sigma, 
                                            args=params, 
                                            n_burn=num_burn, 
                                            n_steps=N_iter, 
                                            variable_names=None)
        
    return df_samples

In [None]:
N_iter = 100000

strain = ['ASH','AVA','WT']

params = [[5.5, 18., 124, 39], 
          [8., 3., 124, 91],
          [0.2, 8., 126, 13]]

x0s = [0.3, 0.8, 0.1]

df_samples_mh = pd.DataFrame()

p3 = bokeh.plotting.figure(height=300, width=400, title='Metropolis-Hastings sampler')

for i, strain_name in enumerate(strain):
    temp = hw7_sampling(x0s[i], sigma, params[i], log_test_distribution, 2000, N_iter)
    temp['Strain'] = strain_name
    p3 = bebi103.viz.histogram(temp['theta'],
                               p=p3,
                               bins=35,
                               line_width=2,
                               density=True,
                               x_axis_label='theta',
                               y_axis_label='g(theta|y)',
                               color = color_palette[i])
    
    df_samples_mh = pd.concat([df_samples_mh, temp])
        
df_samples_mh = df_samples_mh.reset_index(drop=True)

p4 = bebi103.viz.ecdf_collection(data=df_samples_mh,
                                cats='Strain',
                                val='theta',
                                formal=True,
                                line_width=2,
                                plot_width=600,
                                plot_height=300)

bokeh.io.show(bokeh.layouts.gridplot([p1, p2, p3, p4], ncols=2))

In [None]:
model_code_diff_gen = """
data { 
  real a1;
  real b1;
  real a2;
  real b2;
  int Nt1;
  int n1;
  int Nt2;
  int n2;
}


parameters {
  real<lower=0, upper=1> theta1;
  real<lower=0, upper=1> theta2;
}


model {
  // Priors
  theta1 ~ beta(a1, b1);
  theta2 ~ beta(a2, b2);

  // Likelihood
  n1 ~ binomial(Nt1, theta1);
  n2 ~ binomial(Nt2, theta2);
}

generated quantities {
  real dtheta = theta2 - theta1;
}
"""

In [None]:
sm_diff_gen = bebi103.stan.StanModel(model_code=model_code_diff_gen)

In [None]:
def mcmc_theta_diff_sampling(sm_gen, params , N_iter=100000):
    
    alpha1, beta1, Num_t1, num1, alpha2, beta2, Num_t2, num2 = params    
    data = dict(a1=alpha1,
                b1=beta1,
                a2=alpha2,
                b2=beta2,
                Nt1=Num_t1,
                n1=num1,
                Nt2=Num_t2,
                n2=num2)
    samples = sm_gen.sampling(data=data, iter=N_iter)
    return bebi103.stan.to_dataframe(samples, diagnostics=False, inc_warmup=False)

In [None]:
N_iter = 10000

strain = ['ASH','AVA','WT']

params = [[5.5, 18., 124, 39], 
          [8., 3., 124, 91],
          [0.2, 8., 126, 13]]

strain_diff = [[0,1],[2,0],[2,1]]


df_diff_mcmc = pd.DataFrame()

p5 = bokeh.plotting.figure(width=400, height=300, title='Difference of theta')


for i, cp in enumerate(strain_diff):
    para_temp = params[cp[0]] + params[cp[1]]
    temp = mcmc_theta_diff_sampling(sm_diff_gen, para_temp, N_iter)
    temp['Strain'] = strain[cp[0]] + '-' + strain[cp[1]]
    p5 = bebi103.viz.histogram(temp['dtheta'],
                               p=p5,
                               bins=35,
                               line_width=2,
                               density=True,
                               x_axis_label='delta theta',
                               y_axis_label='g(delta theta|y)',
                               color = color_palette[i])    
    df_diff_mcmc = pd.concat([df_diff_mcmc, temp])
        
df_diff_mcmc = df_diff_mcmc.reset_index(drop=True)

p6 = bebi103.viz.ecdf_collection(data=df_diff_mcmc, 
                                cats='Strain',
                                val='dtheta',
                                formal=True,
                                line_width=2,
                                plot_width=600,
                                plot_height=300)

bokeh.io.show(bokeh.layouts.gridplot([p5, p6], ncols=2))

In [None]:
print(df_diff_mcmc.head())