In [None]:
import pandas as pd
import numpy as np

import bebi103

import os.path

import bokeh.io
import bokeh.plotting
import bokeh.layouts
bokeh.io.output_notebook()

color_palette=['#4e79a7', '#f28e2b', '#e15759', '#76b7b2', '#59a14f', '#edc948', '#b07aa1', '#ff9da7', '#9c755f', '#bab0ac']

In [None]:
# Read in the data and take a look
df = pd.read_csv('../data/hw_4.2_caulobacter_growth_image_processing_results.csv')

df.head()

Since the length of division is different for all growth events we should add another column that restarts the time count from 0 every time there's a division.

In [None]:
# Make a new column that makes the time go back to 0 at every start of growth
time = []
j = 0
for i in df['growth_event'].diff():
    if i == 0:
        j += 1
        time.append(j)
    else:
        j = 0
        time.append(j)

df['t'] = time

In [None]:
# Rename for convenience
df = df.rename(columns={'area (sq um)': 'area'})
df.head()

Let's start with a subset of the data first. We will look at bacterium 1.

In [None]:
# Slice out the data for bacterium 1
df_bacterium1 = df.loc[df['bacterium'] == 1]

let's plot the data of bacterium to take a look first.

In [None]:
p = bokeh.plotting.figure(plot_width=650,
                          plot_height=250,
                          x_axis_label='time (min)',
                          y_axis_label='cell area (sq µm)')

# Specify the glyphs
colors = ['#1f78b4', '#a6cee3']
for i, g in df_bacterium1.groupby('growth_event'):
    p.circle(g['time (min)'], g['area'], size=3, color=colors[i%2])

bokeh.io.show(p)

We decided to start our analysis with a non-hierachical model first.

#### No hierarchy

Let's just model a single growth event for now with no hierarchy using the exponential model.  

In [None]:
# Define the prior for the non-hierachical exponential model
def data_prior_pred(t):
    '''
    Samples parameter values according to the prior and generates
    data y at the values given in t.
    '''
    # Sample parameter values according to priors
    a = np.random.normal(1.4, 0.3)
    k = np.random.normal(0.01, 0.002)
    sigma = np.abs(np.random.normal(0, 0.1))
    
    # Generate random data according to the likelihood
    return np.random.normal(a * np.exp(k * t), sigma)

In [None]:
# Prior predictive check by plotting the simulated data
p = bokeh.plotting.figure(height=300, width=450,
                          x_axis_label='time',
                          y_axis_label='area')

t = df_bacterium1.loc[df_bacterium1['growth_event'] == 0, 't'].values

# Plot simulated data
for i in range(100):
    p.circle(t, data_prior_pred(t), size=3, alpha=0.1)

bokeh.io.show(p)

This broad prior capture the trend of growth but definitly miss lots of information.

Now let's try the non-hierachical linear model.

In [None]:
# define the prior for non-hierachical linear model
def data_prior_pred_linear(t):
    '''
    Samples parameter values according to the prior and generates
    data y at the values given in t.
    '''
    # Sample parameter values according to priors
    a = np.random.normal(1.4, 0.3)
    b = np.random.normal(0.01, 0.003)
    sigma = np.abs(np.random.normal(0, 0.1))
    
    # Generate random data according to the likelihood
    return np.random.normal(a + b * t, sigma)

In [None]:
# Prior predictive check by plotting the simulated data
p = bokeh.plotting.figure(height=300, width=450,
                          x_axis_label='time',
                          y_axis_label='area')

t = df_bacterium1.loc[df_bacterium1['growth_event'] == 0, 't'].values

# Plot simulated data
for i in range(100):
    p.circle(t, data_prior_pred_linear(t), size=3, alpha=0.1)

bokeh.io.show(p)

The trend still capture the growth. But the single level model neglect a lots of information such as the variation between growth events and bacteriums. So let's consider the variation between growth event first and build a two-level hierarchical model for one of the bacterium.

#### Two level hierarchical model

Let's try a two level hierarchical model for bacterium 1.

But before we do anything, we would like to construct a function to plot the predictive data. Because in this case, when you think of the growth event, every data point has a corespondence time, if we just use a simple predictive ECDF, we will lose all the information of the time, which dosen't make sense. In order to plot the time-series predictive data, we constucted a similar function when we did HW9.2, so we would like to also use this function below here. The detailed description of this function is in hw9.2.

In [None]:
# Function to plot the predictive data
def hw92_predictive(df, x, y=None, namex='index_1', name='F_ppc', perc=[80, 60, 40, 20], 
                    x_axis_label=None, y_axis_label=None, title=None, plot_width=350, plot_height=225, 
                    color='blue', data_color=color_palette[1], diff=False, p=None, baseline=0):
    '''Mimic of predictive ECDF for sampling results with input variables
    df - MCMC sampling data frame
    x - input variable
    y - data
    namex - name of the input varible in the data frame
    name - name of the predictive results in the data frame
    perc - list, default [80, 60, 40, 20]
            Percentiles for making colored envelopes for confidence
            intervals for the predictive ECDFs. Maximally four can be 
            specified.'''
    
    #Copied from predictive ECDF, choose color
    if color not in ['green', 'blue', 'red', 'gray',
                     'purple', 'orange', 'betancourt']:
        raise RuntimeError("Only allowed colors are 'green', 'blue', 'red', 'gray', 'purple', 'orange'")
    
    colors = {'blue': ['#9ecae1','#6baed6','#4292c6','#2171b5','#084594'],
              'green': ['#a1d99b','#74c476','#41ab5d','#238b45','#005a32'],
              'red': ['#fc9272','#fb6a4a','#ef3b2c','#cb181d','#99000d'],
              'orange': ['#fdae6b','#fd8d3c','#f16913','#d94801','#8c2d04'],
              'purple': ['#bcbddc','#9e9ac8','#807dba','#6a51a3','#4a1486'],
              'gray': ['#bdbdbd','#969696','#737373','#525252','#252525'],
              'betancourt': ['#DCBCBC', '#C79999', '#B97C7C',
    
                             '#A25050', '#8F2727', '#7C0000']}
    #Initialize the figure if needed
    if p is None:
        p = bokeh.plotting.figure(plot_width=plot_width,
                                  plot_height=plot_height,
                                  x_axis_label=x_axis_label,
                                  y_axis_label=y_axis_label,
                                  title=title)
    
    # If diff, take the difference of each step
    if diff:
        x = x[1:]
        if y is not None:
            y = np.diff(y)
        Nb = len(x)
        y_ppc = np.empty((len(perc) * 2 + 1, Nb))
        for i in range(Nb):
            # Take all the sampling results for each time point
            temp = df.loc[df[namex]== i+2, name].values - df.loc[df[namex]== i+1, name].values
            # Take the median and corresponding percentile
            y_ppc[-1, i] = np.median(temp)
            for j in range(len(perc)):
                y_ppc[j * 2, i] = np.percentile(temp, 50 - perc[j] / 2)
                y_ppc[j * 2 + 1, i] = np.percentile(temp, 50 + perc[j] / 2)
    else:                
        Nb = len(x)
        y_ppc = np.empty((len(perc) * 2 + 1, Nb))
        for i in range(Nb):
            # Take all the sampling results for each time point
            temp = df.loc[df[namex]== i+1+baseline, name].values
            # Take all the sampling results for each time point
            y_ppc[-1, i] = np.median(temp)
            for j in range(len(perc)):
                y_ppc[j * 2, i] = np.percentile(temp, 50 - perc[j] / 2)
                y_ppc[j * 2 + 1, i] = np.percentile(temp, 50 + perc[j] / 2)
    
    # Plot
    for j in range(len(perc)):
        bebi103.viz.fill_between(x, y_ppc[j * 2, :],
                     x, y_ppc[j * 2 + 1,:],
                     p=p,
                     show_line=False,
                     fill_color=colors[color][j])
        
    p.circle(x, y_ppc[-1, :],
           size=4,
           color=colors[color][-1])
    
    if y is not None:
        p.circle(x, y[baseline:baseline+Nb], size=4, color='orange')
    
    return p

Code up the prior predictive check for the two-level linear model. Our model is:

\begin{gather}
a_0 \sim \mbox{Norm}(a_{hyper}, \sigma_a) \\
k \sim \mbox{Norm}(k_{hyper}, \sigma_k) \\
\tau_a \sim \mbox{HalfNorm}(\tau_{a, hyper}) \\
\tau_k \sim \mbox{HalfNorm}(\tau_{k, hyper}) \\
\sigma_0 \sim \mbox{HalfNorm}(\sigma_{hyper}) \\
a_{01} \sim \mbox{Norm}(a_0, \tau_a) \\
k_1 \sim \mbox{Norm}(k, \tau_k) \\
a_{temp}(t) = a_{01} + {k_1 t} \\
a(t) \sim \mbox{Norm}(a_{temp}(t), \sigma_0)\\
\end{gather}

In [None]:
model_code_pri_pred = """
data {
  // Total Number of data points 
  int N;
  
  // Number of entries in each level of the hierarchy
  int J_1;
  int J_2;
  vector[N] t;
  
  
  // Input of parameters of the priors
  real hyper_a0_mu;
  real hyper_a0_sigma;
  
  real hyper_k0_mu;
  real hyper_k0_sigma;
  
  real hyper_a0_tau;
  real hyper_k0_tau;
  
  real hyper_sigma;
}


generated quantities {
  // Total number of data points 
  real area[N * J_1];
  
  // Priors
  real a0 = normal_rng(hyper_a0_mu, hyper_a0_sigma);
  real k0 = normal_rng(hyper_k0_mu, hyper_k0_sigma);
  
  real a0_tau = fabs(normal_rng(0, hyper_a0_tau));
  real k0_tau = fabs(normal_rng(0, hyper_k0_tau));
  
  real sigma = fabs(normal_rng(0, hyper_sigma));
  
  // Second layer
  real a_1[J_1]; 
  real k_1[J_1];
  
  for (i in 1:J_1) {
    a_1[i] = normal_rng(a0, a0_tau);
    k_1[i] = normal_rng(k0, k0_tau);
    for (j in 1:N) {
      area[(i - 1) * N + j] = normal_rng((a_1[i] + k_1[i] * t[j]), sigma);
  }
  }
  
  }
"""

In [None]:
# Compile
sm_gen = bebi103.stan.StanModel(file='hw91_model_code_pri_pred.Stan')

Let's specify the parameter for the prior and sample from the prior (two-level linear model)

In [None]:
# Number of iteration
N_iter = 100
T = [i for i in range (100)]


data = dict(N=100,
            J_1=2,
            J_2=3,
            t = T,
            hyper_a0_mu=1.4,
            hyper_a0_sigma=0.3,
            hyper_k0_mu=0.01,
            hyper_k0_sigma=0.002,
            hyper_a0_tau=0.1,
            hyper_k0_tau=0.001,
            hyper_sigma = 0.1,
           )

# Sample
df_pred = sm_gen.sampling(data=data,
                     algorithm='Fixed_param',
                     warmup=0,
                     chains=1,
                     iter=N_iter)

Let's extract the 'area' from the sample we generated and take a look.

In [None]:
df_samples = bebi103.stan.extract_array(df_pred, name='area')

df_samples.head()

Now let's use the function we mentioned before to visualize the generated samples in a time-series manner. This is a good way to do prior predictive check.

In [None]:
#Prior predictive check by visualzing the time-series generated data from our prior
time = T

p1 = hw92_predictive(df_samples, time, name='area', plot_width=500, plot_height=400)


bokeh.io.show(p1)

The trend looks quite resonable, seems we could move on with the prior in hand for the two-level linear model.   
But before that, let's also perform prior predictive check on the two-level exponential model.  

Our two-level exponential model is:

\begin{gather}
a_0 \sim \mbox{Norm}(a_{hyper}, \sigma_a) \\
k \sim \mbox{Norm}(k_{hyper}, \sigma_k) \\
\tau_a \sim \mbox{HalfNorm}(\tau_{a, hyper}) \\
\tau_k \sim \mbox{HalfNorm}(\tau_{k, hyper}) \\
\sigma_0 \sim \mbox{HalfNorm}(\sigma_{hyper}) \\
a_{01} \sim \mbox{Norm}(a_0, \tau_a) \\
k_1 \sim \mbox{Norm}(k, \tau_k) \\
a_{temp}(t) = a_{01} + {k_1 t} \\
a(t) \sim \mbox{Norm}(a_{temp}(t), \sigma_0)\\
\end{gather}

Let's code up the Stan model for the prior predictive check.

In [None]:
model_code_pri_pred_ex = """
data {
  // Total Number of data points 
  int N;
  
  // Number of entries in each level of the hierarchy
  int J_1;
  int J_2;
  vector[N] t;
  
  
  // Input of parameters of the priors
  real hyper_a0_mu;
  real hyper_a0_sigma;
  
  real hyper_k0_mu;
  real hyper_k0_sigma;
  
  real hyper_a0_tau;
  real hyper_k0_tau;
  
  real hyper_sigma;
}


generated quantities {
  // Total number of data points 
  real area[N * J_1];
  
  // Priors
  real a0 = normal_rng(hyper_a0_mu, hyper_a0_sigma);
  real k0 = normal_rng(hyper_k0_mu, hyper_k0_sigma);
  
  real a0_tau = fabs(normal_rng(0, hyper_a0_tau));
  real k0_tau = fabs(normal_rng(0, hyper_k0_tau));
  
  real sigma = fabs(normal_rng(0, hyper_sigma));
  
  // Second layer
  real a_1[J_1]; 
  real k_1[J_1];
  
  for (i in 1:J_1) {
    a_1[i] = normal_rng(a0, a0_tau);
    k_1[i] = normal_rng(k0, k0_tau);
    for (j in 1:N) {
      area[(i - 1) * N + j] = normal_rng(a_1[i] * exp(k_1[i] * t[j]), sigma);
  }
  }
  
  }
"""

The Stan code is attached above for reference, but the model is compiled from the stanalone file.

In [None]:
# Compile
sm_gen = bebi103.stan.StanModel(file='hw91_model_code_pri_pred_ex.Stan')

In [None]:
# Number of iteration
N_iter = 100
T = [i for i in range (100)]


data = dict(N=100,
            J_1=2,
            J_2=3,
            t = T,
            hyper_a0_mu=1.4,
            hyper_a0_sigma=0.3,
            hyper_k0_mu=0.01,
            hyper_k0_sigma=0.002,
            hyper_a0_tau=0.1,
            hyper_k0_tau=0.001,
            hyper_sigma = 0.1,
           )

# Sample
df_pred_ex = sm_gen.sampling(data=data,
                     algorithm='Fixed_param',
                     warmup=0,
                     chains=1,
                     iter=N_iter)

Let's extract the 'area' from the sample we generated and take a look

In [None]:
df_samples_ex = bebi103.stan.extract_array(df_pred, name='area')

df_samples_ex.head()

Now let's use the function we mentioned before to visualize the generated samples in a time-series manner. This is a good way to do prior predictive check.

In [None]:
#Prior predictive check by visualzing the time-series generated data from our prior
time = T

p2 = hw92_predictive(df_samples_ex, time, name='area', plot_width=500, plot_height=400)


bokeh.io.show(p2)

The trend also looks quite resonable, seems we could move on with the prior in hand for the two-level exponential model.  
So let's move on to modeling with two-level linear model and twp-level exponential model. 

Let's also have a code block to make sure once we generate sample, we could load it and do analysis without regenrating the sample.

In [None]:
# If load the dumped files globally
# Load globally
global_load = True
# Not load globally
global_no_load = False
# See if they conflict each other
assert not (global_load and global_no_load), "It is just not possible"

Let's code up the Stan code for Noncentered Linear Model:

In [None]:
model_code_linear_noncentered = """
data {
  // Total number of data points
  int N;
  
  // Number of entries in each level of the hierarchy
  int J_1;

  //Index arrays to keep track of hierarchical structure
  int index_1[N];
  
  // The measurements
  real area[N];
  
  // Time
  vector[N] t;
}

parameters {
  // Hyperparameters level 0
  real a;
  real k;
  real<lower=0> sigma;

  // How hyperparameters vary
  real<lower=0> tau_a;
  real<lower=0> tau_k;

  // Hyperparameters level 1
  vector[J_1] a_1_tilde;
  vector[J_1] k_1_tilde;
}

transformed parameters {
  // Transformations for noncentered
  vector[J_1] a_1 = a + tau_a * a_1_tilde;
  vector[J_1] k_1 = k + tau_k * k_1_tilde;
  vector[N] area_temp;
  
  for (i in 1:N) {
    area_temp[i] = a_1[index_1[i]] + k_1[index_1[i]] * t[i];
  }
}

model {
  a ~ normal(1.4, 0.3);
  k ~ normal(0.01, 0.002);
  sigma ~ normal(0, 0.1);
  tau_a ~ normal(0, 0.1);
  tau_k ~ normal(0, 0.001);

  a_1_tilde ~ normal(0, 1);
  k_1_tilde ~ normal(0, 1);

  area ~ normal(area_temp, sigma);
}

generated quantities {
  vector[N] area_ppc;
  real log_lik[N];
  
  // Posterior predictive check
  for (i in 1:N) {
    area_ppc[i] = normal_rng(area_temp[i], sigma);
  }
  
  // Compute pointwise log likelihood
  for (i in 1:N) {
    log_lik[i] = normal_lpdf(area[i] | area_temp[i], sigma);
  }
}
"""

The Stan code is attached above for reference, but the model is compiled from the stanalone file.

In [None]:
# See if want to try to load the dumped file
if global_load:
    # If load globally, set it locally True
    load_dump_file = True
elif global_no_load:
    # If not load globally, set it locally False
    load_dump_file = False
else:
    # If global parameter is not specified, set it manually
    load_dump_file = True
if load_dump_file:
    # Name the dumped file
    dump_filename = 'linear_level1'
    # See if the file exsits
    if os.path.isfile(dump_filename):
        # If it exists, load it
        [samples_linear, sm_linear] = bebi103.stan.pickle_load_samples(dump_filename)
    else:
        # Or, set not to load the file and compile the model
        print('No dumped file found, compiling the model instead')
        load_dump_file = False
        sm_linear = bebi103.stan.StanModel(model_code=model_code_linear_noncentered)
# Compile the model if no dumped file is loaded
else:
    sm_linear = bebi103.stan.StanModel(file='hw91_model_code_linear_noncentered.Stan')

In order to do a test on our model, let's just use a subset of data first.  
Let's slice out the data of the first 2 growth events of bacterium1.

In [None]:
# Make the input data
# Choose a subset of data
df_sub1 = df_bacterium1.loc[df_bacterium1['growth_event'] == 1]
df_sub2 = df_bacterium1.loc[df_bacterium1['growth_event'] == 2]
df_sub = pd.concat([df_sub1, df_sub2])

# Take a look
df_sub.head()

Make the data into a dict for input to the model.

In [None]:
# Make it into dict
data = dict(N=len(df_sub),
            J_1=2,
            index_1=df_sub['growth_event'].values,
            area=df_sub['area'].values,
            t=df_sub['t'].values)

Perform the sampling and save the sample in a data frame for fuether analysis.  
Also do some diagnostics on the sample.

In [None]:
# If no dump file is loaded, sample
if not load_dump_file:
    # Sample
    samples_linear = sm_linear.sampling(data=data, 
                                             seed=2389412, 
                                             control=dict(adapt_delta=0.99, max_treedepth=11))

# Convert to data frame for easy use later
df_linear = bebi103.stan.to_dataframe(samples_linear)

bebi103.stan.check_all_diagnostics(samples_linear)

Looks great! Then we can look at the trace plot and corner plot to see what parameter estimates we have.

In [None]:
bokeh.io.show(bebi103.viz.trace_plot(samples_linear, pars=['a', 'k'], line_width=2))

In [None]:
bokeh.io.show(bebi103.viz.corner(samples_linear, pars=['a', 'k'], plot_width=300))

Let's do the posterior predictive check using the funtion we mentioned before.

In [None]:
time = df_sub['time (min)'].values
val = df_sub['area'].values
df_linear_ppc = bebi103.stan.extract_array(samples_linear, name='area_ppc')

p1 = hw92_predictive(df_linear_ppc, time, val, perc=[99, 75, 50, 25], name='area_ppc', plot_width=500, plot_height=400, title='Linear (two-level)')

bokeh.io.show(p1)

In [None]:
# Marginalized distributions of each parameter 
plots = [bebi103.viz.ecdf(df_linear[param], x_axis_label=param, plot_height=200, plot_width=250) 
                 for param in ['a', 'k']]
bokeh.io.show(bokeh.layouts.gridplot(plots, ncols=3))

We want to save the sample in a seperated file for further analysis.

In [None]:
if not os.path.isfile(dump_filename):
    bebi103.stan.pickle_dump_samples(fit=samples_linear, model=sm_linear, pkl_file=dump_filename)

Let's move on and code up the Noncentered exponential model:

For the exponential model, we initially did similar things with the linear model, but we found that we need very large adaptive delta to eliminate the divergence, which we assume is because the values of $k$ in the exponential model is even smaller so that the sampler would hit a 'funnel' just due to the very small values, so we do the following modification: when we sample $k$ and parameters generated from it ($k_1$, etc.), we sample them out of distributions that are 100 times of the supposed ones, and when we compute the mean of area in the Gaussian distribution in the final layer, we divide the sampled parameters by 100. In this way, we think it would help in the funnel caused by small values.

In [None]:
model_code_exp_noncentered = """
data {
  // Total number of data points
  int N;
  
  // Number of entries in each level of the hierarchy
  int J_1;

  //Index arrays to keep track of hierarchical structure
  int index_1[N];
  
  // The measurements
  real area[N];
  
  // Time
  vector[N] t;
}

parameters {
  // Hyperparameters level 0
  real a;
  real k;
  real<lower=0> sigma;

  // How hyperparameters vary
  real<lower=0> tau_a;
  real<lower=0> tau_k;

  // Hyperparameters level 1
  vector[J_1] a_1_tilde;
  vector[J_1] k_1_tilde;
}

transformed parameters {
  // Transformations for noncentered
  vector[J_1] a_1 = a + tau_a * a_1_tilde;
  vector[J_1] k_1 = k + tau_k * k_1_tilde;
  vector[N] area_temp;
  
  // Divide k by 100 when computing posterior
  for (i in 1:N) {
    area_temp[i] = a_1[index_1[i]] * exp(k_1[index_1[i]] * t[i] / 100);
  }
}

model {
  a ~ normal(1.4, 0.3);
  // 100 times larger prior for k and tau_k
  k ~ normal(1, 0.2);
  sigma ~ normal(0, 0.1);
  tau_a ~ normal(0, 0.1);
  tau_k ~ normal(0, 0.1);

  a_1_tilde ~ normal(0, 1);
  k_1_tilde ~ normal(0, 1);

  area ~ normal(area_temp, sigma);
}

generated quantities {
  vector[N] area_ppc;
  real log_lik[N];
  
  // Posterior predictive check
  for (i in 1:N) {
    area_ppc[i] = normal_rng(area_temp[i], sigma);
  }
  
  // Compute pointwise log likelihood
  for (i in 1:N) {
    log_lik[i] = normal_lpdf(area[i] | area_temp[i], sigma);
  }
}
"""

The Stan code is attached above for reference, but the model is compiled from the stanalone file.

In [None]:
# See if want to try to load the dumped file
if global_load:
    # If load globally, set it locally True
    load_dump_file = True
elif global_no_load:
    # If not load globally, set it locally False
    load_dump_file = False
else:
    # If global parameter is not specified, set it manually
    load_dump_file = True
if load_dump_file:
    # Name the dumped file
    dump_filename = 'exp_level1'
    # See if the file exsits
    if os.path.isfile(dump_filename):
        # If it exists, load it
        [samples_exp, sm_exp] = bebi103.stan.pickle_load_samples(dump_filename)
    else:
        # Or, set not to load the file and compile the model
        print('No dumped file found, compiling the model instead')
        load_dump_file = False
        sm_exp = bebi103.stan.StanModel(model_code=model_code_exp_noncentered)
# Compile the model if no dumped file is loaded
else:
    sm_exp = bebi103.stan.StanModel(file='hw91_model_code_exp_noncentered.Stan')

In order to do a test on our model, let's just use a subset of data first.  
Let's still use the data of the first 2 growth events of bacterium1.

In [None]:
# Make the input data
# Choose a subset of data
df_sub1 = df_bacterium1.loc[df_bacterium1['growth_event'] == 1]
df_sub2 = df_bacterium1.loc[df_bacterium1['growth_event'] == 2]
df_sub = pd.concat([df_sub1, df_sub2])

# Take a look
df_sub.head()

Make the data into a dict for input to the model.

In [None]:
# Make it into dict
data = dict(N=len(df_sub),
            J_1=2,
            index_1=df_sub['growth_event'].values,
            area=df_sub['area'].values,
            t=df_sub['t'].values)

Perform the sampling and save the sample in a data frame for fuether analysis.  
Also do some diagnostics on the sample.

In [None]:
# If no dump file is loaded, sample
if not load_dump_file:
    # Sample
    samples_exp = sm_exp.sampling(data=data, 
                                  seed=2389412, 
                                  control=dict(adapt_delta=0.99, max_treedepth=13))

bebi103.stan.check_all_diagnostics(samples_exp)

Looks OK! Then we can look at the trace plot and corner plot to see what parameter estimates we have.

In [None]:
bokeh.io.show(bebi103.viz.trace_plot(samples_exp, pars=['a', 'k'], line_width=2))

In [None]:
bokeh.io.show(bebi103.viz.corner(samples_exp, pars=['a', 'k'], plot_width=300))

Do the posterior predictive check on this model.  
Plot the linear(2-level) and exponential(2-level) posterior predictive check side by side.

In [None]:
time = df_sub['time (min)'].values
val = df_sub['area'].values
df_exp_ppc = bebi103.stan.extract_array(samples_exp, name='area_ppc')

p2 = hw92_predictive(df_exp_ppc, time, val, perc=[99, 75, 50, 25], name='area_ppc', plot_width=500, plot_height=400, title='EXP (two-level)')

bokeh.io.show(bokeh.layouts.gridplot([[p1, p2]]))

Save the sample of the two-level exponential model.

In [None]:
if not os.path.isfile(dump_filename):
    bebi103.stan.pickle_dump_samples(fit=samples_exp, model=sm_exp, pkl_file=dump_filename)

Now let's Compare the two two-level models:

In [None]:
def hw91_predictive_compare(df, x, y, namex='index_1', name='area_ppc',
                    x_axis_label=None, y_axis_label=None, title=None, plot_width=350, plot_height=225, 
                    color=color_palette[0], p=None):
    '''Compare absolute values of the differences between sampling results and the data
    df - MCMC sampling data frame
    x - input variable
    y - data
    namex - name of the input varible in the data frame
    name - name of the predictive results in the data frame'''
    
    if p is None:
        p = bokeh.plotting.figure(plot_width=plot_width,
                                  plot_height=plot_height,
                                  x_axis_label=x_axis_label,
                                  y_axis_label=y_axis_label,
                                  title=title)
    
    new_df = df.copy(deep=True)
                  
    Nb = len(x)
    for i in range(Nb):
        new_df.loc[new_df[namex]== i+1, name] = np.abs(new_df.loc[new_df[namex]== i+1, name] - y[i])
    
    p = bebi103.viz.ecdf(new_df[name], 
                      plot_width=plot_width,
                      plot_height=plot_height,
                      x_axis_label=x_axis_label,
                      y_axis_label=y_axis_label,
                      title=title,
                      color=color,
                      p=p)
    
    return p

In [None]:
pc1 = hw91_predictive_compare(df_exp_ppc, time, val, name='area_ppc')
pc1 = hw91_predictive_compare(df_linear_ppc, time, val, name='area_ppc', p=pc1, color=color_palette[1])

bokeh.io.show(pc1)

For further model comparison, let's compute the LOO and Akaike weight of these samples: 

In [None]:
bebi103.stan.compare({'linear': samples_linear,
                      'exp': samples_exp},
                     log_likelihood='log_lik',
                     ic='loo')

We know that the smaller LOO is, the bigger the epld is, indicating a smaller Kullback-Leibler divergence (a better model).
So in general, the smaller LOO and the larger weight is, the closer the model is to the true generative model. 
Using this standard, we could easily tell that exponential(two-level) is better than linear(two-level).  

Now let's move on and consider the fact that we have 2 bacteriums. So let's add one level to the hierachical model considering the variation withn different bacteriums.

#### Three levels hierachical model

When considering the bacteriaum variation, we basically add top layer. The prior predictive check here is very similar to the two-level model, so we will just skip the prior predictive check in this part.

Let's code up our model for three-level linear model:

In [None]:
model_code_linear_2 = """
data {
  // Total number of data points
  int N;
  
  // Number of entries in each level of the hierarchy
  int J_1;
  int J_2;
  
  //Index arrays to keep track of hierarchical structure
  int index_1[J_2];
  int index_2[N];
  
  // The measurements
  real area[N];
  
  // Time
  vector[N] t;
}

parameters {
  // Hyperparameters level 0
  real a;
  real k;
  real<lower=0> sigma;

  // How hyperparameters vary
  real<lower=0> tau_a;
  real<lower=0> tau_k;

  // Hyperparameters level 1
  vector[J_1] a_1_tilde;
  vector[J_1] k_1_tilde;
  
  // Hyperparameters level 2
  vector[J_2] a_2_tilde;
  vector[J_2] k_2_tilde;
}

transformed parameters {
  // Transformations for noncentered
  vector[J_1] a_1 = a + tau_a * a_1_tilde;
  vector[J_1] k_1 = k + tau_k * k_1_tilde;
  
  vector[J_2] a_2 = a_1[index_1] + tau_a * a_2_tilde;
  vector[J_2] k_2 = k_1[index_1] + tau_k * k_2_tilde;
  
  vector[N] area_temp;
  
  for (i in 1:N) {
    area_temp[i] = a_2[index_2[i]] + k_2[index_2[i]] * t[i];
  }
}

model {
  a ~ normal(1.4, 0.3);
  k ~ normal(0.01, 0.002);
  sigma ~ normal(0, 0.1);
  tau_a ~ normal(0, 0.1);
  tau_k ~ normal(0, 0.001);

  a_1_tilde ~ normal(0, 1);
  k_1_tilde ~ normal(0, 1);
  
  a_2_tilde ~ normal(0, 1);
  k_2_tilde ~ normal(0, 1);

  area ~ normal(area_temp, sigma);
}

generated quantities {
  vector[N] area_ppc;
  real log_lik[N];
  
  // Posterior predictive check
  for (i in 1:N) {
    area_ppc[i] = normal_rng(area_temp[i], sigma);
  }
  
  // Compute pointwise log likelihood
  for (i in 1:N) {
    log_lik[i] = normal_lpdf(area[i] | area_temp[i], sigma);
  }
}
"""

The Stan code is attached above for reference, but the model is compiled from the stanalone file.

In [None]:
# See if want to try to load the dumped file
if global_load:
    # If load globally, set it locally True
    load_dump_file = True
elif global_no_load:
    # If not load globally, set it locally False
    load_dump_file = False
else:
    # If global parameter is not specified, set it manually
    load_dump_file = True
if load_dump_file:
    # Name the dumped file
    dump_filename = 'linear_level2'
    # See if the file exsits
    if os.path.isfile(dump_filename):
        # If it exists, load it
        [samples_linear_2, sm_linear_2] = bebi103.stan.pickle_load_samples(dump_filename)
    else:
        # Or, set not to load the file and compile the model
        print('No dumped file found, compiling the model instead')
        load_dump_file = False
        sm_linear_2 = bebi103.stan.StanModel(model_code=model_code_linear_2)
# Compile the model if no dumped file is loaded
else:
    sm_linear_2 = bebi103.stan.StanModel(fiel='hw91_model_code_linear_2.Stan')

In order to test a model, we slice out a subset of the data including the first 3 growth events for both bacterium 1 and 2.

In [None]:
# Make the input data
df_sub1 = df.loc[(df['growth_event'] == 1) & (df['bacterium'] == 1)]
df_sub2 = df.loc[(df['growth_event'] == 2) & (df['bacterium'] == 1)]
df_sub3 = df.loc[(df['growth_event'] == 3) & (df['bacterium'] == 2)]
df_sub = pd.concat([df_sub1, df_sub2])
df_sub = pd.concat([df_sub, df_sub3])
# Rename for convenience
df_sub = df_sub.rename(columns={'area (sq um)': 'area'})

df_sub.head()

In [None]:
# Put the data Into dict
data, df_part = bebi103.stan.df_to_datadict_hier(df_sub,
                                           level_cols=['bacterium', 'growth_event'],
                                           data_cols=['area', 't'])

Perform the sampling and save the sample in a data frame for fuether analysis.  
Also do some diagnostics on the sample.

In [None]:
# If no dump file is loaded, sample
if not load_dump_file:
    # Sample
    samples_linear_2 = sm_linear_2.sampling(data=data, 
                                            seed=2389412, 
                                            control=dict(adapt_delta=0.99, max_treedepth=13))

bebi103.stan.check_all_diagnostics(samples_linear_2)

It looks great! Then we can look at the trace plot and corner plot to see what parameter estimates we have.

In [None]:
bokeh.io.show(bebi103.viz.trace_plot(samples_linear_2, pars=['a', 'k'], line_width=2))

In [None]:
bokeh.io.show(bebi103.viz.corner(samples_linear_2, pars=['a', 'k'], plot_width=300))

Do the posterior predictive check on this model.

In [None]:
time = df_sub['time (min)'].values
val = df_sub['area'].values
df_lin2_ppc = bebi103.stan.extract_array(samples_linear_2, name='area_ppc')

p3 = hw92_predictive(df_lin2_ppc, time, val, perc=[99, 75, 50, 25], name='area_ppc', plot_width=500, plot_height=400, title='Linear (Three-level)', baseline=0)

bokeh.io.show(p3)

Save the sample for further analysis.

In [None]:
if not os.path.isfile(dump_filename):
    bebi103.stan.pickle_dump_samples(fit=samples_linear_2, model=sm_linear_2, pkl_file=dump_filename)

Let's move on and code up the 3-leevel exponential model:

In [None]:
model_code_exp_2 = """
data {
  // Total number of data points
  int N;
  
  // Number of entries in each level of the hierarchy
  int J_1;
  int J_2;
  
  //Index arrays to keep track of hierarchical structure
  int index_1[J_2];
  int index_2[N];
  
  // The measurements
  real area[N];
  
  // Time
  vector[N] t;
}

parameters {
  // Hyperparameters level 0
  real a;
  real k;
  real<lower=0> sigma;

  // How hyperparameters vary
  real<lower=0> tau_a;
  real<lower=0> tau_k;

  // Hyperparameters level 1
  vector[J_1] a_1_tilde;
  vector[J_1] k_1_tilde;
  
  // Hyperparameters level 2
  vector[J_2] a_2_tilde;
  vector[J_2] k_2_tilde;
}

transformed parameters {
  // Transformations for noncentered
  vector[J_1] a_1 = a + tau_a * a_1_tilde;
  vector[J_1] k_1 = k + tau_k * k_1_tilde;
  
  vector[J_2] a_2 = a_1[index_1] + tau_a * a_2_tilde;
  vector[J_2] k_2 = k_1[index_1] + tau_k * k_2_tilde;
  
  vector[N] area_temp;
  
  // Divide k by 100 when computing posterior
  for (i in 1:N) {
    area_temp[i] = a_2[index_2[i]] * exp(k_2[index_2[i]] * t[i] / 100);
  }
}

model {
  a ~ normal(1.4, 0.3);
  // 100 times large prior for k and tau_k
  k ~ normal(1, 0.2);
  sigma ~ normal(0, 0.1);
  tau_a ~ normal(0, 0.1);
  tau_k ~ normal(0, 0.1);

  a_1_tilde ~ normal(0, 1);
  k_1_tilde ~ normal(0, 1);
  
  a_2_tilde ~ normal(0, 1);
  k_2_tilde ~ normal(0, 1);

  area ~ normal(area_temp, sigma);
}

generated quantities {
  vector[N] area_ppc;
  real log_lik[N];
  
  // Posterior predictive check
  for (i in 1:N) {
    area_ppc[i] = normal_rng(area_temp[i], sigma);
  }
  
  // Compute pointwise log likelihood
  for (i in 1:N) {
    log_lik[i] = normal_lpdf(area[i] | area_temp[i], sigma);
  }
}
"""

The Stan code is attached above for reference, but the model is compiled from the stanalone file.

In [None]:
# See if want to try to load the dumped file
if global_load:
    # If load globally, set it locally True
    load_dump_file = True
elif global_no_load:
    # If not load globally, set it locally False
    load_dump_file = False
else:
    # If global parameter is not specified, set it manually
    load_dump_file = False
if load_dump_file:
    # Name the dumped file
    dump_filename = 'exp_level2'
    # See if the file exsits
    if os.path.isfile(dump_filename):
        # If it exists, load it
        [samples_exp_2, sm_exp_2] = bebi103.stan.pickle_load_samples(dump_filename)
    else:
        # Or, set not to load the file and compile the model
        print('No dumped file found, compiling the model instead')
        load_dump_file = False
        sm_exp_2 = bebi103.stan.StanModel(model_code=model_code_exp_2)
# Compile the model if no dumped file is loaded
else:
    sm_exp_2 = bebi103.stan.StanModel(file='hw91_model_code_exp_2.Stan')

In order to test a model, we slice out a subset of the data including the first 3 growth events for both bacterium 1 and 2.

In [None]:
# Make the input data
df_sub1 = df.loc[(df['growth_event'] == 1) & (df['bacterium'] == 1)]
df_sub2 = df.loc[(df['growth_event'] == 2) & (df['bacterium'] == 1)]
df_sub3 = df.loc[(df['growth_event'] == 3) & (df['bacterium'] == 2)]
df_sub = pd.concat([df_sub1, df_sub2])
df_sub = pd.concat([df_sub, df_sub3])
# Rename for convenience
df_sub = df_sub.rename(columns={'area (sq um)': 'area'})

df_sub.head()

In [None]:
# Into dict
data, df_part = bebi103.stan.df_to_datadict_hier(df_sub,
                                           level_cols=['bacterium', 'growth_event'],
                                           data_cols=['area', 't'])

Perform the sampling and save the sample in a data frame for fuether analysis.  
Also do some diagnostics on the sample.

In [None]:
# If no dump file is loaded, sample
if not load_dump_file:
    # Sample
    samples_exp_2 = sm_exp_2.sampling(data=data, 
                                  seed=2389412, 
                                  control=dict(adapt_delta=0.99, max_treedepth=13))

bebi103.stan.check_all_diagnostics(samples_exp_2)

Looks great! Then we can look at the trace plot and corner plot to see what parameter estimates we have.

In [None]:
bokeh.io.show(bebi103.viz.trace_plot(samples_exp_2, pars=['a', 'k'], line_width=2))

In [None]:
bokeh.io.show(bebi103.viz.corner(samples_exp_2, pars=['a', 'k']))

Do the posterior predictive check on this model.  
Plot the linear(3-level) and exponential(3-level) posterior predictive check side by side.

In [None]:
time = df_sub['time (min)'].values
val = df_sub['area'].values
df_exp2_ppc = bebi103.stan.extract_array(samples_exp_2, name='area_ppc')

p4 = hw92_predictive(df_exp2_ppc, time, val, perc=[99, 75, 50, 25], name='area_ppc', plot_width=500, plot_height=400, title='EXP (Three-level)')

bokeh.io.show(bokeh.layouts.gridplot([[p3, p4]]))

Save the sample of the three-level exponential model.

In [None]:
if not os.path.isfile(dump_filename):
    bebi103.stan.pickle_dump_samples(fit=samples_exp_2, model=sm_exp_2, pkl_file=dump_filename)

Now let's Compare the two three-level models:

In [None]:
pc2 = hw91_predictive_compare(df_exp2_ppc, time, val, name='area_ppc')
pc2 = hw91_predictive_compare(df_lin2_ppc, time, val, name='area_ppc', p=pc2, color=color_palette[1])

bokeh.io.show(bokeh.layouts.gridplot([[pc1, pc2]]))

For further model comparison, let's compute the LOO and Akaike weight of these samples: 

In [None]:
bebi103.stan.compare({'linear': samples_linear_2,
                      'exp': samples_exp_2},
                     log_likelihood='log_lik',
                     ic='loo')

We know that the smaller LOO is, the bigger the epld is, indicating a smaller Kullback-Leibler divergence (a better model).
So in general, the smaller LOO and the larger weight is, the closer the model is to the true generative model. 
Using this standard, we could easily tell that exponential(three-level) is better than linear(three-level) when dealing this small subset of the data.  

Now let's move on and analyze the whole dataset.

In [None]:
# Make the input data for the full data set
data, df_part = bebi103.stan.df_to_datadict_hier(df,
                                           level_cols=['bacterium', 'growth_event'],
                                           data_cols=['area', 't'])

# Take a look
data

When using 'bebi103.stan.df_to_datadict_hier', sometime we find that the input data is not in right order, which cause great trouble in further analysis and plotting, To prevent that, we decide to check the order of the input data before putting it into the sampler.

In [None]:
# Bacterium 1
p = bokeh.plotting.figure(height=300)
len_data = len(df.loc[df['bacterium']==1, 'time (min)'].values)
p.line(df.loc[df['bacterium']==1, 'time (min)'].values, df.loc[df['bacterium']==1, 'area'].values - data['area'][:len_data])
bokeh.io.show(p)

In [None]:
# Bacterium 2
p = bokeh.plotting.figure(height=300)
len_data = len(df.loc[df['bacterium']==1, 'time (min)'].values)
p.line(df.loc[df['bacterium']==2, 'time (min)'].values, df.loc[df['bacterium']==2, 'area'].values - data['area'][len_data:], color=color_palette[1])
bokeh.io.show(p)

The order of the input data is right! Let's put them into sampler.

In [None]:
# See if want to try to load the dumped file
if global_load:
    # If load globally, set it locally True
    load_dump_file = True
elif global_no_load:
    # If not load globally, set it locally False
    load_dump_file = False
else:
    # If global parameter is not specified, set it manually
    load_dump_file = True
if load_dump_file:
    # Name the dumped file
    dump_filename = 'linear_level2_full'
    # See if the file exsits
    if os.path.isfile(dump_filename):
        # If it exists, load it
        [samples_linear_2_full, sm_linear_2] = bebi103.stan.pickle_load_samples(dump_filename)
    else:
        # Or, set not to load the file and compile the model
        print('No dumped file found, compiling the model instead')
        load_dump_file = False
        sm_linear_2 = bebi103.stan.StanModel(model_code=model_code_linear_2)
# Compile the model if no dumped file is loaded
else:
    sm_linear_2 = bebi103.stan.StanModel(model_code=model_code_linear_2)

Sampling (3-level linear model):

In [None]:
# If no dump file is loaded, sample
if not load_dump_file:
    # Sample
    samples_linear_2_full = sm_linear_2.sampling(data=data, 
                                                 seed=2389412, 
                                                 control=dict(adapt_delta=0.99, max_treedepth=15))

In [None]:
# Diagnostic
bebi103.stan.check_all_diagnostics(samples_linear_2_full)

The dianostic looks great!  
Let's save the Sample from 3-level linear model:

In [None]:
if not os.path.isfile(dump_filename):
    bebi103.stan.pickle_dump_samples(fit=samples_linear_2_full, model=sm_linear_2, pkl_file=dump_filename)

In [None]:
# See if want to try to load the dumped file
if global_load:
    # If load globally, set it locally True
    load_dump_file = True
elif global_no_load:
    # If not load globally, set it locally False
    load_dump_file = False
else:
    # If global parameter is not specified, set it manually
    load_dump_file = True
if load_dump_file:
    # Name the dumped file
    dump_filename = 'exp_level2_full'
    # See if the file exsits
    if os.path.isfile(dump_filename):
        # If it exists, load it
        [samples_exp_2_full, sm_exp_2] = bebi103.stan.pickle_load_samples(dump_filename)
    else:
        # Or, set not to load the file and compile the model
        print('No dumped file found, compiling the model instead')
        load_dump_file = False
        sm_exp_2 = bebi103.stan.StanModel(model_code=model_code_exp_2)
# Compile the model if no dumped file is loaded
else:
    sm_exp_2 = bebi103.stan.StanModel(model_code=model_code_exp_2)

Sampling (3-level exponential model):

In [None]:
# If no dump file is loaded, sample
if not load_dump_file:
    # Sample
    samples_exp_2_full = sm_exp_2.sampling(data=data, 
                                           seed=2389412, 
                                           control=dict(adapt_delta=0.99, max_treedepth=15))

In [None]:
# Diagnostic
bebi103.stan.check_all_diagnostics(samples_exp_2_full)

The dianostic looks great!  
Let's save the Sample from 3-level exponential model:

In [None]:
if not os.path.isfile(dump_filename):
    bebi103.stan.pickle_dump_samples(fit=samples_exp_2_full, model=sm_exp_2, pkl_file=dump_filename)

Let's take a look at the parameter of the 3-level linear model:

In [None]:
bokeh.io.show(bebi103.viz.corner(samples_linear_2_full, pars=['a', 'k'], plot_width=300))

Let's plot the predictive data. But when we try to do the plotting, we realize if we plot the 2 bacteria together, it will cause a huge confusion, because they will all start from time = 0, and they would overlay with each other in the plot. So we we decided to plot Bacterium1 and Bacterium2 seperately.

Plot the predictive data from 3-level linear model:

In [None]:
time = df['time (min)'].values
time1 = df.loc[df['bacterium']==1, 'time (min)'].values
time2 = df.loc[df['bacterium']==2, 'time (min)'].values
val = df['area'].values

In [None]:
df_linear_2_ppc_full = bebi103.stan.extract_array(samples_linear_2_full, name='area_ppc')

In [None]:
pf1 = hw92_predictive(df_linear_2_ppc_full, time1, val, perc=[99, 75, 50, 25], name='area_ppc', plot_width=2000, plot_height=1000, title='Linear (Three-level), full data set, bacterium 1')
pf2 = hw92_predictive(df_linear_2_ppc_full, time2, val, perc=[99, 75, 50, 25], name='area_ppc', plot_width=2000, plot_height=1000, title='Linear (Three-level), full data set, bacterium 2', baseline=len(time1))

bokeh.io.show(bokeh.layouts.gridplot([[pf1], [pf2]]))

Let's take a look at the parameter of the 3-level exponential model:

In [None]:
bokeh.io.show(bebi103.viz.corner(samples_exp_2_full, pars=['a', 'k'], plot_width=300))

Plot the predictive data from 3-level linear model:

In [None]:
df_exp_2_ppc_full = bebi103.stan.extract_array(samples_exp_2_full, name='area_ppc')

In [None]:
pf3 = hw92_predictive(df_exp_2_ppc_full, time1, val, perc=[99, 75, 50, 25], name='area_ppc', plot_width=2000, plot_height=1000, title='EXP (Three-level), full data set, bacterium 1')
pf4 = hw92_predictive(df_exp_2_ppc_full, time2, val, perc=[99, 75, 50, 25], name='area_ppc', plot_width=2000, plot_height=1000, title='EXP (Three-level), full data set, bacterium 2', baseline=len(time1))

bokeh.io.show(bokeh.layouts.gridplot([[pf3], [pf4]]))

Let's compare these 2 models:

In [None]:
bebi103.stan.compare({'linear': samples_linear_2_full,
                      'exp': samples_exp_2_full},
                     log_likelihood='log_lik',
                     ic='loo')

In [None]:
pc3 = hw91_predictive_compare(df_exp_2_ppc_full, time, val, name='area_ppc')
pc3 = hw91_predictive_compare(df_linear_2_ppc_full, time, val, name='area_ppc', p=pc3, color=color_palette[1])

bokeh.io.show(bokeh.layouts.gridplot([[pc3]]))

In [None]:
%load_ext watermark

In [None]:
%watermark -v -p numpy,scipy,bokeh,jupyterlab