In [1]:
import pandas as pd
import numpy as np

import numba

import bebi103

import altair as alt
import altair_catplot as altcat

import bokeh.io
import bokeh.plotting
bokeh.io.output_notebook()

color_palette=['#4e79a7', '#f28e2b', '#e15759', '#76b7b2', '#59a14f', '#edc948', '#b07aa1', '#ff9da7', '#9c755f', '#bab0ac']

In [2]:
df = pd.read_csv('../data/hw_4.2_caulobacter_growth_image_processing_results.csv')

df.head()

Unnamed: 0,time (min),area (sq um),growth_event,bacterium
0,1.0,1.300624,0,1
1,2.0,1.314144,0,1
2,3.0,1.295216,0,1
3,4.0,1.314144,0,1
4,5.0,1.341184,0,1


Since the length of division is different for all growth events we should add another column that restarts the time count from 0 every time there's a division.

In [3]:
time = []
j = 0
for i in df['growth_event'].diff():
    if i == 0:
        j += 1
        time.append(j)
    else:
        j = 0
        time.append(j)

df['t'] = time

In [4]:
df.head()

Unnamed: 0,time (min),area (sq um),growth_event,bacterium,t
0,1.0,1.300624,0,1,0
1,2.0,1.314144,0,1,1
2,3.0,1.295216,0,1,2
3,4.0,1.314144,0,1,3
4,5.0,1.341184,0,1,4


Let's start with a subset of the data. We will look at bacterium 1.

In [5]:
df_bacterium1 = df.loc[df['bacterium'] == 1]

In [6]:
p = bokeh.plotting.figure(plot_width=650,
                          plot_height=250,
                          x_axis_label='time (min)',
                          y_axis_label='cell area (sq µm)')

# Specify the glyphs
colors = ['#1f78b4', '#a6cee3']
for i, g in df_bacterium1.groupby('growth_event'):
    p.circle(g['time (min)'], g['area (sq um)'], size=3, color=colors[i%2])

bokeh.io.show(p)

#### No hierarchy

Let's just model a single growth event for now with no hierarchy using the exponential model.

In [None]:
def data_prior_pred(t):
    '''
    Samples parameter values according to the prior and generates
    data y at the values given in t.
    '''
    # Sample parameter values according to priors
    a = np.random.normal(1.2, 0.4)
    k = np.random.normal(0.01, 0.003)
    sigma = np.abs(np.random.normal(0, 0.1))
    
    # Generate random data according to the likelihood
    return np.random.normal(a * np.exp(k * t), sigma)

In [None]:
p = bokeh.plotting.figure(height=300, width=450,
                          x_axis_label='time',
                          y_axis_label='area')

t = df_bacterium1.loc[df_bacterium1['growth_event'] == 0, 't'].values

# Plot simulated data
for i in range(100):
    p.circle(t, data_prior_pred(t), size=3, alpha=0.1)

# Plot original data
p.circle(t, df_bacterium1.loc[df_bacterium1['growth_event'] == 0, 'area (sq um)'].values, 
         color='black', size=4)
bokeh.io.show(p)

In [None]:
p = bokeh.plotting.figure(height=300, width=450,
                          x_axis_label='time',
                          y_axis_label='area')

t = df_bacterium1.loc[df_bacterium1['growth_event'] == 3, 't'].values

# Plot simulated data
for i in range(100):
    p.circle(t, data_prior_pred(t), size=3, alpha=0.1)

# Plot original data
p.circle(t, df_bacterium1.loc[df_bacterium1['growth_event'] == 3, 'area (sq um)'].values, 
         color='black', size=4)
bokeh.io.show(p)

Now let's try the linear model.

In [10]:
def data_prior_pred_linear(t):
    '''
    Samples parameter values according to the prior and generates
    data y at the values given in t.
    '''
    # Sample parameter values according to priors
    a = np.random.normal(1.2, 0.4)
    b = np.random.normal(0.01, 0.003)
    sigma = np.abs(np.random.normal(0, 0.1))
    
    # Generate random data according to the likelihood
    return np.random.normal(a + b * t, sigma)

In [11]:
p = bokeh.plotting.figure(height=300, width=450,
                          x_axis_label='time',
                          y_axis_label='area')

t = df_bacterium1.loc[df_bacterium1['growth_event'] == 0, 't'].values

# Plot simulated data
for i in range(100):
    p.circle(t, data_prior_pred_linear(t), size=3, alpha=0.1)

# Plot original data
p.circle(t, df_bacterium1.loc[df_bacterium1['growth_event'] == 0, 'area (sq um)'].values, 
         color='black', size=4)
bokeh.io.show(p)

#### One level hierarchical model

Let's try a one level hierarchical model for bacterium 1.

In [27]:
model_code_pri_pred = """
data {
  // Number of data points for each experiment
  int N;
  // Number of entries in each level of the hierarchy
  int J_1;
  // Input of parameters of the priors
  real hyper_a0_mu;
  real hyper_a0_sigma;
  
  real hyper_k0_mu;
  real hyper_k0_sigma;
  
  real hyper_a0_tau;
  real hyper_k0_tau;
}


generated quantities {
  // Total number of data points 
  real a[N * J_1];
  real k[N * J_1];
  
  // Priors
  real a0 = normal_rng(hyper_a0_mu, hyper_a0_sigma);
  real k0 = normal_rng(hyper_k0_mu, hyper_k0_sigma);
  
  real a0_sigma = fabs(normal_rng(0, hyper_a0_sigma));
  real k0_sigma = fabs(normal_rng(0, hyper_k0_sigma));
  
  real a0_tau = fabs(normal_rng(0, hyper_a0_tau));
  real k0_tau = fabs(normal_rng(0, hyper_k0_tau));
  
  // Second layer
  real a_1[J_1]; 
  real k_1[J_1];
  
  for (i in 1:J_1) {
    a_1[i] = normal_rng(a0, a0_tau);
    k_1[i] = normal_rng(k0, k0_tau);
    for (j in 1:N) {
      a[(i - 1) * N + j] = normal_rng(a_1[i], a0_sigma);
      k[(i - 1) * N + j] = normal_rng(k_1[i], k0_sigma);
    }
  }
  
  }
"""

In [28]:
# Compile
sm_gen = bebi103.stan.StanModel(model_code=model_code_pri_pred)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_1304293a87b7270341545d57d6b7cef0 NOW.
  tree = Parsing.p_module(s, pxd, full_module_name)


In [32]:
# Number of iteration
N_iter = 1000

# input data from df
data = dict(N=100,
            J_1=3,
            hyper_a0_mu=1.4,
            hyper_a0_sigma=0.3,
            hyper_k0_mu=0.01,
            hyper_k0_sigma=0.002,
            hyper_a0_tau=0.1,
            hyper_k0_tau=0.001)

# Sample
df_pred = sm_gen.sampling(data=data,
                     algorithm='Fixed_param',
                     warmup=0,
                     chains=1,
                     iter=N_iter)

df_samples_a = bebi103.stan.extract_array(df_pred, name='a')
df_samples_k = bebi103.stan.extract_array(df_pred, name='k')

p_a = bebi103.viz.predictive_ecdf(df_pred, 
                                'a', 
                                x_axis_label='a')
p_k = bebi103.viz.predictive_ecdf(df_pred, 
                                'k', 
                                x_axis_label='k')
# Plot
bokeh.io.show(bokeh.layouts.gridplot([p_a, p_k], ncols=2))

Check for negative values:

In [13]:
len(df_samples_a[df_samples_a['a'] < 0]) / len(df_samples_a)

0.003425170068027211

In [14]:
len(df_samples_k[df_samples_k['k'] < 0]) / len(df_samples_k)

0.0021360544217687073

In [15]:
len(df_samples_a)

294000

Prior predictive checks

In [16]:
a = df_samples_a['a'].values
k = df_samples_k['k'].values

p = bokeh.plotting.figure(height=300, width=450,
                          x_axis_label='time',
                          y_axis_label='area')

t = df_bacterium1.loc[df_bacterium1['growth_event'] == 0, 't'].values

# Plot simulated data
for i in range(0, len(df_samples_a), 1000):
    p.circle(t, a[i] * np.exp(k[i] * t), size=3, alpha=0.1)

# Plot original data
p.circle(t, df_bacterium1.loc[df_bacterium1['growth_event'] == 0, 'area (sq um)'].values, 
         color='black', size=4)
bokeh.io.show(p)

Trying the same priors/samples for the linear model.

In [17]:
a = df_samples_a['a'].values
k = df_samples_k['k'].values

p = bokeh.plotting.figure(height=300, width=450,
                          x_axis_label='time',
                          y_axis_label='area')

t = df_bacterium1.loc[df_bacterium1['growth_event'] == 0, 't'].values

# Plot simulated data
for i in range(0, len(df_samples_a), 1000):
    p.circle(t, a[i] + k[i] * t, size=3, alpha=0.1)

# Plot original data
p.circle(t, df_bacterium1.loc[df_bacterium1['growth_event'] == 0, 'area (sq um)'].values, 
         color='black', size=4)
bokeh.io.show(p)

prior predictive check trial 2

In [50]:
model_code_pri_pred_2 = """
data {
  // Number of data points for each experiment
  int N;
  // Number of entries in each level of the hierarchy
  int J_1;
  // Time
  vector[N] t;
  
  // Input of parameters of the priors
  real hyper_a0_mu;
  real hyper_a0_sigma;
  
  real hyper_k0_mu;
  real hyper_k0_sigma;
  
  real hyper_a0_tau;
  real hyper_k0_tau;
  
  real sigma;
}


generated quantities {
  // Total number of data points 
  real a[N * J_1];
  real k[N * J_1];
  vector[N] area;
  
  // Priors
  real a0 = normal_rng(hyper_a0_mu, hyper_a0_sigma);
  real k0 = normal_rng(hyper_k0_mu, hyper_k0_sigma);
  
  real a0_sigma = fabs(normal_rng(0, hyper_a0_sigma));
  real k0_sigma = fabs(normal_rng(0, hyper_k0_sigma));
  
  real a0_tau = fabs(normal_rng(0, hyper_a0_tau));
  real k0_tau = fabs(normal_rng(0, hyper_k0_tau));
  
  // Second layer
  real a_1[J_1]; 
  real k_1[J_1];
  
  for (i in 1:J_1) {
    a_1[i] = normal_rng(a0, a0_tau);
    k_1[i] = normal_rng(k0, k0_tau);
    for (j in 1:N) {
      a[(i - 1) * N + j] = normal_rng(a_1[i], a0_sigma);
      k[(i - 1) * N + j] = normal_rng(k_1[i], k0_sigma);
    }
  }
  
  for (i in 1:N) {
    area[i] = random_rng(a[i] * exp(k[i] * t), sigma);
  }

  }
"""

# Compile
sm_gen = bebi103.stan.StanModel(model_code=model_code_pri_pred_2)

ValueError: Failed to parse Stan model 'anon_model_44a83536fa1a8de78872b4b39bcad053'. Error message:
SYNTAX ERROR, MESSAGE(S) FROM PARSER:

No matches for: 

  random_rng(vector, real)

Function random_rng not found.
  error in 'unknown file name' at line 54, column 54
  -------------------------------------------------
    52:   
    53:   for (i in 1:N) {
    54:     area[i] = random_rng(a[i] * exp(k[i] * t), sigma);
                                                             ^
    55:   }
  -------------------------------------------------



In [37]:
df_sub1 = df_bacterium1.loc[df_bacterium1['growth_event'] == 0]
df_sub2 = df_bacterium1.loc[df_bacterium1['growth_event'] == 1]
df_sub = pd.concat([df_sub1, df_sub2])

df_sub.head()

Unnamed: 0,time (min),area (sq um),growth_event,bacterium,t
0,1.0,1.300624,0,1,0
1,2.0,1.314144,0,1,1
2,3.0,1.295216,0,1,2
3,4.0,1.314144,0,1,3
4,5.0,1.341184,0,1,4


In [44]:
# Number of iteration
N_iter = 1000

# input data from df
data = dict(N=len(df_sub),
            J_1=len(df_sub['growth_event'].unique()),
            t=df_sub['t'].values,
            hyper_a0_mu=1.4,
            hyper_a0_sigma=0.3,
            hyper_k0_mu=0.01,
            hyper_k0_sigma=0.002,
            hyper_a0_tau=0.1,
            hyper_k0_tau=0.001,
            sigma=0.1)

# Sample
df_pred = sm_gen.sampling(data=data,
                     algorithm='Fixed_param',
                     warmup=0,
                     chains=1,
                     iter=N_iter)

df_samples = bebi103.stan.extract_array(df_pred, name='area')

p = bebi103.viz.predictive_ecdf(df_pred, 
                                'area', 
                                x_axis_label='area')
# Plot
bokeh.io.show(p)

RuntimeError: column 'area' is either absent or scalar-valued.

Noncentered model code:

In [76]:
model_code_linear_noncentered = """
data {
  // Total number of data points
  int N;
  
  // Number of entries in each level of the hierarchy
  int J_1;

  //Index arrays to keep track of hierarchical structure
  int index_1[N];
  
  // The measurements
  real area[N];
  
  // Time
  vector[N] t;
}

parameters {
  // Hyperparameters level 0
  real a;
  real k;
  real<lower=0> sigma;

  // How hyperparameters vary
  real<lower=0> tau_a;
  real<lower=0> tau_k;

  // Hyperparameters level 1
  vector[J_1] a_1_tilde;
  vector[J_1] k_1_tilde;
}

transformed parameters {
  // Transformations for noncentered
  vector[J_1] a_1 = a + tau_a * a_1_tilde;
  vector[J_1] k_1 = k + tau_k * k_1_tilde;
}

model {
  a ~ normal(1.4, 0.3);
  k ~ normal(0.01, 0.002);
  sigma ~ normal(0, 0.1);
  tau_a ~ normal(0, 0.1);
  tau_k ~ normal(0, 0.001);

  a_1_tilde ~ normal(0, 0.1);
  k_1_tilde ~ normal(0, 0.001);

  area ~ normal(a_1[index_1] + k_1[index_1] .* t, sigma);
}
"""

In [70]:
model_code_linear_noncentered = """
data {
  // Total number of data points
  int N;
  
  // Number of entries in each level of the hierarchy
  int J_1;

  //Index arrays to keep track of hierarchical structure
  int index_1[N];
  
  // The measurements
  real area[N];
  
  // Time
  vector[N] t;
}

parameters {
  // Hyperparameters level 0
  real a;
  real k;
  real<lower=0> sigma;

  // How hyperparameters vary
  real<lower=0> tau_a;
  real<lower=0> tau_k;

  // Hyperparameters level 1
  vector[J_1] a_1_tilde;
  vector[J_1] k_1_tilde;
}

transformed parameters {
  // Transformations for noncentered
  vector[J_1] a_1 = a + tau_a * a_1_tilde;
  vector[J_1] k_1 = k + tau_k * k_1_tilde;
  vector[J_1] area_temp = a_1 + k_1 .* t;
}

model {
  a ~ normal(1.4, 0.3);
  k ~ normal(0.01, 0.002);
  sigma ~ normal(0, 0.1);
  tau_a ~ normal(0, 0.1);
  tau_k ~ normal(0, 0.001);

  a_1_tilde ~ normal(0, 0.1);
  k_1_tilde ~ normal(0, 0.001);

  area ~ normal(a_1[index_1] + k_1[index_1] .* t, sigma);
}

generated quantities {
  real area_ppc[N];
  
  for (i in 1:N) {
    area_ppc[i] = normal_rng(area_temp[i], sigma);
  }
}
"""

In [77]:
sm_noncentered = bebi103.stan.StanModel(model_code=model_code_linear_noncentered)

Using cached StanModel.


In [None]:
# Rename for convenience
df_bacterium1 = df_bacterium1.rename(columns={'area (sq um)': 'area'})

df_bacterium1.head()

In [79]:
data, df_part = bebi103.stan.df_to_datadict_hier(df_bacterium1,
                                           level_cols=['growth_event'],
                                           data_cols=['area', 't'])

# Take a look
data

{'N': 1888,
 'J_1': 20,
 'index_1': array([ 1,  1,  1, ..., 20, 20, 20]),
 'area': array([1.300624, 2.019888, 2.00096 , ..., 1.644032, 1.600768, 2.355184]),
 't': array([ 0, 71, 70, ..., 21, 19, 80])}

In [80]:
# Sample
samples_linear = sm_noncentered.sampling(data=data, seed=2389412)

# Convert to data frame for easy use later
df_linear = bebi103.stan.to_dataframe(samples_linear)

bebi103.stan.check_all_diagnostics(samples_linear)



n_eff / iter looks reasonable for all parameters.
Rhat looks reasonable for all parameters.
0 of 4000 (0.0%) iterations ended with a divergence.
58 of 4000 (1.45%) iterations saturated the maximum tree depth of 10.
  Try running again with max_treedepth set to a larger value to avoid saturation.
E-BFMI indicated no pathological behavior.


8

In [82]:
df_linear_ppc = bebi103.stan.extract_array(samples_linear, name='k_1')

In [23]:
bokeh.io.show(bebi103.viz.corner(samples_linear, pars=['a', 'k']))

In [24]:
# Marginalized distributions of each parameter 
plots = [bebi103.viz.ecdf(df_linear[param], x_axis_label=param, plot_height=200, plot_width=250) 
                 for param in ['a', 'k']]
bokeh.io.show(bokeh.layouts.gridplot(plots, ncols=3))

How to do posterior predictive checks?

In [27]:
def hw92_predictive(df, x, y, namex='index_1', name='area_ppc', perc=[80, 60, 40, 20], 
                    x_axis_label=None, y_axis_label=None, title=None, plot_width=350, plot_height=225, 
                    color='blue', data_color=color_palette[1]):
    '''Mimic of predictive ECDF'''
    
    if color not in ['green', 'blue', 'red', 'gray',
                     'purple', 'orange', 'betancourt']:
        raise RuntimeError("Only allowed colors are 'green', 'blue', 'red', 'gray', 'purple', 'orange'")
    
    colors = {'blue': ['#9ecae1','#6baed6','#4292c6','#2171b5','#084594'],
              'green': ['#a1d99b','#74c476','#41ab5d','#238b45','#005a32'],
              'red': ['#fc9272','#fb6a4a','#ef3b2c','#cb181d','#99000d'],
              'orange': ['#fdae6b','#fd8d3c','#f16913','#d94801','#8c2d04'],
              'purple': ['#bcbddc','#9e9ac8','#807dba','#6a51a3','#4a1486'],
              'gray': ['#bdbdbd','#969696','#737373','#525252','#252525'],
              'betancourt': ['#DCBCBC', '#C79999', '#B97C7C',
                             '#A25050', '#8F2727', '#7C0000']}
    
    p = bokeh.plotting.figure(plot_width=plot_width,
                              plot_height=plot_height,
                              x_axis_label=x_axis_label,
                              y_axis_label=y_axis_label,
                              title=title)
    
    Nb = len(x)
    y_ppc = np.empty((len(perc) * 2 + 1, Nb))
    for i in range(Nb):
        temp = df.loc[df[namex]== i+1, name].values
        y_ppc[-1, i] = np.median(temp)
        for j in range(len(perc)):
            y_ppc[j * 2, i] = np.percentile(temp, 50 - perc[j] / 2)
            y_ppc[j * 2 + 1, i] = np.percentile(temp, 50 + perc[j] / 2)
    
    for j in range(len(perc)):
        bebi103.viz.fill_between(x, y_ppc[j * 2, :],
                     x, y_ppc[j * 2 + 1,:],
                     p=p,
                     show_line=False,
                     fill_color=colors[color][j])
        
    p.line(x, y_ppc[-1, :],
           line_width=2,
           color=colors[color][-1])
    
    p.line(x, y, line_width=2, color='orange')
    
    return p

In [None]:
# p1 = hw92_predictive(df_linear, conc_b, df['fluorescence'].values,name='F', plot_width=500, plot_height=400)

# bokeh.io.show(p1)

i think this stuff below is wrong probably

In [115]:
df_a = bebi103.stan.extract_array(samples_linear, 'a_1')
df_b = bebi103.stan.extract_array(samples_linear, 'k_1')

In [94]:
# Plot measured data set
p = bebi103.viz.ecdf(df_bacterium1['area'].values,
                     x_axis_label='area',
                     color='orange',
                     level='overlay')

# Plot posterior predictive ECDFs
for i in df_a['chain_idx'].unique()[::10]:
    a1 = df_a.loc[df_a['chain_idx']==i, 'a_1'] + \
    df_b.loc[df_b['chain_idx']==i, 'k_1'] * np.arange(len(df_b.loc[df_b['chain_idx']==i, 'k_1']))
    p = bebi103.viz.ecdf(a1, alpha=0.1, p=p)

bokeh.io.show(p)

Noncentered exponential model

In [None]:
model_code_exp_noncentered = """
data {
  // Total number of data points
  int N;
  
  // Number of entries in each level of the hierarchy
  int J_1;

  //Index arrays to keep track of hierarchical structure
  int index_1[N];
  
  // The measurements
  real area[N];
  
  // Time
  vector[N] t;
}


parameters {
  // Hyperparameters level 0
  real a;
  real k;
  real<lower=0> sigma;

  // How hyperparameters vary
  real<lower=0> tau_a;
  real<lower=0> tau_k;

  // Hyperparameters level 1
  vector[J_1] a_1_tilde;
  vector[J_1] k_1_tilde;
}

transformed parameters {
  // Transformations for noncentered
  vector[J_1] a_1 = a + tau_a * a_1_tilde;
  vector[J_1] k_1 = k + tau_k * k_1_tilde;
}

model {
  a ~ normal(1.4, 0.3);
  k ~ normal(0.01, 0.002);
  sigma ~ normal(0, 0.1);
  tau_a ~ normal(0, 0.1);
  tau_k ~ normal(0, 0.001);

  a_1_tilde ~ normal(0, 0.1);
  k_1_tilde ~ normal(0, 0.001);

  area ~ normal(a_1[index_1] .* exp(k_1[index_1] .* t), sigma);
}
"""

In [None]:
sm_exp = bebi103.stan.StanModel(model_code=model_code_exp_noncentered)

In [None]:
# Sample
samples_exp = sm_exp.sampling(data=data, seed=2389412, warmup=2000, iter=4000)

# Convert to data frame for easy use later
df_exp = bebi103.stan.to_dataframe(samples_exp)

bebi103.stan.check_all_diagnostics(samples_exp)

In [None]:
bokeh.io.show(bebi103.viz.corner(samples_exp, pars=['a', 'k']))

In [119]:
df_a = bebi103.stan.extract_array(samples_exp, 'a_1')
df_b = bebi103.stan.extract_array(samples_exp, 'k_1')

In [120]:
# Plot measured data set
p = bebi103.viz.ecdf(df_bacterium1['area'].values,
                     x_axis_label='area',
                     color='orange',
                     level='overlay')

# Plot posterior predictive ECDFs
for i in df_a['chain_idx'].unique()[::10]:
    a1 = df_a.loc[df_a['chain_idx']==i, 'a_1'] * \
    np.exp(df_b.loc[df_b['chain_idx']==i, 'k_1'] * np.arange(len(df_b.loc[df_b['chain_idx']==i, 'k_1'])))
    p = bebi103.viz.ecdf(a1, alpha=0.1, p=p)

bokeh.io.show(p)

In [122]:
%load_ext watermark

In [123]:
%watermark -v -p numpy,scipy,bokeh,jupyterlab

CPython 3.7.0
IPython 7.1.1

numpy 1.15.4
scipy 1.1.0
bokeh 1.0.1
jupyterlab 0.35.3
