# Computing the calibration factor (in progress)

In [1]:
import numpy as np 
import pandas as pd
import bokeh.io 
import bokeh.plotting
import bokeh_catplot as bkcat 
import scipy.optimize
import mwc.stats 
import imp
import mwc.bayes 
imp.reload(mwc.bayes)
import mwc.viz
import bokeh.models
import bokeh.transform
import tqdm
import bokeh.palettes
import scipy.stats
import scipy.special
import statsmodels.tools.numdiff as smnd
import tqdm
colors, color_list = mwc.viz.bokeh_theme()
bokeh.io.output_notebook()

In [59]:
# Load the snapshots
snaps = pd.read_csv('../../data/raw_compiled_snaps.csv')

# Load the lineages
lineages = pd.read_csv('../../data/raw_compiled_lineages.csv')

# Apply morphology filters to both. 
min_size = 0.5 / 0.065**2
max_size = 5 / 0.065**2
snaps = mwc.process.morphological_filter(snaps, area_bounds=[0.5, 5], 
                                         ar_bounds=[0, 0.8], ip_dist=0.065)
lineages = lineages[(lineages['area_1'] >= min_size) & (lineages['area_2'] >= min_size) &
                   (lineages['area_1'] <= max_size) & (lineages['area_2'] <= max_size)]

# Drop error frames 
lineages['error_frame'] = np.nan_to_num(lineages['error_frame'])
lineages = lineages[lineages['error_frame'] == 0].copy()


# Load the hierarchical model. 

model = mwc.bayes.StanModel('../stan/hierarchical_calibration_factor.stan', force_compile=True)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_1119c0132ed1322ef92595a04109d58c NOW.


Precompiled model not found. Compiling model...
finished!


In [60]:
# Start with examining **only** the glucose sample
carbon = 'glucose'
samp = lineages[(lineages['carbon']==carbon) & (lineages['temp']==37)].copy()
auto_samp = snaps[(snaps['carbon']==carbon) & (snaps['temp']==37) & (snaps['strain']=='auto')].copy()

# Iterate through all of the days and run numbers and subratcting the chosen value
funcs = {'mean_auto':np.mean, 'median_auto':np.mean}
for g, d in samp.groupby(['date', 'run_number']):
    _auto = auto_samp[(auto_samp['date']==g[0]) &
                     (auto_samp['run_number']==g[1])]['fluor2_mean_death'].values
    # Compute the summary statistica and add it to the samp
    for v, f in funcs.items():
        auto_mch = f(_auto)
        samp.loc[(samp['date']==g[0]) & (samp['run_number']==g[1]), v] = f(_auto)


In [61]:
# Perform the background subtraction and compute integrated intensity. 
samp['I_1_tot'] = samp['area_1'] * (samp['I_1'].values - samp['median_auto'])
samp['I_2_tot'] = samp['area_2'] * (samp['I_2'].values - samp['median_auto'])

# Remove unphysical values. 
samp = samp[(samp['I_1_tot'] >0) & (samp['I_2_tot'] > 0)]

# Add identifiers for each category.
samp['idx'] = samp.groupby(['date']).ngroup() + 1

# Set up the data dictionary. 
data_dict = {'J':samp['idx'].max(), 
             'N': len(samp),
             'idx': samp['idx'].values,
             'I_1':samp['I_1_tot'].values,
             'I_2':samp['I_2_tot'].values}

# Sample the motherfucker
fit, mcmc_samples = model.sample(data_dict, iter=4000, control=dict(max_treedepth=11))

Beginning sampling...




finished sampling!


In [62]:
fit

Inference for Stan model: anon_model_1119c0132ed1322ef92595a04109d58c.
4 chains, each with iter=4000; warmup=2000; thin=1; 
post-warmup draws per chain=2000, total post-warmup draws=8000.

              mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
alpha_1      3.1e4   1.8e4  2.8e4 1467.3 1919.2  1.1e4  6.5e4  6.6e4      2   4.04
sigma        2.5e4  1597.7 2263.6  2.1e4  2.3e4  2.6e4  2.6e4  2.6e4      2  16.61
alpha_2[1]   1.1e7   4.5e6  6.4e6   1.32  6.5e6  1.5e7  1.5e7  1.5e7      2 269.69
alpha_2[2]   2.6e7   1.7e6  2.5e6  2.2e7  2.5e7  2.8e7  2.8e7  2.8e7      2  16.42
alpha_2[3]   1.4e7   5.7e6  8.0e6   1.28  8.2e6  1.8e7  1.8e7  1.9e7      2 291.04
alpha_2[4]   1.9e7   1.3e6  1.9e6  1.6e7  1.8e7  2.0e7  2.0e7  2.0e7      2   3.55
alpha_2[5]   1.7e7   1.2e6  1.9e6  1.4e7  1.8e7  1.8e7  1.8e7  1.8e7      2   2.31
alpha_2[6]   1.7e7   6.9e6  9.7e6   1.21 10.0e6  2.2e7  2.2e7  2.3e7      2 304.86
alpha_2[7]   1.6e7   6.5e6  9.2e6   1.47  9.5e6  2.1e7  2.1e7  2

In [55]:
model = mwc.bayes.StanModel('../stan/calibration_factor.stan') #, force_compile=True)

Found precompiled model. Loading...
finished!


In [56]:
stats_dfs = pd.DataFrame([])
for g, d in tqdm.tqdm(samp.groupby(['date', 'run_number'])):
    # Set up the data dict and sample
    data_dict = {'N': len(d), 'I1':d['I_1_tot'], 'I2':d['I_2_tot']}
    _, samples = model.sample(data_dict, iter=2000)

    # Compute the important stats of alpha. 
    mean_alpha = np.median(samples['alpha'])
    alpha_min,alpha_max = mwc.stats.compute_hpd(samples['alpha'], 0.95)
    stats_dfs = stats_dfs.append({'date':g[0], 'run_number':g[1], 
                                  'mean_alpha':mean_alpha,
                                  'alpha_min':alpha_min, 
                                  'alpha_max':alpha_max}, ignore_index=True)

  0%|          | 0/12 [00:00<?, ?it/s]

Beginning sampling...
finished sampling!


  8%|▊         | 1/12 [00:01<00:14,  1.35s/it]

Beginning sampling...
finished sampling!


 17%|█▋        | 2/12 [00:03<00:16,  1.63s/it]

Beginning sampling...
finished sampling!


 25%|██▌       | 3/12 [00:05<00:14,  1.59s/it]

Beginning sampling...
finished sampling!


 33%|███▎      | 4/12 [00:06<00:12,  1.57s/it]

Beginning sampling...
finished sampling!


 42%|████▏     | 5/12 [00:08<00:10,  1.55s/it]

Beginning sampling...
finished sampling!


 50%|█████     | 6/12 [00:09<00:09,  1.54s/it]

Beginning sampling...
finished sampling!


 58%|█████▊    | 7/12 [00:10<00:07,  1.44s/it]

Beginning sampling...
finished sampling!


 67%|██████▋   | 8/12 [00:12<00:05,  1.43s/it]

Beginning sampling...
finished sampling!


 75%|███████▌  | 9/12 [00:13<00:04,  1.36s/it]

Beginning sampling...
finished sampling!


 83%|████████▎ | 10/12 [00:14<00:02,  1.28s/it]

Beginning sampling...
finished sampling!


 92%|█████████▏| 11/12 [00:16<00:01,  1.58s/it]

Beginning sampling...
finished sampling!


100%|██████████| 12/12 [00:18<00:00,  1.68s/it]


In [57]:
def bin_by_value(df, bins):
    """
    Bins by predefined bins. Returns the mean and SEM of all points in that bin
    """
    # Iterate through the bins.
    df = df.copy()
    summed_means = np.zeros(len(bins) - 1)
    fluct_means = np.zeros(len(bins) - 1)
    summed_sems = np.zeros(len(bins) - 1)
    fluct_sems = np.zeros(len(bins) - 1)
    for i in range(len(bins) - 1):
        lower = bins[i] - 1
        upper = bins[i+1] + 1
        samps = df[(df['summed'] >= lower) & (df['summed'] <= upper)]
        summed_means[i] = np.mean(samps['summed'])
        fluct_means[i] = np.mean(samps['fluct'])
        summed_sems[i] = np.std(samps['summed']) / np.sqrt(len(samps))
        fluct_sems[i] = np.std(samps['fluct']) / np.sqrt(len(samps))
    # assemble into a dataframe
    _df = pd.DataFrame(np.array([summed_means, summed_sems, fluct_means, fluct_sems]).T,
                       columns=['summed_mean', 'summed_sem', 'fluct_mean', 'fluct_sem'])
    # Compute the mins and max for eaach. 
    _df['summed_min'] = _df['summed_mean'] - _df['summed_sem']
    _df['summed_max'] = _df['summed_mean'] + _df['summed_sem']
    _df['fluct_min'] = _df['fluct_mean'] - _df['fluct_sem']
    _df['fluct_max'] = _df['fluct_mean'] + _df['fluct_sem']
    return _df
        

In [58]:
samp['summed'] = samp['I_1_tot'] + samp['I_2_tot']
samp['fluct'] = (samp['I_1_tot'] - samp['I_2_tot'])**2
p = bokeh.plotting.figure(x_axis_type='log', y_axis_type='log', width=800, height=300)


i = 0
for g, d in samp.groupby(['date']):
    alpha_max = stats_dfs[stats_dfs['date']==g]['alpha_max'].values[0]
    alpha_min = stats_dfs[stats_dfs['date']==g]['alpha_min'].values[0]
    bins = np.logspace(1, 5.5)
    p.circle(d['summed'], d['fluct'], size=1, color=color_list[i], alpha=0.5,legend=str(g))
    binned = bin_by_value(d, bins)
    p.circle(binned['summed_mean'], binned['fluct_mean'], size=6, color=color_list[i], legend=str(g))
    p.line(bins, alpha_min * bins, color=color_list[i], legend=str(g))
    p.line(bins, alpha_max * bins, color=color_list[i], legend=str(g))
    i+=1 
    
p.legend.click_policy = 'hide'
p.legend.location = 'top_left'
bokeh.io.show(p)

Unnamed: 0,alpha_max,alpha_min,date,mean_alpha,run_number
0,1724.949262,1223.610274,20181019.0,1467.449526,1.0
1,1894.730574,1488.625673,20181022.0,1691.186303,1.0
2,954.216774,741.597977,20181023.0,845.017646,1.0
3,1041.57384,852.633636,20181025.0,942.392566,1.0
4,964.067797,717.978347,20181030.0,832.490136,1.0
5,1035.358367,734.467307,20181123.0,877.590809,1.0
6,644.094581,242.345625,20190306.0,404.252308,1.0
7,2975.426823,1664.891409,20190308.0,2287.298286,1.0
8,1209.379827,934.200665,20190318.0,1064.040771,1.0
9,2070.981273,1062.56013,20190327.0,1537.990227,1.0
