# Inferring A Fluorescence Calibration Factor 

(c) The authors. This work is licensed under the standard [CC-BY 4.0]() and all code herin is licensed under the standard [MIT]() permissive license.

---

In [1]:
import sys
import numpy as np
import pystan
import pandas as pd
import datashader as ds
import holoviews as hv
import itertools
import bokeh.io
import bokeh.plotting
sys.path.insert(0, '../../')
import mwc.bayes
import mwc.stats
hv.extension('bokeh')
bokeh.io.output_notebook()

In this notebook, we infer the most-likely value for the calibration factor between fluorescence and protein copy number across a set of experiments.

## Derivation

## Inferential Model

## Implementation

In [2]:
# Load the fluctuation data. 
fluct_data = pd.read_csv('../../data/compiled_fluctuations.csv')

# Load the fold-change measurements 
fc_data = pd.read_csv('../../data/compiled_fold_change.csv')

# Add identifiers to the fluctuation data. 
fluct_data['media_idx'] = fluct_data.groupby('carbon').ngroup() + 1
fluct_data['run_idx'] = fluct_data.groupby(['carbon', 'date', 'run_no']).ngroup() + 1
fc_data['media_idx'] = fc_data.groupby('carbon').ngroup() + 1
fc_data['run_idx'] = fc_data.groupby(['carbon', 'date', 'run_number']).ngroup() + 1
# Compute the mean autofluorescence value. 
auto_data = fc_data[(fc_data['strain']=='auto')]
mean_auto_mch = auto_data['mean_mCherry'].values - auto_data['mCherry_bg_val'].values

# Subtract the background and prune
fluct_data['I_1'] -= (fluct_data['bg_val'].values + mean_auto_mch.mean())
fluct_data['I_2'] -= (fluct_data['bg_val'].values + mean_auto_mch.mean())
fluct_data['summed'] = (fluct_data['area_1'] * fluct_data['I_1'].values + fluct_data['area_2'] * fluct_data['I_2'].values)
fluct_data['fluct'] = (fluct_data['area_1'] * fluct_data['I_1'].values - fluct_data['area_2'] * fluct_data['I_2'].values)**2
fluct_data = fluct_data[(fluct_data['I_1'] >= 0) & (fluct_data['I_2'] >= 0)]

In [4]:
%%opts Scatter [logy=True, logx=True, width=500, height=300, legend_position='top_left']
hv.Scatter(fluct_data, kdims=['carbon', 'summed'], vdims=['fluct']).groupby('carbon')


In [5]:
fc_data.head()

Unnamed: 0,strain,area_pix,mean_yfp,mean_mCherry,fold_change,atc_ngml,date,carbon,temp,operator,run_number,yfp_bg_val,mCherry_bg_val,IPTGuM,media_idx,run_idx
0,delta,331.0,210.882175,172.501511,0.065102,0.0,20181005,glucose,37,O2,1,172.484943,192.062071,0,2,4
1,delta,469.0,528.701493,178.066098,0.58343,0.0,20181005,glucose,37,O2,1,172.484943,192.062071,0,2,4
2,delta,362.0,603.726519,191.737569,0.705788,0.0,20181005,glucose,37,O2,1,172.484943,192.062071,0,2,4
3,delta,561.0,617.067736,188.69697,0.727546,0.0,20181005,glucose,37,O2,1,172.484943,192.062071,0,2,4
4,delta,409.0,612.860636,187.190709,0.720685,0.0,20181005,glucose,37,O2,1,172.484943,192.062071,0,2,4


In [7]:
imp.reload(mwc.bayes)

<module 'mwc.bayes' from '../../mwc/bayes.py'>

In [None]:
# Assemble the sampling data dictionary. 
fc_data = fc_data[(fc_data['mean_mCherry'] - fc_data['mCherry_bg_val']) >= 0]

# Separate auto and delta strains from foldchange data
autofluo = fc_data[fc_data['strain']=='auto']
deltafluo = fc_data[fc_data['strain']=='delta']

data_dict = {'J_1': np.max(fluct_data['media_idx']),
             'J_2': np.max(fluct_data['run_idx']),
             'N': len(fluct_data),
             'M': len(fc_data),
             'index_1': [key[0] for key in fluct_data.groupby(['media_idx', 'run_idx']).groups],
             'index_2': fluct_data['run_idx'],
             'fc_index_1': [key[0] for key in fc_data.groupby(['media_idx', 'run_idx']).groups],
             'fc_index_2': fc_data['run_idx'], 
             'I_1': fluct_data['I_1'] * fluct_data['area_1'],
             'I_2': fluct_data['I_2'] * fluct_data['area_2'],
             'mCherry_exp': fc_data['mean_mCherry'] - fc_data['mCherry_bg_val'],
             'yfp_exp': fc_data['mean_yfp'],
             'area': fc_data['area_pix']}
model = mwc.bayes.StanModel('../stan/hierarchical_calibration_factor.stan', data_dict,
                           force_compile=True)
# Sample!
samples = model.sample(iter=800)

Precompiled model not found. Compiling model...


INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_0534a9ed19f745ed9f7bddd319abef29 NOW.


finished!


In [29]:
data_dict['delta_index_1']

[]

In [26]:
imp.reload(mwc.bayes)

<module 'mwc.bayes' from '../../mwc/bayes.py'>

In [21]:
_df = samples[0].to_dataframe(diagnostics=True)
_df.keys()

Index(['chain', 'chain_idx', 'warmup', 'divergent__', 'energy__',
       'treedepth__', 'accept_stat__', 'stepsize__', 'n_leapfrog__',
       'tau_alpha', 'alpha_1[1]', 'alpha_1[2]', 'alpha_1[3]',
       'alpha_2_tilde[1]', 'alpha_2_tilde[2]', 'alpha_2_tilde[3]',
       'alpha_2_tilde[4]', 'alpha_2_tilde[5]', 'alpha_2_tilde[6]',
       'alpha_2_tilde[7]', 'alpha_2_tilde[8]', 'alpha_2_tilde[9]',
       'alpha_2[1]', 'alpha_2[2]', 'alpha_2[3]', 'alpha_2[4]', 'alpha_2[5]',
       'alpha_2[6]', 'alpha_2[7]', 'alpha_2[8]', 'alpha_2[9]', 'lp__'],
      dtype='object')

In [10]:
import imp
imp.reload(mwc.viz)
imp.reload(mwc.bayes)

<module 'mwc.bayes' from '../../mwc/bayes.py'>

In [22]:
# parse the dataframe output
samples[0].extract()

OrderedDict([('tau_alpha',
              array([6.82765886, 7.37713319, 6.38381442, ..., 7.03911364, 7.27861746,
                     5.90066089])),
             ('alpha_1', array([[900.57039544, 597.95679128, 800.79022977],
                     [913.12705628, 664.6153705 , 815.5710399 ],
                     [956.32265565, 620.58073357, 800.42684792],
                     ...,
                     [695.52745137, 596.49936357, 879.6989315 ],
                     [841.46499633, 588.16612646, 747.10901337],
                     [839.99756007, 611.79882052, 771.93440589]])),
             ('alpha_2_tilde',
              array([[ -7.63711895, -22.63072878, -23.03629252, ...,  22.06071199,
                      -28.64601556,   6.96493543],
                     [ -6.75354892, -32.66857237, -33.14763999, ...,  19.69117922,
                      -28.89358346,   5.59261508],
                     [-14.86528973, -31.3900952 , -27.1067066 , ...,  19.49542018,
                      -21.14196609,   7

In [11]:
bokeh_traceplot(samples)

In [20]:
import holoviews as hv
hv.extension('bokeh')

In [8]:
samples_df = samples.to_dataframe()
samples_df[['alpha_1[1]', 'alpha_1[2]']].mean()

alpha_1[1]    510.415785
alpha_1[2]    825.924726
dtype: float64

## Validataion function

In [5]:
%%opts Scatter [logy=True, logx=True]
%%opts Curve (color='k')
x = np.logspace(0, 4, 500)
fc = (1 + (x/5E6) * np.exp(13.9))**-1

points = hv.Scatter(fc_data[fc_data['fold_change'] > 0], kdims=['max_repressors', 'date'], vdims=['fold_change']).groupby('date')
pred = hv.Curve((x, fc))

points * pred


DataError: Supplied data does not contain specified dimensions, the following dimensions were not found: ['max_repressors']

PandasInterface expects tabular data, for more information on supported datatypes see http://holoviews.org/user_guide/Tabular_Datasets.html

In [None]:
# Assemble the sampling data dictionary. 
data_dict = dict(J_media=1, mch_media_idx=np.ones(len(fc_data)), mCh=fc_data['area_pix'] * fc_data['mean_mCherryJ_run=len(fluct_data['idx'].unique()), N=len(fluct_data),
                media_idx=np.ones(len(fluct_data)).astype(int), run_idx=fluct_data['idx'].values.astype(int),
                I_1=fluct_data['I_1'], I_2=fluct_data['I_2'])

# Sample!
samples = model.sampling(data_dict, iter=1000, chains=4)

In [None]:
theo = alt.Chart(cred_df).mark_area(opacity=0.5, color='dodgerblue').encode(
        x=alt.X('summed:Q', scale={'type':'log'}),
        y=alt.Y('min:Q', scale={'type':'log'}),
        y2=alt.Y('max:Q'))
points = alt.Chart(fluct_data).mark_point(filled=True, size=2, color='slategray').encode(
        x=alt.X('summed:Q', scale={'type':'log'}),
        y=alt.Y('fluct:Q', scale={'type':'log'})).interactive()
points + theo

###  Making a traceplot viewer in Bokeh

In [15]:
# Get a list of the parameters:
import bokeh.io
import bokeh.plotting
import datashader as ds
from holoviews.operation.datashader import datashade
import datashader.bokeh_ext
import holoviews as hv


def bokeh_traceplot(samples, varnames=None):
    params = samples.model_pars
    sample_values = samples.extract()
    palette = bokeh.palettes.Category10_10
    
    # Define the standard plot sizes.   
    pairs = []
    if varnames != None:
        iterator = varnames
    else:
        iterator = params
    for p in iterator:
        colors = itertools.cycle(palette)
        if len(np.shape(sample_values[p])) == 1:
            _samples = np.array([sample_values[p]]).T
        else:
            _samples = sample_values[p]
      
        dfs = []
        trace = bokeh.plotting.figure(plot_width=400, plot_height=200, 
                                      x_axis_label='sample number', y_axis_label=f'{p}',
                                     title=f'sampling history for {p}', background_fill_color='#ecedef') 

        dist = bokeh.plotting.figure(plot_width=400, plot_height=200, 
                                     x_axis_label=f'{p}', y_axis_label='ECDF',
                                    title=f'posterior distribution for {p}', background_fill_color='#ecedef')   
        
        # Add visual formatting
        trace.xgrid.grid_line_color = '#FFFFFF'
        trace.ygrid.grid_line_color = '#FFFFFF'
        dist.xgrid.grid_line_color = '#FFFFFF'
        dist.ygrid.grid_line_color = '#FFFFFF'    
        
        for i, color in zip(range(np.shape(_samples)[1]), colors): 
            # Extract the specific values. 
            _values = _samples[:, i]
            x, y = np.sort(_values), np.arange(0, len(_values), 1) / len(_values)
            _df = pd.DataFrame(np.array([x, y, 
                                         _values, np.arange(0, len(_values), 1)]).T, 
                               columns=['x', 'ecdf', 'samples', 'step_no'])            
            dist.line(_df['x'], _df['ecdf'], line_width=2, color=color)
            trace.line(_df['step_no'], _df['samples'], line_width=1, color=color) 
        pairs.append([dist, trace])  
    return bokeh.io.show(bokeh.layouts.gridplot(pairs))
bokeh_traceplot(samples)

AttributeError: 'DataFrame' object has no attribute 'model_pars'

In [19]:
samples

Inference for Stan model: anon_model_da28093df3b22a381b7b0ff8ec0dacd8.
4 chains, each with iter=500; warmup=250; thin=1; 
post-warmup draws per chain=250, total post-warmup draws=1000.

                   mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
tau_alpha          4.95    0.02   0.64   3.78   4.51   4.93   5.36   6.22    689    1.0
alpha_1[1]       514.77    1.58  26.04 466.04 496.91  513.8 531.76 564.78    273   1.03
alpha_1[2]       772.65    2.11  52.89 661.53 738.17 773.51 809.79 873.59    626    1.0
alpha_2_tilde[1] -19.85    0.29   6.04 -30.94 -24.31 -19.78 -15.63  -8.28    430   1.02
alpha_2_tilde[2]  -17.1    0.28   6.28 -29.15  -21.5 -17.19 -12.61  -4.86    490   1.02
alpha_2_tilde[3]  21.44     0.3   7.26   7.99  16.22  21.13  26.12  36.48    597    1.0
alpha_2_tilde[4]  -1.55    0.32   6.46 -14.47  -5.83  -1.35   2.89  10.95    415   1.01
alpha_2_tilde[5]  18.79     0.3   7.02   5.55  13.79  18.91  23.27  32.91    546   1.01
alpha_2_tilde[6]  25.6