# Inferring A Fluorescence Calibration Factor 

(c) The authors. This work is licensed under the standard [CC-BY 4.0]() and all code herin is licensed under the standard [MIT]() permissive license.

---

In [1]:
import sys
import numpy as np
import pystan
import pandas as pd
import datashader as ds
import holoviews as hv
import itertools
import bokeh.io
import bokeh.plotting
sys.path.insert(0, '../../')
import mwc.bayes
import mwc.stats
hv.extension('bokeh')
bokeh.io.output_notebook()

In this notebook, we infer the most-likely value for the calibration factor between fluorescence and protein copy number across a set of experiments.

## Derivation

## Inferential Model

## Implementation

In [6]:
# Load the fluctuation data. 
fluct_data = pd.read_csv('../../data/compiled_fluctuations.csv')

# Load the fold-change measurements 
fc_data = pd.read_csv('../../data/compiled_fold_change.csv')

# Add identifiers to the fluctuation data. 
fluct_data['media_idx'] = fluct_data.groupby('carbon').ngroup() + 1
fluct_data['run_idx'] = fluct_data.groupby(['carbon', 'date', 'run_no']).ngroup() + 1
fc_data['media_idx'] = fc_data.groupby('carbon').ngroup() + 1
fc_data['run_idx'] = fc_data.groupby(['carbon', 'date', 'run_number']).ngroup() + 1
# Compute the mean autofluorescence value. 
auto_data = fc_data[(fc_data['strain']=='auto')]
mean_auto_mch = auto_data['mean_mCherry'].values - auto_data['mCherry_bg_val'].values

# Subtract the background and prune
fluct_data['I_1'] -= (fluct_data['bg_val'].values + mean_auto_mch.mean())
fluct_data['I_2'] -= (fluct_data['bg_val'].values + mean_auto_mch.mean())
fluct_data['summed'] = (fluct_data['area_1'] * fluct_data['I_1'].values + fluct_data['area_2'] * fluct_data['I_2'].values)
fluct_data['fluct'] = (fluct_data['area_1'] * fluct_data['I_1'].values - fluct_data['area_2'] * fluct_data['I_2'].values)**2
fluct_data = fluct_data[(fluct_data['I_1'] >= 0) & (fluct_data['I_2'] >= 0)]

In [3]:
%%opts Scatter [logy=True, logx=True, width=500, height=300, legend_position='top_left']
hv.Scatter(fluct_data, kdims=['carbon', 'summed'], vdims=['fluct']).groupby('carbon')


In [8]:
fc_data.head()

Unnamed: 0,strain,area_pix,mean_yfp,mean_mCherry,fold_change,atc_ngml,date,carbon,temp,operator,run_number,yfp_bg_val,mCherry_bg_val,IPTGuM,media_idx,run_idx
0,delta,331.0,210.882175,172.501511,0.065102,0.0,20181005,glucose,37,O2,1,172.484943,192.062071,0,2,4
1,delta,469.0,528.701493,178.066098,0.58343,0.0,20181005,glucose,37,O2,1,172.484943,192.062071,0,2,4
2,delta,362.0,603.726519,191.737569,0.705788,0.0,20181005,glucose,37,O2,1,172.484943,192.062071,0,2,4
3,delta,561.0,617.067736,188.69697,0.727546,0.0,20181005,glucose,37,O2,1,172.484943,192.062071,0,2,4
4,delta,409.0,612.860636,187.190709,0.720685,0.0,20181005,glucose,37,O2,1,172.484943,192.062071,0,2,4


In [9]:
# Assemble the sampling data dictionary. 
fc_data = fc_data[fc_data['mean_mCherry'] >= 0]

# Separate auto and delta strains from foldchange data
autofluo = fc_data[fc_data['strain']=='auto']
deltafluo = fc_data[fc_data['strain']=='deltafluo']

data_dict = {'J_1': np.max(fluct_data['media_idx']),
             'J_2': np.max(fluct_data['run_idx']),
             'N': len(fluct_data),
             'N_auto': len(autofluo),
             'N_delta': len(deltafluo),
             'M': len(fc_data),
             'index_1': [key[0] for key in fluct_data.groupby(['media_idx', 'run_idx']).groups],
             'index_2': fluct_data['run_idx'],
             'auto_index_1': [key[0] for key in autofluo.groupby(['media_idx', 'run_idx']).groups],
             'auto_index_2': autofluo['run_idx'],
             'delta_index_1': [key[0] for key in deltafluo.groupby(['media_idx', 'run_idx']).groups],
             'delta_index_2': deltafluo['run_idx'],
             'I_1': fluct_data['I_1'] * fluct_data['area_1'],
             'I_2': fluct_data['I_2'] * fluct_data['area_2'],
             'autofluo':autofluo['mean_yfp']- autofluo['yfp_bg_val'],
             'deltafluo':deltafluo['mean_yfp'] - deltafluo['yfp_bg_val'],
             'mCherry_exp': fc_data['mean_mCherry'] - fc_data['mCherry_bg_val'],
             'yfp_exp': fc_data['mean_yfp'] - fc_data['mean_yfp'],
             'area': fc_data['area_pix']}
model = mwc.bayes.StanModel('../stan/hierarchical_calibration_factor.stan', data_dict)
# Sample!
samples = model.sample()

Found precompiled model. Loading...
finished!


In [23]:
model.traceplot(varnames=['alpha_1', 'alpha_2'])

In [21]:
import imp
imp.reload(mwc.viz)
imp.reload(mwc.bayes)

<module 'mwc.bayes' from '../../mwc/bayes.py'>

In [16]:
samples

Unnamed: 0,chain,chain_idx,warmup,divergent__,energy__,treedepth__,accept_stat__,stepsize__,n_leapfrog__,tau_alpha,...,alpha_2_tilde[8],alpha_2[1],alpha_2[2],alpha_2[3],alpha_2[4],alpha_2[5],alpha_2[6],alpha_2[7],alpha_2[8],lp__
0,1,1,0,0,40803.412233,4,0.939376,0.434000,23,5.587839,...,-6.914286,854.545722,385.886399,435.466028,697.869123,616.411222,985.103654,626.757867,840.083812,-40798.249196
1,1,2,0,0,40800.804187,3,0.988969,0.434000,7,5.100780,...,-1.239735,827.325227,431.813639,407.749941,596.473454,584.788821,934.265209,689.283418,803.508930,-40794.151654
2,1,3,0,0,40796.881029,4,0.974275,0.434000,15,5.020628,...,6.992296,912.443617,414.524520,419.398009,643.381083,619.102679,891.021132,637.512932,825.771978,-40793.554160
3,1,4,0,0,40797.664152,3,0.929969,0.434000,7,5.642637,...,1.806049,864.415667,409.127845,435.257498,622.293242,583.449799,922.062050,642.118454,835.565412,-40795.198702
4,1,5,0,0,40797.027599,3,0.943491,0.434000,7,5.270020,...,2.830432,906.127280,410.728459,416.970714,628.368792,624.746408,904.072259,629.664017,775.741455,-40793.507588
5,1,6,0,0,40801.101895,3,0.446401,0.434000,7,5.191512,...,-2.973528,776.400072,405.961948,456.136433,627.705579,578.752381,980.329221,667.941861,834.040991,-40795.672776
6,1,7,0,0,40801.331663,3,0.886988,0.434000,7,5.172980,...,0.313007,724.602222,420.561986,471.346319,626.902931,593.777294,965.367545,619.525525,823.253168,-40798.090447
7,1,8,0,0,40808.261058,3,0.769091,0.434000,7,6.356248,...,-12.515604,1023.107528,425.402361,423.076839,615.581819,599.188316,1023.350440,668.629495,776.718449,-40799.076209
8,1,9,0,0,40807.072557,3,0.765658,0.434000,7,4.509453,...,5.367792,887.543682,418.298630,423.780285,621.653624,613.272610,934.256160,635.095521,864.717755,-40799.687999
9,1,10,0,0,40810.968183,3,0.866466,0.434000,15,6.653749,...,-7.472869,946.479921,445.077520,463.669147,704.925484,598.898386,1009.983146,649.586464,755.310112,-40802.849967


In [11]:
bokeh_traceplot(samples)

In [20]:
import holoviews as hv
hv.extension('bokeh')

In [8]:
samples_df = samples.to_dataframe()
samples_df[['alpha_1[1]', 'alpha_1[2]']].mean()

alpha_1[1]    510.415785
alpha_1[2]    825.924726
dtype: float64

## Validataion function

In [5]:
%%opts Scatter [logy=True, logx=True]
%%opts Curve (color='k')
x = np.logspace(0, 4, 500)
fc = (1 + (x/5E6) * np.exp(13.9))**-1

points = hv.Scatter(fc_data[fc_data['fold_change'] > 0], kdims=['max_repressors', 'date'], vdims=['fold_change']).groupby('date')
pred = hv.Curve((x, fc))

points * pred


DataError: Supplied data does not contain specified dimensions, the following dimensions were not found: ['max_repressors']

PandasInterface expects tabular data, for more information on supported datatypes see http://holoviews.org/user_guide/Tabular_Datasets.html

In [None]:
# Assemble the sampling data dictionary. 
data_dict = dict(J_media=1, mch_media_idx=np.ones(len(fc_data)), mCh=fc_data['area_pix'] * fc_data['mean_mCherryJ_run=len(fluct_data['idx'].unique()), N=len(fluct_data),
                media_idx=np.ones(len(fluct_data)).astype(int), run_idx=fluct_data['idx'].values.astype(int),
                I_1=fluct_data['I_1'], I_2=fluct_data['I_2'])

# Sample!
samples = model.sampling(data_dict, iter=1000, chains=4)

In [None]:
theo = alt.Chart(cred_df).mark_area(opacity=0.5, color='dodgerblue').encode(
        x=alt.X('summed:Q', scale={'type':'log'}),
        y=alt.Y('min:Q', scale={'type':'log'}),
        y2=alt.Y('max:Q'))
points = alt.Chart(fluct_data).mark_point(filled=True, size=2, color='slategray').encode(
        x=alt.X('summed:Q', scale={'type':'log'}),
        y=alt.Y('fluct:Q', scale={'type':'log'})).interactive()
points + theo

###  Making a traceplot viewer in Bokeh

In [15]:
# Get a list of the parameters:
import bokeh.io
import bokeh.plotting
import datashader as ds
from holoviews.operation.datashader import datashade
import datashader.bokeh_ext
import holoviews as hv


def bokeh_traceplot(samples, varnames=None):
    params = samples.model_pars
    sample_values = samples.extract()
    palette = bokeh.palettes.Category10_10
    
    # Define the standard plot sizes.   
    pairs = []
    if varnames != None:
        iterator = varnames
    else:
        iterator = params
    for p in iterator:
        colors = itertools.cycle(palette)
        if len(np.shape(sample_values[p])) == 1:
            _samples = np.array([sample_values[p]]).T
        else:
            _samples = sample_values[p]
      
        dfs = []
        trace = bokeh.plotting.figure(plot_width=400, plot_height=200, 
                                      x_axis_label='sample number', y_axis_label=f'{p}',
                                     title=f'sampling history for {p}', background_fill_color='#ecedef') 

        dist = bokeh.plotting.figure(plot_width=400, plot_height=200, 
                                     x_axis_label=f'{p}', y_axis_label='ECDF',
                                    title=f'posterior distribution for {p}', background_fill_color='#ecedef')   
        
        # Add visual formatting
        trace.xgrid.grid_line_color = '#FFFFFF'
        trace.ygrid.grid_line_color = '#FFFFFF'
        dist.xgrid.grid_line_color = '#FFFFFF'
        dist.ygrid.grid_line_color = '#FFFFFF'    
        
        for i, color in zip(range(np.shape(_samples)[1]), colors): 
            # Extract the specific values. 
            _values = _samples[:, i]
            x, y = np.sort(_values), np.arange(0, len(_values), 1) / len(_values)
            _df = pd.DataFrame(np.array([x, y, 
                                         _values, np.arange(0, len(_values), 1)]).T, 
                               columns=['x', 'ecdf', 'samples', 'step_no'])            
            dist.line(_df['x'], _df['ecdf'], line_width=2, color=color)
            trace.line(_df['step_no'], _df['samples'], line_width=1, color=color) 
        pairs.append([dist, trace])  
    return bokeh.io.show(bokeh.layouts.gridplot(pairs))
bokeh_traceplot(samples)

AttributeError: 'DataFrame' object has no attribute 'model_pars'

In [19]:
samples

Inference for Stan model: anon_model_da28093df3b22a381b7b0ff8ec0dacd8.
4 chains, each with iter=500; warmup=250; thin=1; 
post-warmup draws per chain=250, total post-warmup draws=1000.

                   mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
tau_alpha          4.95    0.02   0.64   3.78   4.51   4.93   5.36   6.22    689    1.0
alpha_1[1]       514.77    1.58  26.04 466.04 496.91  513.8 531.76 564.78    273   1.03
alpha_1[2]       772.65    2.11  52.89 661.53 738.17 773.51 809.79 873.59    626    1.0
alpha_2_tilde[1] -19.85    0.29   6.04 -30.94 -24.31 -19.78 -15.63  -8.28    430   1.02
alpha_2_tilde[2]  -17.1    0.28   6.28 -29.15  -21.5 -17.19 -12.61  -4.86    490   1.02
alpha_2_tilde[3]  21.44     0.3   7.26   7.99  16.22  21.13  26.12  36.48    597    1.0
alpha_2_tilde[4]  -1.55    0.32   6.46 -14.47  -5.83  -1.35   2.89  10.95    415   1.01
alpha_2_tilde[5]  18.79     0.3   7.02   5.55  13.79  18.91  23.27  32.91    546   1.01
alpha_2_tilde[6]  25.6