# Combined Calibration Factor and Fold-Change Inference

In [1]:
import numpy as np
import pandas as pd
import bokeh.io
import altair as alt
import sys
sys.path.insert(0, '../../')
import mwc.bayes
import mwc.viz
import mwc.bayes
import mwc.stats
bokeh.io.output_notebook()

In [3]:
# Load in the data
fluct_data = pd.read_csv('../../data/compiled_fluctuations.csv')
fc_data = pd.read_csv('../../data/compiled_fold_change.csv')

# Isolate a single carbon source. 
fluct_data = fluct_data[fluct_data['carbon']=='glucose'].copy()
fc_data = fc_data[fc_data['carbon']=='glucose'].copy()

# Compute the mean autofluorescence for each channel. 
auto = fc_data[fc_data['strain']=='auto']
mean_auto_mch = np.mean(auto['mean_mCherry'] - auto['mCherry_bg_val'])
mean_auto_yfp = np.mean(auto['mean_yfp'] - auto['yfp_bg_val'])

# Perform necessary background subtraction for fluctuation measurements. 
fluct_data['I_1_sub'] = (fluct_data['I_1'] - fluct_data['bg_val'] - mean_auto_mch) * fluct_data['area_1']
fluct_data['I_2_sub'] = (fluct_data['I_2'] - fluct_data['bg_val'] - mean_auto_mch) * fluct_data['area_2']

# Ensure positivity. 
fluct_data = fluct_data[(fluct_data['I_1_sub'] >= 0) & (fluct_data['I_2_sub'] >= 0)]

# Perform background subtraction for fluorescence measurements. 
fc_data['yfp_sub'] = np.round((fc_data['mean_yfp'] - fc_data['yfp_bg_val']) * fc_data['area_pix'])
fc_data['mch_sub'] = np.round((fc_data['mean_mCherry'] - fc_data['mCherry_bg_val']) * fc_data['area_pix'])

# Add relevant identifiers to each data set. 
fluct_data['date_idx'] = fluct_data.groupby(['date']).ngroup() + 1
fluct_data['rep_idx'] = fluct_data.groupby(['date', 'run_no']).ngroup() + 1

# Compute the means for the fold-change data
mean_fc_data = fc_data.groupby(['date', 'atc_ngml', 'run_number', 'strain']).mean().reset_index()

# Enforce positivity of YFP and mCherry.
mean_fc_data = mean_fc_data[mean_fc_data['strain'] != 'auto'].copy() # Drop unnecessary autofluorescence strain
mean_fc_data = mean_fc_data[(mean_fc_data['yfp_sub'] >= 0) & (mean_fc_data['mch_sub'] >= 0)]

# Add appropriate identifiers to mean data.
mean_fc_data['conc_idx'] = mean_fc_data.groupby(['atc_ngml', 'strain']).ngroup() + 1
mean_fc_data['rep_idx'] = mean_fc_data.groupby(['date', 'atc_ngml', 'run_number', 'strain']).ngroup() + 1

In [4]:
# Assemble the data dictionary and compile the model. 
data_dict = {'J_exp': np.max(fluct_data['rep_idx']),
             'N_fluct': len(fluct_data),
             'index_1': fluct_data['rep_idx'],
             'J_conc': np.max(mean_fc_data['conc_idx']),
             'J_conc_exp': np.max(mean_fc_data['rep_idx']),
             'N_meas': len(mean_fc_data),
             'index_2': mean_fc_data['conc_idx'],
             'I_1': fluct_data['I_1_sub'],
             'I_2': fluct_data['I_2_sub'],
             'mcherry': mean_fc_data['mch_sub'],
             'yfp': mean_fc_data['yfp_sub']}
model = mwc.bayes.StanModel('../stan/hierarchical_dilution_analysis.stan', data_dict)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_8c1f867e1bd278fee97b8863b7fab708 NOW.


Precompiled model not found. Compiling model...


  tree = Parsing.p_module(s, pxd, full_module_name)


finished!


In [5]:
samples = model.sample(iter=1000, chains=4, return_df=True, control=dict(adapt_delta=0.95))

Beginning sampling...




finished sampling!


In [10]:
model.traceplot(varnames=['avg_rep', 'fc', 'alpha_1'])

In [None]:
# Load the 

In [75]:
def summarize_parameters(samples, parnames=[], mass_frac=0.95):
    """
    Summarizes all or a subset of parameters from a Stan model. 
    
    Parameters
    ----------
    samples: StanFit4Model object
        Sampling output from Stan model. 
    parnames: list
        List of desired parnames. If left empty, all parameters 
        are summarized and returned. 
    mass_frac: float [0, 1]
        The probability mass fraction for the HPD. Default is 
        the 95% credible region. 
        
    Returns
    -------
    summary_df: pandas DataFrame
        Dataframe of summarized parameters. The columns are as
        follows:
            parameter = name of parameter in Stan model
            dimension = index (dimension) of the parameter
            mean = mean of samples
            median = median of samples
            mode = parameter value when the log posterior is maximized
            hpd_min = minimum bound of the highest probability density
                defined by the mass fraction.
            hpd_max = upper bound of the highest probability density
                defined by the mass fraction
    """
    # Extract the sampling information and find the mode
    fit = samples.extract()
    mode_ind = np.argmax(fit['lp__'])
    
    # Get a list of all parameters defined in the model and assign a dimension
    pars = samples.model_pars
    
    # Convert the dimensions for each parameter to integers. 
    _dims = []
    for d in samples.par_dims:
        if len(d) == 0:
            _dims.append(1)
        else:
            _dims.append(int(d[0]))

    par_dims = {p:v for p, v in zip(pars, _dims)}
    if len(parnames) != 0:
        pars = varnames
        desired_pars = {k:v for k, v in par_dims.items() if k in varnames}
        par_dims = desired_pars
    
    # Iterate through each parameter and compute the aggregate properties. 
    df = pd.DataFrame([], columns=['parameter', 'dimension', 'mean', 
                                   'median', 'mode', 'hpd_min', 'hpd_max'])
                       
    for par, dim in par_dims.items():
        par_samples = fit[par]
        if dim == 1:
            par_samples = par_samples[:, np.newaxis]
        for j in range(dim):
            # Compute the summary statistics
            par_mode = par_samples[:, j][mode_ind]
            par_mean = np.mean(par_samples[:, j])
            par_median = np.median(par_samples[:, j])
            hpd_min, hpd_max = mwc.stats.compute_hpd(par_samples[:, j], mass_frac=mass_frac)
            
            # Assemble a dictionary to append to the data frame
            par_dict ={'parameter':par, 
                      'dimension': j + 1,
                      'mean': par_mean,
                      'mode': par_mode,
                      'median': par_median,
                      'hpd_min': hpd_min,
                      'hpd_max': hpd_max,
                      'mass_fraction': mass_frac}
            df = df.append(par_dict, ignore_index=True)
    df['dimension'] = df['dimension'].astype(int) 
    return df
    
summarize_parameters(samples[0])

Unnamed: 0,parameter,dimension,mean,median,mode,hpd_min,hpd_max,mass_fraction
0,tau_alpha,1,1.088144,0.909624,3.826329,0.338865,2.363615,0.95
1,log_alpha_1,1,6.328981,6.400358,5.391817,5.060884,7.257799,0.95
2,log_mch_1,1,6.980419,6.998666,6.933888,6.234442,7.586165,0.95
3,log_mch_1,2,9.785752,10.017630,10.544601,7.923298,11.390266,0.95
4,log_mch_1,3,10.588009,10.759599,11.123026,9.058743,11.813503,0.95
5,log_mch_1,4,11.079012,11.135544,11.489556,10.165364,11.793611,0.95
6,log_mch_1,5,11.388818,11.433056,11.597063,10.702295,12.034341,0.95
7,log_mch_1,6,11.752442,11.775303,11.614070,11.436443,12.182831,0.95
8,log_mch_1,7,11.707541,11.738954,11.733747,11.319413,12.139980,0.95
9,log_yfp_1,1,12.924753,12.938030,12.863645,12.545941,13.297339,0.95


In [69]:
x = a.extract()['alpha_1']
np.shape(x[:, np.newaxis])

(2000, 1)

In [55]:
x[:, np.newaxis]

array([[ 939.95319742],
       [ 528.48435257],
       [  52.79030721],
       ...,
       [1227.01512455],
       [ 552.06647623],
       [ 960.08432638]])

In [61]:
a = samples[0]
pars = a.model_pars
dims = a.par_dims
par_dims = {p:d for p, d in zip(pars, dims)}

In [62]:
pars

['tau_alpha',
 'log_alpha_1',
 'log_mch_1',
 'log_yfp_1',
 'log_mch_sigma_1',
 'log_yfp_sigma_1',
 'log_alpha_2_raw',
 'log_alpha_2',
 'alpha_1',
 'alpha_2',
 'mch_sigma_1',
 'yfp_sigma_1',
 'mch_1',
 'yfp_1',
 'avg_rep',
 'fc']