# Reworking Inference Models for Efficiency

© 2018 The Authors. This work is licensed under a [Creative Commons Attribution License CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/). All code contained herein is licensed under an [MIT license](https://opensource.org/licenses/MIT).

--- 

In [1]:
import sys
sys.path.insert(0, '../../')
import numpy as np
import pandas as pd
import pystan 
import bokeh.io
import bokeh.plotting
import tqdm
import mut.viz
import mut.thermo
import mut.bayes
import bebi103.viz
constants = mut.thermo.load_constants()
bokeh.io.output_notebook()

The purpose of this notebook is to spruce up the inferential models to be more efficient as well as develop new predictive checks and vizualization techniques. 

## DNA Binding Energy Inference

In [2]:
# Load the data. 
data = pd.read_csv('../../data/csv/compiled_data.csv')

# Restrict to a single mutant and operator
DNA_data = data[(data['class']=='DNA') & (data['operator']=='O2')]

# Load the stan model. 
model = mut.bayes.StanModel('../stan/DNA_binding_energy_induction.stan')

Found precompiled model. Loading...
finished!


In [3]:
# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(DNA_data.groupby(['mutant', 'repressors'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'R': d['repressors'], 
                 'Nns': constants['Nns'],
                 'ep_ai': constants['ep_AI'],
                 'Ka': constants['Ka'],
                 'Ki': constants['Ki'], 
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict)
    
    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['repressors'] = g[1]
    samples_df['mutant'] = g[0]
    samples_df['operator'] = d['operator'].unique()[0]
    
    # Compute the summarized dataframe
    _df = samples_df[['ep_RA', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = g[1]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = d['operator'].unique()[0]
  
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)   
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/DNA_binding_energy_samples.csv', index=False)
summary_df.to_csv('../../data/csv/DNA_binding_energy_summary.csv', index=False)
print('Finished!')

100%|██████████| 12/12 [00:10<00:00,  1.26it/s]


Finished!


In [4]:
# Plot the contours. 
samples = mutant_df[(mutant_df['mutant']=='Q21M') & 
                   (mutant_df['repressors']==60)]
p = bebi103.viz.corner(samples, vars=['ep_RA', 'sigma'])
bokeh.io.show(p)

## $K_A$ and $K_I$ Inference

In [8]:
# Restrict data to inducer mutants
IND_data = data[data['class']=='IND']

# Load the stan model. 
model = mut.bayes.StanModel('../stan/KaKi_fitting.stan')

Found precompiled model. Loading...
finished!


In [9]:
# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(IND_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'R': d['repressors'], 
                 'Nns': constants['Nns'],
                 'ep_AI': constants['ep_AI'],
                 'ep_RA': constants[g[1]],
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, control=dict(adapt_delta=0.99))
    
    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    
    # Compute the summarized dataframe
    _df = samples_df[['Ka', 'Ki', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
     
    # Add to storage vector
    mutant_dfs.append(samples_df)  
    summary_dfs.append(summary_df)   
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/KaKi_only_samples.csv', index=False)
summary_df.to_csv('../../data/csv/KaKi_only_summary.csv', index=False)
print('Finished!')

100%|██████████| 7/7 [00:39<00:00,  5.14s/it]


Finished!


In [12]:
# Plot the contours. 
samples = mutant_df[(mutant_df['mutant']=='Q294K') &
                    (mutant_df['operator']=='O2')]
                   
p = bebi103.viz.corner(samples, vars=['Ka', 'Ki', 'sigma'], datashade=True)
bokeh.io.show(p)

ValueError: Out of range float values are not JSON compliant

### $K_A$, $K_I$, and $\Delta\varepsilon_{AI}$ Inference

In [5]:
# Load the stan model
model = mut.bayes.StanModel('../stan/KaKi_epAI_fitting.stan')

# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(IND_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'R': d['repressors'], 
                 'Nns': constants['Nns'],
                 'ep_RA': constants[g[1]],
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=4000, 
                                       control=dict(adapt_delta=0.995, max_treedepth=11))
    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['Ka', 'Ki', 'ep_AI', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/KaKi_epAI_samples.csv', index=False)
summary_df.to_csv('../../data/csv/KaKi_epAI_summary.csv', index=False)
print('Finished!')

  0%|          | 0/10 [00:00<?, ?it/s]

Found precompiled model. Loading...
finished!


100%|██████████| 10/10 [01:40<00:00,  9.35s/it]


Finished!


## $K_A$, $K_I$, and $R$ inference

In [65]:
# Load the stan model
model = mut.bayes.StanModel('../stan/KaKi_R_fitting.stan')

# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(IND_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'Nns': constants['Nns'],
                 'ep_RA': constants[g[1]],
                 'ep_AI': constants['ep_AI'],
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict,
                                      control=dict(adapt_delta=0.99))

    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['Ka', 'Ki', 'R', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/KaKi_R_samples.csv', index=False)
summary_df.to_csv('../../data/csv/KaKi_R_summary.csv', index=False)

  0%|          | 0/10 [00:00<?, ?it/s]

Found precompiled model. Loading...
finished!


100%|██████████| 10/10 [00:37<00:00,  3.90s/it]


In [67]:
samples = mutant_df[(mutant_df['mutant']=='Q294V') &
                    (mutant_df['operator']=='O2')]
     
p = bebi103.viz.corner(samples, vars=['R', 'Ka', 'Ki', 'sigma'])
bokeh.io.show(p)

## $R$, $K_A$, $K_I$, $\Delta\varepsilon_{AI}$, and $\Delta\varepsilon_{RA}$ Double Mutant Fitting

In [35]:
DBL_data = data[data['class']=='DBL']
model = mut.bayes.StanModel('../stan/DBL_parameter_estimation.stan')

# Loop through each mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(DBL_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'Nns': constants['Nns'],
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'R': 260,
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=5000, control=dict(adapt_delta=0.95))
                                      

    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['Ka', 'Ki', 'ep_RA', 'ep_AI', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/DBL_complete_samples.csv', index=False)
summary_df.to_csv('../../data/csv/DBL_complete_summary.csv', index=False)


  0%|          | 0/8 [00:00<?, ?it/s][A

Found precompiled model. Loading...
finished!



 12%|█▎        | 1/8 [00:19<02:16, 19.48s/it][A
 25%|██▌       | 2/8 [00:42<02:04, 20.69s/it][A



 75%|███████▌  | 6/8 [04:29<01:42, 51.13s/it][A
 88%|████████▊ | 7/8 [04:50<00:42, 42.10s/it][A
100%|██████████| 8/8 [05:12<00:00, 36.18s/it][A
[A

In [34]:
model = mut.bayes.StanModel('../stan/DBL_parameter_estimation.stan', force_compile=True)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_c7f7af9268cf215228ca293269b98d39 NOW.


Precompiled model not found. Compiling model...


  tree = Parsing.p_module(s, pxd, full_module_name)


In file included from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/ndarraytypes.h:1823:0,
                 from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/ndarrayobject.h:18,
                 from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/arrayobject.h:4,
                 from /var/folders/2q/lvh2zsws3lxckq8xtkn_84z80000gn/T/tmpn3_zz32e/stanfit4anon_model_c7f7af9268cf215228ca293269b98d39_7645695681354271214.cpp:685:
  ^
/var/folders/2q/lvh2zsws3lxckq8xtkn_84z80000gn/T/tmpn3_zz32e/stanfit4anon_model_c7f7af9268cf215228ca293269b98d39_7645695681354271214.cpp: In function ‘PyObject* __pyx_pf_71stanfit4anon_model_c7f7af9268cf215228ca293269b98d39_7645695681354271214_2_call_sampler(PyObject*, PyObject*, PyObject*, PyObject*)’:
     __pyx_t_12 = ((__pyx_t_9 != __pyx_v_fitptr->param_names_oi().size()) != 0);
                                                                       ^
        .section __TEXT,__textcoal_nt,coalesced,pure_in

## $\Delta\varepsilon_{AI}$ to Double Mutants

In [33]:
DBL_data = data[data['class']=='DBL']
DNA_summary = pd.read_csv('../../data/csv/DNA_binding_energy_summary.csv')
model = mut.bayes.StanModel('../stan/epAI_fitting.stan')

# Loop through each mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(DBL_data[DBL_data['IPTGuM']==0].groupby(['mutant', 'operator'])):
    # Determine the value of the DNA binding energy to use. 
    dna_mut = g[0].split('-')[0]
    ep_RA = DNA_summary[(DNA_summary['mutant']==dna_mut) & (DNA_summary['operator']==g[1]) &
                       (DNA_summary['repressors']==260) & (DNA_summary['parameter']=='ep_RA')]['mode'].values
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'Nns': constants['Nns'],
                 'R': 260,
                 'ep_RA': ep_RA,
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=5000, control=dict(adapt_delta=0.99))
                                      

    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['ep_AI', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/DBL_epAI_samples.csv', index=False)
summary_df.to_csv('../../data/csv/DBL_epAI_summary.csv', index=False)


  0%|          | 0/8 [00:00<?, ?it/s][A

Found precompiled model. Loading...
finished!



 12%|█▎        | 1/8 [00:01<00:07,  1.11s/it][A
 25%|██▌       | 2/8 [00:02<00:07,  1.27s/it][A

 50%|█████     | 4/8 [00:05<00:04,  1.22s/it][A

 75%|███████▌  | 6/8 [00:08<00:02,  1.38s/it][A
 88%|████████▊ | 7/8 [00:10<00:01,  1.66s/it][A
100%|██████████| 8/8 [00:11<00:00,  1.56s/it][A
[A

In [29]:
model = mut.bayes.StanModel('../stan/epAI_fitting.stan')

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_a99af664490f8579771c5877d8ad423d NOW.


Precompiled model not found. Compiling model...


  tree = Parsing.p_module(s, pxd, full_module_name)


In file included from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/ndarraytypes.h:1823:0,
                 from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/ndarrayobject.h:18,
                 from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/arrayobject.h:4,
                 from /var/folders/2q/lvh2zsws3lxckq8xtkn_84z80000gn/T/tmp4ppaq48t/stanfit4anon_model_a99af664490f8579771c5877d8ad423d_6966233200726887256.cpp:685:
  ^
/var/folders/2q/lvh2zsws3lxckq8xtkn_84z80000gn/T/tmp4ppaq48t/stanfit4anon_model_a99af664490f8579771c5877d8ad423d_6966233200726887256.cpp: In function ‘PyObject* __pyx_pf_71stanfit4anon_model_a99af664490f8579771c5877d8ad423d_6966233200726887256_2_call_sampler(PyObject*, PyObject*, PyObject*, PyObject*)’:
     __pyx_t_12 = ((__pyx_t_9 != __pyx_v_fitptr->param_names_oi().size()) != 0);
                                                                       ^
        .section __TEXT,__textcoal_nt,coalesced,pure_in