# Reworking Inference Models for Efficiency

© 2018 The Authors. This work is licensed under a [Creative Commons Attribution License CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/). All code contained herein is licensed under an [MIT license](https://opensource.org/licenses/MIT).

--- 

In [1]:
import sys
sys.path.insert(0, '../../')
import numpy as np
import pandas as pd
import pystan 
import bokeh.io
import bokeh.plotting
import tqdm
import mut.viz
import mut.thermo
import mut.bayes
import bebi103.viz
constants = mut.thermo.load_constants()
bokeh.io.output_notebook()

The purpose of this notebook is to spruce up the inferential models to be more efficient as well as develop new predictive checks and vizualization techniques. 

## DNA Binding Energy Inference

In [2]:
# Load the data. 
data = pd.read_csv('../../data/csv/compiled_data.csv')

# Restrict to a single mutant and operator
DNA_data = data[(data['class']=='DNA') & (data['operator']=='O2')]

# Load the stan model. 
model = mut.bayes.StanModel('../stan/DNA_binding_energy_induction.stan')

Found precompiled model. Loading...
finished!


In [3]:
# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(DNA_data.groupby(['mutant', 'repressors'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'R': d['repressors'], 
                 'Nns': constants['Nns'],
                 'ep_ai': constants['ep_AI'],
                 'Ka': constants['Ka'],
                 'Ki': constants['Ki'], 
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict)
    
    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['repressors'] = g[1]
    samples_df['mutant'] = g[0]
    samples_df['operator'] = d['operator'].unique()[0]
    
    # Compute the summarized dataframe
    _df = samples_df[['ep_RA', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = g[1]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = d['operator'].unique()[0]
  
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)   
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/DNA_binding_energy_samples.csv', index=False)
summary_df.to_csv('../../data/csv/DNA_binding_energy_summary.csv', index=False)
print('Finished!')

100%|██████████| 12/12 [00:09<00:00,  1.49it/s]


Finished!


In [4]:
# Plot the contours. 


samples = mutant_df[(mutant_df['mutant']=='Q21M') & 
                   (mutant_df['repressors']==60)]
p = bebi103.viz.corner(samples, vars=['ep_RA', 'sigma'])
bokeh.io.show(p)

## $K_A$ and $K_I$ Inference

In [6]:
# Restrict data to inducer mutants
IND_data = data[data['class']=='IND']

# Load the stan model. 
model = mut.bayes.StanModel('../stan/KaKi_fitting.stan')

Found precompiled model. Loading...
finished!


In [6]:
# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(IND_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'R': d['repressors'], 
                 'Nns': constants['Nns'],
                 'ep_AI': constants['ep_AI'],
                 'ep_RA': constants[g[1]],
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=5000, control=dict(adapt_delta=0.99))
    
    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    
    # Compute the summarized dataframe
    _df = samples_df[['Ka', 'Ki', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
     
    # Add to storage vector
    mutant_dfs.append(samples_df)  
    summary_dfs.append(summary_df)   
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/KaKi_only_samples.csv', index=False)
summary_df.to_csv('../../data/csv/KaKi_only_summary.csv', index=False)
print('Finished!')














100%|██████████| 12/12 [02:44<00:00, 11.25s/it][A
[A

Finished!


In [None]:
# Plot the contours. 
samples = mutant_df[(mutant_df['mutant']=='Q294K') &
                    (mutant_df['operator']=='O2')]
                   
p = bebi103.viz.corner(samples, vars=['Ka', 'Ki', 'sigma'], datashade=True)
bokeh.io.show(p)

### $K_A$, $K_I$, and $\Delta\varepsilon_{AI}$ Inference

In [7]:
model = mut.bayes.StanModel('../stan/KaKi_epAI_fitting.stan')

Found precompiled model. Loading...
finished!


In [8]:
# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(IND_data.groupby(['mutant', 'operator'])):
    print(g)
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'R': d['repressors'], 
                 'Nns': constants['Nns'],
                 'ep_RA': constants[g[1]],
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=4000, 
                                       control=dict(adapt_delta=0.995, max_treedepth=11))
    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['Ka', 'Ki', 'ep_AI', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
#
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/KaKi_epAI_samples.csv', index=False)
summary_df.to_csv('../../data/csv/KaKi_epAI_summary.csv', index=False)
print('Finished!')

  0%|          | 0/12 [00:00<?, ?it/s]

('F164T', 'O1')


  8%|▊         | 1/12 [00:46<08:35, 46.89s/it]

('F164T', 'O2')


 17%|█▋        | 2/12 [01:05<06:24, 38.49s/it]

('F164T', 'O3')


 25%|██▌       | 3/12 [01:08<04:11, 27.89s/it]

('Q294K', 'O1')


 33%|███▎      | 4/12 [01:29<03:25, 25.68s/it]

('Q294K', 'O2')


 42%|████▏     | 5/12 [01:44<02:36, 22.41s/it]

('Q294K', 'O3')


 50%|█████     | 6/12 [01:51<01:46, 17.76s/it]

('Q294R', 'O1')


 58%|█████▊    | 7/12 [02:01<01:17, 15.43s/it]

('Q294R', 'O2')


 67%|██████▋   | 8/12 [02:18<01:03, 15.93s/it]

('Q294R', 'O3')


 75%|███████▌  | 9/12 [02:33<00:46, 15.58s/it]

('Q294V', 'O1')


 83%|████████▎ | 10/12 [03:43<01:03, 31.95s/it]

('Q294V', 'O2')


 92%|█████████▏| 11/12 [03:57<00:26, 26.60s/it]

('Q294V', 'O3')


100%|██████████| 12/12 [04:09<00:00, 22.16s/it]


Finished!


    ## $K_A$, $K_I$, and $R$ inference

In [10]:
# Load the stan model
model = mut.bayes.StanModel('../stan/KaKi_R_fitting.stan')

# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(IND_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'Nns': constants['Nns'],
                 'ep_RA': constants[g[1]],
                 'ep_AI': constants['ep_AI'],
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict,
                                      control=dict(adapt_delta=0.99))

    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['Ka', 'Ki', 'R', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/KaKi_R_samples.csv', index=False)
summary_df.to_csv('../../data/csv/KaKi_R_summary.csv', index=False)


  0%|          | 0/12 [00:00<?, ?it/s][A

Found precompiled model. Loading...
finished!



  8%|▊         | 1/12 [00:15<02:55, 15.94s/it][A
 17%|█▋        | 2/12 [00:23<02:14, 13.46s/it][A
 25%|██▌       | 3/12 [00:27<01:35, 10.66s/it][A
 33%|███▎      | 4/12 [00:37<01:23, 10.50s/it][A
 42%|████▏     | 5/12 [00:45<01:07,  9.58s/it][A
 50%|█████     | 6/12 [00:47<00:44,  7.46s/it][A
 58%|█████▊    | 7/12 [00:52<00:32,  6.53s/it][A
 67%|██████▋   | 8/12 [00:58<00:25,  6.46s/it][A
 75%|███████▌  | 9/12 [01:00<00:15,  5.13s/it][A
 83%|████████▎ | 10/12 [01:12<00:14,  7.07s/it][A

100%|██████████| 12/12 [01:20<00:00,  5.63s/it][A
[A

In [14]:
samples = mutant_df[(mutant_df['mutant']=='Q294V') &
                    (mutant_df['operator']=='O2')]
     
p = bebi103.viz.corner(samples, vars=['R', 'Ka', 'Ki', 'sigma'])
bokeh.io.show(p)

## $K_A$, $K_I$, $\Delta\varepsilon_{AI}$, and $\Delta\varepsilon_{RA}$ Double Mutant Fitting

In [13]:
DBL_data = data[data['class']=='DBL']
model = mut.bayes.StanModel('../stan/DBL_parameter_estimation.stan')

# Loop through each mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(DBL_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'Nns': constants['Nns'],
                 'ep_RA':DNA_summary[(DNA_summary['repressors']==260) & (DNA_summary['mutant']==g[0].split('-')[0]) & 
                                    (DNA_summary['operator']=='O2') & (DNA_summary['parameter']=='ep_RA')]['mode'].values,
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'R': 260,
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=8000, control=dict(adapt_delta=0.99))
                                      

    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    samples_df['ep_RA'] = data_dict['ep_RA'][0]
    _df = samples_df[['ep_RA', 'Ka', 'Ki', 'ep_AI', 'sigma', 'lp__']].copy()

    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/DBL_complete_samples.csv', index=False)
summary_df.to_csv('../../data/csv/DBL_complete_summary.csv', index=False)


  0%|          | 0/9 [00:00<?, ?it/s][A
[A

Found precompiled model. Loading...
finished!


NameError: name 'DNA_summary' is not defined

## $\Delta\varepsilon_{AI}$ to Double Mutants


In [16]:
model = mut.bayes.StanModel('../stan/epAI_fitting.stan', force_compile=True)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_a045af09162c634003ec1ef30bc69ca6 NOW.


Precompiled model not found. Compiling model...


  tree = Parsing.p_module(s, pxd, full_module_name)


In file included from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/ndarraytypes.h:1816:0,
                 from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/ndarrayobject.h:18,
                 from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/arrayobject.h:4,
                 from /var/folders/2q/lvh2zsws3lxckq8xtkn_84z80000gn/T/tmpyeax2bzo/stanfit4anon_model_a045af09162c634003ec1ef30bc69ca6_1090961852581845642.cpp:685:
  ^
/var/folders/2q/lvh2zsws3lxckq8xtkn_84z80000gn/T/tmpyeax2bzo/stanfit4anon_model_a045af09162c634003ec1ef30bc69ca6_1090961852581845642.cpp: In function ‘PyObject* __pyx_pf_71stanfit4anon_model_a045af09162c634003ec1ef30bc69ca6_1090961852581845642_2_call_sampler(PyObject*, PyObject*, PyObject*, PyObject*)’:
     __pyx_t_12 = ((__pyx_t_9 != __pyx_v_fitptr->param_names_oi().size()) != 0);
                                                                       ^
        .section __TEXT,__textcoal_nt,coalesced,pure_in

In [17]:
DBL_data = data[data['class']=='DBL']
DNA_summary = pd.read_csv('../../data/csv/DNA_binding_energy_summary.csv')
IND_summary = pd.read_csv('../../data/csv/KaKi_only_summary.csv')
Q294K_summary = pd.read_csv('../../data/csv/KaKi_epAI_summary.csv')


# Loop through each mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(DBL_data.groupby(['mutant', 'operator'])):
    # Determine the value of the DNA binding energy to use. 
    dna_mut, ind_mut = g[0].split('-')
    ep_RA = DNA_summary[(DNA_summary['mutant']==dna_mut) & (DNA_summary['operator']==g[1]) &
                       (DNA_summary['repressors']==260) & (DNA_summary['parameter']=='ep_RA')]['mode'].values[0]
    if ind_mut == 'Q294K':
        _ind_summary = Q294K_summary[(Q294K_summary['mutant']==ind_mut) & (Q294K_summary['operator']=='O2')]
    else:
        _ind_summary = IND_summary[(IND_summary['mutant']==ind_mut) & (IND_summary['operator']=='O2')]
        
    Ka = _ind_summary[_ind_summary['parameter']=='Ka']['mode'].values[0]
    Ki = _ind_summary[_ind_summary['parameter']=='Ki']['mode'].values[0]
    
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'Nns': constants['Nns'],
                 'R': 260,
                 'ep_RA': [ep_RA],
                 'Ka': [Ka],
                 'Ki': [Ki],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=5000, control=dict(adapt_delta=0.99))
                                    

    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['ep_AI', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/DBL_epAI_samples.csv', index=False)
summary_df.to_csv('../../data/csv/DBL_epAI_summary.csv', index=False)


  0%|          | 0/9 [00:00<?, ?it/s][A
 11%|█         | 1/9 [00:02<00:22,  2.87s/it][A
 22%|██▏       | 2/9 [00:06<00:21,  3.13s/it][A
 33%|███▎      | 3/9 [00:09<00:17,  3.00s/it][A
 44%|████▍     | 4/9 [00:12<00:14,  2.94s/it][A
 56%|█████▌    | 5/9 [00:14<00:11,  2.80s/it][A
 67%|██████▋   | 6/9 [00:17<00:08,  2.77s/it][A
 78%|███████▊  | 7/9 [00:22<00:07,  3.63s/it][A
 89%|████████▉ | 8/9 [00:29<00:04,  4.39s/it][A
100%|██████████| 9/9 [00:31<00:00,  3.66s/it][A
[A

In [18]:
model = mut.bayes.StanModel('../stan/epAI_fitting.stan')

Found precompiled model. Loading...
finished!


## Hill Fitting To Double Mutants


In [None]:
DBL_data = data[data['class']=='DBL']
model = mut.bayes.StanModel('../stan/Hill_fitting.stan')
# Loop through each mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(DBL_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'Nns': constants['Nns'],
                 'c': d['IPTGuM'],
                 'foldchange': d['fold_change']}
    
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=5000)
                                      

    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['a', 'b', 'K', 'n', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/DBL_Hill_samples.csv', index=False)
summary_df.to_csv('../../data/csv/DBL_Hill_summary.csv', index=False)