# Reworking Inference Models for Efficiency

© 2018 The Authors. This work is licensed under a [Creative Commons Attribution License CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/). All code contained herein is licensed under an [MIT license](https://opensource.org/licenses/MIT).

--- 

In [1]:
import sys
sys.path.insert(0, '../../')
import numpy as np
import pandas as pd
import pystan 
import bokeh.io
import bokeh.plotting
import tqdm
import mut.viz
import mut.thermo
import mut.bayes
import bebi103.viz
constants = mut.thermo.load_constants()
bokeh.io.output_notebook()

The purpose of this notebook is to spruce up the inferential models to be more efficient as well as develop new predictive checks and vizualization techniques. 

## DNA Binding Energy Inference

In [2]:
# Load the data. 
data = pd.read_csv('../../data/csv/compiled_data.csv')

# Restrict to a single mutant and operator
DNA_data = data[(data['class']=='DNA') & (data['operator']=='O2')]

# Load the stan model. 
model = mut.bayes.StanModel('../stan/DNA_binding_energy_induction.stan')

Found precompiled model. Loading...
finished!


In [117]:
# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(DNA_data.groupby(['mutant', 'repressors'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'R': d['repressors'], 
                 'Nns': constants['Nns'],
                 'ep_ai': constants['ep_AI'],
                 'Ka': constants['Ka'],
                 'Ki': constants['Ki'], 
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict)
    
    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['repressors'] = g[1]
    samples_df['mutant'] = g[0]
    samples_df['operator'] = d['operator'].unique()[0]
    
    # Compute the summarized dataframe
    _df = samples_df[['ep_RA', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = g[1]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = d['operator'].unique()[0]
  
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)   
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/DNA_binding_energy_samples.csv', index=False)
summary_df.to_csv('../../data/csv/DNA_binding_energy_summary.csv', index=False)
print('Finished!')





  0%|          | 0/12 [00:00<?, ?it/s][A[A[A[A



  8%|▊         | 1/12 [00:00<00:08,  1.28it/s][A[A[A[A



 17%|█▋        | 2/12 [00:01<00:08,  1.17it/s][A[A[A[A



 25%|██▌       | 3/12 [00:02<00:07,  1.17it/s][A[A[A[A



 33%|███▎      | 4/12 [00:03<00:07,  1.10it/s][A[A[A[A



 42%|████▏     | 5/12 [00:04<00:06,  1.12it/s][A[A[A[A



 50%|█████     | 6/12 [00:05<00:05,  1.19it/s][A[A[A[A



 58%|█████▊    | 7/12 [00:06<00:04,  1.15it/s][A[A[A[A



 67%|██████▋   | 8/12 [00:06<00:03,  1.25it/s][A[A[A[A



 75%|███████▌  | 9/12 [00:07<00:02,  1.11it/s][A[A[A[A



 83%|████████▎ | 10/12 [00:08<00:01,  1.17it/s][A[A[A[A



 92%|█████████▏| 11/12 [00:09<00:00,  1.14it/s][A[A[A[A



100%|██████████| 12/12 [00:10<00:00,  1.16it/s][A[A[A[A



[A[A[A[A

Finished!


In [119]:
# Plot the contours. 
samples = mutant_df[(mutant_df['mutant']=='Q21M') & 
                   (mutant_df['repressors']==60)]
p = bebi103.viz.corner(samples, vars=['ep_RA', 'sigma'])
bokeh.io.show(p)

## $K_A$ and $K_I$ Inference

In [120]:
# Restrict data to inducer mutants
IND_data = data[data['class']=='IND']

# Load the stan model. 
model = mut.bayes.StanModel('../stan/KaKi_fitting.stan')

Found precompiled model. Loading...
finished!


In [121]:
# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(IND_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'R': d['repressors'], 
                 'Nns': constants['Nns'],
                 'ep_AI': constants['ep_AI'],
                 'ep_RA': constants[g[1]],
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, control=dict(adapt_delta=0.99))
    
    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    
    # Compute the summarized dataframe
    _df = samples_df[['Ka', 'Ki', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
     
    # Add to storage vector
    mutant_dfs.append(samples_df)  
    summary_dfs.append(summary_df)   
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/KaKi_only_samples.csv', index=False)
summary_df.to_csv('../../data/csv/KaKi_only_summary.csv', index=False)
print('Finished!')





  0%|          | 0/8 [00:00<?, ?it/s][A[A[A[A



 12%|█▎        | 1/8 [00:03<00:23,  3.34s/it][A[A[A[A



 25%|██▌       | 2/8 [00:11<00:29,  4.86s/it][A[A[A[A



 38%|███▊      | 3/8 [00:22<00:32,  6.51s/it][A[A[A[A



 50%|█████     | 4/8 [00:24<00:21,  5.29s/it][A[A[A[A



 62%|██████▎   | 5/8 [00:38<00:24,  8.02s/it][A[A[A[A



 75%|███████▌  | 6/8 [00:58<00:23, 11.53s/it][A[A[A[A



 88%|████████▊ | 7/8 [01:02<00:09,  9.25s/it][A[A[A[A



100%|██████████| 8/8 [01:07<00:00,  8.04s/it][A[A[A[A



[A[A[A[A

Finished!


In [86]:
# Plot the contours. 
samples = mutant_df[(mutant_df['mutant']=='Q294K') &
                    (mutant_df['operator']=='O2')]
                   
p = bebi103.viz.corner(samples, vars=['Ka', 'Ki', 'sigma'], datashade=True)
bokeh.io.show(p)

ValueError: Out of range float values are not JSON compliant

### $K_A$, $K_I$, and $\Delta\varepsilon_{AI}$ Inference

In [122]:
# Load the stan model
model = mut.bayes.StanModel('../stan/KaKi_epAI_fitting.stan')

# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(IND_data.groupby(['mutant', 'operator'])):
    print(g)
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'R': d['repressors'], 
                 'Nns': constants['Nns'],
                 'ep_RA': constants[g[1]],
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=4000, 
                                       control=dict(adapt_delta=0.995, max_treedepth=11))
    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['Ka', 'Ki', 'ep_AI', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
#
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/KaKi_epAI_samples.csv', index=False)
summary_df.to_csv('../../data/csv/KaKi_epAI_summary.csv', index=False)
print('Finished!')





  0%|          | 0/8 [00:00<?, ?it/s][A[A[A[A

Found precompiled model. Loading...
finished!
('F164T', 'O2')






 12%|█▎        | 1/8 [00:17<02:05, 17.87s/it][A[A[A[A

('Q294K', 'O1')






 25%|██▌       | 2/8 [00:36<01:48, 18.07s/it][A[A[A[A

('Q294K', 'O2')






 38%|███▊      | 3/8 [00:54<01:30, 18.18s/it][A[A[A[A

('Q294K', 'O3')






 50%|█████     | 4/8 [01:02<01:00, 15.03s/it][A[A[A[A

('Q294R', 'O1')






 62%|██████▎   | 5/8 [01:14<00:41, 13.97s/it][A[A[A[A

('Q294R', 'O2')






 75%|███████▌  | 6/8 [01:41<00:36, 18.13s/it][A[A[A[A

('Q294R', 'O3')






 88%|████████▊ | 7/8 [01:53<00:16, 16.20s/it][A[A[A[A

('Q294V', 'O2')






100%|██████████| 8/8 [02:15<00:00, 17.85s/it][A[A[A[A



[A[A[A[A

Finished!


## $K_A$, $K_I$, and $R$ inference

In [123]:
# Load the stan model
model = mut.bayes.StanModel('../stan/KaKi_R_fitting.stan')

# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(IND_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'Nns': constants['Nns'],
                 'ep_RA': constants[g[1]],
                 'ep_AI': constants['ep_AI'],
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict,
                                      control=dict(adapt_delta=0.99))

    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['Ka', 'Ki', 'R', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/KaKi_R_samples.csv', index=False)
summary_df.to_csv('../../data/csv/KaKi_R_summary.csv', index=False)





  0%|          | 0/8 [00:00<?, ?it/s][A[A[A[A

Found precompiled model. Loading...
finished!






 12%|█▎        | 1/8 [00:05<00:40,  5.72s/it][A[A[A[A



 25%|██▌       | 2/8 [00:14<00:39,  6.57s/it][A[A[A[A



 38%|███▊      | 3/8 [00:20<00:32,  6.55s/it][A[A[A[A







 62%|██████▎   | 5/8 [00:26<00:14,  4.80s/it][A[A[A[A



 75%|███████▌  | 6/8 [00:31<00:09,  4.76s/it][A[A[A[A



 88%|████████▊ | 7/8 [00:33<00:03,  3.96s/it][A[A[A[A



100%|██████████| 8/8 [00:38<00:00,  4.14s/it][A[A[A[A



[A[A[A[A

In [124]:
samples = mutant_df[(mutant_df['mutant']=='Q294V') &
                    (mutant_df['operator']=='O2')]
     
p = bebi103.viz.corner(samples, vars=['R', 'Ka', 'Ki', 'sigma'])
bokeh.io.show(p)

## $K_A$, $K_I$, $\Delta\varepsilon_{AI}$, and $\Delta\varepsilon_{RA}$ Double Mutant Fitting

In [125]:
DBL_data = data[data['class']=='DBL']
model = mut.bayes.StanModel('../stan/DBL_parameter_estimation.stan')

# Loop through each mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(DBL_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'Nns': constants['Nns'],
                 'ep_RA':DNA_summary[(DNA_summary['repressors']==260) & (DNA_summary['mutant']==g[0].split('-')[0]) & 
                                    (DNA_summary['operator']=='O2') & (DNA_summary['parameter']=='ep_RA')]['mode'].values,
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'R': 260,
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=8000, control=dict(adapt_delta=0.99))
                                      

    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    samples_df['ep_RA'] = data_dict['ep_RA'][0]
    _df = samples_df[['ep_RA', 'Ka', 'Ki', 'ep_AI', 'sigma', 'lp__']].copy()

    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/DBL_complete_samples.csv', index=False)
summary_df.to_csv('../../data/csv/DBL_complete_summary.csv', index=False)





  0%|          | 0/9 [00:00<?, ?it/s][A[A[A[A

Found precompiled model. Loading...
finished!






 11%|█         | 1/9 [00:07<00:58,  7.33s/it][A[A[A[A



 22%|██▏       | 2/9 [00:15<00:53,  7.69s/it][A[A[A[A



 33%|███▎      | 3/9 [00:30<00:58,  9.78s/it][A[A[A[A



 44%|████▍     | 4/9 [01:39<02:17, 27.41s/it][A[A[A[A



 56%|█████▌    | 5/9 [01:57<01:39, 24.80s/it][A[A[A[A



 67%|██████▋   | 6/9 [03:07<01:55, 38.35s/it][A[A[A[A



 78%|███████▊  | 7/9 [03:14<00:57, 28.99s/it][A[A[A[A



 89%|████████▉ | 8/9 [03:27<00:24, 24.01s/it][A[A[A[A



100%|██████████| 9/9 [03:38<00:00, 20.08s/it][A[A[A[A



[A[A[A[A

## $\Delta\varepsilon_{AI}$ to Double Mutants


In [None]:
model = mut.bayes.StanModel('../stan/epAI_fitting.stan', force_compile=True)

In [5]:
DBL_data = data[data['class']=='DBL']
DNA_summary = pd.read_csv('../../data/csv/DNA_binding_energy_summary.csv')
IND_summary = pd.read_csv('../../data/csv/KaKi_epAI_summary.csv')


# Loop through each mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(DBL_data[DBL_data['IPTGuM']==0].groupby(['mutant', 'operator'])):
    # Determine the value of the DNA binding energy to use. 
    dna_mut = g[0].split('-')[0]
    ep_RA = DNA_summary[(DNA_summary['mutant']==dna_mut) & (DNA_summary['operator']==g[1]) &
                       (DNA_summary['repressors']==260) & (DNA_summary['parameter']=='ep_RA')]['mode'].values[0]
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'Nns': constants['Nns'],
                 'R': 260,
                 'ep_RA': [ep_RA],
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=5000, control=dict(adapt_delta=0.99))
                                      

    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['ep_AI', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/DBL_epAI_samples.csv', index=False)
summary_df.to_csv('../../data/csv/DBL_epAI_summary.csv', index=False)

100%|██████████| 9/9 [00:16<00:00,  1.58s/it]


In [128]:
model = mut.bayes.StanModel('../stan/epAI_fitting.stan')

Found precompiled model. Loading...
finished!


## Hill Fitting To Double Mutants


In [130]:
DBL_data = data[data['class']=='DBL']
model = mut.bayes.StanModel('../stan/Hill_fitting.stan')
# Loop through each mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(DBL_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'Nns': constants['Nns'],
                 'c': d['IPTGuM'],
                 'foldchange': d['fold_change']}
    
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=5000)
                                      

    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['a', 'b', 'K', 'n', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/DBL_Hill_samples.csv', index=False)
summary_df.to_csv('../../data/csv/DBL_Hill_summary.csv', index=False)





  0%|          | 0/9 [00:00<?, ?it/s][A[A[A[A

Found precompiled model. Loading...
finished!






 11%|█         | 1/9 [00:02<00:17,  2.21s/it][A[A[A[A



 22%|██▏       | 2/9 [00:06<00:19,  2.79s/it][A[A[A[A



 33%|███▎      | 3/9 [00:08<00:15,  2.55s/it][A[A[A[A



 44%|████▍     | 4/9 [00:10<00:12,  2.50s/it][A[A[A[A



 56%|█████▌    | 5/9 [00:13<00:10,  2.65s/it][A[A[A[A



 67%|██████▋   | 6/9 [00:16<00:07,  2.59s/it][A[A[A[A







 89%|████████▉ | 8/9 [00:20<00:02,  2.44s/it][A[A[A[A



100%|██████████| 9/9 [00:23<00:00,  2.54s/it][A[A[A[A



[A[A[A[A