# Reworking Inference Models for Efficiency

© 2018 The Authors. This work is licensed under a [Creative Commons Attribution License CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/). All code contained herein is licensed under an [MIT license](https://opensource.org/licenses/MIT).

--- 

In [80]:
import sys
sys.path.insert(0, '../../')
import numpy as np
import pandas as pd
import pystan 
import bokeh.io
import bokeh.plotting
import tqdm
import mut.viz
import mut.thermo
import mut.bayes
import bebi103.viz
constants = mut.thermo.load_constants()
bokeh.io.output_notebook()

The purpose of this notebook is to spruce up the inferential models to be more efficient as well as develop new predictive checks and vizualization techniques. 

## DNA Binding Energy Inference

In [81]:
# Load the data. 
data = pd.read_csv('../../data/csv/compiled_data.csv')

# Restrict to a single mutant and operator
DNA_data = data[(data['class']=='DNA') & (data['operator']=='O2')]

# Load the stan model. 
model = mut.bayes.StanModel('../stan/DNA_binding_energy_induction.stan')

Found precompiled model. Loading...
finished!


In [82]:
# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(DNA_data.groupby(['mutant', 'repressors'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'R': d['repressors'], 
                 'Nns': constants['Nns'],
                 'ep_ai': constants['ep_AI'],
                 'Ka': constants['Ka'],
                 'Ki': constants['Ki'], 
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict)
    
    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['repressors'] = g[1]
    samples_df['mutant'] = g[0]
    samples_df['operator'] = d['operator'].unique()[0]
    
    # Compute the summarized dataframe
    _df = samples_df[['ep_RA', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = g[1]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = d['operator'].unique()[0]
  
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)   
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/DNA_binding_energy_samples.csv', index=False)
summary_df.to_csv('../../data/csv/DNA_binding_energy_summary.csv', index=False)
print('Finished!')



  0%|          | 0/12 [00:00<?, ?it/s][A[A

  8%|▊         | 1/12 [00:00<00:08,  1.24it/s][A[A

 17%|█▋        | 2/12 [00:01<00:07,  1.32it/s][A[A

 25%|██▌       | 3/12 [00:02<00:07,  1.27it/s][A[A

 33%|███▎      | 4/12 [00:03<00:06,  1.26it/s][A[A

 42%|████▏     | 5/12 [00:03<00:05,  1.24it/s][A[A

 50%|█████     | 6/12 [00:04<00:05,  1.18it/s][A[A

 58%|█████▊    | 7/12 [00:05<00:04,  1.10it/s][A[A

 67%|██████▋   | 8/12 [00:06<00:03,  1.06it/s][A[A

 75%|███████▌  | 9/12 [00:08<00:03,  1.16s/it][A[A

 83%|████████▎ | 10/12 [00:09<00:01,  1.00it/s][A[A

 92%|█████████▏| 11/12 [00:09<00:00,  1.10it/s][A[A

100%|██████████| 12/12 [00:10<00:00,  1.21it/s][A[A

[A[A

Finished!


In [83]:
# Plot the contours. 
samples = mutant_df[(mutant_df['mutant']=='Q21M') & 
                   (mutant_df['repressors']==60)]
p = bebi103.viz.corner(samples, vars=['ep_RA', 'sigma'])
bokeh.io.show(p)

## $K_A$ and $K_I$ Inference

In [84]:
# Restrict data to inducer mutants
IND_data = data[data['class']=='IND']

# Load the stan model. 
model = mut.bayes.StanModel('../stan/KaKi_fitting.stan')

Found precompiled model. Loading...
finished!


In [85]:
# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(IND_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'R': d['repressors'], 
                 'Nns': constants['Nns'],
                 'ep_AI': constants['ep_AI'],
                 'ep_RA': constants[g[1]],
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, control=dict(adapt_delta=0.99))
    
    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    
    # Compute the summarized dataframe
    _df = samples_df[['Ka', 'Ki', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
     
    # Add to storage vector
    mutant_dfs.append(samples_df)  
    summary_dfs.append(summary_df)   
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/KaKi_only_samples.csv', index=False)
summary_df.to_csv('../../data/csv/KaKi_only_summary.csv', index=False)
print('Finished!')



  0%|          | 0/7 [00:00<?, ?it/s][A[A

 14%|█▍        | 1/7 [00:03<00:20,  3.46s/it][A[A

 29%|██▊       | 2/7 [00:07<00:18,  3.76s/it][A[A

 43%|████▎     | 3/7 [00:10<00:13,  3.35s/it][A[A

 57%|█████▋    | 4/7 [00:20<00:16,  5.55s/it][A[A

 71%|███████▏  | 5/7 [00:31<00:13,  7.00s/it][A[A

 86%|████████▌ | 6/7 [00:33<00:05,  5.40s/it][A[A

100%|██████████| 7/7 [00:35<00:00,  4.44s/it][A[A

[A[A

Finished!


In [86]:
# Plot the contours. 
samples = mutant_df[(mutant_df['mutant']=='Q294K') &
                    (mutant_df['operator']=='O2')]
                   
p = bebi103.viz.corner(samples, vars=['Ka', 'Ki', 'sigma'], datashade=True)
bokeh.io.show(p)

ValueError: Out of range float values are not JSON compliant

### $K_A$, $K_I$, and $\Delta\varepsilon_{AI}$ Inference

In [115]:
# Load the stan model
model = mut.bayes.StanModel('../stan/KaKi_epAI_fitting.stan')

# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(IND_data.groupby(['mutant', 'operator'])):
    print(g)
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'R': d['repressors'], 
                 'Nns': constants['Nns'],
                 'ep_RA': constants[g[1]],
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=4000, 
                                       control=dict(adapt_delta=0.995, max_treedepth=11))
    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['Ka', 'Ki', 'ep_AI', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
#
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/KaKi_epAI_samples.csv', index=False)
summary_df.to_csv('../../data/csv/KaKi_epAI_summary.csv', index=False)
print('Finished!')





  0%|          | 0/7 [00:00<?, ?it/s][A[A[A[A

Found precompiled model. Loading...
finished!
('F164T', 'O2')






 14%|█▍        | 1/7 [00:11<01:08, 11.49s/it][A[A[A[A

('Q294K', 'O1')






 29%|██▊       | 2/7 [00:26<01:02, 12.41s/it][A[A[A[A

('Q294K', 'O3')






 43%|████▎     | 3/7 [00:31<00:41, 10.32s/it][A[A[A[A

('Q294R', 'O1')






 57%|█████▋    | 4/7 [00:40<00:29,  9.90s/it][A[A[A[A

('Q294R', 'O2')






 71%|███████▏  | 5/7 [01:02<00:26, 13.42s/it][A[A[A[A

('Q294R', 'O3')






 86%|████████▌ | 6/7 [01:10<00:12, 12.04s/it][A[A[A[A

('Q294V', 'O2')






100%|██████████| 7/7 [01:28<00:00, 13.71s/it][A[A[A[A



[A[A[A[A

Finished!


## $K_A$, $K_I$, and $R$ inference

In [93]:
# Load the stan model
model = mut.bayes.StanModel('../stan/KaKi_R_fitting.stan')

# Loop through each DNA mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(IND_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'Nns': constants['Nns'],
                 'ep_RA': constants[g[1]],
                 'ep_AI': constants['ep_AI'],
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict,
                                      control=dict(adapt_delta=0.99))

    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['Ka', 'Ki', 'R', 'sigma', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/KaKi_R_samples.csv', index=False)
summary_df.to_csv('../../data/csv/KaKi_R_summary.csv', index=False)





  0%|          | 0/7 [00:00<?, ?it/s][A[A[A[A

Found precompiled model. Loading...
finished!






 14%|█▍        | 1/7 [00:04<00:26,  4.46s/it][A[A[A[A



 29%|██▊       | 2/7 [00:10<00:24,  4.82s/it][A[A[A[A







 57%|█████▋    | 4/7 [00:15<00:11,  3.75s/it][A[A[A[A



 71%|███████▏  | 5/7 [00:19<00:07,  3.87s/it][A[A[A[A



 86%|████████▌ | 6/7 [00:21<00:03,  3.21s/it][A[A[A[A



100%|██████████| 7/7 [00:25<00:00,  3.53s/it][A[A[A[A



[A[A[A[A

In [94]:
samples = mutant_df[(mutant_df['mutant']=='Q294V') &
                    (mutant_df['operator']=='O2')]
     
p = bebi103.viz.corner(samples, vars=['R', 'Ka', 'Ki', 'sigma'])
bokeh.io.show(p)

## $K_A$, $K_I$, $\Delta\varepsilon_{AI}$, and $\Delta\varepsilon_{RA}$ Double Mutant Fitting

In [95]:
DBL_data = data[data['class']=='DBL']
model = mut.bayes.StanModel('../stan/DBL_parameter_estimation.stan')

# Loop through each mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(DBL_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'Nns': constants['Nns'],
                 'ep_RA':DNA_summary[(DNA_summary['repressors']==260) & (DNA_summary['mutant']==g[0].split('-')[0]) & 
                                    (DNA_summary['operator']=='O2') & (DNA_summary['parameter']=='ep_RA')]['mode'].values,
                 'n_sites': constants['n_sites'],
                 'c': d['IPTGuM'],
                 'R': 260,
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=8000, control=dict(adapt_delta=0.99))
                                      

    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    samples_df['ep_RA'] = data_dict['ep_RA'][0]
    _df = samples_df[['ep_RA', 'Ka', 'Ki', 'ep_AI', 'sigma', 'lp__']].copy()

    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/DBL_complete_samples.csv', index=False)
summary_df.to_csv('../../data/csv/DBL_complete_summary.csv', index=False)





  0%|          | 0/8 [00:00<?, ?it/s][A[A[A[A

Found precompiled model. Loading...
finished!






 12%|█▎        | 1/8 [00:05<00:39,  5.70s/it][A[A[A[A



 25%|██▌       | 2/8 [00:11<00:33,  5.60s/it][A[A[A[A







 50%|█████     | 4/8 [01:37<01:50, 27.72s/it][A[A[A[A



 62%|██████▎   | 5/8 [01:50<01:09, 23.26s/it][A[A[A[A



 75%|███████▌  | 6/8 [02:46<01:05, 32.95s/it][A[A[A[A



 88%|████████▊ | 7/8 [02:50<00:24, 24.51s/it][A[A[A[A



100%|██████████| 8/8 [03:00<00:00, 19.94s/it][A[A[A[A



[A[A[A[A

## $\Delta\varepsilon_{AI}$ to Double Mutants


In [97]:
model = mut.bayes.StanModel('../stan/epAI_fitting.stan')

Found precompiled model. Loading...
finished!


In [111]:
DBL_data = data[data['class']=='DBL']
DNA_summary = pd.read_csv('../../data/csv/DNA_binding_energy_summary.csv')
IND_summary = pd.read_csv('../../data/csv/KaKi_epAI_summary.csv')


# Loop through each mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(DBL_data[DBL_data['IPTGuM']==0].groupby(['mutant', 'operator'])):
    # Determine the value of the DNA binding energy to use. 
    dna_mut = g[0].split('-')[0]
    ep_RA = DNA_summary[(DNA_summary['mutant']==dna_mut) & (DNA_summary['operator']==g[1]) &
                       (DNA_summary['repressors']==260) & (DNA_summary['parameter']=='ep_RA')]['mode'].values[0]
    if g[0].split('-')[1] == 'Q294K':
        ep_AI_single = IND_summary[(IND_summary['mutant']=='Q294K') & (IND_summary['operator']=='O2') &
                                  (IND_summary['parameter']=='ep_AI')]['mode'].values[0]
    else:
        ep_AI_single = constants['ep_AI']
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'Nns': constants['Nns'],
                 'R': 260,
                 'ep_RA': [ep_RA],
                 'ep_AI': [ep_AI_single],
                 'fc': d['fold_change']}
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=5000, control=dict(adapt_delta=0.99))
                                      

    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['ep_int', 'sigma', 'lp__']].copy()
    samples_df['ep_RA'] = ep_RA
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/DBL_epAI_samples.csv', index=False)
summary_df.to_csv('../../data/csv/DBL_epAI_summary.csv', index=False)





  0%|          | 0/8 [00:00<?, ?it/s][A[A[A[A



 12%|█▎        | 1/8 [00:01<00:11,  1.65s/it][A[A[A[A



[A[A[A[A

IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
model = mut.bayes.StanModel('../stan/epAI_fitting.stan')

In [113]:
IND_summary[(IND_summary['mutant']=='Q294K') &]

Unnamed: 0,parameter,mode,hpd_min,hpd_max,repressors,mutant,operator
5,Ka,1193.984776,968.333659,1540.930143,260.0,Q294K,O1
6,Ki,453.118028,379.821658,570.107969,260.0,Q294K,O1
7,ep_AI,-3.484461,-3.537695,-3.442303,260.0,Q294K,O1
8,sigma,0.014315,0.012474,0.017672,260.0,Q294K,O1
9,lp__,255.227905,250.268951,255.226423,260.0,Q294K,O1
10,Ka,4884.856568,298.780441,9676.842692,260.0,Q294K,O3
11,Ki,4607.427219,1.333366,9432.022179,260.0,Q294K,O3
12,ep_AI,-7.788532,-21.714745,-4.588039,260.0,Q294K,O3
13,sigma,0.074205,0.063166,0.08507,260.0,Q294K,O3
14,lp__,200.381162,195.168029,200.339319,260.0,Q294K,O3


## Hill Fitting To Double Mutants


In [None]:
DBL_data = data[data['class']=='DBL']
model = mut.bayes.StanModel('../stan/Hill_fitting.stan')
# Loop through each mutant and perform the inference. 
mutant_dfs = []
summary_dfs = []
for g, d in tqdm.tqdm(DBL_data.groupby(['mutant', 'operator'])):
    # Assemble the data dictionary.
    data_dict = {'J':1,
                 'N': len(d),
                 'idx': np.ones(len(d)).astype(int),
                 'Nns': constants['Nns'],
                 'c': d['IPTGuM'],
                 'foldchange': d['fold_change']}
    
    # Sample!
    samples, samples_df = model.sample(data_dict=data_dict, iter=5000)
                                      

    # Get the parameter names and rename 
    parnames = samples.unconstrained_param_names()
    new_names = {'{}[{}]'.format(m.split('.')[0], m.split('.')[1]):'{}'.format(m.split('.')[0]) for m in parnames} 
    samples_df.rename(columns=new_names, inplace=True)
    
    # Add identifiers
    samples_df['operator'] = g[1]
    samples_df['repressors'] = d['repressors'].unique()[0]
    samples_df['mutant'] = g[0]
    _df = samples_df[['a', 'b', 'K', 'n', 'lp__']].copy()
    summary_df = mut.stats.compute_statistics(_df, logprob_name='lp__')
    summary_df['repressors'] = d['repressors'].unique()[0]
    summary_df['mutant'] = g[0]
    summary_df['operator'] = g[1]
    
    
    # Add to storage vector
    mutant_dfs.append(samples_df) 
    summary_dfs.append(summary_df)
    
# Combine and save to disk    
mutant_df = pd.concat(mutant_dfs, sort=False)
summary_df = pd.concat(summary_dfs, sort=False)
mutant_df.to_csv('../../data/csv/DBL_Hill_samples.csv', index=False)
summary_df.to_csv('../../data/csv/DBL_Hill_summary.csv', index=False)

In [None]:
DBL_data['IPTGuM'].unique()