# Connecting epistasis to $\Delta\varepsilon_{AI}$ 

In [2]:
import sys
import numpy as np
import pandas as pd
import bokeh.io
import pystan 
sys.path.insert(0, '../../')
import mut.thermo
import mut.stats
import mut.bayes
bokeh.io.output_notebook()

There appears to be epistasis between domains in the double mutants. We have estimated the DNA binding energy from the DNA mutants as well as the Ka/Ki from the inducer binding mutants. We have also fit each doulbe mutant individually. The idea is that for some of these epistatic mutants, we may be able to describe the epistasis through $\Delta\varepsilon_{AI}$. 

The procedure is as follows:

1. Assume that the parameters determined from the single mutants are correct. 
2. Using the double mutant induction profile, fit the allosteric energy.
3. Using these three parameters, predict the behavior of a double mutant with another operator. 

If we can accurately predict the new operator double mutant, we have appropriately described the epistatic interactions. In this notebook, I fit the $\Delta\varepsilon_{AI}$ from the Y20I double mutants as well as $\Delta\varepsilon_{RA}$ for Y20I to O1.

##  Loading the data

In [95]:
data = pd.read_csv('../../data/csv/compiled_data.csv')

# Isolate the data that we care about. 
O1_data = data[(data['operator'] == 'O1') & (data['mutant']=='Y20I') & (data['method'] == 'flow cytometry')]
dbl = data[(data['mutant'] == 'Y20I-F164T') & (data['method']=='flow cytometry')]

# Load the flat chains and compute the modes for each parameter. 
DNA_chains = pd.read_csv('../../data/mcmc/NB_emcee_mutants_DNA_strict.csv')
IND_chains = pd.read_csv('../../data/mcmc/NB_emcee_mutants_IND_strict.csv')

DNA_idx = np.argmax(DNA_chains['lnprobability'].values)
IND_idx = np.argmax(IND_chains['lnprobability'].values)

epr_O2 = DNA_chains['Y20I_eps_r'].values[DNA_idx]
ka = IND_chains['F164T_Ka'].values[IND_idx] * 1E-6
ki = IND_chains['F164T_Ki'].values[IND_idx] * 1E-6

# Define the wild-type parameters.
wt_ka = 139E-6
wt_ki = 0.53E-6
wt_epr = -13.9

## Fitting $\Delta\varepsilon_{RA}$

The first step is to fit the best binding energy for the single mutant to the O1 strain. It's best to do this in log fold-change. Below is `Stan` code to do just that 

In [15]:
model_code = """
data {
    int<lower=1> N; // Number of data points
    vector[N] R; // Vector of repressor copy numbers
    real<lower=0> ep_ai; // In units of KbT
    real<lower=0> Nns; // In units of basepairs
    vector[N] fc; // Observed fold-change.
    }

parameters { 
    real<lower=-50, upper=0> ep_r; 
    real<lower=1E-9> sigma;
    }
    
transformed parameters {
    vector[N] log_fc; 
    log_fc = log(fc);
    }
model {
    vector[N] mu; 
    
    // Define the priors
    ep_r ~ normal(0, 10);
    sigma ~ normal(0, 1); 
    for (i in 1:N) {
        mu[i] = 1 / (1 + (1 / 1 + exp(-ep_ai)) * (R[i] / Nns) * exp(-ep_r));
    }
    log_fc ~ normal(log(mu), sigma);
    }
"""

# Compile the model code.
model = pystan.StanModel(model_code=model_code)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_063cb31e4b5c6c89355af51894c6c374 NOW.


In file included from /var/folders/2q/lvh2zsws3lxckq8xtkn_84z80000gn/T/tmpvx0uluk8/stanfit4anon_model_063cb31e4b5c6c89355af51894c6c374_1854334351095088654.cpp:599:
In file included from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/arrayobject.h:4:
In file included from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/ndarrayobject.h:18:
In file included from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/ndarraytypes.h:1816:
 ^
    __pyx_t_12 = ((__pyx_t_9 != __pyx_v_fitptr->param_names_oi().size()) != 0);
                   ~~~~~~~~~ ^  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /var/folders/2q/lvh2zsws3lxckq8xtkn_84z80000gn/T/tmpvx0uluk8/stanfit4anon_model_063cb31e4b5c6c89355af51894c6c374_1854334351095088654.cpp:603:
In file included from /anaconda3/lib/python3.6/site-packages/pystan/stan_fit.hpp:22:
In file included from /anaconda3/lib/python3.6/site-packages/pystan/stan/src/stan/services/diagnose/diagnose.hpp:10:

In [17]:
data_dict = {'N':len(O1_data), 'R':O1_data['repressors'].values,
             'ep_ai':4.5, 'Nns':4.6E6, 'fc':O1_data['fold_change']}
samples = model.sampling(data=data_dict, iter=10000, chains=4)
samples

  elif np.issubdtype(np.asarray(v).dtype, float):


Inference for Stan model: anon_model_063cb31e4b5c6c89355af51894c6c374.
4 chains, each with iter=10000; warmup=5000; thin=1; 
post-warmup draws per chain=5000, total post-warmup draws=20000.

             mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
ep_r       -12.23  3.6e-4   0.05 -12.33 -12.27 -12.23  -12.2 -12.14  16984    1.0
sigma        0.28  2.4e-4   0.03   0.23   0.26   0.28    0.3   0.35  15344    1.0
log_fc[0]    -4.5 1.3e-151.8e-15   -4.5   -4.5   -4.5   -4.5   -4.5      2    nan
log_fc[1]   -3.72 6.3e-168.9e-16  -3.72  -3.72  -3.72  -3.72  -3.72      2    nan
log_fc[2]   -1.88 3.1e-164.4e-16  -1.88  -1.88  -1.88  -1.88  -1.88      2    nan
log_fc[3]   -2.56 3.1e-164.4e-16  -2.56  -2.56  -2.56  -2.56  -2.56      2    nan
log_fc[4]   -3.67 9.4e-161.3e-15  -3.67  -3.67  -3.67  -3.67  -3.67      2    nan
log_fc[5]   -1.77 4.7e-166.7e-16  -1.77  -1.77  -1.77  -1.77  -1.77      2    nan
log_fc[6]   -3.62 9.4e-161.3e-15  -3.62  -3.62  -3.62  -3.62  -3.62    

## Fit $\Delta\varepsilon_{AI}$ to the double mutant

In [92]:
model_code = """
    data { 
        int<lower=1> N; // Number of data points.
        vector[N] R; // Repressor copy numbers
        real ep_r; // Binding energy
        real ka; // in M
        real ki; // in M
        real n; // Number of allosteric binding sites.
        real Nns; // Number of nonspecific binding sites.
        vector[N] c; // Concentration of effector in M
        vector[N] fc; // Observed fold-change
        }
        
    parameters {
        real ep_ai;
        real<lower=0> sigma;
    }
    
    transformed parameters {
        vector[N] fc_log;
        fc_log = log(fc);
    }
    
    model {
        vector[N] mu;
        vector[N] p_act;
        
        // Set the priors
        sigma ~ normal(0, 10);
        
        for (i in 1:N) {
            p_act[i] = (1 +c[i] / ka)^n / ((1 + c[i]/ka)^n + exp(-ep_ai) * (1 + c[i]/ki)^n);
            mu[i] = 1 / (1 + p_act[i] * (R[i] / Nns) * exp(-ep_r));
        }
        
        fc_log ~ normal(log(mu), sigma);
    }
    
"""

epai_model = pystan.StanModel(model_code=model_code)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_fa32eee3853496d69f0110b8784faf73 NOW.


In file included from /var/folders/2q/lvh2zsws3lxckq8xtkn_84z80000gn/T/tmp301uag3r/stanfit4anon_model_fa32eee3853496d69f0110b8784faf73_16790826060848306.cpp:599:
In file included from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/arrayobject.h:4:
In file included from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/ndarrayobject.h:18:
In file included from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/ndarraytypes.h:1816:
 ^
    __pyx_t_12 = ((__pyx_t_9 != __pyx_v_fitptr->param_names_oi().size()) != 0);
                   ~~~~~~~~~ ^  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /var/folders/2q/lvh2zsws3lxckq8xtkn_84z80000gn/T/tmp301uag3r/stanfit4anon_model_fa32eee3853496d69f0110b8784faf73_16790826060848306.cpp:603:
In file included from /anaconda3/lib/python3.6/site-packages/pystan/stan_fit.hpp:22:
In file included from /anaconda3/lib/python3.6/site-packages/pystan/stan/src/stan/services/diagnose/diagnose.hpp:10:
In 

In [96]:

data_dict = {'N':len(dbl), 'R':dbl['repressors'], 'ep_r':epr_O2, 'ka':ka, 'ki':ki,
            'n':2, 'Nns':4.6E6, 'c':dbl['IPTGuM']/1E6, 'fc':dbl['fold_change'].values}
dbl_samples = epai_model.sampling(data=data_dict, iter=10000, chains=4)
dbl_chains = mut.bayes.chains_to_dataframe(dbl_samples)
dbl_stats = mut.stats.compute_statistics(dbl_chains)

  elif np.issubdtype(np.asarray(v).dtype, float):


In [97]:
# Compute the best-fit curve.
c_range = np.logspace(-10, -2, 500)
ep_ai = dbl_stats[dbl_stats['parameter']=='ep_ai']['mode'].values[0]
arch = mut.thermo.SimpleRepression(ep_r=epr_O2, R=260, ka=ka, ki=ki, ep_ai=ep_ai,
                                  effector_conc=c_range)
fc = arch.fold_change()
arch = mut.thermo.SimpleRepression(ep_r=epr_O2, R=260, ka=ka, ki=ki, ep_ai=4.5,
                                  effector_conc=c_range)
fc2 = arch.fold_change()

# Set up the plot.
p = bokeh.plotting.figure(plot_width=600, plot_height=400, x_axis_type='log', 
                   x_axis_label='IPTG (M)', y_axis_label='fold-change')

# Plot the theory curve.
_ = p.line(c_range, fc, color='dodgerblue', legend='fit')
_ = p.line(c_range, fc2, color='tomato', legend='prediction')

# Plot the data.
grouped = pd.DataFrame(dbl.groupby('IPTGuM').apply(mut.stats.compute_mean_sem)).reset_index()
p.circle(grouped['IPTGuM']/1E6, grouped['mean'], color='dodgerblue', legend='data')
p.legend.location = 'top_left'
bokeh.io.show(p)

In [39]:
dbl_stats[dbl_stats['parameter']=='ep_ai'].values[0][1:]

array([-1.8407895017482487, -3.1397304842781497, -1.073006631892643],
      dtype=object)

In [41]:
ep

array([[-1.8407895017482487, -1.8407895017482487, -1.8407895017482487,
        ..., -1.8407895017482487, -1.8407895017482487,
        -1.8407895017482487],
       [-3.1397304842781497, -3.1397304842781497, -3.1397304842781497,
        ..., -3.1397304842781497, -3.1397304842781497,
        -3.1397304842781497],
       [-1.073006631892643, -1.073006631892643, -1.073006631892643, ...,
        -1.073006631892643, -1.073006631892643, -1.073006631892643]],
      dtype=object)