# Bayesian Parameter Estimation on MBL Data
## Nathan Belliveau, Griffin Chure, Manuel Razo
**July 7, 2016**

In [54]:
import numpy as np
import pandas as pd
import emcee
import scipy.special
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import warnings
import seaborn as sns
import corner
rc = {'lines.linewidth': 1.5,
      'axes.labelsize' : 14,
      'axes.titlesize' : 18,
      'axes.facecolor' : 'EBEBEB',
      'axes.edgecolor' : '000000',
      'axes.linewidth' : 0.75,
      'axes.frameon' : True,
      'xtick.labelsize' : 11,
      'ytick.labelsize' : 11,
      'font.family' : 'Droid Sans',
      'grid.linestyle' : ':',
      'grid.color' : 'a6a6a6',
      'mathtext.fontset' : 'stixsans',
      'mathtext.sf' : 'sans'}
plt.rc('text.latex', preamble=r'\usepackage{sfmath}')
plt.rc('mathtext', fontset='stixsans', sf='sans') 
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)
sns.set_palette("deep", color_codes=True)

# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

#------------------------------------------------------------------------------- 
%matplotlib notebook

Our data consists of a set of fold-change in gene expression values from a set of experiments conducted at Woods Hole, MA at MBL Physiology. The purpose of this notebook is to perform a two-parameter fit on the active and inactive dissociation constants of the Lac repressor to IPTG. Using Bayes' rule, we can say that

$$
\begin{align}
P(\{a\} \vert D, I) &\propto P(D \vert \{a\}, I) P(\{a\} \vert I) \\
& \propto P(D \vert K_a, K_i, I) P(K_a \vert I) P(K_i \vert I)
\end{align}
$$

where $\{a\}$ is the set of dissociation constants for the active and inactive repressors ($K_a$ and $K_i$ repsectively), $D$ are our data, and $I$ is the prior information we have with regards to the system. Note that in this case we are considering that the two dissociation constants are independent parameters. This is probably not correct as it would make logical sense that the ability to bind inducer in the active vs inactive state uses a similar binding pocket and are therefore related. For now, it is not obvious how these two parameters are related mathematically, so we will treat them as completely independent. We can be more explicity with the functional form of our posterior as follows: 

$$
P(K_a, K_i \vert D, I) \propto \prod\limits_{i=1}^N \exp\left\{\frac{fc_i - fc(R, IPTG, \Delta\epsilon_r, K_a, K_i, \epsilon_{ai}))^2}{\sigma_i}\right\}\frac{1}{K_a^{max} - K_a^{min}}\cdot \frac{1}{K_i^{max} - K_i^{min}}\cdot \frac{1}{\sigma_i}
$$

where we have taken a uniform prior on the two dissociation constants. Using some of our physical intution, we known that a lower bound for the dissociation constant is that of the strongest non-covalent bond (biotin-streptavidin) and the upper bound would be Van der Waals interactions. 

## Defining the log posterior

Below, we define the functions needed to perform our parameter estimation.

In [16]:
# define a funciton to compute the fold change as a funciton of IPTG
def pact(IPTG, Ka=88.4E-3, Ki=4.71E-3, epsilon=4):
    '''
    Returns the probability of a repressor being active as described by the MWC
    model.
    Parameter
    ---------
    IPTG : array-like.
        concentrations of inducer on which to evaluate the function
    Ka, Ki : float.
        Dissociation constant of the active and inactive states respectively
    epsilon : float.
        energy difference between the active and the inactive state
    Returns
    -------
    pact : float.
        probability of a repressor of being in the active state. Active state is
        defined as the state that can bind to the DNA.
    '''
    pact = (1 + IPTG / Ka)**2 / ((1 + IPTG / Ka)**2 + np.exp(-epsilon) * (1 + IPTG / Ki)**2)
    return pact



In [17]:
def fold_change(IPTG, Ka=88.4E-3, Ki=4.71E-3, epsilon=4, R=11, epsilon_r=-13.9):
    '''
    Returns the gene expression fold change according to the thermodynamic model
    with the extension that takes into account the effect of the inducer.
    Parameter
    ---------
    IPTG : array-like.
        concentrations of inducer on which to evaluate the function
    Ka, Ki : float.
        Dissociation constant of the active and inactive states respectively
    epsilon : float.
        energy difference between the active and the inactive state
    R : array-like.
        repressor copy number for each of the strains. The length of this array
        should be equal to the IPTG array. If only one value of the repressor is
        given it is asssume that all the data points should be evaluated with
        the same repressor copy number
    epsilon_r : array-like
        repressor binding energy. The length of this array
        should be equal to the IPTG array. If only one value of the binding
        energy is given it is asssume that all the data points 
        should be evaluated with the same repressor copy number
        
    Returns
    -------
    fold-change : float.
        gene expression fold change as dictated by the thermodynamic model.
   '''
    return 1 / (1 + 2 * R / 5E6 * pact(IPTG, Ka, Ki, epsilon) * \
            (1 + np.exp(-epsilon)) * np.exp(-epsilon_r))
 

In [64]:
def log_likelihood(param, IPTG, fc, R, ep_r, ep_ai):
    """
    Computes the log likelihood.
    """
   
    #Unpack the parameters.
    Ka, Ki, sigma = param
    #compute the fc.
    fc_theo = fold_change(IPTG, Ka=Ka, Ki=Ki, epsilon=ep_ai, R=R, epsilon_r=ep_r) 
   
    log_like =  -0.5*(np.sum((fc_theo - fc)**2 * sigma**-1))
    return log_like
   
def log_post(param, IPTG, fc, R, ep_r, ep_ai, Kmin, Kmax):
    Ka, Ki, sigma = param
    if (Ka > Kmax) | (Ka < Kmin):
        return -np.inf
    elif (Ki > Kmax) | (Ki < Kmin):
        return -np.inf
    elif sigma < 0:
        return -np.inf
    else:
        return -(len(IPTG) + 1)*np.log(sigma) + log_likelihood(param, IPTG, fc, R, ep_r, ep_ai) 

    

In [65]:
#Load the data. 
data = pd.read_csv('../../data/mbl_2016/MBL_fc_data.csv')
rbs = data[(data.strain=='1027') & (data.epsilon_r==-13.9)]

#Define the packed values for simplicity
Kmin = 1E-15
Kmax = 1
pack = np.array([rbs.IPTG, rbs.fold_change, 130, -13.9, 4.5, Kmin, Kmax])


#Define the parameters for emcee
n_dim = 3 
n_walkers = 500
n_burn = 300
n_steps = 500 
p0 = np.empty((n_walkers, n_dim))
p0[:,0] = np.random.uniform(Kmin, Kmax, n_walkers)
p0[:,1] = np.random.uniform(Kmin, Kmax, n_walkers)
p0[:,2] = np.random.uniform(1E-6, 1, n_walkers)




In [66]:
#Call the sampler. 
sampler = emcee.EnsembleSampler(n_walkers, n_dim, log_post, args=(rbs.IPTG, rbs.fold_change, 130, -13.9, 4.5, Kmin, Kmax), threads=8)

In [67]:
#Do the burn in 
pos, prob, state = sampler.run_mcmc(p0, n_burn, storechain=False)

In [68]:
_ = sampler.run_mcmc(pos, n_steps)

In [69]:
samples = sampler.chain[:,:,:].reshape((-1,n_dim))


In [73]:
samples

array([[ 0.58896316,  0.48544322,  0.12343538],
       [ 0.35456981,  0.37954561,  0.12310684],
       [ 0.35456981,  0.37954561,  0.12310684],
       ..., 
       [ 0.20773079,  0.00084382,  0.00165744],
       [ 0.21214262,  0.00084122,  0.00166324],
       [ 0.21436022,  0.00083033,  0.00178367]])

In [74]:
np.shape(samples)

(250000, 3)

In [76]:
samples_reshape = sampler.chain[:,:,:].reshape((-1, n_dim))

In [77]:
samples_reshape

array([[ 0.58896316,  0.48544322,  0.12343538],
       [ 0.35456981,  0.37954561,  0.12310684],
       [ 0.35456981,  0.37954561,  0.12310684],
       ..., 
       [ 0.20773079,  0.00084382,  0.00165744],
       [ 0.21214262,  0.00084122,  0.00166324],
       [ 0.21436022,  0.00083033,  0.00178367]])

In [83]:
p0

array([[ 0.65132569,  0.7929273 ,  0.98272455],
       [ 0.93983884,  0.48795775,  0.81597552],
       [ 0.53437583,  0.72984207,  0.73750498],
       ..., 
       [ 0.48738613,  0.78263787,  0.5625746 ],
       [ 0.07665559,  0.91544482,  0.78989955],
       [ 0.52663802,  0.57856563,  0.04789557]])

In [85]:
plt.plot(sampler.flatchain[100:10000,0], sampler.flatchain[100:10000,1], 'bo')


<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x1217c47b8>]