# Parameter Estimation of Inducer Mutants

(c) 2017 the authors. This work is licensed under a [Creative Commons Attribution License CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/). All code contained herein is licensed under an [MIT license](https://opensource.org/licenses/MIT).  

In [1]:
# For operating system interaction
import os
import glob
import datetime
import sys

# For loading .pkl files.
import pickle

# For scientific computing
import numpy as np
import pandas as pd
import scipy.special

# Library to perform MCMC sampling
import emcee

# Import custom utilities
import mwc_induction_utils as mwc

# Useful plotting libraries
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import corner
mwc.set_plotting_style()

# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline

# This enables SVG graphics inline (only use with static plots (non-Bokeh))
%config InlineBackend.figure_format = 'svg'

# Generate a variable with the day that the script is run
today = str(datetime.datetime.today().strftime('%Y%m%d'))


For the inducer mutants we want to fit the Ka and Ki values. 
1. Fit Ka and Ki for each inducer mutant: wt, Q294K, Q294V, and F164T
2. where we use priors on wt ea, wt ei, R, and wt O2 binding energy


### Initializing walkers automatically.

To make the function more robust we will define a function that initializes the MCMC walkers nearby the MAP for the MWC parameters. The function first finds the MAP using the [non-linear regression](https://github.com/RPGroup-PBoC/mwc_induction/blob/master/code/analysis/non_linear_regression.ipynb) function we previously defined, and starts the walkers around that region. Then it initializes the walkers for the mean repressor copy number and the binding energy also around the MAP value.

Begin by defining some data processing functions, log posterior, and MCMC walker initialization function (hidden):

In [314]:
def log_likelihood(param, param_idx, unique_var, data, epsilon=4.5):
    '''
    Computes the log-likelihood
    Parameters
    ----------
    param : array-like
        Array with the value of all the parameters/dismensions on which the
        MCMC walkers should walk. The array follows this order:
        ea, ei, sigma : first 3 columns.
        repressor copy number : next columns.
        binding energies : final columns.
        The exact position of each of these parameters depends on the number
        of unique repressors and energies as indicated by param_idx.
    param_idx : array-like.
        An array that indicates in the param array where are each parameters
        located. The logic is the following:
        In the first 3 positions of the param argument for the MCMC we find
        epsilon_A, epsilon_I and sigma the error associated with the Gaussian
        likelihood.
        After that we have all the repressor copy numbers for each of the RBS
        mutants. Followed by all the unique binding energies in the DataFrame.
        This variable indicates the position of each of these variables such
        that  the function is robust and it works for a DataFrame with 1 RBS 
        mutant and 1 energy as well as for multiple mutants and multiple enrgies.
    unique_var : : list.
        A list whose first element is the list of the unique mean repressor
        copy number found in the DataFrame.
        The second element is the list of unique binding energies also found
        in the DataFrame.
        This is used by the MCMC function to determine how many dimensions 
        the walkers should walk in.
    data : array-like.
        Numpy array pre-arranged in the order that the log-posterior function
        expects it with the following columns:
        data[:, 0] : fold_change_A
        data[:, 1] : IPTG_uM
        data[:, 2] : repressors
        data[:, 3] : delta_repressors
        data[:, 4] : binding_energy
        data[:, 5] : delta_energy
    epsilon : float.
        Energetic difference between the active and inactive state.
    Returns
    -------
    log likelihood probability
    '''
    # unpack parameters
    ea = param[0:len(unique_var[1])]
    ei = param[len(unique_var[1]):(param_idx[0]-1)]
    sigma = param[param_idx[0]-1] # MWC parameters
    rep = param[param_idx[0]:param_idx[1]] # Repressor copy numbers
    eps_r = param[param_idx[1]:param_idx[2]] # Represor energies
   
    # Initialize the log_likelihood
    log_like = 0
    # loop through the parameters to fit in order to compute the
    # theoretical fold change using the right parameters for each strain
    for i, r in enumerate(unique_var[0]):
        for j, eps in enumerate(unique_var[1]):
            data_block = data[(data[:, 2]==r) & (data[:, 4]==eps), :]
            # compute the theoretical fold-change
            fc_theory = mwc.fold_change_log(data_block[:, 1],
                                            ea[j], ei[j], epsilon,
                                            rep[i], eps_r)
            # compute the log likelihood for this block of data
            log_like -=  np.sum((fc_theory - data_block[:, 0])**2) / 2 / sigma**2
            
    return log_like

def log_prior(param, param_idx, unique_var, data, epsilon=4.5):
    '''
    Computes the log-prior probability
    Parameters
    ----------
    param : array-like
        Array with the value of all the parameters/dismensions on which the
        MCMC walkers should walk. The array follows this order:
        ea, ei, sigma : first 3 columns.
        repressor copy number : next columns.
        binding energies : final columns.
        The exact position of each of these parameters depends on the number
        of unique repressors and energies as indicated by param_idx.
    param_idx : array-like.
        An array that indicates in the param array where are each parameters
        located. The logic is the following:
        In the first 3 positions of the param argument for the MCMC we find
        epsilon_A, epsilon_I and sigma the error associated with the Gaussian
        likelihood.
        After that we have all the repressor copy numbers for each of the RBS
        mutants. Followed by all the unique binding energies in the DataFrame.
        This variable indicates the position of each of these variables such
        that  the function is robust and it works for a DataFrame with 1 RBS 
        mutant and 1 energy as well as for multiple mutants and multiple enrgies.
    unique_var : : list.
        A list whose first element is the list of the unique mean repressor
        copy number found in the DataFrame.
        The second element is the list of unique binding energies also found
        in the DataFrame.
        This is used by the MCMC function to determine how many dimensions 
        the walkers should walk in.
    data : array-like.
        Numpy array pre-arranged in the order that the log-posterior function
        expects it with the following columns:
        data[:, 0] : fold_change_A
        data[:, 1] : IPTG_uM
        data[:, 2] : repressors
        data[:, 3] : delta_repressors
        data[:, 4] : binding_energy
        data[:, 5] : delta_energy
    epsilon : float.
        Energetic difference between the active and inactive state.
    Returns
    -------
    log prior probability
    '''
    # unpack parameters
    ea = param[0:len(unique_var[1])]
    ei = param[len(unique_var[1]):(param_idx[0]-1)]
    sigma = param[param_idx[0]-1] # MWC parameters
    rep = param[param_idx[0]:param_idx[1]] # Repressor copy numbers
    eps_r = param[param_idx[1]:param_idx[2]] # Represor energies
    
    # Initialize the log_prior
    log_prior = 0
    # loop through the parameters to to fit in order to compute the appropiate
    # log prior
    for i, r in enumerate(unique_var[0]):
        for j, eps in enumerate(unique_var[1]):
            # add in prior for repressors
            data_block = data[(data[:, 2]==r) & (data[:, 4]==eps), :]
            log_prior -= np.sum((rep[i] - data_block[:, 2])**2 / \
                         2 / data_block[:, 3]**2)

            # add in prior for ea and ei
            if np.any([eps=='wt', eps=='Q21M', eps=='Q21A', eps=='Y20I']):
                log_prior -= np.sum((ea[j] + 5.33)**2 / \
                             2 / 0.06**2)
                log_prior -= np.sum((ei[j] - 0.31)**2 / \
                             2 / 0.06**2)

            # add in prior for wild-type eps
            if np.any([eps=='wt', eps=='Q294K', eps=='Q294V', eps=='F164T']):
                log_prior -= np.sum((eps_r + 13.6)**2 / \
                             2 / 0.1**2)
                
    # check the bounds on the parameterreps
    if np.any(rep <= 0) or (sigma <= 0):
        return -np.inf
    
    if np.any(ea <= -15) or np.any(ea >= 15) or np.any(ei <= -15) or np.any(ei >= 15):
        return -np.inf
    
    return log_prior

def log_post(param, param_idx, unique_var, data, epsilon=4.5):
    '''
    Computes the log posterior probability.
    Parameters
    ----------
    param : array-like
        Array with the value of all the parameters/dismensions on which the
        MCMC walkers should walk. The array follows this order:
        ea, ei, sigma : first 3 columns.
        repressor copy number : next columns.
        binding energies : final columns.
        The exact position of each of these parameters depends on the number
        of unique repressors and energies as indicated by param_idx.
    param_idx : array-like.
        An array that indicates in the param array where are each parameters
        located. The logic is the following:
        In the first 3 positions of the param argument for the MCMC we find
        epsilon_A, epsilon_I and sigma the error associated with the Gaussian
        likelihood.
        After that we have all the repressor copy numbers for each of the RBS
        mutants. Followed by all the unique binding energies in the DataFrame.
        This variable indicates the position of each of these variables such
        that  the function is robust and it works for a DataFrame with 1 RBS 
        mutant and 1 energy as well as for multiple mutants and multiple enrgies.
    unique_var : : list.
        A list whose first element is the list of the unique mean repressor
        copy number found in the DataFrame.
        The second element is the list of unique binding energies also found
        in the DataFrame.
        This is used by the MCMC function to determine how many dimensions 
        the walkers should walk in.
    data : array-like.
        Numpy array pre-arranged in the order that the log-posterior function
        expects it with the following columns:
        data[:, 0] : fold_change_A
        data[:, 1] : IPTG_uM
        data[:, 2] : repressors
        data[:, 3] : delta_repressors
        data[:, 4] : binding_energy
        data[:, 5] : delta_energy
    epsilon : float.
        Energetic difference between the active and inactive state.
    Returns
    -------
    The log posterior probability
    '''
    # unpack parameters
    ea = param[0:len(unique_var[1])]
    ei = param[len(unique_var[1]):(param_idx[0]-1)]
    sigma = param[param_idx[0]-1] # MWC parameters
    eps_r = param[param_idx[1]:param_idx[2]] # Represor energies
    
    lnp = log_prior(param, param_idx, unique_var, data, epsilon)
    # Check before computing the likelihood if one of the boundaries set by
    # the prior was not satisfied. If that is the case don't waste time
    # computing the likelihood and return -inf
    if lnp == -np.inf:
        return lnp
    
    return -(len(data) + 1) * np.log(sigma)\
            + log_likelihood(param, param_idx, unique_var, data, epsilon)\
            + lnp

In [315]:
def init_walkers_ind(df, n_walkers, unique_var, param_idx):
    '''
    Initialize walkers according to however many dimensions will be explored
    by the MCMC
    Parameters
    ----------
    df : pandas DataFrame
        Data frame containing the data that will be used for fitting the
        parameters
    n_walkers : int
        Number of walkers for the MCMC.
    unique_var : : list
        A list whose first element is the list of the unique mean repressor
        copy number found in the DataFrame.
        The second element is the list of unique binding energies also found
        in the DataFrame.
        This is used by the MCMC function to determine how many dimensions 
        the walkers should walk in.
    param_idx : array-like
        An array that indicates in the param array where are each parameters
        located. The logic is the following:
        In the first 3 positions of the param argument for the MCMC we find
        epsilon_A, epsilon_I and sigma the error associated with the Gaussian
        likelihood.
        After that we have all the repressor copy numbers for each of the RBS
        mutants. Followed by all the unique binding energies in the DataFrame.
        This variable indicates the position of each of these variables such
        that  the function is robust and it works for a DataFrame with 1 RBS 
        mutant and 1 energy as well as for multiple mutants and multiple enrgies.
    n_dim : int
        Number of dimensions that the MCMC walkers will walk on.
        
    Returns
    -------
    [p0, ndim] : list
        The maximum a-priori value from optimization and the number of parameters
        used for the MCMC execution. 
    '''
    #Define the parameters for emcee
    n_dim = 1 + 2*len(unique_var[1]) + len(unique_var[0]) + 1
    
#     # Perform a non-linear regression
#     map_param =  mwc.non_lin_reg_mwc(df, p0=[1, 7], diss_const=False)
#     mean = [map_param[0], map_param[2]]
#     cov = np.array([[map_param[1], 0], [0, map_param[3]]])
    
    # Initialize walkers
    p0 = np.empty((n_walkers, n_dim))
    # Initialize walkers
    p0 = np.empty((n_walkers, n_dim))
    for j, eps in enumerate(unique_var[1]):
        p0[:,j] = np.random.normal(-5.33, 1, n_walkers) # ea
        p0[:,j + len(unique_var[1])] = np.random.normal(0.31, 1, n_walkers) # ei
        
    p0[:,param_idx[0]-1] = np.random.uniform(1E-5, 0.2, n_walkers) # sigma
    
    # loop through the repressors
    for i, r in enumerate(unique_var[0]):
#         sigma_r = df[df.repressors==r].delta_repressors.unique()
        # Check if any walker was initialized in a forbidden area
        rep_num = np.random.normal(r, r*0.2, n_walkers)
        rep_num[rep_num < 0] = 0
        p0[:, param_idx[0]+i] = rep_num
#     for j, eps in enumerate(unique_var[1]):
    p0[:, param_idx[2]-1] = np.random.normal(-13.6, 0.1, n_walkers)
    
    return p0, n_dim

In [2]:
def mcmc_pre_process_ind(df):
    """
    Pre-process the tidy DataFrame to prepare it for the MCMC. This is done
    separately from the log-posterior calculation to speed up the process
    avoiding parsing the DataFrame every evaluation of the posterior.
    Parameteres
    -----------
    df : pandas DataFrame.
        A tidy pandas DataFrame as standardized in the project that contains
        at least the following columns:
        fold_change_A : the experimental fold-change from channel A in the
        flow cytometer.
    IPTG_uM : 1d-array
        Concentrations of the inducer in micromolar. 
    repressors : int
        The mean repressor copy number in copies per cell. 
    delta_repressors : float
        The experimental standard deviation on the mean repressor copy number
    binding_energy : float
        The mean repressor binding energy
    delta_energy : float 
        The experimental standard deviation on the binding energy

    Returns
    -------
    [rep_unique, eps_unique] : list
        A list whose first element is the list of the unique mean repressor
        copy number found in the DataFrame.
        The second element is the list of unique binding energies also found
        in the DataFrame (i.e. DNA mutants)
        This is used by the MCMC function to determine how many dimensions 
        the walkers should walk in.
    param_idx : array-like.
        An array that indicates in the param array where are each parameters
        located. The logic is the following:
        In the first 1 positions of the param argument for the MCMC we find
        sigma, the error associated with the Gaussian likelihood.
        After that we have all the repressor copy numbers for each of the RBS
        mutants. 
        This variable indicates the position of each of these variables such
        that  the function is robust and it works for a DataFrame with 1 RBS 
        mutant as well as for multiple mutants and multiple enrgies.
    data : array-like.
        Numpy array pre-arranged in the order that the log-posterior function
        expects it with the following columns:
        data[:, 0] : fold_change
        data[:, 1] : IPTGuM
        data[:, 2] : repressors
        data[:, 3] : delta_repressors
        data[:, 4] : binding_energy
    """
    # List the unique variables
    rep_unique = np.sort(df.repressors.unique())
    eps_unique = np.sort(df.mutant.unique())
    IPTG_unique = np.sort(df.IPTGuM.unique())
    
    # determine the number of unique variables
    n_repressor = len(rep_unique)
    n_epsilon_r = len(eps_unique)
    n_IPTG = len(IPTG_unique)
    
    # Depending on the number of parameters determine the indexes of the
    # parameters to fit
    param_idx = np.cumsum([2*n_epsilon_r + 1, n_repressor, 1])

    # Sort the data frame such that the log-posterior function can
    # automatically compute the log probability with the right parameters
    # for each data point
    df_sort = df.sort_values(['repressors', 'mutant', 'IPTGuM'])
    data = np.array(df_sort[['fold_change', 'IPTGuM', 
                             'repressors', 'delta_repressors', 'mutant']])
    return [rep_unique, eps_unique], param_idx, data

## Load in the data

In [318]:
# Load all of the 2018 flow data.
flow_files = glob.glob('../../processing/2018*IND*flow*/output/*fold_change.csv')
dfs = [pd.read_csv(f, comment='#') for f in flow_files]
flow_data = pd.concat(dfs, axis=0)
flow_data = flow_data[(flow_data['fold_change'] >= -0.2) & (flow_data['fold_change'] <= 1.3)]
flow_data = flow_data[(flow_data.mutant != 'Q21A-Q294K') & (flow_data.mutant != 'Q294R')]

# Now we remove the autofluorescence and delta values
df_ind = flow_data[(flow_data.mutant != 'auto') & (flow_data.mutant != 'delta') & (flow_data.operator == 'O2')]

# Restart index
df_ind = df_ind.reset_index()

# List the error sources as described by Garcia & Phillips PNAS 2011.
delta_R = {'HG104':2, 'R60':10, 'R124':15, 'R260':20, 'R1220':80,
               'RBS1L':170}


# delta_epsilon_r = {'O1':0.2, 'O2':0.2, 'O3':0.1, 'Oid':0.2}
# Add the error columns to the data frame
df_ind['delta_repressors'] = pd.Series([delta_R[df_ind.iloc[x].strain] for x\
                                    in np.arange(df_ind.shape[0])])

df_ind.head()

Unnamed: 0.1,index,Unnamed: 0,date,username,mutant,operator,strain,IPTGuM,mean_FITC_H,repressors,fold_change,delta_repressors
0,2,2,20180409,nbellive,wt,O2,R260,0.0,5209.473763,260.0,-0.004128,20
1,3,3,20180409,nbellive,Q294V,O2,R260,0.0,5186.974267,260.0,-0.005374,20
2,4,4,20180409,nbellive,F164T,O2,R260,0.0,5325.49404,260.0,0.002297,20
3,7,7,20180409,nbellive,wt,O2,R260,0.1,5187.629503,260.0,0.012561,20
4,8,8,20180409,nbellive,Q294V,O2,R260,0.1,5060.321179,260.0,0.00562,20


In [319]:
# Preprocess the data
unique_var, param_idx, data = mcmc_pre_process_ind(df_ind)

n_walkers = 50
n_burn = 500
n_steps = 8000
p0, n_dim = init_walkers_ind(df_ind, n_walkers, unique_var, param_idx)
#Call the sampler. 
sampler = emcee.EnsembleSampler(n_walkers, n_dim, log_post,\
                args=(param_idx, unique_var, data, 4.5),\
                threads=6)

In [320]:
sample = True
if sample:
    #Do the burn in
    print('Performing the burn-in')
    pos, prob, state = sampler.run_mcmc(p0, n_burn, storechain=False)
    # Perform the real MCMC
    print('Performing the MCMC')
    _ = sampler.run_mcmc(pos, n_steps)
    output = open('../../../data/mcmc/NB_emcee_mutants_IND.pkl', 'wb')
    pickle.dump(sampler.flatchain, output)
    output.close()
    output = open('../../../data/mcmc/NB_emcee_mutants_IND_lnprob.pkl', 'wb')
    pickle.dump(sampler.flatlnprobability, output)
    output.close()
    


Performing the burn-in
Performing the MCMC


In [None]:
# Load the flat-chain
with open('../../../data/mcmc/NB_emcee_mutants_IND.pkl','rb') as file:
    unpickler = pickle.Unpickler(file)
    gauss_flatchain = unpickler.load()
#     gauss_flatlnprobability = unpickler.load()

# Draw the corner plot
# fig = corner.corner(gauss_flatchain, bins=50, plot_contours=False,
#                     rasterized=True)

In [None]:
index = np.concatenate([[mut + '_ka' for mut in unique_var[1]],
                        [mut + '_ki' for  mut in unique_var[1]],
                        ['sigma'],
                       unique_var[0],
                      ['eps']])
df_mcmc = pd.DataFrame(gauss_flatchain, columns=index)

for col in df_mcmc.columns:
    if 'ka' in col:
        df_mcmc[col[:-2]+'Ka'] = np.exp(-df_mcmc[col])
    if 'ki' in col:
        df_mcmc[col[:-2]+'Ki'] = np.exp(-df_mcmc[col])

# redfine the index with the new entries
index = df_mcmc.columns
df_mcmc.head()

#################

# Generate data frame with mode values for each parameter
max_idx = np.argmax(gauss_flatlnprobability, axis=0)
# Obtain the MAP for each parameter
param_fit_IND = df_mcmc.ix[max_idx, :]
# Convert to data frame with column name mode
param_fit_IND = param_fit_IND.to_frame(name='mode')
# Generate parameter to save the hpd for each parameter
param_hpd_IND = pd.DataFrame(columns=['hpd_min', 'hpd_max'])

# Loop through each parameter computing the 95% hpd
for column in df_mcmc:
    param_hpd_IND = param_hpd_IND.append(pd.Series(np.abs(mwc.hpd(df_mcmc[column], 0.95) - \
                                           param_fit_IND.ix[column, 'mode']),
                               index=['hpd_min', 'hpd_max'], name=column))

# Combine the data frames into a single data frame
param_fit_IND = pd.concat([param_fit_IND, param_hpd_IND], axis=1)
param_fit_IND.round(3)



In [None]:
param_fit_IND.to_csv('param_fit_IND.csv')