# Parameter Estimation of Inducer Mutants

(c) 2017 the authors. This work is licensed under a [Creative Commons Attribution License CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/). All code contained herein is licensed under an [MIT license](https://opensource.org/licenses/MIT).  

In [1]:
# For operating system interaction
import os
import glob
import datetime
import sys

# For loading .pkl files.
import pickle

# library for calculating stats and plotting
sys.path.insert(0,'../../../')
import mut.stats as stats
import mut.viz as viz
viz.plotting_style()

# For scientific computing
import numpy as np
import pandas as pd
import scipy.special

# Library to perform MCMC sampling
import emcee

# load in posterior distribution function,
# and scripts to handle the data
sys.path.insert(0, 'logpost/')
import logpost_ind as mcmc_utils

# Useful plotting libraries
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import corner

# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline

# This enables SVG graphics inline (only use with static plots (non-Bokeh))
%config InlineBackend.figure_format = 'svg'

# Generate a variable with the day that the script is run
today = str(datetime.datetime.today().strftime('%Y%m%d'))

For the inducer mutants we want to fit the Ka and Ki values. We will make the assumption that the DNA binding energy is identical to the wild-type strain. Note that we will include the wild-type strain, where we use priors on the parameters for this strain only.

# Load in the data

In [2]:
# Load all of the 2018 flow data.
flow_files = glob.glob('../../processing/2018*IND*flow*/output/*fold_change.csv')
dfs = [pd.read_csv(f, comment='#') for f in flow_files]
flow_data = pd.concat(dfs, axis=0)
flow_data = flow_data[(flow_data['fold_change'] >= -0.2) & (flow_data['fold_change'] <= 1.3)]
# flow_data = flow_data[(flow_data.mutant != 'Q21A-Q294K') & (flow_data.mutant != 'Q294R')]
flow_data = flow_data[(flow_data.mutant != 'Q21A-Q294K')]

# Now we remove the autofluorescence and delta values
df_ind = flow_data[(flow_data.mutant != 'auto') & (flow_data.mutant != 'delta') & (flow_data.operator == 'O2')]

# Restart index
df_ind = df_ind.reset_index()

df_ind.head()

Unnamed: 0.1,index,IPTGuM,Unnamed: 0,date,fold_change,mean_FITC_H,mutant,operator,repressors,run_no,strain,username
0,2,0.0,2,20180409,-0.004128,5209.473763,wt,O2,260.0,,R260,nbellive
1,3,0.0,3,20180409,-0.005374,5186.974267,Q294V,O2,260.0,,R260,nbellive
2,4,0.0,4,20180409,0.002297,5325.49404,F164T,O2,260.0,,R260,nbellive
3,7,0.1,7,20180409,0.012561,5187.629503,wt,O2,260.0,,R260,nbellive
4,8,0.1,8,20180409,0.00562,5060.321179,Q294V,O2,260.0,,R260,nbellive


# Set up the MCMC

In [3]:
# Preprocess the data
unique_var, param_idx, data = mcmc_utils.mcmc_pre_process_ind(df_ind)

n_walkers = 50
n_burn = 500
n_steps = 8000
p0, n_dim = mcmc_utils.init_walkers_ind(df_ind, n_walkers, unique_var, param_idx)
#Call the sampler. 
sampler = emcee.EnsembleSampler(n_walkers, n_dim, mcmc_utils.log_post,\
                args=(param_idx, unique_var, data, 4.5),\
                threads=6)

In [4]:
unique_var

[array([ 260.]),
 array(['F164T', 'Q294K', 'Q294R', 'Q294V', 'wt'], dtype=object)]

# Run the MCMC

In [5]:
sample = False
if sample:
    #Do the burn in
    print('Performing the burn-in')
    pos, prob, state = sampler.run_mcmc(p0, n_burn, storechain=False)
    # Perform the real MCMC
    print('Performing the MCMC')
    _ = sampler.run_mcmc(pos, n_steps)
    output = open('../../../data/mcmc/NB_emcee_mutants_IND_strict.pkl', 'wb')
    pickle.dump(sampler.flatchain, output)
    output.close()
    output = open('../../../data/mcmc/NB_emcee_mutants_IND_lnprob_strict.pkl', 'wb')
    pickle.dump(sampler.flatlnprobability, output)
    output.close()
    


# Re-load in the chains, save as .csv, and calculate properties

In [None]:
# Load the flat-chain
with open('../../../data/mcmc/NB_emcee_mutants_IND_strict.pkl','rb') as file:
    unpickler = pickle.Unpickler(file)
    gauss_flatchain = unpickler.load()
with open('../../../data/mcmc/NB_emcee_mutants_IND_lnprob_strict.pkl','rb') as file:
    unpickler = pickle.Unpickler(file)
    gauss_flatlnprobability = unpickler.load()

# # Draw the corner plot
# fig = corner.corner(gauss_flatchain, bins=50, plot_contours=False,
#                     rasterized=True, labels=df_mcmc.columns[:11])

In [8]:
# save as csv file
index = np.concatenate([[mut + '_ka' for mut in unique_var[1]],
                        [mut + '_ki' for  mut in unique_var[1]],
                        ['sigma']])
df_mcmc = pd.DataFrame(gauss_flatchain, columns=index)
df_mcmc['lnprobability'] = gauss_flatlnprobability

for col in df_mcmc.columns:
    if 'ka' in col:
        df_mcmc[col[:-2]+'Ka'] = np.exp(-df_mcmc[col])
    if 'ki' in col:
        df_mcmc[col[:-2]+'Ki'] = np.exp(-df_mcmc[col])

# redfine the index with the new entries
index = df_mcmc.columns
df_mcmc.to_csv('../../../data/mcmc/NB_emcee_mutants_IND_strict.csv')
df_mcmc.head()

Unnamed: 0,F164T_ka,Q294K_ka,Q294R_ka,Q294V_ka,wt_ka,F164T_ki,Q294K_ki,Q294R_ki,Q294V_ki,wt_ki,...,F164T_Ka,Q294K_Ka,Q294R_Ka,Q294V_Ka,wt_Ka,F164T_Ki,Q294K_Ki,Q294R_Ki,Q294V_Ki,wt_Ki
0,-5.189203,-4.761934,3.30786,-6.544055,-4.814836,0.354271,0.187983,6.953925,-1.33539,0.55497,...,179.325501,116.971976,0.036594,695.099363,123.326559,0.701684,0.828629,0.000955,3.801477,0.57409
1,-5.189203,-4.761934,3.30786,-6.544055,-4.814836,0.354271,0.187983,6.953925,-1.33539,0.55497,...,179.325501,116.971976,0.036594,695.099363,123.326559,0.701684,0.828629,0.000955,3.801477,0.57409
2,-5.189203,-4.761934,3.30786,-6.544055,-4.814836,0.354271,0.187983,6.953925,-1.33539,0.55497,...,179.325501,116.971976,0.036594,695.099363,123.326559,0.701684,0.828629,0.000955,3.801477,0.57409
3,-5.189203,-4.761934,3.30786,-6.544055,-4.814836,0.354271,0.187983,6.953925,-1.33539,0.55497,...,179.325501,116.971976,0.036594,695.099363,123.326559,0.701684,0.828629,0.000955,3.801477,0.57409
4,-5.180909,-4.747569,3.967065,-6.561438,-4.829148,0.369224,0.195044,7.592595,-1.329575,0.549222,...,177.844471,115.303634,0.018929,707.288325,125.104288,0.69127,0.822798,0.000504,3.779436,0.577399


Also calculate the properties

In [8]:
param_fit_IND = stats.compute_statistics(df_mcmc, logprob_name='lnprobability')
param_fit_IND.to_csv('param_fit_IND_strict.csv')
param_fit_IND.head()

Unnamed: 0,parameter,mode,hpd_min,hpd_max
0,F164T_ka,-5.048085,-5.80366,-4.567683
1,Q294K_ka,-4.145398,-5.027519,-2.81763
2,Q294R_ka,6.302842,2.654775,11.49721
3,Q294V_ka,-6.490333,-7.18432,-5.863952
4,wt_ka,-4.916185,-5.046639,-4.82535


In [None]:
param_fit_IND