# Parameter Estimation of Inducer Mutants

(c) 2017 the authors. This work is licensed under a [Creative Commons Attribution License CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/). All code contained herein is licensed under an [MIT license](https://opensource.org/licenses/MIT).  

In [14]:
# For operating system interaction
import os
import glob
import datetime
import sys

# For loading .pkl files.
import pickle

# library for calculating stats and plotting
sys.path.insert(0,'../../../')
import mut.stats as stats
import mut.viz as viz
viz.plotting_style()

# For scientific computing
import numpy as np
import pandas as pd
import scipy.special

# Library to perform MCMC sampling
import emcee

# load in posterior distribution function,
# and scripts to handle the data
sys.path.insert(0, 'logpost/')
import logpost_ind as mcmc_utils

# Useful plotting libraries
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import corner

# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline

# This enables SVG graphics inline (only use with static plots (non-Bokeh))
%config InlineBackend.figure_format = 'svg'

# Generate a variable with the day that the script is run
today = str(datetime.datetime.today().strftime('%Y%m%d'))

For the inducer mutants we want to fit the Ka and Ki values. We will make the assumption that the DNA binding energy is identical to the wild-type strain. Note that we will include the wild-type strain, where we use priors on the parameters for this strain only.

# Load in the data

In [5]:
# Load all of the 2018 flow data.
flow_files = glob.glob('../../processing/2018*IND*flow*/output/*fold_change.csv')
dfs = [pd.read_csv(f, comment='#') for f in flow_files]
flow_data = pd.concat(dfs, axis=0)
flow_data = flow_data[(flow_data['fold_change'] >= -0.2) & (flow_data['fold_change'] <= 1.3)]
# flow_data = flow_data[(flow_data.mutant != 'Q21A-Q294K') & (flow_data.mutant != 'Q294R')]
flow_data = flow_data[(flow_data.mutant != 'Q21A-Q294K')]

# Now we remove the autofluorescence and delta values
df_ind = flow_data[(flow_data.mutant != 'auto') & (flow_data.mutant != 'delta') & (flow_data.operator == 'O2')]

# Restart index
df_ind = df_ind.reset_index()

df_ind.head()

Unnamed: 0.1,index,Unnamed: 0,date,username,mutant,operator,strain,IPTGuM,mean_FITC_H,repressors,fold_change
0,2,2,20180409,nbellive,wt,O2,R260,0.0,5209.473763,260.0,-0.004128
1,3,3,20180409,nbellive,Q294V,O2,R260,0.0,5186.974267,260.0,-0.005374
2,4,4,20180409,nbellive,F164T,O2,R260,0.0,5325.49404,260.0,0.002297
3,7,7,20180409,nbellive,wt,O2,R260,0.1,5187.629503,260.0,0.012561
4,8,8,20180409,nbellive,Q294V,O2,R260,0.1,5060.321179,260.0,0.00562


# Set up the MCMC

In [6]:
# Preprocess the data
unique_var, param_idx, data = mcmc_utils.mcmc_pre_process_ind(df_ind)

n_walkers = 50
n_burn = 500
n_steps = 8000
p0, n_dim = mcmc_utils.init_walkers_ind(df_ind, n_walkers, unique_var, param_idx)
#Call the sampler. 
sampler = emcee.EnsembleSampler(n_walkers, n_dim, mcmc_utils.log_post,\
                args=(param_idx, unique_var, data, 4.5),\
                threads=6)

In [10]:
unique_var

[array([ 260.]), array(['F164T', 'Q294R', 'Q294V', 'wt'], dtype=object)]

# Run the MCMC

In [8]:
sample = False
if sample:
    #Do the burn in
    print('Performing the burn-in')
    pos, prob, state = sampler.run_mcmc(p0, n_burn, storechain=False)
    # Perform the real MCMC
    print('Performing the MCMC')
    _ = sampler.run_mcmc(pos, n_steps)
    output = open('../../../data/mcmc/NB_emcee_mutants_IND_strict.pkl', 'wb')
    pickle.dump(sampler.flatchain, output)
    output.close()
    output = open('../../../data/mcmc/NB_emcee_mutants_IND_lnprob_strict.pkl', 'wb')
    pickle.dump(sampler.flatlnprobability, output)
    output.close()
    


# Re-load in the chains, save as .csv, and calculate properties

In [11]:
# Load the flat-chain
with open('../../../data/mcmc/NB_emcee_mutants_IND_strict.pkl','rb') as file:
    unpickler = pickle.Unpickler(file)
    gauss_flatchain = unpickler.load()
with open('../../../data/mcmc/NB_emcee_mutants_IND_lnprob_strict.pkl','rb') as file:
    unpickler = pickle.Unpickler(file)
    gauss_flatlnprobability = unpickler.load()

# Draw the corner plot
# fig = corner.corner(gauss_flatchain, bins=50, plot_contours=False,
#                     rasterized=True, labels=df_mcmc.columns[:9])

In [12]:
# save as csv file
index = np.concatenate([[mut + '_ka' for mut in unique_var[1]],
                        [mut + '_ki' for  mut in unique_var[1]],
                        ['sigma']])
df_mcmc = pd.DataFrame(gauss_flatchain, columns=index)
df_mcmc['lnprobability'] = gauss_flatlnprobability

for col in df_mcmc.columns:
    if 'ka' in col:
        df_mcmc[col[:-2]+'Ka'] = np.exp(-df_mcmc[col])
    if 'ki' in col:
        df_mcmc[col[:-2]+'Ki'] = np.exp(-df_mcmc[col])

# redfine the index with the new entries
index = df_mcmc.columns
df_mcmc.to_csv('../../../data/mcmc/NB_emcee_mutants_IND_strict.csv')
df_mcmc.head()

Unnamed: 0,F164T_ka,Q294R_ka,Q294V_ka,wt_ka,F164T_ki,Q294R_ki,Q294V_ki,wt_ki,sigma,lnprobability,F164T_Ka,Q294R_Ka,Q294V_Ka,wt_Ka,F164T_Ki,Q294R_Ki,Q294V_Ki,wt_Ki
0,-5.281382,3.657958,-6.242459,-4.947591,0.512318,7.198407,-1.142863,0.527601,0.095899,500.632684,196.641356,0.025785,514.121089,140.835315,0.599105,0.000748,3.135733,0.590019
1,-5.228552,3.873654,-6.021892,-4.951449,0.541855,7.391771,-1.01461,0.528976,0.094579,498.670528,186.522545,0.020782,412.357911,141.379681,0.581668,0.000616,2.758287,0.589208
2,-5.226594,3.958676,-5.997452,-4.956814,0.549975,7.477284,-0.999318,0.531263,0.094992,498.086426,186.157644,0.019088,402.401982,142.140252,0.576964,0.000566,2.716428,0.587862
3,-5.224804,3.976337,-5.972709,-4.954572,0.555403,7.496781,-0.984118,0.532619,0.094923,497.534398,185.824678,0.018754,392.567708,141.821954,0.573841,0.000555,2.67545,0.587065
4,-5.224804,3.976337,-5.972709,-4.954572,0.555403,7.496781,-0.984118,0.532619,0.094923,497.534398,185.824678,0.018754,392.567708,141.821954,0.573841,0.000555,2.67545,0.587065


Also calculate the properties

In [15]:
param_fit_IND = stats.compute_statistics(df_mcmc, logprob_name='lnprobability')
param_fit_IND.to_csv('param_fit_IND_strict.csv')
param_fit_IND.head()

Unnamed: 0,parameter,mode,hpd_min,hpd_max
0,F164T_ka,-5.084633,-5.605459,-4.657013
1,Q294R_ka,5.011149,3.280695,11.493241
2,Q294V_ka,-6.429937,-6.991734,-5.97955
3,wt_ka,-4.962322,-5.061275,-4.848975
4,F164T_ki,0.498831,0.319338,0.693452
