# Parameter Estimation of Double Mutants

(c) 2017 the authors. This work is licensed under a [Creative Commons Attribution License CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/). All code contained herein is licensed under an [MIT license](https://opensource.org/licenses/MIT).  

In [9]:
# For operating system interaction
import os
import glob
import datetime
import sys

# For loading .pkl files.
import pickle

# library for calculating stats and plotting
sys.path.insert(0,'../../../')
import mut.stats as stats
import mut.viz as viz
viz.plotting_style()

# For scientific computing
import numpy as np
import pandas as pd
import scipy.special

# Library to perform MCMC sampling
import emcee

# load in posterior distribution function,
# and scripts to handle the data
sys.path.insert(0, 'logpost/')
import logpost_dbl as mcmc_utils

# Useful plotting libraries
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import corner

# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline

# This enables SVG graphics inline (only use with static plots (non-Bokeh))
%config InlineBackend.figure_format = 'svg'

# Generate a variable with the day that the script is run
today = str(datetime.datetime.today().strftime('%Y%m%d'))

# Load in the data



In [2]:
# Load all of the 2018 flow data.
flow_files = glob.glob('../../processing/2018*flow*/output/*fold_change.csv')
dfs = [pd.read_csv(f, comment='#') for f in flow_files]
flow_data = pd.concat(dfs, axis=0)
flow_data = flow_data[(flow_data['fold_change'] >= -0.2) & (flow_data['fold_change'] <= 1.3)]
flow_data = flow_data[(flow_data.mutant != 'Q21A') & \
                     (flow_data.mutant != 'Q21M') & \
                     (flow_data.mutant != 'Y20I') & \
                     (flow_data.mutant != 'F164T') & \
                     (flow_data.mutant != 'Q294R') & \
                     (flow_data.mutant != 'Q294V')]

df = flow_data

# Now we remove the autofluorescence and delta values
df = df[(df.mutant != 'auto') & (df.mutant != 'delta') & (df.operator == 'O2')]

# Restart index
df = df.reset_index()

df.head()

Unnamed: 0.1,index,IPTGuM,Unnamed: 0,date,fold_change,mean_FITC_H,mutant,operator,repressors,strain,username
0,2,0.0,2,20180409,-0.004128,5209.473763,wt,O2,260.0,R260,nbellive
1,7,0.1,7,20180409,0.012561,5187.629503,wt,O2,260.0,R260,nbellive
2,12,5.0,12,20180409,0.025843,5390.200563,wt,O2,260.0,R260,nbellive
3,17,10.0,17,20180409,0.039235,5642.45378,wt,O2,260.0,R260,nbellive
4,22,25.0,22,20180409,0.100567,6613.374107,wt,O2,260.0,R260,nbellive


# Set up the MCMC

In [4]:
# Preprocess the data
unique_var, param_idx, data = mcmc_utils.mcmc_pre_process_all(df)

n_walkers = 80
n_burn = 500
n_steps = 8000
p0, n_dim = mcmc_utils.init_walkers_all(df, n_walkers, unique_var, param_idx)
#Call the sampler. 
sampler = emcee.EnsembleSampler(n_walkers, n_dim, mcmc_utils.log_post,\
                args=(param_idx, unique_var, data, 4.5),\
                threads=6)

# Run the MCMC

In [7]:
sample = False
if sample:
    #Do the burn in
    print('Performing the burn-in')
    pos, prob, state = sampler.run_mcmc(p0, n_burn, storechain=False)
    # Perform the real MCMC
    print('Performing the MCMC')
    _ = sampler.run_mcmc(pos, n_steps)
    output = open('../../../data/mcmc/NB_emcee_mutants_dbl_strict.pkl', 'wb')
    pickle.dump(sampler.flatchain, output)
    output.close()
    output = open('../../../data/mcmc/NB_emcee_mutants_dbl_lnprob_strict.pkl', 'wb')
    pickle.dump(sampler.flatlnprobability, output)
    output.close()
    

# Re-load in the chains, save as .csv, and calculate properties

In [12]:
# Load the flat-chain
with open('../../../data/mcmc/NB_emcee_mutants_dbl_strict.pkl','rb') as file:
    unpickler = pickle.Unpickler(file)
    gauss_flatchain = unpickler.load()

with open('../../../data/mcmc/NB_emcee_mutants_dbl_lnprob_strict.pkl','rb') as file:
    unpickler = pickle.Unpickler(file)
    gauss_flatlnprobability = unpickler.load()

# # # Draw the corner plot
# fig = corner.corner(gauss_flatchain[:,[9,19,30]], bins=50, plot_contours=False,
#                     rasterized=True)

Now, lets create a Pandas DataFrame out of the MCMC chains.

In [13]:
index = np.concatenate([[mut + '_ka' for mut in unique_var[1]], 
                        [mut + '_ki' for mut in unique_var[1]],
                       ['sigma'],
                      [mut + '_eps' for mut in unique_var[1]]])
df_mcmc = pd.DataFrame(gauss_flatchain, columns=index)

for col in df_mcmc.columns:
    if 'ka' in col:
        df_mcmc[col[:-2]+'Ka'] = np.exp(-df_mcmc[col])
    if 'ki' in col:
        df_mcmc[col[:-2]+'Ki'] = np.exp(-df_mcmc[col])
df_mcmc['lnprobability'] = gauss_flatlnprobability

# redfine the index with the new entries
index = df_mcmc.columns
df_mcmc.head()

Unnamed: 0,Q21A-F164T_ka,Q21A-Q294K_ka,Q21A-Q294V_ka,Q21M-F164T_ka,Q21M-Q294K_ka,Q21M-Q294V_ka,Y20I-F164T_ka,Y20I-Q294K_ka,Y20I-Q294V_ka,wt_ka,...,Q21A-Q294K_Ki,Q21A-Q294V_Ki,Q21M-F164T_Ki,Q21M-Q294K_Ki,Q21M-Q294V_Ki,Y20I-F164T_Ki,Y20I-Q294K_Ki,Y20I-Q294V_Ki,wt_Ki,lnprobability
0,-4.850053,-8.831757,-3.11711,-5.520402,3.66343,-5.397003,-1.022655,-3.983686,-9.832767,-4.86827,...,0.049969,0.104044,1.984529,8.534525,5.272571,0.000791,0.708159,0.100764,0.735372,1012.207501
1,-4.850053,-8.831757,-3.11711,-5.520402,3.66343,-5.397003,-1.022655,-3.983686,-9.832767,-4.86827,...,0.049969,0.104044,1.984529,8.534525,5.272571,0.000791,0.708159,0.100764,0.735372,1012.207501
2,-4.850053,-8.831757,-3.11711,-5.520402,3.66343,-5.397003,-1.022655,-3.983686,-9.832767,-4.86827,...,0.049969,0.104044,1.984529,8.534525,5.272571,0.000791,0.708159,0.100764,0.735372,1012.207501
3,-4.850053,-8.831757,-3.11711,-5.520402,3.66343,-5.397003,-1.022655,-3.983686,-9.832767,-4.86827,...,0.049969,0.104044,1.984529,8.534525,5.272571,0.000791,0.708159,0.100764,0.735372,1012.207501
4,-4.850053,-8.831757,-3.11711,-5.520402,3.66343,-5.397003,-1.022655,-3.983686,-9.832767,-4.86827,...,0.049969,0.104044,1.984529,8.534525,5.272571,0.000791,0.708159,0.100764,0.735372,1012.207501


In [14]:
df_mcmc.to_csv('../../../data/mcmc/NB_emcee_mutants_DBL_strict.csv')

In [15]:
param_fit = stats.compute_statistics(df_mcmc, logprob_name='lnprobability')
param_fit.to_csv('param_fit_DBL_strict.csv')
param_fit.head()

Unnamed: 0,parameter,mode,hpd_min,hpd_max
0,Q21A-F164T_ka,-5.681384,-14.965988,-3.607922
1,Q21A-Q294K_ka,-9.226069,-14.999859,4.067214
2,Q21A-Q294V_ka,-6.302701,-7.511868,-2.355237
3,Q21M-F164T_ka,-5.033623,-6.375734,-4.621659
4,Q21M-Q294K_ka,-6.937593,-8.102362,5.667185
