# Parameter Estimation - fitting inducer and DNA binding energies for each single mutant strain

(c) 2017 the authors. This work is licensed under a [Creative Commons Attribution License CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/). All code contained herein is licensed under an [MIT license](https://opensource.org/licenses/MIT).  

In [1]:
# For operating system interaction
import os
import glob
import datetime
import sys

# For loading .pkl files.
import pickle

# library for calculating stats and plotting
sys.path.insert(0,'../../../')
import mut.stats as stats
import mut.viz as viz
viz.plotting_style()

# For scientific computing
import numpy as np
import pandas as pd
import scipy.special

# Library to perform MCMC sampling
import emcee

# load in posterior distribution function,
# and scripts to handle the data
sys.path.insert(0, 'logpost/')
import logpost_singles_global as mcmc_utils

# Useful plotting libraries
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import corner

# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline

# This enables SVG graphics inline (only use with static plots (non-Bokeh))
%config InlineBackend.figure_format = 'svg'

# Generate a variable with the day that the script is run
today = str(datetime.datetime.today().strftime('%Y%m%d'))

Fit all single mutant data .
1. Fit new DNA binding energies and inducer Ka/Ki, where we assume that both the binding energy (DNA mutants) or the inducer Ka/Ki (inducer mutants) can change.
2. Use priors for wild-type strain only.


## Load in the data

In [2]:
# Load all of the 2018 flow data.
flow_files = glob.glob('../../processing/2018*flow*/output/*fold_change.csv')
dfs = [pd.read_csv(f, comment='#') for f in flow_files]
flow_data = pd.concat(dfs, axis=0)
flow_data = flow_data[(flow_data['fold_change'] >= -0.2) & (flow_data['fold_change'] <= 1.3)]
flow_data = flow_data[(flow_data.mutant != 'Q21A-Q294V') & \
                     (flow_data.mutant != 'Q21A-Q294K') & \
                     (flow_data.mutant != 'Q21A-F164T') & \
                     (flow_data.mutant != 'Q21M-Q294V') & \
                     (flow_data.mutant != 'Q21M-Q294K') & \
                     (flow_data.mutant != 'Q21M-F164T') & \
                     (flow_data.mutant != 'Y20I-Q294V') & \
                     (flow_data.mutant != 'Y20I-Q294K') & \
                     (flow_data.mutant != 'Y20I-F164T')]
flow_data = flow_data[(flow_data.mutant != 'Q21M') | (flow_data.IPTGuM != 0.0)]

# Load the microscopy data
mic_files = glob.glob('../../processing/2018*microscopy*/output/*fold_change.csv')
dfs = [pd.read_csv(f) for f in mic_files]
mic_data = pd.concat(dfs, axis=0)
mic_data['IPTGuM'] = 0.0

df_all = pd.concat([flow_data, mic_data], ignore_index=True)

# Now we remove the autofluorescence and delta values
df_all = df_all[(df_all.mutant != 'auto') & (df_all.mutant != 'delta') & (df_all.operator == 'O2')]

# We're also not going to bother with Q294R for now since
# it doesn't show a response to IPTG.
df_all = df_all[df_all.mutant != 'Q294R']

# Restart index
df_all = df_all.reset_index()

df_all.head()

Unnamed: 0.1,index,IPTGuM,Unnamed: 0,date,fold_change,mean_FITC_H,mean_YFP,mutant,operator,repressors,run_number,strain,username
0,2,0.0,2.0,20180409,-0.004128,5209.473763,,wt,O2,260.0,,R260,nbellive
1,3,0.0,3.0,20180409,-0.005374,5186.974267,,Q294V,O2,260.0,,R260,nbellive
2,4,0.0,4.0,20180409,0.002297,5325.49404,,F164T,O2,260.0,,R260,nbellive
3,7,0.1,7.0,20180409,0.012561,5187.629503,,wt,O2,260.0,,R260,nbellive
4,8,0.1,8.0,20180409,0.00562,5060.321179,,Q294V,O2,260.0,,R260,nbellive


Define functions for data preprocessing and MCMC walker initialization (hidden below):

## Setup MCMC

In [3]:
# Preprocess the data
unique_var, param_idx, data = mcmc_utils.mcmc_pre_process_all(df_all)

n_walkers = 60
n_burn = 500
n_steps = 8000
p0, n_dim = mcmc_utils.init_walkers_all(df_all, n_walkers, unique_var, param_idx)
#Call the sampler. 
sampler = emcee.EnsembleSampler(n_walkers, n_dim, mcmc_utils.log_post,\
                args=(param_idx, unique_var, data, 4.5),\
                threads=6)

In [4]:
sample = False
if sample:
    #Do the burn in
    print('Performing the burn-in')
    pos, prob, state = sampler.run_mcmc(p0, n_burn, storechain=False)
    # Perform the real MCMC
    print('Performing the MCMC')
    _ = sampler.run_mcmc(pos, n_steps)
    output = open('../../../data/mcmc/NB_emcee_mutants_global_strict.pkl', 'wb')
    pickle.dump(sampler.flatchain, output)
    output.close()
    output = open('../../../data/mcmc/NB_emcee_mutants_global_lnprob_strict.pkl', 'wb')
    pickle.dump(sampler.flatlnprobability, output)
    output.close()
    

In [5]:
# Load the flat-chain
with open('../../../data/mcmc/NB_emcee_mutants_globalb_strict.pkl','rb') as file:
    unpickler = pickle.Unpickler(file)
    gauss_flatchain = unpickler.load()

with open('../../../data/mcmc/NB_emcee_mutants_globalb_lnprob_strict.pkl','rb') as file:
    unpickler = pickle.Unpickler(file)
    gauss_flatlnprobability = unpickler.load()

# # Draw the corner plot
# fig = corner.corner(gauss_flatchain[:,[4,10,14,15,16,17,21]], bins=50, plot_contours=False,
#                     rasterized=True)

Now, lets create a Pandas DataFrame out of the MCMC chains.

In [6]:
index = np.concatenate([[mut + '_ka' for mut in unique_var[1]], 
                        [mut + '_ki' for mut in unique_var[1]],
                       ['sigma'],
                      [mut + '_eps' for mut in unique_var[1]]])
df_mcmc = pd.DataFrame(gauss_flatchain, columns=index)


for col in df_mcmc.columns:
    if 'ka' in col:
        df_mcmc[col[:-2]+'Ka'] = np.exp(-df_mcmc[col])
    if 'ki' in col:
        df_mcmc[col[:-2]+'Ki'] = np.exp(-df_mcmc[col])
df_mcmc['lnprobability'] = gauss_flatlnprobability
# df_mcmc = pd.DataFrame(gauss_flatchain, columns=index)
      
# redfine the index with the new entries
index = df_mcmc.columns
df_mcmc.to_csv('../../../data/mcmc/NB_emcee_mutants_global_strict.csv')
df_mcmc.head()

Unnamed: 0,F164T_ka,Q21A_ka,Q21M_ka,Q294V_ka,Y20I_ka,wt_ka,F164T_ki,Q21A_ki,Q21M_ki,Q294V_ki,...,Q294V_Ka,Y20I_Ka,wt_Ka,F164T_Ki,Q21A_Ki,Q21M_Ki,Q294V_Ki,Y20I_Ki,wt_Ki,lnprobability
0,-12.230971,-10.221831,-4.460114,-10.930849,-12.821329,-5.268982,1.612366,-0.341327,0.793752,-1.71386,...,55873.714009,370026.302003,194.218086,0.199415,1.406814,0.452145,5.550342,0.247325,0.758254,2111.84728
1,-12.220681,-10.232002,-4.453098,-10.963599,-12.833103,-5.268368,1.608288,-0.339782,0.798742,-1.696923,...,57733.855012,374408.439628,194.098868,0.20023,1.404641,0.449895,5.457128,0.246978,0.756607,2112.77231
2,-12.238648,-10.255956,-4.45177,-11.059689,-12.878563,-5.266549,1.597368,-0.334136,0.801573,-1.678435,...,63556.764463,391822.182487,193.746222,0.202429,1.396734,0.448623,5.357168,0.242354,0.756088,2114.733373
3,-12.238648,-10.255956,-4.45177,-11.059689,-12.878563,-5.266549,1.597368,-0.334136,0.801573,-1.678435,...,63556.764463,391822.182487,193.746222,0.202429,1.396734,0.448623,5.357168,0.242354,0.756088,2114.733373
4,-12.238648,-10.255956,-4.45177,-11.059689,-12.878563,-5.266549,1.597368,-0.334136,0.801573,-1.678435,...,63556.764463,391822.182487,193.746222,0.202429,1.396734,0.448623,5.357168,0.242354,0.756088,2114.733373


Calculate the properties too.

In [7]:
param_fit = stats.compute_statistics(df_mcmc, logprob_name='lnprobability')
param_fit.to_csv('param_fit_global_strict.csv')
param_fit.head()

Unnamed: 0,parameter,mode,hpd_min,hpd_max
0,F164T_ka,-5.211714,-5.955974,-4.504796
1,Q21A_ka,-4.717772,-14.490424,-4.433006
2,Q21M_ka,-5.093962,-5.223736,-4.817517
3,Q294V_ka,-6.497979,-13.641256,-5.837362
4,Y20I_ka,-3.831764,-14.600878,-3.378044
