# Analysis of DNA binding mutants 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pystan
import glob
import bokeh.io
import bokeh.plotting
import bokeh.palettes
import sys
sys.path.insert(0, '../../')
import mut.bayes
import mut.viz
import mut.stats
import mut.thermo
bokeh.io.output_notebook()
colors = mut.viz.pub_style()
%matplotlib inline

  from ._conv import register_converters as _register_converters


In [2]:
# Load all of the 2018 flow data.
flow_files = glob.glob('../processing/2018*flow*/output/*fold_change.csv')
dfs = [pd.read_csv(f, comment='#') for f in flow_files]
flow_data = pd.concat(dfs, axis=0)
flow_data = flow_data[(flow_data['fold_change'] >= -0.2) & (flow_data['fold_change'] <= 1.2)]

# Load the microscopy data
mic_files = glob.glob('../processing/2018*microscopy*/output/*fold_change.csv')
dfs = [pd.read_csv(f) for f in mic_files]
mic_data = pd.concat(dfs, axis=0)

In [3]:
# Isolate the leakiness data
leakiness = flow_data[(flow_data['IPTGuM'] ==0) & (flow_data['mutant'] != 'Q21M') &
                     (flow_data['mutant'] != 'wt') & (flow_data['mutant'] != 'auto') 
                     & (flow_data['mutant'] != 'delta') & (flow_data['mutant'] != np.nan) &
                     (flow_data['mutant'] != 'Q294K') & (flow_data['mutant'] != 'Q294V') &
                     (flow_data['mutant'] != 'F164T')].copy()
leakiness.drop('Unnamed: 0', axis=1, inplace=True)
leakiness.rename(columns={'mean_FITC_H':'mean_intensity'}, inplace=True)
leakiness.loc[:, 'method'] = 'flow'
mic_data.loc[:, 'method'] = 'microscopy'
merged = pd.concat([leakiness, mic_data], ignore_index=True)
merged = merged[(merged['mutant'] != 'none')]

In [4]:
# Plot the data points. 
p1 = bokeh.plotting.figure(x_axis_label='repressor copy number',
                          y_axis_label='fold-change',
                          x_axis_type='log', y_axis_type='log',
                          plot_width=600, plot_height=400)

grouped = merged.groupby(['mutant'])
colors = bokeh.palettes.Accent8
i = 0
for g, d in grouped:
    _grouped = d.groupby('repressors')
    for _g, _d in _grouped:
        _d = _d.mean()
        p1.circle(_d['repressors'], _d['fold_change'], legend=g,
                     color=colors[i])
    i += 1
bokeh.io.show(p1)


In [5]:
# Include IDs for heirarchical model.
merged.loc[:, 'id'] = 1
merged.loc[merged['mutant'] == 'Q21A', 'id'] = 2
merged.loc[merged['mutant'] == 'Q21M', 'id'] = 3

In [6]:
merged.head()

Unnamed: 0,IPTGuM,date,fold_change,mean_YFP,mean_intensity,method,mutant,operator,repressors,run_number,strain,username,id
0,0.0,20180322,0.761336,,12694.551352,flow,Q21A,O2,60.0,,R60,gchure,2
1,0.0,20180322,0.279661,,9191.802448,flow,Q21A,O2,260.0,,R260,gchure,2
2,0.0,20180322,0.63268,,11758.961989,flow,Q21A,O2,124.0,,R124,gchure,2
3,0.0,20180322,0.155053,,8285.651273,flow,Q21A,O2,1220.0,,R1220,gchure,2
4,0.0,20180123,0.197116,,5387.374666,flow,Q21A,O2,260.0,,R260,gchure,2


In [7]:
def assemble_StanModelCode(model_file, function_file):
    """Returns a string of the stan model code from a model and function file"""
    lines = []
    files = [function_file, model_file]
    for f in files:
        with open(f, 'r') as file:
            out = file.read().splitlines()
            for line in out:
                lines.append(line) 
    model_code = """\n"""
    for line in lines:
        model_code += line + '\n'
    return model_code

In [8]:
model_code = assemble_StanModelCode('../stan/heirarchical_epR_fit.stan', '../stan/functions.stan')
sm = pystan.StanModel(model_code=model_code)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_31d86c808853debbc6ac3a765ef665db NOW.


In file included from /var/folders/2q/lvh2zsws3lxckq8xtkn_84z80000gn/T/tmpx06fd_3p/stanfit4anon_model_31d86c808853debbc6ac3a765ef665db_4146333505491980958.cpp:599:
In file included from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/arrayobject.h:4:
In file included from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/ndarrayobject.h:18:
In file included from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/ndarraytypes.h:1816:
 ^
    __pyx_t_12 = ((__pyx_t_9 != __pyx_v_fitptr->param_names_oi().size()) != 0);
                   ~~~~~~~~~ ^  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /var/folders/2q/lvh2zsws3lxckq8xtkn_84z80000gn/T/tmpx06fd_3p/stanfit4anon_model_31d86c808853debbc6ac3a765ef665db_4146333505491980958.cpp:603:
In file included from /anaconda3/lib/python3.6/site-packages/pystan/stan_fit.hpp:22:
In file included from /anaconda3/lib/python3.6/site-packages/pystan/stan/src/stan/services/diagnose/diagnose.hpp:10:

In [9]:
merged = merged[(merged['fold_change'] >= -0.2) & (merged['fold_change'] <= 1.3)]
# Assemble the data dictionary. 
ka = 139
ka_sigma = 30
ki = 0.53
ki_sigma = 0.04
data_dict = {'J':3, 'N':len(merged), 'trial':merged['id'].values,
            'R':merged['repressors'].values, 'n_ns':4.6E6, 'ka':ka, 'ki':ki,
            'ep_AI':4.5, 'n_sites':2, 'c':np.zeros(len(merged)), 'fc':merged['fold_change'].values}

# Sample the distribution.
samples = sm.sampling(data=data_dict, iter=5000, chains=4)

  elif np.issubdtype(np.asarray(v).dtype, float):


In [10]:
def chains_to_dataframe(fit):
    data = fit.extract()
    keys = list(data.keys())
    varnames = [k for k in keys if 'lp__' not in k]
    samples = {}
    for i, key in enumerate(varnames):
        # Get the shape.
        dim = np.shape(data[key])
        if len(dim) == 2:
            for j in range(dim[-1]):
                samples['{}.{}'.format(key, j+1)] = data[key][:, j]
    
        else:
            samples[key] = data[key]
            
    # compute the log_post. 
    new_keys = samples.keys()
    logp = []
    for j in range(dim[0]):
        logp.append(fit.log_prob([samples[k][j] for k in fit.unconstrained_param_names()]))
        
    samples['logp'] = logp
    return pd.DataFrame(samples)
    

In [13]:
df = chains_to_dataframe(samples)

df.rename(columns={'ep_R.1':'epR_Y20I', 'ep_R.2':'epR_Q21A', 'ep_R.3':'epR_Q21M'},
         inplace=True)

In [14]:
stats = mut.stats.compute_statistics(df)

  return getattr(obj, method)(*args, **kwds)


In [15]:
stats

Unnamed: 0,parameter,mode,hpd_min,hpd_max
0,epR_Y20I,-9.714586,-9.909059,-9.438792
1,epR_Q21A,-10.250761,-10.46538,-10.087569
2,epR_Q21M,-15.057135,-15.243548,-14.989021
3,sigma.1,0.081784,0.083843,0.159151
4,sigma.2,0.061891,0.06576,0.125032
5,sigma.3,0.001826,0.000972,0.003326


In [16]:
# Plot the fits. 
rep_range = np.logspace(0, 4, 500)
muts = merged['mutant'].unique()
ep_R = [stats[stats['parameter']=='epR_{}'.format(m)]['mode'].values[0] for m in muts]
ep_R_low = [stats[stats['parameter']=='epR_{}'.format(m)]['hpd_min'].values[0] for m in muts]
ep_R_high = [stats[stats['parameter']=='epR_{}'.format(m)]['hpd_max'].values[0] for m in muts]
meshed_R, meshed_ep = np.meshgrid(rep_range, ep_R)
meshed_R_low, meshed_ep_high = np.meshgrid(rep_range, ep_R_high)
meshed_R_high, meshed_ep_low = np.meshgrid(rep_range, ep_R_low)
arch = mut.thermo.SimpleRepression(R=meshed_R, ep_r=meshed_ep, ep_ai=4.5, ka=139, ki=0.53,
                                  n_sites=int(2), n_ns=4.6E6, effector_conc=0)
arch_high = mut.thermo.SimpleRepression(R=meshed_R_high, ep_r=meshed_ep_high, ep_ai=4.5, ka=139, ki=0.53,
                                  n_sites=int(2), n_ns=4.6E6, effector_conc=0)
arch_low = mut.thermo.SimpleRepression(R=meshed_R_low, ep_r=meshed_ep_low, ep_ai=4.5, ka=139, ki=0.53,
                                  n_sites=int(2), n_ns=4.6E6, effector_conc=0)
fc = arch.fold_change()
fc_high = arch_high.fold_change()
fc_low = arch_low.fold_change()
# Set up the figure. 
color_cycle = ['tomato', 'dodgerblue', 'slategrey']
color_dict = {i: j for i, j in zip(muts, color_cycle)}
p = bokeh.plotting.figure(plot_width=600, plot_height=400, x_axis_type='log',
                         y_axis_type='log', x_axis_label='repressor copy number',
                         y_axis_label='fold-change')
for i in range(3):
    p.line(rep_range, fc[i, :], color=color_cycle[i], line_width=2, 
           legend='{} = {:0.1f} +{:0.1f} -{:0.1f} k_BT'.format(muts[i], ep_R[i], 
                    np.abs(ep_R[i]) - np.abs(ep_R_high[i]), 
                    np.abs(ep_R_low[i]) - np.abs(ep_R[i])), alpha=0.5)
    band_x = np.append(rep_range, rep_range[::-1])
    band_y = np.append(fc_low[i, ], fc_high[i, ::-1])
    p.patch(band_x, band_y, color=color_cycle[i], alpha=0.3)
    
    
# Plot the data points. 
grouped = merged.groupby(['mutant', 'repressors'])
for g, d in grouped:
    p.circle(g[1], d['fold_change'].mean(), color=color_dict[g[0]])
p.legend.location = 'bottom_left'    
bokeh.io.show(p)

###  Plotting the titration curves.

In [17]:
c_range = np.logspace(-2, 4, 500)
reps = flow_data['repressors'].unique()
reps = np.sort(reps[reps > 0])

# 
c, ep, R = np.meshgrid(c_range, ep_R, reps) 
c_low, ep_low, R_low = np.meshgrid(c_range, ep_R_low, reps) 
c_high, ep_high, R_high = np.meshgrid(c_range, ep_R_high, reps) 
fc = mut.thermo.SimpleRepression(R=R, ep_r=ep, effector_conc=c, ep_ai=4.5,
                                ka=ka, ki=ki, n_sites=2, n_ns=4.6E6).fold_change()
fc_low = mut.thermo.SimpleRepression(R=R_low, ep_r=ep_low, effector_conc=c_low, ep_ai=4.5,
                                ka=ka, ki=ki, n_sites=2, n_ns=4.6E6).fold_change()   
fc_high = mut.thermo.SimpleRepression(R=R_high, ep_r=ep_high, effector_conc=c_high, ep_ai=4.5,
                                ka=ka, ki=ki, n_sites=2, n_ns=4.6E6).fold_change()

fcs = [fc, fc_low, fc_high]

rep_color = bokeh.palettes.Category10_5
rep_dict = {i:j for i, j in zip(reps, rep_color)}
canvas = []
for i, m in enumerate(muts):
    _p = bokeh.plotting.figure(plot_width=350, plot_height=250, x_axis_type='log',
                               x_axis_label='IPTG [µM]', y_axis_label='fold-change',
                              title=m)
    for j, R in enumerate(reps):
        if i == 0:
            legend = str(R)
        else:
            legend = False
        _p.line(c_range, fc[i, :, j], color=rep_dict[R], line_width=1, alpha=0.5,
               legend=legend)
        band_x = np.append(c_range, c_range[::-1])
        band_y = np.append(fc_low[i,:, j], fc_high[i, ::-1, j])
        _p.patch(band_x, band_y, color=rep_dict[R], alpha=0.3)
   
        
        # Plot the data.
        mut_flow = flow_data[(flow_data['mutant'] == m) & (flow_data['repressors'] == R)]
        grouped = mut_flow.groupby('IPTGuM')
        for g, d in grouped:
            mean_fc = d['fold_change'].mean()
            sem_fc = d['fold_change'].std() / np.sqrt(len(d))
            
            _p.circle(g, mean_fc, color=rep_dict[R], size=2)
            _p.line((g, g), (mean_fc - sem_fc, mean_fc + sem_fc), color=rep_dict[R])
            
        
    if i==0:
        _p.legend.location = 'bottom_right'
    canvas.append(_p)
    
layout = bokeh.layouts.gridplot([[canvas[0], canvas[1]], [canvas[2]]])
bokeh.io.show(layout)

In [108]:
imp.reload(mut.thermo)

<module 'mut.thermo' from '../../mut/thermo.py'>

##  Computing the properties.

In [144]:
# assemble the data ignoring Q21M
dna_muts = flow_data[(flow_data['mutant'] == 'Q21M') | (flow_data['mutant']=='Y20I') | 
                    (flow_data['mutant'] == 'Q21A')]
dna_muts = dna_muts[(dna_muts['mutant'] != 'Q21M') | (dna_muts['IPTGuM'] > 0)]
mic_data['IPTGuM'] = 0

merged_global = pd.concat([mic_data[mic_data['mutant']=='Q21M'], dna_muts])

# Restrict to only the useful information. 
merged_global = merged_global[['IPTGuM', 'mutant', 'repressors', 'fold_change']]

idx = {'Q21M':1, 'Q21A':2, 'Y20I':3}
for k in idx.keys():
    merged_global.loc[merged_global['mutant']==k, 'id'] = int(idx[k])
    
merged_global = merged_global[(merged_global['fold_change'] >= -0.2) & (merged_global['fold_change'] <= 1.3)]

In [151]:
# Compute  and plot the properties. 

p1 = bokeh.plotting.figure(plot_width=600, plot_height=300, x_axis_type='log',
                          y_axis_type='log', x_axis_label='repressors per cell',
                          y_axis_label='leakiness')
p2 = bokeh.plotting.figure(plot_width=600, plot_height=300, x_axis_type='log',
                           x_axis_label='repressors per cell',
                          y_axis_label='saturation')
p3 = bokeh.plotting.figure(plot_width=600, plot_height=300, x_axis_type='log',
                          x_axis_label='repressors per cell',
                          y_axis_label='dynamic range')

canvas = [p1, p2, p3]
rep_range = np.logspace(0, 4, 500)
color_palette = bokeh.palettes.viridis(5)
for i, m in enumerate(ep_R):
 
    # mesh the bounds and instantiate the architecture.
    low = ep_R_low[i]
    high = ep_R_high[i]
    vals = np.array([m, low, high])
    r, ep = np.meshgrid(rep_range, vals)
    
    arch = mut.thermo.SimpleRepression(R=r, ep_r=ep, ka=ka, ki=ki, ep_ai=4.5,
                                      effector_conc=0, n_sites=2)
    
    leakiness = arch.leakiness()
    saturation = arch.saturation()
    dyn_range = arch.dynamic_range()
    properties = [leakiness, saturation, dyn_range]
    for j in range(3):
        canvas[j].line(rep_range, properties[j][0, :], color=color_palette[i])
        bandx = np.append(rep_range, rep_range[::-1])
        bandy = np.append(properties[j][1, :], properties[j][2, ::-1])
        canvas[j].patch(bandx, bandy, color=color_palette[i], alpha=0.4)
    

# Plot the data.
for i, m in enumerate(muts):
    _d = merged_global[merged_global['mutant']==m]
    
    # Plot the leakiness.
    grouped = _d.groupby(['repressors', 'IPTGuM'])
    leak, sat = {}, {}
    for g, d in grouped:
        mean_fc = d['fold_change'].mean()
        sem_fc = d['fold_change'].std() / np.sqrt(len(d))
        if g[1] == 0.0:     
            p1.circle(g[0], mean_fc, color=color_palette[i], legend=m)
            p1.multi_line((g[0], g[0]), (mean_fc - sem_fc, mean_fc + sem_fc),
                        color=color_palette[i])
            leak[g[0]] = mean_fc 
        if g[1] == 5000.0:
            p2.circle(g[0], mean_fc, color=color_palette[i])
            p2.multi_line((g[0], g[0]), (mean_fc - sem_fc, mean_fc + sem_fc),
                        color=color_palette[i])           
            sat[g[0]] = mean_fc
    for j, r in enumerate(leak.keys()):
        p3.circle(r, sat[r] - leak[r], color=color_palette[i])
p1.legend.location = 'bottom_left' 
layout = bokeh.layouts.column(p1, p2, p3)
bokeh.io.show(layout)

In [136]:
dna_muts[dna_muts['mutant']=='Q21M']

Unnamed: 0.1,IPTGuM,Unnamed: 0,date,fold_change,mean_FITC_H,mutant,operator,repressors,strain,username


In [152]:
# Assemble the stan model. 
model_code = assemble_StanModelCode('../stan/heirarchical_global_fit.stan', '../stan/functions.stan')
sm = pystan.StanModel(model_code=model_code)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_66e72a35206c68612d300a91b3c62f4a NOW.


In file included from /var/folders/2q/lvh2zsws3lxckq8xtkn_84z80000gn/T/tmpcrjm2uzy/stanfit4anon_model_66e72a35206c68612d300a91b3c62f4a_6484565692495752769.cpp:599:
In file included from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/arrayobject.h:4:
In file included from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/ndarrayobject.h:18:
In file included from /anaconda3/lib/python3.6/site-packages/numpy/core/include/numpy/ndarraytypes.h:1816:
 ^
    __pyx_t_12 = ((__pyx_t_9 != __pyx_v_fitptr->param_names_oi().size()) != 0);
                   ~~~~~~~~~ ^  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /var/folders/2q/lvh2zsws3lxckq8xtkn_84z80000gn/T/tmpcrjm2uzy/stanfit4anon_model_66e72a35206c68612d300a91b3c62f4a_6484565692495752769.cpp:603:
In file included from /anaconda3/lib/python3.6/site-packages/pystan/stan_fit.hpp:22:
In file included from /anaconda3/lib/python3.6/site-packages/pystan/stan/src/stan/services/diagnose/diagnose.hpp:10:

In [None]:
# Assemble the data dictionary.
data_dict = {'J': 3, 'N': len(merged_global), 'R':merged_global['repressors'],
            'c':merged_global['IPTGuM'], 'trial':merged_global['id'].astype(int), 'n_ns':4.6E6,
            'ep_ai':4.5, 'n_sites':2, 'fc':merged_global['fold_change']}
global_fit = sm.sampling(data=data_dict, iter=5000, chains=4)

  elif np.issubdtype(np.asarray(v).dtype, float):


In [None]:
global_df = chains_to_dataframe(global_fit)
global_fit_stats = mut.stats.compute_statistics(global_df)

In [72]:
global_fit_stats

Unnamed: 0,parameter,mode,hpd_min,hpd_max
0,ep_R.1,-15.114247,-15.256739,-14.998845
1,ep_R.2,-10.272071,-10.385623,-10.143083
2,ep_R.3,-9.913431,-10.050989,-9.689704
3,ka.1,-0.312568,-19.689744,19.050694
4,ka.2,4.752119,4.452565,5.398163
5,ka.3,3.58894,3.333014,4.131543
6,ki.1,-1.355897,-19.777112,19.554096
7,ki.2,-0.068061,-0.209412,0.145194
8,ki.3,-0.89208,-1.040883,-0.564086
9,sigma.1,0.001493,0.000953,0.003353
