# Exploratory Data Analysis
---

In [80]:
import numpy as np
import pandas as pd
import imp
import mwc.viz
import mwc.stats
imp.reload(mwc.stats)
import bokeh.io
import bokeh.layouts
import bokeh.plotting
from bokeh.models import Dropdown, RadioButtonGroup, ColumnDataSource, Slider, Band
bokeh.io.output_notebook()
colors, colors_list = mwc.viz.bokeh_theme()

As of July 7, 2019, Zofii and I have collected nearly all of the data for both the carbon source variation as well as the temperature ranges. The point of this notebook is to perform some cursory exploratory data analysis with respect to the fold-change in gene expression as well as the growth rate depdent expression parameters. Ultimately, I will want to examine $\Delta F$ as a means to identify *which* parameters are changing. To begin, let's write a small Bokeh application that allows exploration of the the dilution data 

In [81]:
# Load the data
flucts = pd.read_csv('../../data/analyzed_fluctuations.csv')

def dilution_doc(doc):
    # Define the interactions
    button = RadioButtonGroup(labels=['carbon source variation', 'temperature variation'], active=0)
    drop1 = Dropdown(label='select carbon source', menu=[('glucose', 'glucose'), ('glycerol', 'glycerol'), ('acetate', 'acetate')], value='glucose')
    drop2 = Dropdown(label='select temperature', menu=[('42° C', '42'), ('37° C', '37'), ('32° C', '32')], value='37')
    bins = Slider(value=50, start=5, end=250, step=5, title='events per bin', bar_color=colors['purple'])

    # Define the figure canvas and layouts
    p = bokeh.plotting.figure(x_axis_type='log',y_axis_type='log', 
                             x_axis_label='I\u2081 + I\u2082',
                             y_axis_label='(I\u2081 - I\u2082)\u00b2',
                             width=600, height=400)
    row = bokeh.layouts.row(drop1, drop2)
    lay = bokeh.layouts.column(button, row, bins, p)

    # Define the source
    fluct_source = ColumnDataSource(dict(summed=[], fluct=[]))
    fit_source = ColumnDataSource(dict(I_range=[], fit=[]))
    bin_source = ColumnDataSource(dict(binned_summed=[], binned_fluct=[], fluct_low=[], fluct_high=[]))
    I_tot_range = np.logspace(2, 6, 200)
    def update():
        if button.active == 0:
            temp = 37
            carbon = drop1.value

        elif button.active == 1:
            temp = int(drop2.value)
            carbon = 'glucose'

        # Isolate the dataset 
        _df = flucts[(flucts['carbon']==carbon) & (flucts['temp']==temp)]
        alpha = _df['alpha_mu'].values[0]

        # Get the number of bins
        binning = mwc.stats.bin_by_events(_df, bins.value)
        p.title.text = f'{carbon}, {temp}°C, no antibiotic'
        fluct_source.data = dict(summed=_df['summed'], fluct=_df['fluct'])
        fit_source.data = dict(I_range=I_tot_range, fit=alpha * I_tot_range)
        bin_source.data = dict(binned_summed=binning['summed'], binned_fluct=binning['fluct'],
                              fluct_low=binning['fluct'] - binning['fluct_sem'],
                              fluct_high = binning['fluct'] + binning['fluct_sem'])


    p.circle(x='summed', y='fluct', color='black', size=1, alpha=0.5, legend='division', source=fluct_source)

    binned_uncert = Band(base='binned_summed', lower='fluct_low', upper='fluct_high', fill_alpha=0.85, fill_color=colors['light_purple'],
                        source=bin_source, level='underlay') 
    p.add_layout(binned_uncert)
    p.circle(x='binned_summed', y='binned_fluct', line_color=colors['purple'], fill_color='white', 
             legend='binned divisions', source=bin_source, line_width=2, size=8)
    p.line(x='I_range', y='fit', color=colors['orange'], line_width=2, legend='best fit', source=fit_source)
    p.legend.location = 'top_left'
    for v in [drop1, drop2, bins]:
        v.on_change('value', lambda attr, old, new: update())
    update()
    doc.add_root(lay)

bokeh.io.show(dilution_doc, notebook_handle=True)


The data looks okay, some of the fits are not spectacular. There also appears to be limited data for the 32°C data set so I may need to spend some more time collecting data for that particular condition. I'm also not sure to what degree I should trust the "binned" data. 

As a sanity check, let's make sure that the glucose dilution measurements make sense with our prescribed theory.  

In [89]:
# Load the fold-change data and isolate the glucose 37 data
fc = pd.read_csv('../../data/analyzed_foldchange.csv')
g37 = fc[(fc['carbon']=='glucose') & (fc['temp']==37)].copy()
g37['repressors'] = np.round(g37['repressors'])
grouped = g37.groupby(['repressors'])['fold_change'].agg(('mean', 'sem')).reset_index()

rep_range = np.logspace(0, 3, 200)
# Compute the theoretical curve
theo = mwc.model.SimpleRepression(R=rep_range, ep_r=-13.9, ka=139, ki=0.53, effector_conc=0, ep_ai=4.5).fold_change()

In [90]:
p = bokeh.plotting.figure(x_axis_type='log', y_axis_type='log')
p.circle(grouped['repressors'], grouped['mean'], size=1, alpha=0.5, color='black')
p.segment(x0=grouped['repressors'], x1=grouped['repressors'], y0=grouped['mean']-grouped['sem'], y1=grouped['mean'] + grouped['sem'])
p.line(rep_range, theo)

bokeh.io.show(p)

In [84]:
grouped.reset_index()

Unnamed: 0,index,repressors,mean,sem
