In [None]:
import numpy as np
import pandas as pd
import scipy.stats as st

import re #regex

import bebi103

import bokeh.io
import bokeh.plotting
import bokeh_catplot

bokeh.io.output_notebook()

%load_ext blackcellmagic

### Munging
Load data from Brewster, pre-tidied by Manuel, and drop the spurious column that was the index in csv.

In [None]:
df_fish = pd.read_csv("../../data/jones_brewster_2014.csv")
del df_fish['Unnamed: 0']

In [None]:
df_fish.head()

Let's take a quick look at everything we've got.

In [None]:
plot_kwargs = {
    "x_axis_label": "counts",
    "y_axis_label": "expt",
    "width": 500,
    "height": 1000,
    "horizontal": True,
}
p = bokeh_catplot.box(data=df_fish, cats="experiment", val="mRNA_cell", **plot_kwargs)
bokeh.io.show(p)

Wait, what are all the experiment labels in the dataset?

In [None]:
raw_expt_labels = df_fish['experiment'].unique()
raw_expt_labels.sort()
raw_expt_labels

Huh, is this the complete dataset, with constitutive promoters _and_ LacI regulated measurements? Then what is in the regulated file?

In [None]:
df_reg = pd.read_csv("../../data/jones_brewster_regulated_2014.csv")
reg_labels = df_reg['experiment'].unique()
reg_labels

uuuuuh, what? Is that duplicates of what's in the main dataset?

In [None]:
print(len(df_reg[df_reg['experiment'] == 'O3_10ngmL']))
print(len(df_fish[df_fish['experiment'] == 'O3_10ngmL']))
print(len(df_reg[df_reg['experiment'] == 'Oid_0p5ngmL']))
print(len(df_fish[df_fish['experiment'] == 'Oid_0p5ngmL']))

I think the contents of the regulated file either duplicate or are a subset of the contents of the other file... Let's write a quick test function.

In [None]:
def check_counts_subset(subset_series, total_series):
    subset_vals, subset_counts = np.unique(subset_series, return_counts=True)
    total_vals, total_counts = np.unique(total_series, return_counts=True)
    for i, val in enumerate(subset_vals):
        assert val in total_vals, "%r not found in total_series" % val
        assert (
            subset_counts[i] <= total_counts[np.searchsorted(total_vals, val)]
        ), "More occurances of %r in subset_series than in total_series!" % val

In [None]:
check_counts_subset([0,1], [0,1,1]) # passes
# check_counts_subset([0,1,2], [0,1,1]) # fails
# check_counts_subset([0,1,2,3], [0,1,2,4]) # fails
check_counts_subset([0,0,1,2,3,3,3], [0,0,1,2,3,4,4,3,3]) # passes
# check_counts_subset([0,0,1,2,3,3], [0,0,1,2,4,3]) # fails

Seems to work. Now use it for reals.

In [None]:
check_counts_subset(df_reg[df_reg["experiment"] == "Oid_0p5ngmL"]['mRNA_cell'],
                    df_fish[df_fish["experiment"] == "Oid_0p5ngmL"]['mRNA_cell'])
check_counts_subset(df_reg[df_reg["experiment"] == "O3_10ngmL"]['mRNA_cell'],
                    df_fish[df_fish["experiment"] == "O3_10ngmL"]['mRNA_cell'])

No assertions raised so the contents of the regulated file are in fact a subset of the full dataframe.
So, I dunno what happened with the regulated file, but I think we can ignore it and work only with the main file.

#### Energies
Next, let's get the energies from the supplement of Brewster/Jones 2012 paper.

In [None]:
df_energies = pd.read_csv("../../data/brewster_jones_2012.csv")
df_energies.head()

Are all the promoters in the 2012 dataset in the 2014 fish dataset? These are the only constitutive promoters I'm interested in.

In [None]:
all(item in df_fish.experiment.unique() for item in df_energies.Name)

#### Splitting into regulated & constitutive data
Some of these datasets are not of interest right now so let's split it into multiple dataframes for easier downstream handling. The regulated datasets start with O1, O2, or O3. Everything else doesn't. From that everything else, grab the ones that we have energies for, and set aside the rest. Use regex to parse.

In [None]:
# put all strings that start w/ 'O' in one list
regulated_labels = [label for label in raw_expt_labels if re.match('^O', label)]
# and put all the others in another list
other_labels = [label for label in raw_expt_labels if not re.match('^O', label)]
# from that, split out those we have energies for...
constitutive_labels = [label for label in other_labels if label in tuple(df_energies.Name)]
# ...and those we don't
leftover_labels = [label for label in other_labels if label not in tuple(df_energies.Name)]
leftover_labels

Without more metadata, I don't really know what to do with the leftover labels data, e.g., what good does the aTc concentration do me if I don't know what promoter it was for?

### Parameter estimation

#### Chi-by-eye to sanity check

UV5, 5DL10, and 5DL20 look like good candidates for a closer look; all have decent non-zero expression, and they look different from each other.

In [None]:
df_slice = df_fish.query("experiment == 'UV5' \
                          or experiment == '5DL10' \
                          or experiment == '5DL20'")

df_slice['experiment'].unique()

Now that we've got a more manageable set, let's make ECDFs and chi-by-eye with negative binomial. `scipy.stats` convention is `cdf(k, n, p, loc=0)`, where $n$ is the number of successes we're waiting for and $p$ is probability of success.

In [None]:
p = bokeh_catplot.ecdf(data=df_slice, cats='experiment', val='mRNA_cell', style='staircase')
# compute upper bound for theoretical CDF plots
u_bound = max(df_slice['mRNA_cell'])
x = np.arange(u_bound+1)
p.line(x, st.nbinom.cdf(x, 5, 0.2))
p.line(x, st.nbinom.cdf(x, 3, 0.4), color='orange')
p.line(x, st.nbinom.cdf(x, .3, 0.26), color='green')
bokeh.io.show(p)

Ok, we're in the neighborhood, so next let's treat the model more carefully with Stan.