In [None]:
import copy
import re #regex

import numpy as np
import pandas as pd

import cmdstanpy
import arviz as az

import bebi103
import bokeh_catplot

import bokeh.io
import bokeh.plotting
# import bokeh.models.mappers
# import bokeh.palettes

import holoviews as hv
import holoviews.operation.datashader
hv.extension('bokeh')
bebi103.hv.set_defaults()

import panel as pn
pn.extension()

bokeh.io.output_notebook()

%load_ext blackcellmagic
%load_ext autoreload
%autoreload 2

Load data and parse into convenient dataframes.

In [None]:
df_fish = pd.read_csv("../../data/jones_brewster_2014.csv")
del df_fish['Unnamed: 0']

raw_expt_labels = df_fish['experiment'].unique()

# put all strings that start w/ 'O' in one list; these are the regulated expts
regulated_labels = [label for label in raw_expt_labels if re.match('^O', label)]

# finally create the actual useful df's
df_reg = df_fish[df_fish['experiment'].isin(regulated_labels)]
df_reg = df_reg.sort_values(by=['experiment']).reset_index()
del df_reg['index']
df_UV5 = df_fish[df_fish["experiment"] == "UV5"]

In [None]:
df_reg.head()

For the regulated data, the `experiment` label combines operator sequence and aTc concentration into a single string, e.g., `Oid_2ngmL`. This is useless for Stan, so let's create new columns containing index codes for operator & aTc concentrations separately, and then attach those new columns to the regulated dataframe.

In [None]:
op_lookup = {'O1': 1, 'O2': 2, 'O3': 3, 'Oid': 4}
def get_op(label):
    return op_lookup[re.split('_', label)[0]]

atc_lookup = {'0p5ngmL': 1, '1ngmL': 2, '2ngmL': 3, '10ngmL': 4}
def get_conc(label):
    return atc_lookup[re.split('_', label)[1]]

In [None]:
op_series = df_reg['experiment'].apply(get_op)
op_series.name = 'op_idx'
aTc_series = df_reg['experiment'].apply(get_conc)
aTc_series.name = 'aTc_idx'

df_reg = df_reg.join([op_series, aTc_series])

In [None]:
len(df_reg[(df_reg['op_idx'] == 4) & (df_reg['aTc_idx'] == 1)])

In [None]:
len(df_reg[df_reg['experiment'] == 'Oid_0p5ngmL'])

Cool. That seems to have worked.

Let's repeat the previous analysis but more principled: define a binding E for each operator, a lacI copy number for each aTc concentration, and a burst freq & size (so 10 model parameters total). Fit UV5 plus _all_ the repressed data together, rather than 1 by 1. This will be a stronger test: can the burst parameters agree with constitutive UV5 and still accommodate all the repressed data, and do we recover binding energies that look remotely plausible?

#### Prior predictive check

In [None]:
data_prior_pred = dict(
    N=len(df_reg),
    N_uv5=len(df_UV5),
    aTc_idx=df_reg.aTc_idx.values,
    op_idx=df_reg.op_idx.values,
#     mRNA_uv5=df_UV5["mRNA_cell"].values.astype(int),
#     ppc=0,
)

In [None]:
bebi103.stan.clean_cmdstan(path='./stan/')
sm_prior_pred = cmdstanpy.CmdStanModel(stan_file="stan/simple_rep_means_v02_prior_pred.stan")
# print(sm.code())

In [None]:
prior_pred_samples = sm_prior_pred.sample(
    data=data_prior_pred,
    fixed_param=True,
    sampling_iters=1000,
#     output_dir="./stan/stan_samples",
)

In [None]:
# Convert to ArviZ InferenceData
prior_pred_samples = az.from_cmdstanpy(
    posterior=prior_pred_samples,
    prior=prior_pred_samples,
    prior_predictive=['mRNA_uv5', 'mRNA']
)

First prior predictive for UV5 alone. Make prior fairly tight to enforce prior inference:

In [None]:
p = bebi103.viz.predictive_ecdf(
    prior_pred_samples.prior_predictive['mRNA_uv5'],
    frame_height=250,
    frame_width=350,
    discrete=True,
    percentiles=(95, 90, 75, 50),
    x_axis_label='mRNA counts',
    x_axis_type='log'
)
p = bokeh_catplot.ecdf(data=df_UV5, val='mRNA_cell', style='staircase', palette=('#FF9900',), p=p)
bokeh.io.show(p)

I wanna make sure I haven't done anything stupid such that priors exclude the actual data. So make a panel explorer that plots prior predictive ECDFs for each experiment w/ data overlaid, and a slider to choose expt.

In [None]:
# Create expt selector drop-down list
aTc_op_selector = pn.widgets.DiscreteSlider(
    name="trial",
    options=sorted(list(df_reg["experiment"].unique())),
    value="O1_0p5ngmL",
)

@pn.depends(aTc_op_selector.param.value)
def prior_pred_explorer(expt_label):
    bool_slice = np.array(df_reg["experiment"] == expt_label)

    p = bebi103.viz.predictive_ecdf(
        prior_pred_samples.prior_predictive["mRNA"].sel(mRNA_dim_0=bool_slice),
        frame_height=250,
        frame_width=350,
        discrete=True,
        percentiles=(95, 90, 75, 50),
        x_axis_label="mRNA counts",
        x_axis_type="log",
    )
    p = bokeh_catplot.ecdf(data=df_reg[bool_slice],
                           val="mRNA_cell",
                           style="staircase",
                           palette=("#FF9900",),
                           p=p
    )
    return p

pn.Row(prior_pred_explorer, pn.Spacer(width=15), aTc_op_selector)

Mostly look fine.

#### Posterior sampling

In [None]:
data = copy.deepcopy(data_prior_pred)
data["mRNA"] = df_reg["mRNA_cell"].values.astype(int)
data["mRNA_uv5"] = df_UV5["mRNA_cell"].values.astype(int)
data["ppc"] = 0

bebi103.stan.clean_cmdstan(path='./stan/')
sm = cmdstanpy.CmdStanModel(stan_file="stan/simple_rep_means_v02.stan")
# print(sm.code())

In [None]:
posterior_samples = sm.sample(data=data, cores=4)
posterior_samples = az.from_cmdstanpy(posterior_samples)

In [None]:
bebi103.stan.check_all_diagnostics(posterior_samples)

So far so good. Corner plots?

In [None]:
bokeh.io.show(
    bebi103.viz.corner(
        posterior_samples,
#         pars=["op_E[0]", "log_R[0]", "op_E[1]", "log_R[1]", "alpha", "b",],
        pars=["op_E[0]", "op_E[1]", "op_E[2]", "log_R[0]", "log_R[1]","log_R[2]",],
#         pars=["op_E[1]", "op_E[2]", "op_E[3]", "log_R[1]","log_R[2]", "log_R[3]", ],
        alpha=0.1,
        xtick_label_orientation=np.pi / 4,
    )
)

As feared, the concentration scaling factors are totally degenerate with the binding energies.
It's tempting to say this doesn't matter, this model just doesn't work, and move to the harder model & infer rates. But I think this is foolish: I expect the posterior predictives to fail, but despite that I still oughta be able to get better posterior shrinkage than this. I blame the degeneracy b/w copy number & affinity, which won't go away in the harder version, if anything it'll be worse, so if I can't tame it here, it won't be easier there when I don't even know what to expect and can't use tight priors to help stabilize things...

One test would be to throw out a few of the trials, i.e., does it help to remove the data that are clearly way overdisperse &/or too repressed? Not that that data is bad: it actually might be the _most_ useful for inferring kinetics, but since those distributions are clearly not negative binomially distributed, it's hard to trust the inferences produced from shoving a square peg in a round hole.


So I think fitting for $R$ isn't gonna work. We need to specify it a priori, I think best way is with a very informative prior rather than a point estimate.

Table S3 in Jones/Brewster 2014 quotes lacI copy # of 0.21, 5.9, and 50 for aTc concentrations of 0.5, 2, & 10 ng/mL, respectively. My guesstimate inference from taking binding energies as known & fitting $R$ gave the following $R$ values:

|     | 0.5 ng/ml | 1 ng/ml | 2 ng/ml | 10 ng/ml |
| --- | --------- | ------- | ------- | -------- |
| Oid | 0.03      | 1.5     | 11      | -        |
| O1  | 0.1       | 6       | 30-35   | -        |
| O2  | 0.5-1     | 6-8     | 25-35   | 70-100   |
| O3  | -         |  -      | > 25    | > 30?    |


So let's take

|     | 0.5 ng/ml | 1 ng/ml | 2 ng/ml | 10 ng/ml |
| --- | --------- | ------- | ------- | -------- |
| R   | 0.1       | 2       | 10      | 50       |
| ln R| -2.3      | 0.7     | 2.3     | 3.9      |

which is not far from the peaks I was using before, but now let's make the width much tighter, say 0.25 ln units.

Ok, a highly informative prior on $R$ does in fact work to make inferences of $\Delta\epsilon$s identifiable, and the values are in the neighborhood though not in perfect agreement with the canonical values. Maybe I should turn this around: put highly informative priors on the energies, and infer the repressor copy numbers for each aTc? Then I can use that knowledge as a prior for fitting the nonequilibrium model of repression.