#  Simulating the dilution experiment. 

In [2]:
import numpy as np
import pandas as pd
import bokeh.io
import bokeh.models
import bokeh.palettes
import bokeh.plotting
import scipy.special

# Display graphics in this notebook
bokeh.io.output_notebook()

## Setting up the problem.  

## Generating a data set

In [13]:
# Seed the RNG for reproducibility. 
np.random.seed(666)  # number of the beast

# Define some parameters. 
alpha = 150 # in arbitrary units
int_range = alpha * np.logspace(0, 4, 100)  # I_tot range for the calculation.
min_Ntot = 1  # Minimum number of proteins. Essentially leakiness.
max_divisions = 50 # number of division events recorded for each intensity. 
partition_prob = 0.5 # 
alpha_std =  10
I_tot_std = 0.05 # Fraction o f intensity.
# Set up a DataFrame to store the relevant values.
df = pd.DataFrame(columns=['seeded_alpha', 'simulated_alpha', 'alpha_std', 'N_1',
                           'N_2', 'I_1', 'I_2'])

# Loop through each intensity value. 
for i, I_tot in enumerate(int_range):
    # Loop through each division event. 
    random_div_num = np.random.choice(np.arange(0, max_divisions,1))
    for div in range(random_div_num):
        # Set the noise in the alpha value. 
        alpha_noise = np.random.normal(alpha, 10)  # Add some gaussian norm. 
        
        # Apply noise to the intensity.
        I_tot = np.random.normal(I_tot, I_tot_std * I_tot)
        
        # Compute the number that are partitioned. 
        N_tot = int(I_tot / alpha_noise)
        if N_tot < min_Ntot:
            N_tot = int(min_Ntot)
        N_1 = np.sum(np.random.rand(N_tot) < partition_prob).astype(int)
        
        # Compute the square difference. 
        square_diff = (alpha_noise * (N_1 - N_tot + N_1))**2
        
        # Add everything to the DataFrame.
        df = df.append({'seeded_alpha': alpha, 'simulated_alpha': alpha_noise,
                        'alpha_std': alpha_std, 'N_1': N_1, 'N_2': N_1 - N_tot,
                        'I_1': N_1 * alpha_noise, 'I_2': (N_tot - N_1) * alpha_noise,
                        }, ignore_index=True)
 
# Save the DataFrame as a csv file. 
df.to_csv('../../data/other/simulated_dilution.csv')

In [7]:
# Set the figure. 
p = bokeh.plotting.figure()
p = bokeh.charts.Scatter(df, x='I_tot', y='square_diff', height=500,
                         width=600, xlabel='mother cell intensity [a.u]',
                         ylabel='squared intensity difference [a.u.]', 
                         color='black', marker='circle')
p.background_fill_color = '#E3DCD'
p.grid.grid_line_color = 'white'
p.grid.grid_line_dash = 'dotted'
bokeh.models.BasicTickFormatter(use_scientific=False)
bokeh.io.show(p)

##  A bayesian approach

In both the Brewster & Rosenfeld papers, the calibration factor was computed by binning the division event data and then fitting the function. Obviously, the number of bins chosen and the bin centers will introduce some degree of bias into the end result. Brewster and Franz got away with this because they computed the calibration factor as a function of events per bin, and found that at high events per bin the calibration factor reached a plateau. They arbitrarily chose some value in this range for allo f their calculations. As far as I can tell, the binning used in Rosenfeld *et al.* is unkown and belongs to the sands of time. 

Rather than doing the same approach, I can try to use a Bayesian approach to estimate withe parameter without the bias of binning. We can begin by writing down Bayes's theorem as

$$
P(\alpha, \sigma, N_1, N_\text{tot} \vert I_1, I_2) \propto P(I_1, I_2 \vert \alpha, \sigma, N_1, N_\text{tot}) P(\alpha, \sigma, N_\text{tot})P(N_1 \vert N_\text{tot}),
$$

where $\alpha$ is the calibration factor, $N_1$ is the number of proteins in daughter cell 1, $I_1$ and $I_2$ are the intensities for daughter 1 and daughter 2, and $\sigma$ is the uncertainty in our system. 

<br/>
### Likelihoods

There are three likelihoods in this system. We can say that for the intensities, $I_1$ and $I_2$, the values will be Gaussian distributed about some mean value, allow us to write

$$
P(I_1 \vert \alpha, \sigma, N_1, N_\text{tot}) = {1 \over \sqrt{2 \pi \sigma^2}}\exp\left[-{(I_1 - \alpha N_1)^2 \over 2\sigma^2}\right]
$$
and

$$P(I_2 \vert \alpha, \sigma, N_1, N_\text{tot}) = {1 \over \sqrt{2 \pi \sigma^2}}\exp\left[-{(I_2 - \alpha(N_\text{tot} - N_1))^2 \over 2\sigma^2}\right].
$$

Our last likelihood is for the nubmer of proteins in daughter 1. This, as the crux of this method, must be binomially distributed giving us a likelihood of 

$$
P(N_1 \vert N_\text{tot}) = {{N_\text{tot}}\choose{N_1}}{1 \over 2}^{N_\text{tot}},
$$

assuming that the proteins are randomly distributed, $p= 1/2$.
<br />
### Priors

As $\sigma$ is the only scale parameter in our posteriror, we can give it a Jeffrey's prior

$$
P(\sigma) = {1 \over \sigma}.
$$

The other priors can be of any value, and therefore have uniform priors. However, $N_1$ and $N_2$ must be discrete since these are tangible physical objects. Intensity can be continuous, however. 

### Full posterior

With all of the pieces in hand, we can write the full posteriro probability distribution for a set of $i$ division events as

$$

\begin{align}
P(\alpha, \sigma, &\{N_1, N_\text{tot}\} \vert \{I_1, I_2\}) \propto \\ 
&{1 \over \sigma} \left(\prod\limits_{i}^{N_\text{div}}{{N_{i, \text{tot}}}\choose{N_{i,1}}}2^{-N_{i, \text{tot}}}\right)\\
&\left({1 \over \sqrt{2 \pi \sigma^2}}\right)^{N_\text{div}}\left(\exp\left[{1 \over 2\sigma^2}\sum\limits_{i}^{N_\text{div}}\left((I_{i, 1} - \alpha N_{i,1})^2 + (I_{i, 2} - \alpha(N_{i,\text{tot}} - N_{i, 1}))^2\right)\right]\right).
\end{align}

$$

With this in hand, we can code the posterior.

In [34]:
def neg_log_post(p, I_1, I_2):
    
    # Ensure that the parameters are physical.
    if (p < 0).any():
        return -np.inf
    
    # Unpack the parameters. 
    alpha, n_1, n_tot, sigma = p
    n_div = len(n1)
    n_1 = int(n_1)
    n_tot = int(n_tot)
    
    # Make sure the numbers are correct. 
    if n_1 > n_tot:
        return -np.inf
    
    # Compute the prior for sigma.
    log_prior = -np.log(sigma)
    
    # Compute the likelihood for n_tot.
    log_like_ntot = np.sum(scipy.special.gammaln(n_tot+1) -\
                           scipy.special.gammaln(n_1 +1) -\
                          scipy.special.gammaln(n_tot - n_1 + 1))
    
    log_like_int = -n_div * np.log(sigma) +\
                    np.log(np.sum((I_1 - alpha * n_1)**2 +\
                                  (I_2 - alpha * (n_tot - n_1))))
        
    return -(log_like_ntot + log_like_int) 

In [54]:
# Set the MCMC parameters
n_div = len(df['N_1'])
n_walkers = len(df['N_1'])
n_dim = 2 * len(df['N_1'])
n_burn = 3000
seed_alpha = df['seed_alpha'].unique()[0]

# Sprinkle around our starting positions. 
p = np.empty((n_walkers, n_dim))
p[:, 0] = seed_alpha + np.random.uniform(-seed_alpha * 0.1,\
                                        seed_alpha * 0.1, n_walkers)
p[:, 1:len(df['N_1'])] = df['N_1'] + np.random.uniform(-0.1, 0.1, n_walkers) 
p[:, len(df['N_1']):] = df['N_tot'] + np.random.uniform(-0.1, 0.1, n_walkers) 
np.random.uniform(df['I_tot'] / df['seed_alpha'])
n_2 = np.random.uniform(df['I_tot'] / dt['seed_alpha'])

ValueError: could not broadcast input array from shape (2627) into shape (2627,2626)

In [47]:
alpha_opt

150.0