#  Logistic Regression

In [3]:
import numpy as np
import bokeh.io
import bokeh.plotting
import mscl.mcmc
import mscl.plotting
import pymc3 as pm
import pandas as pd
bokeh.io.output_notebook()

## A Bayesian Approach

Ultimately, we would like to estimate the best-fit parameter values for the coefficients $\beta_0\, .... \beta_n$. For a given data set of survival measurements $\mathbf{s}$, we can use Bayes's theorem to write our posterior as 

$$
\begin{align}
g(\beta_0, \beta_1\, \vert\, \mathbf{s}) &= {f(\mathbf{s}\, \vert\, \beta_0, \beta_1)g(\beta_0, \beta_1)\over f(\mathbf{s})}\\
& = f(\mathbf{s}\, \vert \, \beta_0, \beta_1)g(\beta_0, \beta_1)
\end{align}
$$

where the proportionality results from ignoring the marginalized likelihoo $f(\mathbf{s})$.

As our data is binary ($\mathbf{s} \in \{0, 1\}$), the likelihood function for each cell can be modeled as a Bernoulli trial with a probability of successs (survival) $p$. The likelihood for $N$ different cells can be written as 

$$
f(\mathbf{s}\, \vert \, \beta_0, \beta_1) = \prod\limits_{i}^N p_i(\beta_0, \beta_1, x_i)^{s_i}(1 - p(\beta_0, \beta_1, x_i)_i)^{1 - s_i}.
$$

Our primary assumption for logistic regression is that $p$ is the logistic function,

$$
p_i = {1 \over 1 + \exp\left[-\beta_0 - \beta_1 x_i\right]}
$$

Plugging this into our likelihood function yields

$$
f(\mathbf{s}\, \vert \, \beta_0, \beta_1) = \prod\limits_i^N(1 + \exp\left[-\beta_0 - \beta_1x_i\right])^{-s_i}(1 - (1 + \exp\left[-\beta_0 - \beta_1x_i\right])^{-1})^{1 - s_i}.
$$

 

In [9]:
# Load the data. 
data = pd.read_csv('../data/csv/compiled_shock_data.csv', comment='#')
CHANNEL_AU = 4258
data['num_channel'] = data['rescaled_intensity'] / CHANNEL_AU
data.head()

Unnamed: 0,area,date,exposure_ms,flow_rate,intensity,mean_bg,rbs,survival,rescaled_intensity,num_channel
0,5.6576,20170525,25,0.01,217327.199203,103629.096813,sd1,True,454792.409561,106.808927
1,5.7344,20170525,25,0.01,207743.08108,103629.096813,sd1,True,416455.937067,97.805528
2,5.7344,20170525,25,0.01,207743.08108,103629.096813,sd1,True,416455.937067,97.805528
3,5.7344,20170525,25,0.01,207743.08108,103629.096813,sd1,True,416455.937067,97.805528
4,4.5824,20170525,25,0.01,168015.510516,103629.096813,sd1,True,257545.65481,60.485123


In [100]:
area_x = np.sort(data['area'])
area_y = np.arange(0, len(area_x), 1) / len(area_x)
p = mscl.plotting.boilerplate()
p.scatter(area_x, area_y)
bokeh.io.show(p)
single_cells = data.loc[data['area'] < 10]
single_cells['tot_channel'] = single_cells['num_channel'] * single_cells['area']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [101]:
single_cells.head()

Unnamed: 0,area,date,exposure_ms,flow_rate,intensity,mean_bg,rbs,survival,rescaled_intensity,num_channel,tot_channel
0,5.6576,20170525,25,0.01,217327.199203,103629.096813,sd1,True,454792.409561,106.808927,604.282183
1,5.7344,20170525,25,0.01,207743.08108,103629.096813,sd1,True,416455.937067,97.805528,560.856018
2,5.7344,20170525,25,0.01,207743.08108,103629.096813,sd1,True,416455.937067,97.805528,560.856018
3,5.7344,20170525,25,0.01,207743.08108,103629.096813,sd1,True,416455.937067,97.805528,560.856018
4,4.5824,20170525,25,0.01,168015.510516,103629.096813,sd1,True,257545.65481,60.485123,277.167029


In [102]:
with pm.Model() as model:
    # Define the formula.
    pm.glm.GLM.from_formula('survival ~ tot_channel', data=single_cells, family=pm.families.Binomial())
    trace = pm.sample(draws=5000, tune=5000, njobs=4)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
100%|██████████| 10000/10000 [00:37<00:00, 269.48it/s]


In [103]:
print(pm.gelman_rubin(trace))
_ = mscl.plotting.traceplot(trace, dist_type='kde')

{'Intercept': 1.000306213388976, 'tot_channel': 1.0004740736135718}


In [104]:
trace_df = mscl.mcmc.trace_to_dataframe(trace, model)
stats = mscl.mcmc.compute_statistics(trace_df)

In [105]:
stats_grouped = stats.groupby('parameter')
surv_grouped = single_cells.groupby('survival')

In [108]:
# Plot the data.
p = mscl.plotting.boilerplate(width=600, height=500,
                             x_axis_label='channels per cell',
                             y_axis_label='probability of survival')

for g, d in surv_grouped:
    if g is True:
        y = np.ones(len(d)) 
    else:
        y = np.zeros(len(d))
    
    p.scatter(d['tot_channel'], y=y, color='slategray', alpha=0.5)
  
# Plot the probability curve.
beta0=stats.loc[stats['parameter']=='Intercept']['mode'].values[0]
beta1=stats.loc[stats['parameter']=='tot_channel']['mode'].values[0]
chan_range = np.linspace(0, 1500, 1000)
logit = beta0 + beta1 * chan_range
prob = (1 + np.exp(logit))**-1
p.line(chan_range, prob, color='dodgerblue')
cred_region = np.zeros((2, len(chan_range)))
for i, c in enumerate(chan_range):
    prob = (1 + np.exp(trace_df['Intercept'] + trace_df['tot_channel'] * c))**-1
    cred_region[:, i] = mscl.mcmc.compute_hpd(prob, mass_frac=0.95)
mscl.plotting.fill_between(p, chan_range, cred_region[0, :], cred_region[1, :],
                          color='dodgerblue', alpha=0.3)

bokeh.io.show(p)