# Comparing bulk and single-cell survival probabilities

In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tqdm
sys.path.insert(0, '../../')
import mscl.stats
import mscl.plotting
import scipy.spatial
import sklearn.metrics
import tqdm
colors = mscl.plotting.set_plotting_style()
%matplotlib inline

In [2]:
# Load the constants from the logistic regression.
stats = pd.read_csv('../../data/csv/complete_mcmc_stats.csv')
# Get the statistics only for the fast shock.
beta_0 = stats[stats['parameter'] == 'beta_0__0']['median'].values[0]
beta_1 = stats[stats['parameter'] == 'beta_1__0']['median'].values[0]

## Bulk experiment simulations

Two competing factors -- proximity and the width of the distribution. We consider both separately.  

In [3]:
def plate_cells(n_cells, spot_area):
    """
    Computationally spread cells onto an agar plate of a defined area.
    """
    return np.random.uniform(0, spot_area, size=(2, n_cells))
    

def proximity_filter(coords, min_distance=500):
    """
    Computes pairwise distances between coordinates and returns the minimum  
    number of cells.
    
    Parameters
    ----------
    coords: 2d-array
        Array of x, y coordinates with N rows and 2 columns.
        
    min_distance: int
        Minimum distance between cells to be considered different colonies.
        
    Returns
    -------
    num_colonies: int
        The number of colonies on the plate
    """ 
    num_colonies = 0
    max_block = 1000
    if len(coords[:, 0]) > 1000:
        blocks = np.arange(0, len(coords[:, 0]), 1000)
        if len(coords[:, 0]) % 1000 == 0:
            coords = [coords[blocks[k]:blocks[k+1]] for k in range(len(blocks)-1)]
        else:
            _coords = [coords[blocks[k]:blocks[k+1]] for k in range(len(blocks)-1)]
            _coords.append(coords[len(coords[:, 0]) - len(coords[:, 0]) % 1000])
            coords = _coords
        for c in coords:
            dist = sklearn.metrics.pairwise.euclidean_distances(coords.T)
            num_colonies += np.min(np.sum(dist >= min_distance, axis=0)) 
    else: 
        dist = sklearn.metrics.pairwise.euclidean_distances(coords.T)
        num_colonies = np.min(np.sum(dist >= min_distance, axis=0))    
        
    return num_colonies, dist
    
def plate_dilution_factors(survivors, spot_area, min_distance=500):
    """
    Perform computational plating for a set of serial dilutions. 
    """
    detected_colonies = []
    for i, samp in enumerate(survivors):
        if samp != 0:
            coords = plate_cells(samp, spot_area) 
            num_colonies, dist = proximity_filter(coords, min_distance)
        else:
            num_colonies = 0
            dist = 0
        detected_colonies.append(num_colonies)
    return detected_colonies, dist

In [4]:
def determine_survivors(chan_mu, chan_sigma, n_cells, beta_0, beta_1):
    """
    Given statistics of channel distribution, compute and return number of survivors 
    after shock as predicted by logistic regression.
    
    Parameters
    ----------
    chan_mu, chan_sigma : float
        The mean and standard deviation for channel expression distribution.
    n_cells: int
        The total number of cells exposed to the osmotic shock.
    beta_0, beta_1 : float
        The coefficients of the logistic regression.
        
    Returns
    -------
    n_survivors : int
        The total number of surviving cells.
    channels : 1d-array
        Array of channel counts for survivors.
    """ 
    # Generate the channel distribution. 
    cells = np.random.normal(loc=chan_mu, scale=chan_sigma, size=n_cells).astype(int)
        
    # Compute the survival probability for each cell given a channel number.
    logit = beta_0 + beta_1 * np.log(cells)
    p_s = (1 + np.exp(-logit))**-1
    
    # Apply the shock and determine the survivors.
    coin_flips = np.random.random(n_cells) < p_s
    n_survivors = np.sum(coin_flips).astype(int)
    
    # Determine which cells survived.
    survivors = cells[coin_flips]
    return n_survivors, survivors

In [5]:
def serial_dilution(n_cells, dilution_factor):
    """
    Preform a serial dilution from a pool of n_cells.
    
    Parameters
    ----------
    n_cells: int
        The total number of cells at the beginning of the serial dilution
    dilution_factor: int or list of ints
        The dilution factors to be applied. 
    """
    if type(dilution_factor) == int:
        dilution_factor = list(dlution_factor)
    
    diluted_cells = []
    for i, d in enumerate(dilution_factor):
        # Set the number of cells in each dilution.
        if i == 0:
            p = 1 / d
            n_cells = np.sum(np.random.random(n_cells) < p) 
        else:
            p = 1 / np.exp(np.log(d) - np.log(dilution_factor[i-1]))
            n_cells = np.sum(np.random.random(n_cells) < p)
            diluted_cells[i - 1] -= n_cells
        diluted_cells.append(n_cells) 
    return diluted_cells

In [6]:
def compute_probability(df):
    grouped = df.groupby(['dilution_factor'])
    prob_df = pd.DataFrame([], columns=['prob_mean', 'prob_err', 'dilution_factor'])
    for g, d in grouped:
        exp = d[d['experiment'] == 'shock']['survivors'].values
        cntrl = d[d['experiment'] == 'control']['survivors'].values
        mean_prob = np.mean(exp / cntrl)
        sem_prob = np.std(exp / cntrl) / np.sqrt(n_simulations)
        prob_df = prob_df.append({'prob_mean':mean_prob, 'mean_exp_cells':np.mean(exp), 'mean_control_cells': np.mean(cntrl), 
                                  'prob_err':sem_prob, 'dilution_factor':g},
                                ignore_index=True)
        prob_df.dropna(inplace=True)
    return prob_df 

In [7]:
# Perform the experiment.
n_simulations = 5 
chan_mu = 500
chan_sigma = 200
n_cells = 6E5 # of cells in 2µL at OD0.3
spot_area = 5E3 # In pixels with a width of 2µm
min_distance = 250 # Min distance in units of 2µm 
beta_0 = -8
beta_1 = 1.5
dilution_factor = [1E1, 1E2, 1E3, 1E4, 1E5, 1E6]
experiments = ['control', 'shock']
dfs = []
for n in tqdm.tqdm(range(n_simulations)):
    for i, exp in enumerate(experiments):
        if exp == 'control':
            cells = np.random.normal(chan_mu, chan_sigma, n_cells).astype(int)
            n_surv = n_cells
        else:
            n_surv, cells = determine_survivors(chan_mu, chan_sigma, n_cells, beta_0, beta_1)
            n_surv = n_surv
            
        # Perform the serial dilutions.
        dilution = serial_dilution(n_surv, dilution_factor) 
        print(dilution)
        spots, dist = plate_dilution_factors(dilution, spot_area, min_distance=min_distance)
        
        # Assemble a dataframe.
        _df = pd.DataFrame(np.array([spots, dilution_factor]).T, 
                          columns = ['survivors', 'dilution_factor'])
        _df.loc[:, 'experiment'] = exp
        _df.loc[:, 'simulation'] = n + 1
        dfs.append(_df)
    
df = pd.concat(dfs, ignore_index=True)
prob_df = compute_probability(df)



[53944, 5412, 503, 57, 4, 0]




[40248, 4066, 366, 32, 3, 1]


 20%|██        | 1/5 [03:09<12:39, 189.88s/it]

[53953, 5414, 509, 50, 6, 3]
[39960, 3961, 388, 39, 2, 0]


 40%|████      | 2/5 [05:56<08:55, 178.35s/it]

[54340, 5371, 544, 53, 3, 2]
[40238, 3920, 404, 45, 3, 1]


 60%|██████    | 3/5 [08:38<05:45, 172.88s/it]

[53757, 5383, 535, 47, 6, 2]
[40101, 3987, 402, 43, 4, 0]


 80%|████████  | 4/5 [11:00<02:45, 165.14s/it]

[54002, 5377, 547, 47, 7, 0]
[40177, 3882, 390, 52, 3, 1]


100%|██████████| 5/5 [13:54<00:00, 166.97s/it]


In [8]:
prob_df = compute_probability(df)



In [10]:
prob_df

Unnamed: 0,prob_mean,prob_err,dilution_factor,mean_control_cells,mean_exp_cells
0,0.743206,0.001051,10.0,53498.4,39760.0
1,0.73425,0.004506,100.0,5327.2,3911.6
2,0.736928,0.008046,1000.0,515.8,380.0
3,0.831844,0.087453,10000.0,48.0,39.2
4,0.56,0.124508,100000.0,4.2,2.0


In [11]:
coords = plate_cells(dilution[0], spot_area)

In [12]:
np.sum(sklearn.metrics.pairwise.euclidean_distances(coords) >= min_distance, axis=0)

array([1, 1])