# Comparing fluorescence distributions of Flow Cytometry and Microscopy

In [15]:
# Import the standard dependencies.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import pandas as pd
import glob

# For gaussian gating.
import scipy.stats

# Import custom written utilities.
import mwc_induction_utils as mwc

# Set the plotting environment.
%matplotlib notebook
mwc.set_plotting_style()
sns.set_context('paper')



It would be nice to have a figure that shows how the distributions of fluorescence compare between flow cytometry and microscopy. Since we have many CSV files of each saved, we can easily generate this kind of plot. 

### Displaying example gating procedure

We will start by showing an ouput of a standard flow cytometry run, which is the forward scatter vs side scatter in the FITC channel. Below, we define the functions used for the unsupervised gating.

In [2]:
def fit_2D_gaussian(df, x_val='FSC-A', y_val='SSC-A', log=False):
    '''
    This function hacks astroML fit_bivariate_normal to return the mean and
    covariance matrix when fitting a 2D gaussian fuction to the data contained
    in the x_vall and y_val columns of the DataFrame df.
    Parameters
    ----------
    df : DataFrame.
        dataframe containing the data from which to fit the distribution
    x_val, y_val : str.
        name of the dataframe columns to be used in the function
    log : bool.
        indicate if the log of the data should be use for the fit or not
        
    Returns
    -------
    mu : tuple.
        (x, y) location of the best-fit bivariate normal
    cov : 2 x 2 array
        covariance matrix.
        cov[0, 0] = variance of the x_val column
        cov[1, 1] = variance of the y_val column
        cov[0, 1] = cov[1, 0] = covariance of the data
    '''
    if log:
        x = np.log10(df[x_val])
        y = np.log10(df[y_val])
    else:
        x = df[x_val]
        y = df[y_val]
        
    # Fit the 2D Gaussian distribution using atroML function
    mu, sigma_1, sigma_2, alpha = mwc.fit_bivariate_normal(x, y, robust=True)

    # compute covariance matrix from the standar deviations and the angle
    # that the fit_bivariate_normal function returns
    sigma_xx = ((sigma_1 * np.cos(alpha)) ** 2
                + (sigma_2 * np.sin(alpha)) ** 2)
    sigma_yy = ((sigma_1 * np.sin(alpha)) ** 2
                + (sigma_2 * np.cos(alpha)) ** 2)
    sigma_xy = (sigma_1 ** 2 - sigma_2 ** 2) * np.sin(alpha) * np.cos(alpha)
    
    # put elements of the covariance matrix into an actual matrix
    cov = np.array([[sigma_xx, sigma_xy], [sigma_xy, sigma_yy]])
    
    return mu, cov

def gauss_interval(df, mu, cov, x_val='FSC-A', y_val='SSC-A', log=False):
    '''
    Computes the of the statistic
    (x - µx)'∑(x - µx) 
    for each of the elements in df columns x_val and y_val.
    
    Parameters
    ----------
    df : DataFrame.
        dataframe containing the data from which to fit the distribution
    mu : array-like.
        (x, y) location of bivariate normal
    cov : 2 x 2 array
        covariance matrix
    x_val, y_val : str.
        name of the dataframe columns to be used in the function
    log : bool.
        indicate if the log of the data should be use for the fit or not 
    
    Returns
    -------
    statistic_gauss : array-like.
        array containing the result of the linear algebra operation:
        (x - µx)'∑(x - µx) 
    '''
    # Determine that the covariance matrix is not singular
    det = np.linalg.det(cov)
    if det == 0:
        raise NameError("The covariance matrix can't be singular")
            
    # Compute the vector x defined as [[x - mu_x], [y - mu_y]]
    if log: 
        x_vect = np.log10(np.array(df[[x_val, y_val]]))
    else:
        x_vect = np.array(df[[x_val, y_val]])
    x_vect[:, 0] = x_vect[:, 0] - mu[0]
    x_vect[:, 1] = x_vect[:, 1] - mu[1]
    
    # compute the inverse of the covariance matrix
    inv_sigma = np.linalg.inv(cov)
    
    # compute the operation
    interval_array = np.zeros(len(df))
    for i, x in enumerate(x_vect):
        interval_array[i] = np.dot(np.dot(x, inv_sigma), x.T)
        
    return interval_array

Now we can simply apply it to our tidy dataframe and extract fluorescence information.

In [312]:
# Load the data. 
IPTG_range = [0, 0.1, 10, 25, 50, 75, 250, 500, 1000, 5000]

def gate(data, alpha=0.5):
    mu, cov = fit_2D_gaussian(data, log=True)
    interval_arr = gauss_interval(data, mu, cov, log=True)
    idx = interval_arr <= scipy.stats.chi2.ppf(alpha, 2)
    disc = interval_arr >= scipy.stats.chi2.ppf(alpha, 2)
    data_idx = pd.DataFrame(data[idx])
    data_disc = pd.DataFrame(data[disc])
    data_idx.insert(0, 'gate', 1)
    data_disc.insert(0, 'gate', 0)
    return pd.concat([data_idx, data_disc]) 

dfs = []
for conc in IPTG_range:
    RBS1027_glob = glob.glob('../../data/flow/csv/RBS1027_titration_example/*RBS1027*_'
                         +str(conc)  + 'uMIPTG*.csv')
    delta_glob = glob.glob('../../data/flow/csv/RBS1027_titration_example/*delta*_'
                         +str(conc)  + 'uMIPTG*.csv')
    auto_glob = glob.glob('../../data/flow/csv/RBS1027_titration_example/*auto*_' 
                         +str(conc) + 'uMIPTG*.csv')
    rbs_data = pd.read_csv(RBS1027_glob[0], comment='#')
    delta_data = pd.read_csv(delta_glob[0], comment='#')
    auto_data = pd.read_csv(auto_glob[0], comment='#')
    rbs_cells = gate(rbs_data)
    delta_cells = gate(delta_data)
    auto_cells = gate(auto_data)
    rbs_cells.insert(0, 'strain', 'RBS1027') 
    delta_cells.insert(0, 'strain', 'delta')
    auto_cells.insert(0, 'strain', 'auto')
    z = pd.concat([rbs_cells, delta_cells, auto_cells])
    z.insert(0, 'IPTG', conc)
    dfs.append(z)
     
df = pd.concat(dfs)

In [313]:
df[df.IPTG==5000]


Unnamed: 0.1,IPTG,strain,gate,Unnamed: 0,HDR-T,FSC-A,FSC-H,FSC-W,SSC-A,SSC-H,SSC-W,FITC-A,FITC-H,FITC-W
0,5000.0,RBS1027,1,0,7.102775,11578.198242,8809.545898,172265.140625,43564.191406,35164.011719,162383.234375,26918.236328,15808.499023,223185.453125
3,5000.0,RBS1027,1,3,20.858418,11144.457031,6231.107910,234424.812500,36760.433594,25685.892578,187584.046875,21544.771484,17146.062500,164697.656250
4,5000.0,RBS1027,1,4,33.953560,12418.647461,9485.478516,171603.046875,41850.902344,35853.101562,152998.796875,18619.746094,11894.882812,205174.562500
6,5000.0,RBS1027,1,6,46.914684,12842.103516,9797.620117,171800.921875,38132.859375,26755.728516,186806.734375,24623.818359,18822.308594,171471.687500
8,5000.0,RBS1027,1,8,51.777504,5927.338379,6876.572266,112978.976562,20926.787109,19112.236328,143516.218750,13343.317383,8094.677246,216059.921875
10,5000.0,RBS1027,1,10,58.621822,8127.052246,5558.326172,191645.640625,35580.902344,19017.492188,245230.015625,16852.804688,11552.003906,191216.250000
12,5000.0,RBS1027,1,12,64.633331,9670.845703,6410.966309,197720.125000,32248.650391,27833.533203,151863.406250,7004.816406,8792.202148,104426.093750
14,5000.0,RBS1027,1,14,98.902786,12101.709961,9538.519531,166293.656250,39369.332031,29976.439453,172142.421875,33712.832031,26940.011719,164024.000000
17,5000.0,RBS1027,1,17,111.969208,11298.025391,8093.546387,182967.359375,30303.298828,21124.013672,188028.375000,20310.279297,17928.500000,148484.750000
20,5000.0,RBS1027,1,20,123.092422,9475.600586,7123.107910,174360.109375,26760.416016,21901.892578,160147.859375,30727.697266,26738.062500,150629.484375


We can generate a figure which shows a large majority of the measured cells in black and those that are considered in blue. Note that the limits o[n this plot are slightly changed for aesthetic reasons.

In [371]:
plt.figure()
colors = sns.color_palette('colorblind', n_colors=8)
colors[4] = sns.xkcd_palette(['dusty purple'])[0]
sel_dat = df[(df.strain == 'RBS1027') & (df.IPTG == 1000)]
plt.plot(sel_dat[sel_dat.gate==0]['FSC-A'], sel_dat[sel_dat.gate==0]['SSC-A'], 'k.',
                   rasterized=True, label='discarded cells')
plt.plot(sel_dat[sel_dat.gate==1]['FSC-A'], sel_dat[sel_dat.gate==1]['SSC-A'], '.', 
         color = colors[3], rasterized=True, alpha=0.5, label='selected cells')

# # Fix formatting and restrict bounds.
plt.legend(loc='upper left', fontsize=15, markerscale=2)
plt.xlabel('forward scatter (a.u.)', fontsize=20)
plt.ylabel('side scatter (a.u.)', fontsize=20)
plt.xscale('log')
plt.yscale('log')
plt.tick_params(labelsize=18)

# Restrict bounds for aesthetic reasons.
plt.xlim([1E3, 1E5])
plt.ylim([1E3, 3E5])

# Save the figure.
plt.tight_layout()
plt.savefig('/Users/gchure/Dropbox/mwc_induction/figures/supplementary_figures/flow_cloud.pdf',
            bbox_inches='tight')

<IPython.core.display.Javascript object>

### Examining fluorescence distributions. 

We would like to compare the fluorescence distributions between microscopy and flow cytometry. We can begin by loading the important CSV files for the microscopy example.

In [315]:
# Load the preprocessed microscopy files.
mic_cells = pd.read_csv('../../data/RBS1027_O2_microscopy_cell_intensities.csv')

We will show the fluorescence intensities for RBS1027 O2 at 250µM IPTG.

In [316]:
# Select only the desired cells. 
mic_selected = mic_cells[(mic_cells['date'] == 20161019) &
                      (mic_cells['rbs'] == 'RBS1027') &
                      (mic_cells['IPTG_uM'] == 250)].mean_intensity

# Generate the plot. 
plt.figure()
plt.hist(mic_selected, bins=25, color='r', alpha=0.75, histtype='stepfilled')
plt.xlabel('mean pixel intensity', fontsize=20)
plt.ylabel('counts', fontsize=20)
ax = plt.gca()
plt.text(0.72, 0.8, 'N = {0}'.format(len(mic_selected)), transform=ax.transAxes,
        fontsize=24)
plt.ticklabel_format(style='sci', axis='x', scilimits=(0,3))
plt.tick_params(labelsize=18)

# Save the figure.
plt.tight_layout()
plt.savefig('/Users/gchure/Dropbox/mwc_induction/figures/supplementary_figures/microscopy_distribution.pdf',
            bbox_inches='tight')

<IPython.core.display.Javascript object>

Now we can do the same for the flow cytometry data.

In [379]:
# Generate the plot
fig, ax = plt.subplots(2, 1, figsize=(9, 5), sharex=True)
rbs_colors = sns.color_palette('Blues', n_colors=7)
delta_colors = sns.color_palette('Greens', n_colors=7)
plot_range = [0, 10, 25, 75, 1000]
for i, conc in enumerate(plot_range):
    ax[0].hist(df[(df.gate==1) & (df.strain=='RBS1027') & (df.IPTG==conc)]['FITC-A'],
             bins=200, color=rbs_colors[i], alpha=0.6, histtype='stepfilled',
             normed=True, linewidth=2)
    ax[0].plot(np.mean(df[(df.gate==1) & (df.strain=='RBS1027') & (df.IPTG==conc)]['FITC-A']),
               1.3E-4, 'v', markersize=10, markeredgecolor=rbs_colors[i], markerfacecolor='w', markeredgewidth=3)

    ax[1].hist(df[(df.gate==1) & (df.strain=='delta') & (df.IPTG==conc)]['FITC-A'],
              bins=200, color=delta_colors[i], alpha=0.6, histtype='stepfilled',
              normed=True, linewidth=2)
    ax[1].plot(np.mean(df[(df.gate==1) & (df.strain=='delta') & (df.IPTG==conc)]['FITC-A']),
               9.5E-5, 'v', markeredgecolor=delta_colors[i], markerfacecolor='w', markeredgewidth=2, markersize=10)
for a in ax:
    a.set_yticks(np.array([0, 0.5, 1, 1.4])*1E-4)
    a.ticklabel_format(style='sci', axis='both', scilimits=(0, 3))
    a.tick_params(labelsize=18)
    a.set_xlim(np.array([-1, 6]) * 1E4)
ax[0].set_ylim([0, 1.4E-4])
ax[1].set_ylim([0, 1.05E-4])
ax[0].text(0.9, 0.85, 'R > 0', fontsize=18, transform=ax[0].transAxes) 
ax[1].text(0.9, 0.85, 'R = 0', fontsize=18, transform=ax[1].transAxes) 
ax[1].set_xlabel('total cell intensity (a.u.)', fontsize=18)
fig.text(0.05, 0.5, 'frequency', fontsize=18, ha='center', rotation='vertical')
plt.savefig('/Users/gchure/Dropbox/mwc_induction/Figures/example_distributions.svg', bbox_inches='tight')

<IPython.core.display.Javascript object>

In [318]:
# Compute the fold change.
grouped = pd.groupby(df[df.gate==1], ['strain', 'IPTG'])['FITC-A'].mean()
grouped

strain   IPTG  
RBS1027  0.0        3695.989297
         0.1        3647.257660
         10.0       5024.582621
         25.0       9348.880618
         50.0      13051.849078
         75.0      16200.652576
         250.0     21596.144741
         500.0     21469.415060
         1000.0    20899.082751
         5000.0    18372.970072
auto     0.0        3438.455568
         0.1        3348.332064
         10.0       3412.436135
         25.0       3337.535968
         50.0       3305.481954
         75.0       3278.220405
         250.0      3388.739312
         500.0      3421.416702
         1000.0     3451.569957
         5000.0     3334.038579
delta    0.0       21777.464637
         0.1       22756.350605
         10.0      23026.521639
         25.0      23136.675673
         50.0      23687.706778
         75.0      23355.899591
         250.0     24818.306583
         500.0     24135.880934
         1000.0    22839.004795
         5000.0    20986.819432
Name: FITC-A, dtype: flo

In [338]:
sns.rugplot?

In [319]:
fc = (grouped['RBS1027'] - grouped['auto']) / (grouped['delta'] - grouped['auto'])
fc

IPTG
0.0       0.014043
0.1       0.015402
10.0      0.082193
25.0      0.303616
50.0      0.478180
75.0      0.643622
250.0     0.849639
500.0     0.871275
1000.0    0.899939
5000.0    0.851930
Name: FITC-A, dtype: float64

In [380]:
plt.figure()
plt.plot(np.array(IPTG_range)/1E6, fc, 'o', markeredgecolor='r', markeredgewidth=2,
        markersize=7, markerfacecolor='w')
plt.xlim([1E-8,1E-2])
plt.margins(0.02)
plt.xscale('log')
plt.xlabel('[IPTG] (M)', fontsize=18)
plt.ylabel('fold-change', fontsize=18)
plt.ylim([0, 1.1])
ax = plt.gca()
plt.text(0.01, 0.9, 'R = 260', fontsize=18, transform=ax.transAxes)
plt.text(0.01, 0.8, r'$\Delta\varepsilon_{RA} = -13.9\, k_BT$', fontsize=18,
        transform=ax.transAxes)
plt.tick_params(labelsize=15)
plt.tight_layout()
plt.savefig('/Users/gchure/Dropbox/mwc_induction/Figures/example_titration.eps',
           bbox_inches='tight')

<IPython.core.display.Javascript object>

To  emphasize how similar these results are, we will plot the fold-change from three independent experiments using both flow cytometry and microscopy keeping the same colorscheme as shown above. 

In [9]:
# Load the csv files.
mic_fc = pd.read_csv('../../data/RBS1027_O2_microscopy_foldchange.csv')
flow_glob = glob.glob('../../data/201608*O2*.csv')
with open('../../data/datasets_ignore.csv') as f:
    ignored_sets = f.readlines()
    ignored_sets = ['../../data/' + z.rstrip() for z in ignored_sets]
    
flow_fc = []
for entry in flow_glob:
    if entry not in ignored_sets:
        samp = pd.read_csv(entry, comment='#') 
        if 'r2' not in entry:
            flow_fc.append(samp)       
flow_fc = pd.concat(flow_fc, axis=0)
flow_fc = flow_fc[flow_fc['rbs']=='RBS1027']

With the data sets in hand, we can plot the points and prediction on the same axes.

In [10]:
#plot the prediction
# Define the necessary parameters.
epa = -np.log(141E-6)
epi = -np.log(0.56E-6)
epr = -13.9  # In units of kBT
iptg = np.logspace(-8, -2, 1000)
R = np.array([130])  # Number of lac tetramers per cell.

# Generate the theoretical fold change.
fc = mwc.fold_change_log(iptg, epa, epi, 4.5, R, epr)

# Generate the plot.
plt.figure()
plt.plot(flow_fc['IPTG_uM']/1E6, flow_fc['fold_change_A'], 'bo', markersize=10,
        alpha=0.75)
plt.plot(mic_fc['IPTG_uM'], mic_fc['fold_change'], 'ro', markersize=10,
        alpha=0.75)
plt.plot(iptg, fc,'k-', linewidth=3)
plt.xscale('log')
legend = plt.legend(['flow cytometry', 'microscopy', 'flow cytometry prediction'], loc='upper left',
          title='method', fontsize=14)
plt.setp(legend.get_title(),fontsize=18)
plt.xlabel('[IPTG] (M)', fontsize=20)
plt.ylabel('fold-change', fontsize=20)
plt.tick_params(labelsize=18)
plt.ylim([-0.01, 1.02])
plt.xlim([5E-8,1E-2])
plt.tight_layout()
plt.savefig('/Users/gchure/Dropbox/mwc_induction/figures/supplementary_figures/microscopy_flow_prediction_plot.pdf',
           bbox_inches='tight')

<IPython.core.display.Javascript object>