# Using datashader to explore the large data sets from the Flow Cytometer

In [2]:
import os
import glob
# Our numerical workhorses
import numpy as np
import pandas as pd
import scipy

# Package to estimate densities
from sklearn.neighbors.kde import KernelDensity
from sklearn.grid_search import GridSearchCV

# Import matplotlib stuff for plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Seaborn, useful for graphics
import seaborn as sns

# Import Bokeh modules for interactive plotting
import bokeh.io
import bokeh.mpl
import bokeh.plotting

# favorite Seaborn settings for notebooks
rc={'lines.linewidth': 2, 
    'axes.labelsize' : 16, 
    'axes.titlesize' : 18,
    'axes.facecolor' : 'F4F3F6',
    'axes.edgecolor' : '000000',
    'axes.linewidth' : 1.2,
    'xtick.labelsize' : 13,
    'ytick.labelsize' : 13,
    'grid.linestyle' : ':',
    'grid.color' : 'a6a6a6'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)
sns.set_palette("deep", color_codes=True)

# Datashader to plot lots of datapoints
import datashader as ds
from datashader.bokeh_ext import InteractiveImage
from functools import partial
from datashader.utils import export_image
from datashader.colors import colormap_select, Greys9, Hot, viridis, inferno
from IPython.core.display import HTML, display

# Set up Bokeh for inline viewing
bokeh.io.output_notebook()
bokeh.plotting.output_notebook()

# Automatic thresholding of the data

Trying to find a reliable automatic thresholding of the data is not easy. So far our criteria has been based in plotting 5% of the data on a `FSC-A` vs `SSC-A` scatter plot and deciding arbitrary cutoffs for the data points.

But as we will show using Python's amazing `datashader` library for plotting large data sets this undersampling can be quite misleading.

Let's first read an example data set

In [3]:
# define variables to use over the script
date = 20160725
username = 'mrazomej'

# list the directory with the data
datadir = '../../../data/flow/csv/'
files = np.array(os.listdir(datadir))
csv_bool = np.array([str(date) in f and 'csv' in f for f in files])
files = files[np.array(csv_bool)]

df_example = pd.read_csv(datadir + files[1])

Now in order to use `datashader` in an interactive `Bokeh` plot let's define a `base_plot` function to initialize a Bokeh plot.

In [4]:
def base_plot(df, x_col, y_col, log=False):
    # Define the range to plot chekcing if it is a log scale or not
    if log:
        x_range = (np.min(np.log10(df[x_col])), 
                   np.max(np.log10(df[x_col])))
        y_range = (np.min(np.log10(df[y_col])), 
                   np.max(np.log10(df[y_col])))
    else:
        x_range = (df[x_col].min(), df[x_col].max())
        y_range = (df[y_col].min(), df[y_col].max())
    # Initialize the Bokeh plot
    p = bokeh.plotting.figure(
        x_range=x_range,
        y_range=y_range,
        tools='pan,wheel_zoom,box_zoom,reset', 
        plot_width=500, 
        plot_height=500,
    )
    # Add all the features to the plot
    p.xgrid.grid_line_color = '#a6a6a6'
    p.ygrid.grid_line_color = '#a6a6a6'
    p.ygrid.grid_line_dash = [6, 4]
    p.xgrid.grid_line_dash = [6, 4]
    p.xaxis.axis_label = x_col
    p.yaxis.axis_label = y_col
    p.xaxis.axis_label_text_font_size = '15pt'
    p.yaxis.axis_label_text_font_size = '15pt'
    p.background_fill_color = '#F4F3F6'
    return p

With this in hand we define a simple function that takes one of our data frames and plots whichever columns we want on a 2-D scatter plot.

In [5]:
def ds_plot(df, x_col, y_col, log=False):
    if log:
        data = np.log10(df[[x_col, y_col]])
    else:
        data = df[[x_col, y_col]]
    p = base_plot(data, x_col, y_col)
    pipeline = ds.Pipeline(data, ds.Point(x_col, y_col))
    return p, pipeline

Now let's plot the channels we've been using for the thresholding, i.e. front and side scattering area.
But since we are using `datashader` we can plot **all of the data points** rather than the 5% we use with `matplotlib`.

In [6]:
p, pipeline = ds_plot(df_example, 'FSC-A', 'SSC-A', log=True)
InteractiveImage(p, pipeline)

Let's now apply the same thresholds **we arbitrarily set by eye** on a log-log plot and plot the data.

In [7]:
fsc_range = [5E3, 2E4]
ssc_range = [1E4, 6E4]

df_thresh = df_example[(df_example['SSC-A'] > ssc_range[0]) & \
                        (df_example['FSC-A'] > fsc_range[0]) & \
                        (df_example['SSC-A'] < ssc_range[1]) & \
                        (df_example['FSC-A'] < fsc_range[1])]


In [8]:
p, pipeline = ds_plot(df_thresh, 'FSC-A', 'SSC-A', log=True)
InteractiveImage(p, pipeline)

By this criteria it does seem that we captured the "relevant population only". But what if we were to plot this same plot but not on a log-log plot

In [9]:
p, pipeline = ds_plot(df_thresh, 'FSC-A', 'SSC-A', log=False)
InteractiveImage(p, pipeline)

Aha! From this perspective it seemst hat we didn't really capture the body of the population but we left a huge tail on the higher end.

And on top of that these thresholds are set from experiment to experiment in a very arbritrary way which is prone to mistakes and day-to-day variability.

Could we find a way to get rid of the "supervised" thresholding and find an authomatic way to "set a gate" on our population?

# KDE gate

We will try to perform a Kernel Density Estimation to get a sense of where does most of the data-points density is.

In [40]:
grid = GridSearchCV(KernelDensity(),
                    {'bandwidth': np.linspace(0.1, 1.0, 30)},
                    cv=5) # 5-fold cross-validation
grid.fit(df_example[['FSC-A', 'SSC-A']])
print(grid.best_params_)

{'bandwidth': 1.0}


In [15]:
data = df_example[['FSC-A', 'SSC-A']]
log_data = np.log10(data)
kde = KernelDensity().fit(data)

In [12]:
x_grid = np.logspace(log_data['FSC-A'].min(), 
                               log_data['FSC-A'].max(), 200)
y_grid = np.logspace(log_data['SSC-A'].min(), 
                               log_data['SSC-A'].max(), 200)
xx, yy = np.meshgrid(x_grid, y_grid)
# pdf = np.exp(kde.score_samples(grid))

In [18]:
kde.score_samples?