# Using datashader to explore the large data sets from the Flow Cytometer

In [6]:
import os
import glob
# Our numerical workhorses
import numpy as np
import pandas as pd
import scipy
# Import matplotlib stuff for plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Seaborn, useful for graphics
import seaborn as sns

# Import Bokeh modules for interactive plotting
import bokeh.io
import bokeh.mpl
import bokeh.plotting

# favorite Seaborn settings for notebooks
rc={'lines.linewidth': 2, 
    'axes.labelsize' : 16, 
    'axes.titlesize' : 18,
    'axes.facecolor' : 'F4F3F6',
    'axes.edgecolor' : '000000',
    'axes.linewidth' : 1.2,
    'xtick.labelsize' : 13,
    'ytick.labelsize' : 13,
    'grid.linestyle' : ':',
    'grid.color' : 'a6a6a6'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)
sns.set_palette("deep", color_codes=True)

# Datashader to plot lots of datapoints
import datashader as ds
from datashader.bokeh_ext import InteractiveImage
from functools import partial
from datashader.utils import export_image
from datashader.colors import colormap_select, Greys9, Hot, viridis, inferno
from IPython.core.display import HTML, display

# Set up Bokeh for inline viewing
bokeh.io.output_notebook()
bokeh.plotting.output_notebook()

Let's first read an example data set

In [7]:
# define variables to use over the script
date = 20160725
username = 'mrazomej'

# list the directory with the data
datadir = '../../../data/flow/csv/'
files = np.array(os.listdir(datadir))
csv_bool = np.array([str(date) in f and 'csv' in f for f in files])
files = files[np.array(csv_bool)]

df_example = pd.read_csv(datadir + files[0])

Now let's define a `base_plot` function to initialize a Bokeh plot.

In [29]:
def base_plot(df, x_col, y_col):
    x_range = (df[x_col].min(), df[x_col].max())
    y_range = (df[y_col].min(), df[y_col].max())
    p = bokeh.plotting.figure(
        x_range=x_range,
        y_range=y_range,
        tools='pan,wheel_zoom,box_zoom,reset', 
        plot_width=500, 
        plot_height=500,
#         y_axis_type='log',
#         x_axis_type='log',
    )
    p.xgrid.grid_line_color = '#a6a6a6'
    p.ygrid.grid_line_color = '#a6a6a6'
    p.ygrid.grid_line_dash = [6, 4]
    p.xgrid.grid_line_dash = [6, 4]
    p.xaxis.axis_label = x_col
    p.yaxis.axis_label = y_col
    p.xaxis.axis_label_text_font_size = '15pt'
    p.yaxis.axis_label_text_font_size = '15pt'
    p.background_fill_color = '#F4F3F6'
    return p

Finally we define a simple function that takes one of our data frames and plots the columns we want.

In [30]:
def ds_plot(df, x_col, y_col):
    p = base_plot(df, x_col, y_col)
    pipeline = ds.Pipeline(df, ds.Point(x_col, y_col))
    return p, pipeline

In [31]:
p, pipeline = ds_plot(df_example, 'FSC-A', 'FITC-A')
InteractiveImage(p, pipeline)