## Testing heatmaps for the visualisation of Genome Detective output

Date: 2018-05-15  
Author: Sam Nooij

In [4]:
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.sampledata.les_mis import data

#Try these as test input
input_file = "../results/3_1_results.csv"
input_df = pd.read_csv(input_file)

#Check if the correct file was imported:
input_df

Unnamed: 0,Assignment,# Contigs,Mapped # Reads,Coverage (%),Mapped depth <br/>of Coverage,NT Identity (%),AA Identity (%),Contigs
0,Norovirus,1.0,542592,99.827082,11929.1,79.3604,84.7965,>1_20682_AAGAGGCACCTAGAGT_L001_R1_001_00000000...
1,uncultured crAssphage,12.0,78949,78.068305,171.9,96.3036,95.3261,>1_20682_AAGAGGCACCTAGAGT_L001_R1_001_00000000...


In [5]:
samples = ["Run3-sample1"]
assignments = list(input_df['Assignment'])
data = input_df

colors = ["#550000"]
#The current colour is dark red or brownish, like the faeces from which we sequenced viruses

assignment = []
sample = []
color = []
load = []
alpha = []

for s in samples:
    for a in assignments:
        assignment.append(a)
        sample.append(s)
        current_load = data["Mapped # Reads"][len(assignment) - 1]
        load.append(data["Mapped # Reads"][len(assignment) - 1])
        color.append(colors[0])
        max_load = max(data["Mapped # Reads"])
        alpha.append(min(current_load / float(max_load), 0.9) + 0.1)
        #let the alpha be the fraction of the highest read count ("max_load")
        
source = ColumnDataSource(
         data = dict(assignment=assignment, sample=sample, color=color, load=load, alpha=alpha)
)

output_notebook()

NameError: name 'output_notebook' is not defined

In [6]:
TOOLS = "hover, save, pan, box_zoom, wheel_zoom, reset"

p = figure(title = "Viruses in run 3-1, classified by Genome Detective",
          x_range = list(samples),
          y_range = list(reversed(assignments)), #reverse to order 'from top to bottom'
          x_axis_location = "above",
          toolbar_location="right",
          tools = TOOLS)

p.plot_width = 500
p.plot_height = 500
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "10pt"
p.title_text_font_size = "14pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = np.pi/3

p.rect("sample", "assignment", 1, 1, source=source,
       color="color", alpha="alpha", line_color=None)

p.select_one(HoverTool).tooltips = [
    ('Sample', "@sample"),
    ('Taxon' , "@assignment"),
    ('Number of reads', "@load"),
]

#output_file('heatmap_test.html', title="Testing Genome Detective heatmaps in Jupyter+Bokeh")

show(p)

AttributeError: unexpected attribute 'title_text_font_size' to Figure, possible attributes are above, aspect_scale, background_fill_alpha, background_fill_color, below, border_fill_alpha, border_fill_color, css_classes, disabled, extra_x_ranges, extra_y_ranges, h_symmetry, height, hidpi, inner_height, inner_width, js_event_callbacks, js_property_callbacks, layout_height, layout_width, left, lod_factor, lod_interval, lod_threshold, lod_timeout, match_aspect, min_border, min_border_bottom, min_border_left, min_border_right, min_border_top, name, outline_line_alpha, outline_line_cap, outline_line_color, outline_line_dash, outline_line_dash_offset, outline_line_join, outline_line_width, output_backend, plot_height, plot_width, renderers, right, sizing_mode, subscribed_events, tags, title, title_location, toolbar, toolbar_location, toolbar_sticky, v_symmetry, width, x_range, x_scale, y_range or y_scale

## Expand to more samples

In [2]:
import glob

file_list = glob.glob("../results/*_results.csv")
file_list = sorted(file_list)

#Collect a list of run numbers and sample numbers
sample_ids = []

for file_path in file_list:
    file_name = file_path.split('/')[-1]
    sample_id = file_name[:file_name.index("_results.csv")]
    run_id = sample_id.split('_')[0]
    sample_number = sample_id.split('_')[1:]
    if type(sample_number) == type(["list"]):
        sample_number = '_'.join(sample_number)
    else:
        pass
    #If there are multiple parts to a sample number, separated by an underscore,
    # it is now turned from a list into a string.
    sample_ids.append("%s-%s" % (run_id, sample_number))
    
sample_ids = sorted(sample_ids)

#In case I need a link between the file names and the IDs as I stored them now:
id_mapping = {}

for index in xrange(0, len(sample_ids)):
    id_mapping[sample_ids[index]] = file_list[index]
    
print id_mapping

{'5-5061600092_S2': '../results/5_5061600092_S2_results.csv', '4-D': '../results/4_D_results.csv', '3-1': '../results/3_1_results.csv', '6-A1': '../results/6_A1_results.csv'}


### Create separate heatmaps for multiple result csv files

In [3]:
for results_file in file_list:
    results_df = pd.read_csv(results_file)
    
    samples = [results_file]
    assignments = list(results_df['Assignment'])
    data = results_df
    
    colors = ["#550000"]
    #The current colour is dark red or brownish, like the faeces from which we sequenced viruses

    assignment = []
    sample = []
    color = []
    load = []
    alpha = []

    for s in samples:
        for a in assignments:
            assignment.append(a)
            sample.append(s)
            current_load = data["Mapped # Reads"][len(assignment) - 1]
            load.append(data["Mapped # Reads"][len(assignment) - 1])
            color.append(colors[0])
            max_load = max(data["Mapped # Reads"])
            alpha.append(min(current_load / float(max_load), 0.9) + 0.1)
            #let the alpha be the fraction of the highest read count ("max_load")

    source = ColumnDataSource(
             data = dict(assignment=assignment, sample=sample, color=color, load=load, alpha=alpha)
    )
    
    TOOLS = "hover, save, pan, box_zoom, wheel_zoom, reset"

    p = figure(title = "Viruses classified by Genome Detective",
              x_range = list(samples),
              y_range = list(reversed(assignments)), #reverse to order 'from top to bottom'
              x_axis_location = "above",
              toolbar_location="right",
              tools = TOOLS)

    p.plot_width = 500
    p.plot_height = 400
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_text_font_size = "10pt"
    p.title_text_font_size = "14pt"
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = np.pi/3

    p.rect("sample", "assignment", 1, 1, source=source,
           color="color", alpha="alpha", line_color=None)

    p.select_one(HoverTool).tooltips = [
        ('Sample', "@sample"),
        ('Taxon' , "@assignment"),
        ('Number of reads', "@load"),
    ]

    #output_file('heatmap_test.html', title="Testing Genome Detective heatmaps in Jupyter+Bokeh")

    show(p)

NameError: name 'pd' is not defined

In [63]:
import glob

def create_concatenated_dataframe(file_wildcard, folder):
    """
    Input: a filename with wildcard,
            and a folder name
    Output: One concatenated dataframe of all the input files
    """
    #Step 1: create a list of the files using glob
    file_list = glob.glob("%s%s" % (folder, file_wildcard))
    file_list = sorted(file_list)
    
    file_suffix = file_wildcard.lstrip("*")
    
    #Step 2: open the files as dataframe, remove "Contigs" column and add sample IDs
    
    df_list = []
    for results_file in file_list:
        results_df = pd.read_csv(results_file)
        results_df = results_df.drop("Contigs", axis = 1) #remove unnecessary (and long!) column
        sample_id = results_file[:results_file.index(file_suffix)].split('/')[-1]
        results_df["sample"] = sample_id
        df_list.append(results_df)

    #Step 3: concatenate the dataframes
    super_df = pd.concat(df_list, ignore_index=True)
    
    return(super_df)

assignment_df = create_concatenated_dataframe(file_wildcard = "*_results.csv", folder = "../results/")

                  Assignment  # Contigs  Mapped # Reads  Coverage (%)  \
0                  Norovirus        1.0          542592     99.827082   
1      uncultured crAssphage       12.0           78949     78.068305   
2                  Norovirus        1.0            5541    100.914554   
3   Pepper mild mottle virus        3.0             157     74.123014   
4                  Norovirus        1.0         4599864     99.960096   
5             Parechovirus A        1.0         1707242     98.557431   
6               Equartevirus        1.0           83629     99.448992   
7              Enterovirus B        1.0            2304     89.822709   
8              Enterovirus A        1.0         1212499     99.487387   
9                  Norovirus        1.0          257515    100.705513   
10            Parechovirus A        1.0           94027     99.523680   
11             Enterovirus B        3.0             246     61.347950   
12  Escherichia virus phiV10        5.0            

In [100]:
samples = list(assignment_df["sample"])
assignments = list(assignment_df['Assignment'])
loads = list(assignment_df["Mapped # Reads"])

colors = ["#550000"]
#The current colour is dark red or brownish, like the faeces from which we sequenced viruses

max_load = max(loads)
alphas = [ min( x / float(max_load), 0.9) + 0.1 for x in loads]

source = ColumnDataSource(
         data = dict(samples=samples, assignments=assignments, colors=colors, loads=loads, alphas=alphas)
)

output_notebook()

In [101]:
TOOLS = "hover, save, pan, box_zoom, wheel_zoom, reset"

p = figure(title = "Viruses classified by Genome Detective",
          x_range = list(sorted(set(samples))),
          y_range = list(reversed(sorted(set(assignments)))), #reverse to order 'from top to bottom'
          x_axis_location = "above",
          toolbar_location="right",
          tools = TOOLS)

p.plot_width = 800
p.plot_height = 600
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "12pt"
p.title_text_font_size = "16pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = np.pi/4

p.rect("samples", "assignments", 1, 1, source=source,
       color="colors", alpha="alphas", line_color=None)

p.select_one(HoverTool).tooltips = [
    ('Sample', "@samples"),
    ('Taxon' , "@assignments"),
    ('Number of reads', "@loads"),
]

output_file('../results/Assignment_heatmap.html', title="Genome Detective classified viruses")

show(p)

In [98]:
discovery_df = create_concatenated_dataframe(file_wildcard="*_discovery.csv", folder = "../results/")

samples = list(discovery_df["sample"])
assignments = list(discovery_df['Assignment'])
loads = list(discovery_df["Mapped # Reads"])

colors = ["#550000"]
#The current colour is dark red or brownish, like the faeces from which we sequenced viruses

max_load = max(loads)
alphas = [ min( x / float(max_load), 0.9) + 0.1 for x in loads]

source = ColumnDataSource(
         data = dict(samples=samples, assignments=assignments, colors=colors, loads=loads, alphas=alphas)
)

TOOLS = "hover, save, pan, box_zoom, wheel_zoom, reset"

p = figure(title = "Viruses discovered by Genome Detective",
          x_range = list(sorted(set(samples))),
          y_range = list(reversed(sorted(set(assignments)))), #reverse to order 'from top to bottom'
          x_axis_location = "above",
          toolbar_location="right",
          tools = TOOLS)

p.plot_width = 800
p.plot_height = 800
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "12pt"
p.title_text_font_size = "16pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = np.pi/4

p.rect("samples", "assignments", 1, 1, source=source,
       color="colors", alpha="alphas", line_color=None)

p.select_one(HoverTool).tooltips = [
    ('Sample', "@samples"),
    ('Taxon' , "@assignments"),
    ('Number of reads', "@loads"),
]

output_file('../results/Discovery_heatmap.html', title="Genome Detective discovered viruses")

show(p)

In [102]:
?output_notebook()

'_discovery.csv'


### A little, simple example of Bokeh plotting, just to check if it works at all
https://bokeh.pydata.org/en/0.11.1/docs/user_guide/notebook.html

In [7]:
import numpy as np
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

N = 4000
x = np.random.random(size=N) * 100
y = np.random.random(size=N) * 100
radii = np.random.random(size=N) * 1.5
colors = ["#%02x%02x%02x" % (r, g, 150) for r, g in zip(np.floor(50+2*x), np.floor(30+2*y))]

output_notebook()

In [8]:
p = figure()
p.circle(x, y, radius=radii, fill_color=colors, fill_alpha=0.6, line_color=None)

<bokeh.models.renderers.GlyphRenderer at 0x7f28592cfe10>

In [9]:
show(p)