In [2]:
import numpy as np

import pandas as pd

#import altair as alt

import bokeh
import bokeh.plotting
from bokeh.plotting import ColumnDataSource
from bokeh.models import LabelSet
from bokeh.models import FuncTickFormatter


import bokeh.io
bokeh.io.output_notebook()

In [4]:
df_old = pd.read_excel('191015_Deuterium_Transfer_Peak_Areas_old.xlsx', comment='#')
df = pd.read_excel('191015_Deuterium_Transfer_Peak_Areas.xlsx', comment='#')

In [5]:
df.head()

Unnamed: 0,Species,WellNumber,Peak ID,Ret Time,Start Tm,End Tm,m/z,Area,Area Percent,Height,Height Percent,A/H,Type,Outlier
0,L,1,C18,7.436,7.41,7.455,TIC,4607279,98.37,5178529,98.76,0.89,L,N
1,L,1,D50C24,9.499,9.475,9.525,TIC,55596,1.19,56532,1.08,0.98,L,N
2,L,1,D62C30,12.354,12.33,12.375,TIC,20795,0.44,8683,0.17,2.39,L,N
3,S,1,C18,7.436,7.41,7.455,TIC,4594536,99.85,4961362,99.89,0.93,L,N
4,S,1,D50C24,9.499,9.475,9.525,TIC,5398,0.12,4955,0.1,1.09,L,N


In [6]:
#function to normalize the peak are of each unique peak using the area of the C18 peak for each sample
def normalize_area(data):
    #check to make sure that the dataframe hasn't already been normalized
    if not {'Normalized Area', 'Hydrocarbon amount'}.issubset(data.columns):
        #Create array of c18 values repeated in groups of three, matching the number of unique peaks for each sample
        c18_Areas = np.repeat(data['Area'].loc[data['Peak ID'] == 'C18'],3).reset_index().drop("index", axis=1)
        
        #Divide the area of each peak in the dataframe by the corresponding c18 peak area
        data['Normalized Area'] = np.divide(data['Area'],c18_Areas['Area'])
        
        #Calculate the hydrocarbon amount, in ng, by multiplying the normalized peak area by 25.
        #25 ng of c18 was injected in each sample (1 microliter of a 25 ng/microliter solution of c18 in hexane)
        data['Hydrocarbon amount'] = data['Normalized Area']*25
        
    return data

In [7]:
df_old = normalize_area(df_old)
df = normalize_area(df)

In [8]:
#function to create an identifier for each sample in the order (Liometopum or not : Beetle: Type)
def add_identifier(data):
    #check if identifier column already exists
    if not {'Identifier'}.issubset(data.columns):
        
        #create identifier column, populating it with the beetle species used in each well
        data = data.merge(data[['WellNumber','Species']].loc[data['Species'] != 'L'].rename(columns={"Species": "Identifier"}).drop_duplicates())
        
        #modify the identifier column, adding a leading 'L' for Liometopum runs and adding a suffix from the 'Type' column
        data['Identifier'] = ((data['Species']=='L')*pd.Series('L',index=df.index)) + data['Identifier'] + data['Type']
        
    return data

In [9]:
df_old = add_identifier(df_old)
df = add_identifier(df)

In [10]:
def box_and_whisker(data,plt,x_vals,y_vals):
    
    groups = data.groupby(x_vals)
    q1 = groups.quantile(q=0.25)
    q2 = groups.quantile(q=0.5)
    q3 = groups.quantile(q=0.75)
    iqr = q3 - q1
    upper = q3 + 1.5*iqr
    lower = q1 - 1.5*iqr

    # find the outliers for each category
    def outliers(group):
        cat = group.name
        return group[(group[y_vals] > upper.loc[cat][y_vals]) | (group[y_vals] < lower.loc[cat][y_vals])][y_vals]
    out = groups.apply(outliers).dropna()

    # prepare outlier data for plotting, we need coordinates for every outlier.
    if not out.empty:
        outx = []
        outy = []
        for keys in out.index:
            outx.append(keys[0])
            outy.append(out.loc[keys[0]].loc[keys[1]])

    # if no outliers, shrink lengths of stems to be no longer than the minimums or maximums
    qmin = groups.quantile(q=0.00)
    qmax = groups.quantile(q=1.00)
    upper[y_vals] = [min([x,y]) for (x,y) in zip(list(qmax.loc[:,y_vals]),upper[y_vals])]
    lower[y_vals] = [max([x,y]) for (x,y) in zip(list(qmin.loc[:,y_vals]),lower[y_vals])]

    # stems
    plt.segment(q3.reset_index()[x_vals], upper[y_vals], q3.reset_index()[x_vals], q3[y_vals], line_color="black")
    plt.segment(q3.reset_index()[x_vals], lower[y_vals], q3.reset_index()[x_vals], q1[y_vals], line_color="black")

    #boxes
    plt.vbar(q3.reset_index()[x_vals], 0.7, q2[y_vals], q3[y_vals], fill_color=None, line_color="black")
    plt.vbar(q3.reset_index()[x_vals], 0.7, q1[y_vals], q2[y_vals], fill_color=None, line_color="black")

    # whiskers (almost-0 height rects simpler than segments)
    plt.vbar(q3.reset_index()[x_vals], top=lower[y_vals],bottom=lower[y_vals], width=0.2, line_color="black")
    plt.vbar(q3.reset_index()[x_vals], top=upper[y_vals],bottom=upper[y_vals], width=0.2, line_color="black")
    
    return plt

In [11]:
plotting_df=df.loc[df['Peak ID']=='D50C24']

#Identifiers = ['LSU','SU','LSL','SL','LPU','PU','LPL','PL','LDU','DU','LDL','DL',]
Identifiers = ['LSU','SU','LSL','SL','LPU','PU','LPL','PL']
Labels={'LSU': '','SU': 'Sceptobius Control','LSL': '','SL': 'Sceptobius Treated',
        'LPU': '','PU': 'Platyusa Control','LPL': '','PL': 'Platyusa Treated',
        'LDU': '','DU': 'Dalotia Control','LDL': '','DL': 'Dalotia Treated'}

np.random.seed(666)  

p = bokeh.plotting.figure(plot_width=800,
                          plot_height=600,
                          title='C24',
                          x_range=Identifiers,
                          y_axis_label='ng D50C24',
                          y_axis_type='log')

colors=['#494949','#5F56FF','#494949','#5F56FF','#494949','#C42F2F','#494949','#C42F2F','#494949','#832161','#494949','#832161']

p = box_and_whisker(plotting_df,p,'Identifier','Hydrocarbon amount')

for _,i in enumerate(np.unique(plotting_df['WellNumber'])):
        
    data = {'identifier': plotting_df['Identifier'].loc[(plotting_df['WellNumber']==i)].values,
            'hydrocarbon amount': plotting_df['Hydrocarbon amount'].loc[(plotting_df['WellNumber']==i)].values}

    source = ColumnDataSource(data=data)

    offsetVal=(np.random.rand(1)[0]-0.5)*0.5
    p.line(bokeh.transform.dodge('identifier',  offsetVal,  range=p.x_range),
           'hydrocarbon amount',
           source=source,
           color='black',
           alpha=0.3)

    p.circle(bokeh.transform.dodge('identifier',  offsetVal,  range=p.x_range),
           'hydrocarbon amount',
           source=source,
           color=bokeh.transform.factor_cmap('identifier',colors,Identifiers),
           alpha=0.6,
           size=7)


p.xgrid.visible = False
p.ygrid.visible = False

# Add custom axis

p.xaxis.formatter = FuncTickFormatter(code="""
var labels = %s;
return labels[tick];
""" %Labels)

bokeh.io.show(p)

In [12]:
plotting_df=df_old.loc[df_old['Peak ID']=='D50C24']

#Identifiers = ['LSU','SU','LSL','SL','LPU','PU','LPL','PL','LDU','DU','LDL','DL',]
Identifiers = ['LSU','SU','LSL','SL','LPU','PU','LPL','PL']
Labels={'LSU': '','SU': 'Sceptobius Control','LSL': '','SL': 'Sceptobius Treated',
        'LPU': '','PU': 'Platyusa Control','LPL': '','PL': 'Platyusa Treated',
        'LDU': '','DU': 'Dalotia Control','LDL': '','DL': 'Dalotia Treated'}

np.random.seed(666)  

p = bokeh.plotting.figure(plot_width=800,
                          plot_height=600,
                          title='C24',
                          x_range=Identifiers,
                          y_axis_label='ng D50C24',
                          y_axis_type='log')

colors=['#494949','#5F56FF','#494949','#5F56FF','#494949','#C42F2F','#494949','#C42F2F','#494949','#832161','#494949','#832161']

p = box_and_whisker(plotting_df,p,'Identifier','Hydrocarbon amount')

for _,i in enumerate(np.unique(plotting_df['WellNumber'])):
        
    data = {'identifier': plotting_df['Identifier'].loc[(plotting_df['WellNumber']==i)].values,
            'hydrocarbon amount': plotting_df['Hydrocarbon amount'].loc[(plotting_df['WellNumber']==i)].values}

    source = ColumnDataSource(data=data)

    offsetVal=(np.random.rand(1)[0]-0.5)*0.5
    p.line(bokeh.transform.dodge('identifier',  offsetVal,  range=p.x_range),
           'hydrocarbon amount',
           source=source,
           color='black',
           alpha=0.3)

    p.circle(bokeh.transform.dodge('identifier',  offsetVal,  range=p.x_range),
           'hydrocarbon amount',
           source=source,
           color=bokeh.transform.factor_cmap('identifier',colors,Identifiers),
           alpha=0.6,
           size=7)


p.xgrid.visible = False
p.ygrid.visible = False

# Add custom axis

p.xaxis.formatter = FuncTickFormatter(code="""
var labels = %s;
return labels[tick];
""" %Labels)

bokeh.io.show(p)