In [16]:
import pandas as pd
import numpy as np

from bokeh.models import ColumnDataSource, HoverTool, CategoricalColorMapper
from bokeh.plotting import figure
from bokeh.io import show, output_notebook, output_file
from bokeh.palettes import Category10_5, Category20_16

### Basics of Bokeh

In [2]:
# bokeh basics
# Create a blank figure with labels
p = figure(plot_width = 400, plot_height = 400, 
           title = 'Example Glyphs',
           x_axis_label = 'X', y_axis_label = 'Y')

# Example data
squares_x = [1, 3, 4, 5, 8]
squares_y = [8, 7, 3, 1, 10]
circles_x = [9, 12, 4, 3, 15]
circles_y = [8, 4, 11, 6, 10]

# Add squares glyph
p.square(squares_x, squares_y, size = 12, color = 'navy', alpha = 0.6)
# Add circle glyph
p.circle(circles_x, circles_y, size = 12, color = 'red')

# Set to output the plot in the notebook
output_notebook()
# Show the plot
show(p)

In [47]:
# Read the data from a csv into a dataframe
flights = pd.read_csv('data/flights.csv', index_col=0)
# Summary stats for the column of interest
flights['arr_delay'].describe()

count    327346.000000
mean          6.895377
std          44.633292
min         -86.000000
25%         -17.000000
50%          -5.000000
75%          14.000000
max        1272.000000
Name: arr_delay, dtype: float64

In [4]:
flights.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
1,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
2,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
3,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
4,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
5,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00


In [5]:
"""Bins will be five minutes in width, so the number of bins is (length of interval / 5). 
Limit delays to [-60, +120] minutes using the range."""
arr_hist, edges = np.histogram(flights['arr_delay'], bins = int(180/5), range = [-60, 120])

# Put the information in a dataframe
delays = pd.DataFrame({'flights': arr_hist, 'left': edges[:-1], 'right': edges[1:]})

  keep = (tmp_a >= first_edge)
  keep &= (tmp_a <= last_edge)


In [6]:
delays.head()

Unnamed: 0,flights,left,right
0,276,-60.0,-55.0
1,636,-55.0,-50.0
2,1394,-50.0,-45.0
3,2820,-45.0,-40.0
4,5339,-40.0,-35.0


In [7]:
# Create the blank plot
p = figure(plot_height = 600, plot_width = 600, title = 'Histogram of Arrival Delays', 
           x_axis_label = 'Delay (min)]', y_axis_label = 'Number of Flights')

# Add a quad glyph
p.quad(bottom=0, top=delays['flights'], left=delays['left'], right=delays['right'], 
       fill_color='red', line_color='black')

# Show the plot
show(p)

### Adding Interactivity

In [8]:
arr_df = pd.DataFrame({'count': arr_hist, 'left': edges[:-1], 'right': edges[1:]})
arr_df['f_count'] = ['%d flights' % count for count in arr_df['count']]
arr_df['f_interval'] = ['%d to %d minutes' % (left, right) for left, right in zip(arr_df['left'], 
                                                                                  arr_df['right'])]

arr_df.head()

Unnamed: 0,count,left,right,f_count,f_interval
0,276,-60.0,-55.0,276 flights,-60 to -55 minutes
1,636,-55.0,-50.0,636 flights,-55 to -50 minutes
2,1394,-50.0,-45.0,1394 flights,-50 to -45 minutes
3,2820,-45.0,-40.0,2820 flights,-45 to -40 minutes
4,5339,-40.0,-35.0,5339 flights,-40 to -35 minutes


In [9]:
# Import the ColumnDataSource class
from bokeh.models import ColumnDataSource
# Convert dataframe to column data source
src = ColumnDataSource(arr_df)
src.data.keys()

dict_keys(['index', 'count', 'left', 'right', 'f_count', 'f_interval'])

In [10]:
def style(p):
    p.title.align = 'center'
    p.title.text_font_size = '20pt'
    p.xaxis.axis_label_text_font_size = '12pt'
    #p.xaxis.major_label_text_font_size = '12pt'
    p.yaxis.axis_label_text_font_size = '12pt'
    #p.yaxis.major_label_text_font_size = '12pt'
    
    return p

In [11]:
# Set up the figure same as before
p = figure(plot_width = 600, plot_height = 600, title = 'Histogram of Arrival Delays', 
           x_axis_label = 'Delay(min)', y_axis_label = 'Flights')

# Add a quad glyph with source this time
p.quad(bottom=0, top='count', left='left', right='right', source=src, 
       fill_color='red', line_color='black', fill_alpha = 0.75, 
       hover_fill_alpha = 1.0, hover_fill_color = 'navy')

# Add style to the plot
styled_p = style(p)

# Add a hover tool referring to the formatted columns
hover = HoverTool(tooltips = [('Delay', '@f_interval'),
                              ('Count', '@f_count')])

# Add the hover tool to the graph
styled_p.add_tools(hover)

# Show the plot
show(styled_p)

In [12]:
output_file('hist.html')
show(p)

### Part II Bokeh

In [35]:
carrier_names = pd.read_csv('data/airlines.csv')
carrier_names.head()

Unnamed: 0,carrier,name
0,9E,Endeavor Air Inc.
1,AA,American Airlines Inc.
2,AS,Alaska Airlines Inc.
3,B6,JetBlue Airways
4,DL,Delta Air Lines Inc.


In [48]:
flights = flights.merge(carrier_names, how = 'left', on = 'carrier')

# Group by the carrier to find the most common
carrier_nums = flights.groupby('carrier')['year'].count().sort_values(ascending=False)

# Subset to the 8 most common carriers
flights = flights[flights['carrier'].isin(carrier_nums.index[:5])]

# Subset to only [-1, +2] hour delays
flights = flights[(flights['arr_delay'] >= -60) & (flights['arr_delay'] <= 120)]
flights.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour,name
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00,United Air Lines Inc.
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00,United Air Lines Inc.
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00,American Airlines Inc.
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00,JetBlue Airways
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00,Delta Air Lines Inc.


In [49]:
by_carrier = pd.DataFrame(columns=['proportion', 'left', 'right', 
                                   'f_proportion', 'f_interval',
                                   'name', 'color'])

# Iterate through all the carriers
for i, carrier_name in enumerate(flights['name'].unique()):
    
    # Subset to the carrier
    subset = flights[flights['name'] == carrier_name]
    
    # Create a histogram with 5 minute bins
    arr_hist, edges = np.histogram(subset['arr_delay'], bins = int(240/5), range = [-120, 120])
    
    # Divide the counts by the total to get a proportion
    arr_df = pd.DataFrame({'proportion': arr_hist / np.sum(arr_hist), 'left': edges[:-1], 'right': edges[1:] })
    
    # Format the proportion 
    arr_df['f_proportion'] = ['%0.5f' % proportion for proportion in arr_df['proportion']]
    
    # Format the interval
    arr_df['f_interval'] = ['%d to %d minutes' % (left, right) for left, right in zip(arr_df['left'], arr_df['right'])]
    
    # Assign the carrier for labels
    arr_df['name'] = carrier_name
    
    # Color each carrier differently
    arr_df['color'] = Category10_5[i]

    # Add to the overall dataframe
    by_carrier = by_carrier.append(arr_df)
    
# Overall dataframe
by_carrier = by_carrier.sort_values(['name', 'left'])

In [50]:
by_carrier.head()

Unnamed: 0,proportion,left,right,f_proportion,f_interval,name,color
0,0.0,-120.0,-115.0,0.0,-120 to -115 minutes,American Airlines Inc.,#ff7f0e
1,0.0,-115.0,-110.0,0.0,-115 to -110 minutes,American Airlines Inc.,#ff7f0e
2,0.0,-110.0,-105.0,0.0,-110 to -105 minutes,American Airlines Inc.,#ff7f0e
3,0.0,-105.0,-100.0,0.0,-105 to -100 minutes,American Airlines Inc.,#ff7f0e
4,0.0,-100.0,-95.0,0.0,-100 to -95 minutes,American Airlines Inc.,#ff7f0e


In [51]:
by_carrier_src = ColumnDataSource(by_carrier)

In [52]:
# Create the figure
p = figure(plot_height = 600, plot_width = 800, title = 'Histogram of Airline Delays by Carrier',
          x_axis_label = 'Arrival Delay (min)', y_axis_label = 'Proportion')

# Add the quad glpyh with the source by carrier
p.quad(bottom = 0, left = 'left', right = 'right', top = 'proportion',
       fill_color = 'color',  legend = 'name', source = by_carrier_src,
      fill_alpha = 0.6, hover_fill_alpha = 1.0, hover_fill_color = 'color')

# Create the hover tool
hover = HoverTool(tooltips = [('Carrier', '@name'),
                              ('Proportion', '@f_proportion'),
                              ('Delay', '@f_interval')],
                 mode = 'vline')

# Add styling and hover tool
styled_p = style(p)
styled_p.add_tools(hover)
show(styled_p)

