<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-Preparation-for-Plotting" data-toc-modified-id="Data-Preparation-for-Plotting-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data Preparation for Plotting</a></span></li><li><span><a href="#Total-Deaths-by-Borough" data-toc-modified-id="Total-Deaths-by-Borough-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Total Deaths by Borough</a></span></li><li><span><a href="#Aggregate-plots" data-toc-modified-id="Aggregate-plots-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Aggregate plots</a></span></li></ul></div>

# Pedestrian Deaths in NYC

Data Visualization for pedestrians and cyclists killed by motor vehicles in NYC 2012 - 2020

In [1]:
import pandas as pd
import numpy as np
import datetime
from bokeh.plotting import figure, output_notebook, show, gmap
from bokeh.models import CategoricalColorMapper, ColumnDataSource, Legend, Span
from bokeh.models import CheckboxGroup, HoverTool, GMapOptions, FactorRange
from bokeh.layouts import widgetbox, row, column
from bokeh.io import curdoc, push_notebook
from bokeh.transform import factor_cmap
from bokeh.palettes import colorblind

In [2]:
import json
with open('keys.json') as f:
    keys = json.load(f)
    google_api_key = keys['google_map']

In [3]:
df = pd.read_csv('peds_death_data', index_col=0)
df.head()

Unnamed: 0,borough,year,month,day,date,location,latitude,longitude,cross_street_name,off_street_name,on_street_name,vehicle_type_code1,contributing_factor_vehicle_1,number_of_cyclist_killed,number_of_pedestrians_killed,borough_gps
0,STATEN ISLAND,2020,5,4,2020-05-22,"{'type': 'Point', 'coordinates': [-74.1672, 40...",40.602074,-74.1672,,SIGNS ROAD,ARLENE STREET,Pick-Up Truck,Failure to Yield,1,0,STATEN ISLAND
1,QUEENS,2020,5,0,2020-05-18,"{'type': 'Point', 'coordinates': [-73.827286, ...",40.704857,-73.827286,124-50 METROPOLITAN AVENUE,,,Box Truck,View Obstructed/Limited,0,1,QUEENS
2,QUEENS,2020,3,5,2020-03-14,"{'type': 'Point', 'coordinates': [-73.89384, 4...",40.760437,-73.89384,,30 AVENUE,74 STREET,SUV / Station Wagon,Driver Distraction,0,1,QUEENS
3,,2020,3,1,2020-03-17,,,,,,NEW ENGLAND THRUWAY,Tractor Truck,Unspecified,0,1,
4,BROOKLYN,2020,4,1,2020-04-28,"{'type': 'Point', 'coordinates': [-73.95166, 4...",40.643063,-73.95166,,CLARENDON ROAD,ROGERS AVENUE,Bus,Pedestrian/Cyclist Error,1,0,BROOKLYN


## Data Preparation for Plotting

In [4]:
df.borough.value_counts(dropna=False)

QUEENS           338
BROOKLYN         334
MANHATTAN        229
BRONX            146
NaN               65
STATEN ISLAND     55
NOT NYC           31
Name: borough, dtype: int64

In [5]:
df = df.drop(df[(df.borough == 'NOT NYC') | (df.borough.isna() == True)].index)
df = df.drop(['borough_gps', 'location'], axis=1)

In [6]:
df.borough.value_counts(dropna=False)

QUEENS           338
BROOKLYN         334
MANHATTAN        229
BRONX            146
STATEN ISLAND     55
Name: borough, dtype: int64

In [7]:
df['total_deaths'] = df.number_of_cyclist_killed+df.number_of_pedestrians_killed
df['month_year'] = pd.to_datetime(df['date']).dt.to_period('M')

In [8]:
#source = ColumnDataSource(df)
output_notebook()

## Total Deaths by Borough

See `test_plots.py` for interactive html plot. Run `bokeh serve --show test_plots.py` in terminal for use. 

In [9]:
colorblind['Colorblind'][3][0]

'#0072B2'

In [10]:
sorted(df.borough.unique().tolist())

['BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND']

In [11]:
source = ColumnDataSource(
    df.groupby(['month_year', 'borough']).sum() \
    ['total_deaths'].reset_index())
color_mapper = CategoricalColorMapper(factors=df.borough.unique().tolist(),
                                      palette=colorblind['Colorblind'][5])
checkbox = CheckboxGroup(labels=df.borough.unique().tolist(),
                             active=[0,1,2,3,4])


In [12]:
p = figure(title = "Total Deaths by Month 2012 - 2020",
           x_axis_label = "Time", 
           y_axis_label = "Number of Deaths", 
           x_axis_type='datetime', 
           plot_width = 800, 
           toolbar_location = 'above',
           tools='box_select, box_zoom, reset')
p.circle(x='month_year', y='total_deaths', 
         selection_color="blue", 
         nonselection_fill_color='gray',
         nonselection_alpha=0.2, 
         size=10,
         source=source, 
         color=dict(field='borough', transform=color_mapper),
         legend='borough', 
         hover_fill_color='red',
         hover_alpha=0.5,
         hover_line_color='white')
hover_glyph = p.circle(x='month_year', y='total_deaths',
                       source=source, size=11, alpha=0,
                       hover_fill_color='red', hover_alpha=0.5)
tooltips = [('Borough', '@borough'), 
            ('Date', '@month_year{%Y-%m}'), 
            ('Deaths', '@total_deaths')]
p.add_tools(HoverTool(tooltips = tooltips,  
                       mode='vline', 
                       renderers=[hover_glyph],
                       formatters={'month_year':'datetime', }))
show(p)

## Aggregate plots

In [13]:
df.head()

Unnamed: 0,borough,year,month,day,date,latitude,longitude,cross_street_name,off_street_name,on_street_name,vehicle_type_code1,contributing_factor_vehicle_1,number_of_cyclist_killed,number_of_pedestrians_killed,total_deaths,month_year
0,STATEN ISLAND,2020,5,4,2020-05-22,40.602074,-74.1672,,SIGNS ROAD,ARLENE STREET,Pick-Up Truck,Failure to Yield,1,0,1,2020-05
1,QUEENS,2020,5,0,2020-05-18,40.704857,-73.827286,124-50 METROPOLITAN AVENUE,,,Box Truck,View Obstructed/Limited,0,1,1,2020-05
2,QUEENS,2020,3,5,2020-03-14,40.760437,-73.89384,,30 AVENUE,74 STREET,SUV / Station Wagon,Driver Distraction,0,1,1,2020-03
4,BROOKLYN,2020,4,1,2020-04-28,40.643063,-73.95166,,CLARENDON ROAD,ROGERS AVENUE,Bus,Pedestrian/Cyclist Error,1,0,1,2020-04
5,STATEN ISLAND,2020,3,5,2020-03-14,40.62474,-74.12785,,DUBOIS AVENUE,EGBERT AVENUE,SUV / Station Wagon,Alcohol Involvement,0,1,1,2020-03


In [14]:
year_df_main = df[(df['year']!= 2012) & (df['year']!= 2020)]

In [15]:
year_df = year_df_main.groupby(['year']).sum()['total_deaths'].reset_index()
source = ColumnDataSource(year_df)
y = figure(title = "Total Deaths by Year 2013 - 2019",
           x_axis_label = "Years", 
           y_axis_label = "Number of Deaths", 
           plot_width = 800,
           plot_height = 300,
           toolbar_location = None,
           tools="")
y.line(x='year', y='total_deaths', 
       source=source, 
       color = colorblind["Colorblind"][3][0])
tooltips = [('Deaths', '@total_deaths')]
y.add_tools(HoverTool(tooltips = tooltips))

show(y)

In [16]:
crash_df = df.groupby(['contributing_factor_vehicle_1']).sum()[['total_deaths']] \
          .reset_index().sort_values(['total_deaths'], ascending=True)
crash_df = crash_df.loc[crash_df["total_deaths"] >= 5]
source = ColumnDataSource(crash_df)
v = figure(y_range=crash_df.contributing_factor_vehicle_1.unique(),
           plot_width=500, plot_height=300,
           title="Contributing Factors to Incident",
           toolbar_location=None, tools="") 
v.hbar(y='contributing_factor_vehicle_1',
       right='total_deaths',
       height=0.9, source=source,
       line_color = 'white',
       hover_fill_color='red',
       hover_alpha=1.0,
       hover_line_color='gray')
tooltips = [('Deaths', '@total_deaths')]
v.add_tools(HoverTool(tooltips = tooltips))

show(v)

In [17]:
crash_df = df.groupby(['vehicle_type_code1']).sum()[['total_deaths']] \
          .reset_index().sort_values(['total_deaths'], ascending=True)
crash_df = crash_df.loc[crash_df["total_deaths"] >= 5]
crash_df.vehicle_type_code1 = crash_df.vehicle_type_code1.apply(lambda x: x.title())
y_range = crash_df.vehicle_type_code1.unique()
source = ColumnDataSource(crash_df)
v = figure(y_range=y_range,
           plot_width=500, plot_height=300,
           title="Vehicle Type in Incident",
           toolbar_location=None, tools="") 
v.hbar(y='vehicle_type_code1',
       right='total_deaths',
       height=0.9, source=source,
       line_color = 'white',
       hover_fill_color='red',
       hover_alpha=1.0,
       hover_line_color='gray')
tooltips = [('Deaths', '@total_deaths')]
v.add_tools(HoverTool(tooltips = tooltips))

show(v)

In [18]:
pop_df = pd.read_csv('pop_borough', index_col=0)
pop_df = pop_df[pop_df.year!=2012]
pop_df = pop_df.groupby(['borough', 'year']).sum()['population'] \
          .reset_index().sort_values(['year'], ascending=True)
pop_df['year_mean_pop'] = pop_df.groupby('year').transform('mean')

crash_df = year_df_main.groupby(['borough', 'year']).sum()[['total_deaths']] \
          .reset_index().sort_values(['year'], ascending=True)
crash_df['year_mean_deaths'] = crash_df.groupby(["year"]).transform('mean')


In [23]:
# Grouped bar charts in bokeh are non-trivial
totals = []
pops = []
year_avg_deaths =[]
year_avg_pops = []
data = {}
pop_data = {}

boros = crash_df.borough.unique().tolist()
years = crash_df.year.unique().tolist()
year_avg_death = crash_df.year_mean_deaths.tolist()[::5]
year_avg_pop = pop_df.year_mean_pop.tolist()[::5]

for i in range(5):
    year_avg_deaths.extend(year_avg_death)
    year_avg_pops.extend(year_avg_pop)

for i in years: # dict; 'year':[total_1, total_2, ..]
    counts = crash_df[crash_df.year == i] \
                .sort_values('borough').total_deaths.tolist()
    populations = pop_df[pop_df.year == i] \
                .sort_values('borough').population.tolist()
    entry_c = {i:counts}
    entry_p = {i:populations}
    data.update(entry_c)
    pop_data.update(entry_p)
    
for i in range(5): # list of ordered seq of totals
    for k, v in data.items():
        totals.append(v[i])
for i in range(5): # list of ordered seq of populations
    for k, v in pop_data.items():
        pops.append(v[i])
        
percentage_pop = tuple([round((total/pop)*100000,2) \
                        for pop, total in zip(pops, 
                                              totals)])
avg_percent_pop = tuple([round((total/avg_pop)*100000,2) \
                         for avg_pop, total in zip(year_avg_pops,
                                                   year_avg_deaths)])

totals = tuple(totals) # bokeh needs tuples strings for grouped bars
#list of tuples [ ('Bronx', '2013'), ('brooklyn', '2013')...('Queens', '2018') ]
x = [(str(year), boro) for boro in boros for year in years]

boro_source = ColumnDataSource(data=dict(x=x, 
                                         total=totals,
                                         avg_total=year_avg_deaths,
                                         pop_percent=percentage_pop,
                                         avg_percent=avg_percent_pop))
x_range= FactorRange(*x)
b = figure(x_range=x_range, 
           plot_width=800, plot_height= 300,
           title="Deaths per Borough by Year",
           toolbar_location=None, tools="")
b.vbar(x='x', top='total',
       width=0.9, source=boro_source,
       line_color = 'white',
       hover_fill_color='red',
       hover_alpha=1.0,
       hover_line_color='gray',
       fill_color=factor_cmap('x',
                              palette=colorblind['Colorblind'][7],
                              factors=[str(year) for year in years],
                              start=0, end=1))

b.y_range.start = 0
b.x_range.range_padding = 0.1
b.xaxis.major_label_orientation = 1
b.xgrid.grid_line_color = None

tooltips = [('Deaths', '@total'),
           ('Average Yearly Deaths', '@avg_total')]
b.add_tools(HoverTool(tooltips = tooltips))


b1 = figure(x_range=x_range, 
           plot_width=800, plot_height= 300,
           title="Percentage Deaths per 100,000",
           toolbar_location=None, tools="")
b1.vbar(x='x', top='pop_percent',
       width=0.9, source=boro_source,
       line_color = 'white',
       hover_fill_color='red',
       hover_alpha=1.0,
       hover_line_color='gray',
       fill_color=factor_cmap('x',
                              palette=colorblind['Colorblind'][7],
                              factors=[str(year) for year in years],
                              start=0, end=1))
b1.y_range.start = 0
b1.x_range.range_padding = 0.1
b1.xaxis.major_label_orientation = 1
b1.xgrid.grid_line_color = None

tooltips = [('Percent Deaths', '@pop_percent'),
            ('Yearly Avg', '@avg_percent')]
b1.add_tools(HoverTool(tooltips = tooltips))

layout = column(b,b1)
show(layout)


In [20]:
def make_victim_column(row):
    if row.number_of_pedestrians_killed > 1 & \
        row.number_of_cyclist_killed == 0:
        return 'Pedestrian'
    if row.number_of_cyclist_killed > 1 & \
        row.number_of_pedestrians_killed == 0:
        return 'Cyclist'
    else: return 'Both'

In [21]:
map_df = df.dropna(subset=['latitude', 'longitude'])
map_df['victim'] = df.apply(lambda x: make_victim_column(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [22]:
map_options = GMapOptions(lat=40.737, 
                          lng=-73.990, 
                          map_type="roadmap", zoom=15)

color_mapper = CategoricalColorMapper(factors=["Pedestrian", "Cyclist"],
                                      palette=[colorblind['Colorblind'][5][1],
                                              colorblind['Colorblind'][5][0]])

hover_map = HoverTool(tooltips = [('Date', '@month_year{%Y-%m}'), 
                                  ('Deaths', '@total_deaths'),
                                  ('Vehicle Type', '@vehicle_type_code1'),
                                  ('Cause', '@contributing_factor_vehicle_1')],   
                       formatters={'month_year':'datetime', })

g = gmap(google_api_key=google_api_key, 
         map_options=map_options, 
         title="NYC Pedestrian and Cyclists Deaths 2012 - 2020", 
         plot_width=800, 
         toolbar_location = 'above')

map_source = ColumnDataSource(map_df)
g.circle(x='longitude', y='latitude',
         size=10, 
         fill_alpha=1.0, 
         color=dict(field='victim', transform=color_mapper),
         legend='victim',
         source=map_source)
g.legend.location = "top_left"
g.add_tools(hover_map)
show(g)