In [4]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import bokeh
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure
from bokeh.io import show, output_notebook, output_file
from bokeh.models import Legend

In [47]:
# Retrieve "Motor Vehicle Collisions - Crashes" data
url = 'https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv?accessType=DOWNLOAD'
# df = pd.read_csv(url)
print(df.shape)
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


(1673457, 29)


Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,08/20/2012,20:30,BROOKLYN,11236.0,40.636931,-73.911081,POINT (-73.9110805 40.6369309),FLATLANDS AVENUE,EAST 83 STREET,,...,Driver Inattention/Distraction,,,,144939,PASSENGER VEHICLE,SPORT UTILITY / STATION WAGON,,,
1,09/11/2012,21:20,,,,,,CROSS BRONX SERVICE ROAD SOUTH,WESTCHESTER AVENUE,,...,Unspecified,,,,81300,PASSENGER VEHICLE,PASSENGER VEHICLE,,,
2,08/30/2012,12:57,QUEENS,11101.0,40.749022,-73.954446,POINT (-73.9544459 40.7490224),45 AVENUE,5 STREET,,...,Unspecified,,,,239267,VAN,UNKNOWN,,,
3,09/13/2012,13:30,,,40.693802,-73.811554,POINT (-73.8115536 40.6938016),,,,...,Unspecified,,,,2999970,LIVERY VEHICLE,SMALL COM VEH(4 TIRES),,,
4,08/20/2012,12:00,BROOKLYN,11224.0,40.579804,-73.971834,POINT (-73.9718339 40.5798042),NEPTUNE AVENUE,WEST 5 STREET,,...,Unspecified,,,,113804,PASSENGER VEHICLE,UNKNOWN,,,


In [50]:
print(df.columns)

Index(['CRASH DATE', 'CRASH TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE',
       'LONGITUDE', 'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME',
       'OFF STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3',
       'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5',
       'COLLISION_ID', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
       'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'],
      dtype='object')


In [203]:
# Convert date column to datetime
df['CRASH DATE'] = pd.to_datetime(df['CRASH DATE']) 

In [204]:
# Set date interval
start_date = '2020-01-01'
end_date = '2020-04-25'

interval_data = df.copy()

# Change df to only include data for the dates within the interval
interval = (interval_data['CRASH DATE'] > start_date) & (interval_data['CRASH DATE'] <= end_date)

# The interval variable is used to set the new dates in the 
# interval_data DataFrame, so that only the ones with interval=true
# will be used.
interval_data = interval_data.loc[interval]

In [205]:
print(df.shape, interval_data.shape)

(1673457, 29) (41255, 29)


In [206]:
# Create dataframe to use for bokeh plots, containing number of pedestrians, 
# cyclists, and motorists injured or killed in different time series
time_series_df = interval_data[['CRASH DATE', 'CRASH TIME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED']].copy()

In [207]:
# Create column to store hour of day
time_series_df['HOUR OF DAY'] = pd.to_datetime(time_series_df['CRASH TIME']).dt.hour.copy()

In [208]:
# Create column to store day of the week
time_series_df['WEEKDAY'] = pd.to_datetime(time_series_df['CRASH DATE']).dt.dayofweek.copy()

In [209]:
# Create column to store hour of the week
time_series_df['HOUR OF WEEK'] = time_series_df['WEEKDAY'] * 24 + time_series_df['HOUR OF DAY']

In [210]:
# Create column to store month 
time_series_df['MONTH'] = pd.to_datetime(time_series_df['CRASH DATE']).dt.month.copy()

In [211]:
# Merge number of injured and dead for pedestrians, persons, cyclists, and motorists
time_series_df['PEDESTRIANS'] = time_series_df['NUMBER OF PEDESTRIANS INJURED'] + time_series_df['NUMBER OF PEDESTRIANS KILLED']
time_series_df['PERSONS'] = time_series_df['NUMBER OF PERSONS INJURED'] + time_series_df['NUMBER OF PERSONS KILLED']
time_series_df['CYCLISTS'] = time_series_df['NUMBER OF CYCLIST INJURED'] + time_series_df['NUMBER OF CYCLIST KILLED']
time_series_df['MOTORISTS'] = time_series_df['NUMBER OF MOTORIST INJURED'] + time_series_df['NUMBER OF MOTORIST KILLED']

In [212]:
# Drop not needed columns
time_series_df = time_series_df.drop(['CRASH DATE', 'CRASH TIME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED'], axis=1)

In [213]:
# Reset indices after drop
time_series_df.reset_index(drop=True, inplace = True)

In [215]:
# The PERSON column is of type float, change to integer
time_series_df['PERSONS'] = time_series_df['PERSONS'].astype(int)
time_series_df.head()

Unnamed: 0,HOUR OF DAY,WEEKDAY,HOUR OF WEEK,MONTH,PEDESTRIANS,PERSONS,CYCLISTS,MOTORISTS
0,0,0,0,4,0,0,0,0
1,2,0,2,4,0,1,0,1
2,15,5,135,4,0,1,0,1
3,18,5,138,4,0,0,0,0
4,5,5,125,4,0,0,0,0


In [237]:
# Prepare for Bokeh Plots
# Call output_notebook() to display bokeh in the notebook
output_notebook()

time_series_df = time_series_df.sort_values(by='HOUR OF DAY', ascending=True)
unique_hours = time_series_df['HOUR OF DAY'].unique().tolist()

# Range hours from 1-24 instead of 0-23 
unique_hours = [x+1 for x in unique_hours]
# Make string of unique hours
str_unique_hours = [str(x) for x in unique_hours]

# Make list to hold names for accidents
accident_types = time_series_df.columns[4:]

# Append empty string to ensure correct display later
str_unique_hours.append('')

Index(['PEDESTRIANS', 'PERSONS', 'CYCLISTS', 'MOTORISTS'], dtype='object')


In [236]:
# Plot for "HOUR IN DAY"
p = figure(plot_height = 400, plot_width = 850, 
           x_range = FactorRange(factors=str_unique_hours),
           title = 'Accidents per Hour of the Day', 
           x_axis_label = 'Hour of the Day',
           y_axis_label = 'Relative Frequency')

In [None]:
# Define color palette for the plot. Using seaborn, a color palette
# can be made in a range of the number of crime types
palette = sns.color_palette("hls", len(unique_crime_types))
# Convert the palette to hex numbers instead of rgb,
# since bokeh expects hex
pal = palette.as_hex()

# Create variable bar to store the bar objects and 
# items list crimes and bar objects. This will be used for the legends
bar = {}
items = [] 

# Loop over the crime types in unique_crime_types
for indx, i in enumerate(accident_types):
    # Create a vbar object for each crime which stores
    # bars for each hour (x) and normalized values for
    # number of crimes (y)
    bar[i] = p.vbar(x='Hour', 
                    muted_alpha=0.03, 
                    fill_alpha=0.7,  
                    line_color=(0,0,0,0.0), 
                    muted=True, 
                    width=0.6, 
                    top=i, 
                    color=pal[indx], 
                    muted_color=pal[indx], 
                    source=source)
    # Append the crime type string i and the
    # bar object containing the bars for the crime type
    items.append((i, [bar[i]])) 