In [41]:
import pandas as pd
import matplotlib.pyplot as plt
import geopandas
import numpy as np
import requests
import gmaps
import os
import json
import seaborn as sns
import sys
sys.path.insert(0, 'Data')
from config import g_key
from shapely.geometry import Point
%matplotlib inline

#Store Part I results into DataFrame
#Load the data to a DataFrame
accident_data = pd.read_csv("Data/accident_data.csv", encoding="utf-8")
accident_data.head()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,...,False,False,False,False,True,False,Day,Day,Day,Day


In [42]:
# Data cleanup

# Filters out dates containing 2016, 2017, & 2019 -- too much data in dataset to handle with laptops
accident_data_revised = accident_data[~accident_data["Start_Time"].str.contains("2016")]
accident_data_revised = accident_data[~accident_data["Start_Time"].str.contains("2017")]
accident_data_revised = accident_data[~accident_data["Start_Time"].str.contains("2019")]


In [43]:
# Splits the Start_Time column into individual columns
accident_data_revised[["Date", "Time"]] = accident_data_revised["Start_Time"].str.split(expand=True)

In [44]:
# Renames columns to be more readable
accident_data_revised = accident_data_revised.rename(columns={"Start_Lat":"Lat", "Start_Lng":"Lng", "Weather_Condition":"Weather"})

# Filters and rearranges dataset to display most useful columns
accident_data_revised = accident_data_revised[["Date", "Time", "Lat", "Lng", "City", "State", "County", "Weather", "Temperature(F)", "Severity"]]

# Iterates through all column data to find NaN values and adds them to a list
accident_data_revised.columns[accident_data_revised.isna().any()].tolist()
# Output: ['City', 'Weather', 'Temperature(F)']
accident_data_revised.head(10)

Unnamed: 0,Date,Time,Lat,Lng,City,State,County,Weather,Temperature(F),Severity
0,2016-02-08,05:46:00,39.865147,-84.058723,Dayton,OH,Montgomery,Light Rain,36.9,3
1,2016-02-08,06:07:59,39.928059,-82.831184,Reynoldsburg,OH,Franklin,Light Rain,37.9,2
2,2016-02-08,06:49:27,39.063148,-84.032608,Williamsburg,OH,Clermont,Overcast,36.0,2
3,2016-02-08,07:23:34,39.747753,-84.205582,Dayton,OH,Montgomery,Mostly Cloudy,35.1,3
4,2016-02-08,07:39:07,39.627781,-84.188354,Dayton,OH,Montgomery,Mostly Cloudy,36.0,2
5,2016-02-08,07:44:26,40.10059,-82.925194,Westerville,OH,Franklin,Light Rain,37.9,3
6,2016-02-08,07:59:35,39.758274,-84.230507,Dayton,OH,Montgomery,Overcast,34.0,2
7,2016-02-08,07:59:58,39.770382,-84.194901,Dayton,OH,Montgomery,Overcast,34.0,3
8,2016-02-08,08:00:40,39.778061,-84.172005,Dayton,OH,Montgomery,Mostly Cloudy,33.3,2
9,2016-02-08,08:10:04,40.10059,-82.925194,Westerville,OH,Franklin,Light Rain,37.4,3


In [45]:
# Combine Lat/Lng and City/State
accident_data_revised['Coordinates'] = accident_data_revised['Lat'].astype(str).str.zfill(2) + ', ' + accident_data_revised['Lng'].astype(str).str.zfill(3)

accident_data_revised['Location'] = accident_data_revised['City'] + ", " + accident_data_revised['State']
accident_data_revised.head()

Unnamed: 0,Date,Time,Lat,Lng,City,State,County,Weather,Temperature(F),Severity,Coordinates,Location
0,2016-02-08,05:46:00,39.865147,-84.058723,Dayton,OH,Montgomery,Light Rain,36.9,3,"39.865147, -84.058723","Dayton, OH"
1,2016-02-08,06:07:59,39.928059,-82.831184,Reynoldsburg,OH,Franklin,Light Rain,37.9,2,"39.92805900000001, -82.831184","Reynoldsburg, OH"
2,2016-02-08,06:49:27,39.063148,-84.032608,Williamsburg,OH,Clermont,Overcast,36.0,2,"39.063148, -84.032608","Williamsburg, OH"
3,2016-02-08,07:23:34,39.747753,-84.205582,Dayton,OH,Montgomery,Mostly Cloudy,35.1,3,"39.747753, -84.20558199999998","Dayton, OH"
4,2016-02-08,07:39:07,39.627781,-84.188354,Dayton,OH,Montgomery,Mostly Cloudy,36.0,2,"39.627781, -84.188354","Dayton, OH"


In [77]:
# Add column to dataframe showing frequency of accidents at specific coordinates
accident_data_revised['frequency'] = accident_data_revised['Coordinates'].map(accident_data_revised['Coordinates'].value_counts())
accident_data_revised.head()

Unnamed: 0,Date,Time,Lat,Lng,City,State,County,Weather,Temperature(F),Severity,Coordinates,Location,frequency
457574,2017-07-04,17:45:58,37.808498,-122.366852,San Francisco,CA,San Francisco,Partly Cloudy,66.0,3,"37.808498, -122.366852","San Francisco, CA",570
316465,2017-02-10,19:58:27,37.808498,-122.366852,San Francisco,CA,San Francisco,Scattered Clouds,55.9,3,"37.808498, -122.366852","San Francisco, CA",570
1149572,2018-05-19,07:52:20,37.808498,-122.366852,San Francisco,CA,San Francisco,Overcast,54.0,3,"37.808498, -122.366852","San Francisco, CA",570
37996,2016-05-10,15:12:47,37.808498,-122.366852,San Francisco,CA,San Francisco,Partly Cloudy,66.0,3,"37.808498, -122.366852","San Francisco, CA",570
314691,2017-01-31,16:17:06,37.808498,-122.366852,San Francisco,CA,San Francisco,Mostly Cloudy,57.0,3,"37.808498, -122.366852","San Francisco, CA",570


In [78]:
# Group coordinates by accident frequency
accident_data_sorted = accident_data_revised.sort_values(by ='frequency' , ascending=False)
accident_data_sorted.head()

Unnamed: 0,Date,Time,Lat,Lng,City,State,County,Weather,Temperature(F),Severity,Coordinates,Location,frequency
457574,2017-07-04,17:45:58,37.808498,-122.366852,San Francisco,CA,San Francisco,Partly Cloudy,66.0,3,"37.808498, -122.366852","San Francisco, CA",570
1065898,2018-06-08,14:53:01,37.808498,-122.366852,San Francisco,CA,San Francisco,Partly Cloudy,66.9,3,"37.808498, -122.366852","San Francisco, CA",570
889771,2018-09-10,14:24:29,37.808498,-122.366852,San Francisco,CA,San Francisco,Clear,73.0,3,"37.808498, -122.366852","San Francisco, CA",570
2056,2016-07-04,13:25:57,37.808498,-122.366852,San Francisco,CA,San Francisco,Partly Cloudy,69.1,3,"37.808498, -122.366852","San Francisco, CA",570
20990,2016-09-19,17:42:14,37.808498,-122.366852,San Francisco,CA,San Francisco,Scattered Clouds,75.9,3,"37.808498, -122.366852","San Francisco, CA",570


In [81]:
# Find unique coordinates per frequency (this will delete some rows of data)
coordinates_by_frequency = accident_data_sorted.drop_duplicates(subset=['Coordinates'])
coordinates_by_frequency.head()

Unnamed: 0,Date,Time,Lat,Lng,City,State,County,Weather,Temperature(F),Severity,Coordinates,Location,frequency
457574,2017-07-04,17:45:58,37.808498,-122.366852,San Francisco,CA,San Francisco,Partly Cloudy,66.0,3,"37.808498, -122.366852","San Francisco, CA",570
84746,2016-10-17,22:02:10,33.941364,-118.096634,Downey,CA,Los Angeles,Clear,64.9,3,"33.941364, -118.096634","Downey, CA",561
1318094,2018-02-10,14:44:55,42.476501,-83.111794,Royal Oak,MI,Oakland,Light Snow,23.5,3,"42.476501, -83.11179399999997","Royal Oak, MI",534
159336,2016-07-18,16:07:17,33.744976,-84.390343,Atlanta,GA,Fulton,Mostly Cloudy,91.9,3,"33.744976, -84.390343","Atlanta, GA",529
1458849,2017-12-21,16:44:01,34.858925,-82.259857,Greenville,SC,Greenville,Overcast,55.0,2,"34.858925, -82.259857","Greenville, SC",494


In [84]:
# Find which coordinates had the most accidents (top 100)
top_US_df = coordinates_by_frequency.nlargest(100,'frequency')
top_US_df.head()
#top_US_df.iloc[0:100]

Unnamed: 0,Date,Time,Lat,Lng,City,State,County,Weather,Temperature(F),Severity,Coordinates,Location,frequency
457574,2017-07-04,17:45:58,37.808498,-122.366852,San Francisco,CA,San Francisco,Partly Cloudy,66.0,3,"37.808498, -122.366852","San Francisco, CA",570
84746,2016-10-17,22:02:10,33.941364,-118.096634,Downey,CA,Los Angeles,Clear,64.9,3,"33.941364, -118.096634","Downey, CA",561
1318094,2018-02-10,14:44:55,42.476501,-83.111794,Royal Oak,MI,Oakland,Light Snow,23.5,3,"42.476501, -83.11179399999997","Royal Oak, MI",534
159336,2016-07-18,16:07:17,33.744976,-84.390343,Atlanta,GA,Fulton,Mostly Cloudy,91.9,3,"33.744976, -84.390343","Atlanta, GA",529
1458849,2017-12-21,16:44:01,34.858925,-82.259857,Greenville,SC,Greenville,Overcast,55.0,2,"34.858925, -82.259857","Greenville, SC",494


In [98]:
#Configure gmaps
gmaps.configure(api_key=g_key)

In [100]:
#Use the Lat and Lng as locations and frequency as the weight.
locations = top_US_df[["Lat", "Lng"]]

#Drop any rows will null values
frequency = top_US_df['frequency'].astype(float)
maxfrequency = frequency.max()                  

fig = gmaps.figure()

#Add Heatmap layer to map
heatmap_layer = gmaps.heatmap_layer(locations, weights=frequency,
                                    dissipating=False, max_intensity=maxfrequency,
                                    point_radius=1)
fig.add_layer(heatmap_layer)
fig

Figure(layout=FigureLayout(height='420px'))

In [102]:
#Add markers to map.
worst_five = [
    {'name': 'San Francisco, CA', 'location': (37.808498, -122.366852), 'frequency': 570},
    {'name': 'Downey, CA', 'location': (33.941364, -118.096634), 'frequency': 561},
    {'name': 'Royal Oak, MI', 'location': (42.476501, -83.11179399999997), 'frequency': 534},
    {'name': 'Atlanta, GA', 'location': (33.744976, -84.390343), 'frequency': 529},
    {'name': 'Greenville, SC', 'location': (34.858925, -82.259857), 'frequency': 494}
]

worst_locations = [city['location'] for city in worst_five]
info_box_template = """
<dl>
<dt>Name</dt><dd>{name}</dd>
<dt>Number frequency</dt><dd>{frequency}</dd>
</dl>
"""
city_info = [info_box_template.format(**city) for city in worst_five]

marker_layer = gmaps.marker_layer(worst_locations, info_box_content=city_info)
fig = gmaps.figure()
fig.add_layer(marker_layer)
fig

Figure(layout=FigureLayout(height='420px'))

In [111]:
# filter accident_data_sorted to include only NC data

NC_accident_data = accident_data_sorted.query('State=="NC"')
NC_accident_data.head()

Unnamed: 0,Date,Time,Lat,Lng,City,State,County,Weather,Temperature(F),Severity,Coordinates,Location,frequency
1450004,2017-12-18,10:57:51,35.766506,-78.735367,Raleigh,NC,Wake,Mostly Cloudy,53.1,3,"35.766506, -78.73536700000002","Raleigh, NC",306
1509566,2017-11-20,12:21:37,35.814922,-78.605362,Raleigh,NC,Wake,Clear,53.1,3,"35.814922, -78.605362","Raleigh, NC",291
815225,2018-10-05,23:41:30,35.751511,-78.701134,Raleigh,NC,Wake,Overcast,73.9,3,"35.751511, -78.701134","Raleigh, NC",257
1298009,2018-02-01,08:51:45,35.825504,-78.621483,Raleigh,NC,Wake,Mostly Cloudy,39.9,3,"35.82550399999999, -78.621483","Raleigh, NC",253
1227955,2018-04-27,21:07:05,35.835407,-78.669601,Raleigh,NC,Wake,Partly Cloudy,61.0,3,"35.83540700000001, -78.669601","Raleigh, NC",238


In [131]:
#Use the Lat and Lng as locations and frequency as the weight.
locations = NC_accident_data[["Lat", "Lng"]]

#Drop any rows will null values
frequency = NC_accident_data['frequency'].astype(float)
maxfrequency = frequency.max()                  

fig = gmaps.figure()

#Add Heatmap layer to map
heatmap_layer = gmaps.heatmap_layer(locations, weights=frequency,
                                    dissipating=False, max_intensity=maxfrequency,
                                    point_radius=0.12)
fig.add_layer(heatmap_layer)
fig

Figure(layout=FigureLayout(height='420px'))

In [119]:
# filter accident_data_sorted to include only Wake County data

Wake_accident_data = accident_data_sorted.query('County=="Wake"')
Wake_accident_data.head()

Unnamed: 0,Date,Time,Lat,Lng,City,State,County,Weather,Temperature(F),Severity,Coordinates,Location,frequency
1450004,2017-12-18,10:57:51,35.766506,-78.735367,Raleigh,NC,Wake,Mostly Cloudy,53.1,3,"35.766506, -78.73536700000002","Raleigh, NC",306
1509566,2017-11-20,12:21:37,35.814922,-78.605362,Raleigh,NC,Wake,Clear,53.1,3,"35.814922, -78.605362","Raleigh, NC",291
815225,2018-10-05,23:41:30,35.751511,-78.701134,Raleigh,NC,Wake,Overcast,73.9,3,"35.751511, -78.701134","Raleigh, NC",257
1298009,2018-02-01,08:51:45,35.825504,-78.621483,Raleigh,NC,Wake,Mostly Cloudy,39.9,3,"35.82550399999999, -78.621483","Raleigh, NC",253
1227955,2018-04-27,21:07:05,35.835407,-78.669601,Raleigh,NC,Wake,Partly Cloudy,61.0,3,"35.83540700000001, -78.669601","Raleigh, NC",238


In [124]:
#Use the Lat and Lng as locations and frequency as the weight.
locations = Wake_accident_data[["Lat", "Lng"]]

#Drop any rows will null values
frequency = Wake_accident_data['frequency'].astype(float)
maxfrequency = frequency.max()                  

fig = gmaps.figure()

#Add Heatmap layer to map
heatmap_layer = gmaps.heatmap_layer(locations, weights=frequency,
                                    dissipating=False, max_intensity=maxfrequency,
                                    point_radius=0.010)
fig.add_layer(heatmap_layer)
fig

Figure(layout=FigureLayout(height='420px'))

In [136]:
#Add markers to map.
worst_five = [
    {'name': 'I-40WB Exit 239A to Cary and Asheboro', 'location': (35.766506, -78.73536700000002), 'frequency': 306},
    {'name': 'Capital Blvd at I-440 merge', 'location': (35.814922, -78.605362), 'frequency': 291},
    {'name': 'Gorman St at I-40 in SW Raleigh', 'location': (35.751511, -78.701134), 'frequency': 257},
    {'name': 'I-440WB at Wake Forest Rd', 'location': (35.82550399999999, -78.621483), 'frequency': 253},
    {'name': 'I-440WB at Glenwood Ave', 'location': (35.83540700000001, -78.669601), 'frequency': 238}
]

worst_locations = [marker['location'] for marker in worst_five]
info_box_template = """
<dl>
<dt>Name</dt><dd>{name}</dd>
<dt>Number frequency</dt><dd>{frequency}</dd>
</dl>
"""
marker_info = [info_box_template.format(**marker) for marker in worst_five]

marker_layer = gmaps.marker_layer(worst_locations, info_box_content=marker_info)
fig = gmaps.figure()
fig.add_layer(marker_layer)
fig

Figure(layout=FigureLayout(height='420px'))