In [1]:
"""
goal: create clean dataset for 2018 - 2021 crashes
steps:
1. import dataset
2. remove crashes that occured before 2018
3. remove crashes missing latitude and/or longitude (will be qa/qc'd)
4. remove crashes with incorrect latitude and/or longitude
5. export clean dataset as a geojson file and csv
"""

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Import Data

In [None]:
# import libraries
import pandas as pd # for data manipulation
import gspread # to connect with google sheets
from df2gspread import df2gspread as d2g # to interact with google sheets
import geopandas as gpd # for geospatial data manipulation

# create dataframe of crashes in leon county
df = pd.read_csv("./leon-events.csv")[['report_number', 'crash_year', 'crash_date_time',
                                       'first_harmful_event', 'total_number_of_vehicles', 'total_number_of_persons',
                                       'latitude', 'longitude', 'on_street_road_highway', 'street_address_number',
                                       'from_intersection_of']]

In [None]:
# read YAML file
yaml_file = open("./keys.yaml")
parsed_yaml_file = yaml.load(yaml_file)

# Filter Data

In [2]:
# select crashes that occured on 1/1/2018 or later
df2018 = df.query(" crash_year >= 2018").reset_index(drop=True)

In [3]:
# remove the rows with missing lat/lon data -- these will be addressed in qa/qc
df2018missing = df2018[df2018['latitude'].isna() | df2018['longitude'].isna()]

In [115]:
# export to Google Sheets for QA/QC
from oauth2client.service_account import ServiceAccountCredentials

# access and store credentials
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name('./gsCredentials.json')
gc = gspread.authorize(credentials)

# connect to the Google Sheet
spreadsheet_key = parsed_yaml_file["spreadsheet_key"]
wks_name = 'missing-2018'

# send data to the Google Sheet
d2g.upload(df2018missing, spreadsheet_key, wks_name, credentials=credentials, row_names=True)

<Worksheet 'all-2018' id:1962341349>

# Clean Data

In [8]:
# create clean dataset
df2018clean = df2018[~df2018['latitude'].isna() & ~df2018['longitude'].isna() & ~df2018['report_number'].isin(['87845295', '24211781', '24339312'])]

# Export Data

In [11]:
# convert to geodataframe
gdf2018clean = gpd.GeoDataFrame(df2018clean, geometry=gpd.points_from_xy(df2018clean.longitude, df2018clean.latitude))

In [14]:
# export to geojson
gdf2018clean.to_file("./crash-data/clean/all-2018to2021.geojson", driver='GeoJSON')

In [15]:
# export to csv
df2018clean.to_csv("./crash-data/clean/all-2018to2021.csv")

# Misc

In [19]:
df2018.groupby('first_harmful_event').count().sort_values(ascending=False, by="report_number")

# 14 - motor vehicle in transport (another car)
# 15 - parked motor vehicle
# 39 - "Other Fixed Object (wall, building, tunnel, etc.)"
# 18 - Other Non-Fixed Object
# 32 - Tree (standing)
# 10 - Pedestrian
# 36 - "Other Post, Pole or Support"
# 11 - pedalcycle

Unnamed: 0_level_0,report_number,crash_year,crash_date_time,total_number_of_vehicles,total_number_of_persons
first_harmful_event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
14.0,27991,27991,27991,27991,27991
15.0,3541,3541,3541,3541,3541
39.0,1017,1017,1017,1017,1017
18.0,735,735,735,735,735
32.0,614,614,614,614,614
10.0,559,559,559,559,559
36.0,473,473,473,473,473
13.0,408,408,408,408,408
24.0,406,406,406,406,406
25.0,352,352,352,352,352
