In [1]:
"""
goal: create clean dataset for 2018 - 2021 crashes -- "events" and "drivers"
steps:
1. import dataset
2. remove crashes that occured before 2018
3. remove crashes missing latitude and/or longitude (will be qa/qc'd)
4. remove crashes with incorrect latitude and/or longitude
5. export clean dataset as a geojson file and csv
"""

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Events Data

## Import Data

In [5]:
# import libraries
import pandas as pd # for data manipulation
import gspread # to connect with google sheets
from df2gspread import df2gspread as d2g # to interact with google sheets
import geopandas as gpd # for geospatial data manipulation
import os # for file navigation
from datetime import date # for age calculations

# create dataframe of crashes in leon county
df = pd.read_csv("../data/leon-events.csv")[['report_number', 'crash_year', 'crash_date_time',
                                       'first_harmful_event', 'total_number_of_vehicles', 'total_number_of_persons',
                                       'latitude', 'longitude', 'on_street_road_highway', 'street_address_number',
                                       'from_intersection_of']]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [2]:
# read YAML file
import yaml
yaml_file = open("../keys.yaml")
parsed_yaml_file = yaml.load(yaml_file)

  parsed_yaml_file = yaml.load(yaml_file)


## Filter Data

In [10]:
# select crashes that occured on 1/1/2018 or later
df2018 = df.query(" crash_year >= 2018").reset_index(drop=True)

In [11]:
# remove the rows with missing lat/lon data -- these will be addressed in qa/qc
df2018missing = df2018[df2018['latitude'].isna() | df2018['longitude'].isna()]

In [8]:
# export to Google Sheets for QA/QC
from oauth2client.service_account import ServiceAccountCredentials

# access and store credentials
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name('./gsCredentials.json')
gc = gspread.authorize(credentials)

# connect to the Google Sheet
spreadsheet_key = parsed_yaml_file["spreadsheet_key"]
wks_name = 'missing-2018'

# send data to the Google Sheet
d2g.upload(df2018missing, spreadsheet_key, wks_name, credentials=credentials, row_names=True)

<Worksheet 'missing-2018' id:557274010>

## Clean Data

In [12]:
# create clean dataset
df2018clean = df2018[~df2018['latitude'].isna() & ~df2018['longitude'].isna() & ~df2018['report_number'].isin(['87845295', '24211781', '24339312'])]

## Export Data

In [10]:
# convert to geodataframe
gdf2018clean = gpd.GeoDataFrame(df2018clean, geometry=gpd.points_from_xy(df2018clean.longitude, df2018clean.latitude))

In [11]:
# export to geojson
gdf2018clean.to_file("./crash-data/clean/all-2018to2021.geojson", driver='GeoJSON')

In [12]:
# export to csv
df2018clean.to_csv("./crash-data/clean/all-2018to2021.csv")

## Misc

In [15]:
# df2018.groupby('first_harmful_event').count().sort_values(ascending=False, by="report_number")
# df2018.groupby('crash_year').count().sort_values(ascending=False, by="report_number")

# first harmful event
# 14 - motor vehicle in transport (another car)
# 15 - parked motor vehicle
# 39 - "Other Fixed Object (wall, building, tunnel, etc.)"
# 18 - Other Non-Fixed Object
# 32 - Tree (standing)
# 10 - Pedestrian
# 36 - "Other Post, Pole or Support"
# 11 - pedalcycle

In [52]:
df2018.query(" first_harmful_event == 11.0 ")

Unnamed: 0,report_number,crash_year,crash_date_time,first_harmful_event,total_number_of_vehicles,total_number_of_persons,latitude,longitude,on_street_road_highway,street_address_number,from_intersection_of
309,85579687,2018,2018-01-29 18:54,11.0,1,2,30.440920,-84.373347,AENON LANE,,COUNTY ROAD 1581
1256,86964380,2018,2018-01-29 08:31,11.0,1,2,30.432184,-84.293783,PINELLAS ST,,EUGENIA ST
1395,86964519,2018,2018-02-03 16:22,11.0,1,4,30.462420,-84.291845,Gibbs Dr,,MONTICELLO DR
1671,86964796,2018,2018-02-13 09:00,11.0,1,2,30.447932,-84.310350,Chapel Dr,,W CALL ST
1911,86965037,2018,2018-02-22 06:10,11.0,1,2,30.444585,-84.272194,N FRANKLIN BLVD,,E TENNESSEE ST
...,...,...,...,...,...,...,...,...,...,...,...
37614,90115367,2021,2021-05-08 20:50,11.0,1,2,30.457587,-84.365054,W Tennessee St,,
37661,90115422,2021,2021-05-11 15:34,11.0,1,2,30.435613,-84.301625,W Gaines St,,STADIUM DR E
37820,90115595,2021,2021-05-22 19:46,11.0,1,2,30.475012,-84.363089,Capital Circle NW,,COMMONWEALTH BLVD
38029,90115822,2021,2021-06-04 21:00,11.0,1,3,30.431116,-84.292616,CONKLIN ST,,RATTLER VIEW CT


In [56]:
test_df.groupby('crash_year').count().sort_values(ascending=False, by="report_number")
#.query(" report_number == 90115367 ")
# pd.options.display.max_columns = 999

# driver_distraction_code
# driver_condition_code
# address
# city
# zip code
# date of birth
# sex
# helmet_code
# air_bag_deployment_code
# ejected
# suspected_alcohol_use_code
# suspected_drug_use_code
# injurty_severity
# ems_transport_type
# driver_action_1

# how many visitors/out-of-towners get into car accidents?

Unnamed: 0_level_0,report_number,vehicle_number,person_number,recommend_re_exam,driver_license_state,dl_expiration_date,driver_license_type,driver_distraction_code,driver_condition_code,required_endorsements,vision_obstructed,insurance_company,city,state,zip_code,date_of_birth,sex,helmet_code,restraint_system_code,eye_protection_code,air_bag_deployment_code,ejected,suspected_alcohol_use_code,alc_tested_code,alc_test_type_code,alc_test_results,blood_alcohol_content,suspected_drug_use_code,drug_tested_code,drug_test_type_code,drug_test_results,injury_severity,ems_transport_type,ems_name,ems_run_number,injured_taken_location,seat_position_code,row_position_code,other_position_code,driver_action_1,driver_action_2,driver_action_3,driver_action_4
crash_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
2018,1329812,1329812,1329812,1134582,1285358,1271135,1290593,1320563,1319423,1011907,1296655,1303474,1305706,1302528,1300361,1284898,1309237,337510,1295624,795041,1293698,1315753,1319572,504084,11254,11003,6991,1319245,498962,3894,3724,1302700,1305247,123672,75720,96159,1318754,1317533,714258,1307302,53119,16583,10773


# Drivers data
report number -- there could be multiple drivers under one report number

## Import Data

In [69]:
import os

In [70]:
# define path where the driver tables are stored
path = "../data/raw/"

# create empty list
li = []

# for loop -- loop through each file and save the data
for filename in os.listdir(path):
    temp_df = pd.read_csv(os.path.join(path, filename), index_col=None, header=0) \
    .drop(['dl_number', 'policy_number', 'first_name', 'middle_name', 'last_name', 'suffix', 'phone_number', 'address'], axis=1)
    li.append(temp_df)
    
# create dataframe for the drivers table, 2018 - 2021
driver_df = pd.concat(li, axis=0, ignore_index=True)

## Filter Data

In [63]:
# create list of record numbers from the events table
id_list = list(df2018['report_number'])

In [72]:
# remove rows from the list that don't match the ones from the events table
driver_df = driver_df.query(" report_number in @id_list")

## Clean Data

In [83]:
# change date of birth type to date


In [84]:
# create age of driver column

In [78]:
# create determine age function
def calculateAge(birthDate):
    today = date.today()
    age = today.year - birthDate.year - ((today.month, today.day) < (birthDate.month, birthDate.day))
 
    return age