## Crash Data Wrangling Jupyter Notebook

**Author:** Eric Englin, Meredith Raymer, and Sophie Kaye

**Date:** 6/23/22

**Purpose:** This notebook will clean pre-combined IMARS data and filter data tables for data attributes to be used in analysis. The smaller cleaned tables are then combined to create a dataframe that is joinable with the reciprocal CDS dataset.

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
from shapely.geometry import Point, LineString, Polygon

In [None]:
myworkingdirectory = r"C:\Users\Sophie.Kaye\Desktop\NPS Safety"
os.chdir(myworkingdirectory)

In [None]:
path = './IMARS'

files = os.listdir(path)

for f in files:
    print(f)

In [None]:
imars_crash = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Location")
imars_passenger = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Person")
imars_vehicle = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Vehicle")
imars_crash_details = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Classification")

In [None]:
imars_crash.shape

In [None]:
imars_passenger.shape

In [None]:
imars_vehicle.shape

In [None]:
imars_crash_details.shape

In [None]:
# note that crash_details and crash don't have the same number of entries to begin with

In [None]:
imars_passenger.columns

In [None]:
imars_crash.columns

In [None]:
imars_crash_details.columns

In [None]:
imars_vehicle.columns

# General Data Cleaning
Note that there are crashes with randomly missing park, region, and/or roadway

### CHECK FOR MISSING RECORD NUMBERS

In [None]:
imars_crash.shape

In [None]:
imars_crash.dropna(subset=['IMARS_Record_No']).shape

In [None]:
imars_crash = imars_crash.dropna(subset=['IMARS_Record_No'])

In [None]:
imars_passenger.shape

In [None]:
imars_passenger.dropna(subset=['IMARS_Record_No']).shape

In [None]:
imars_passenger = imars_passenger.dropna(subset=['IMARS_Record_No'])

In [None]:
imars_vehicle.shape

In [None]:
imars_vehicle.dropna(subset=['IMARS_Record_No']).shape

In [None]:
imars_vehicle = imars_vehicle.dropna(subset=['IMARS_Record_No'])

In [None]:
imars_crash_details.shape

In [None]:
imars_crash_details.dropna(subset=['IMARS_Record_No']).shape

In [None]:
imars_crash_details = imars_crash_details.dropna(subset=['IMARS_Record_No'])

### REMOVE DUPLICATES

In [None]:
imars_crash = imars_crash.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash.shape

In [None]:
imars_crash_details = imars_crash_details.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_details.shape

In [None]:
# note that crash and crash details still don't have the same number of entries (although they did with the older archived input data...)

### REVERT FLIPPED LAT/LONG

In [None]:
need_revert = imars_crash['Latitude'].abs() > 70
imars_crash.loc[need_revert, ['Latitude', 'Longitude']] = (imars_crash.loc[need_revert, ['Longitude', 'Latitude']].values)

### ADJUST SIGNS TO CORRECT HEMISPHERE

In [None]:
imars_crash.loc[imars_crash.Latitude < 0,"Latitude"] = imars_crash['Latitude']*(-1)
imars_crash.loc[imars_crash.Longitude > 0,"Longitude"] = imars_crash['Longitude']*(-1)

# Add Parks to Crash Table

**Note:** IMARS does not have park units with each crash, so these will have to be added using the Latitude and Longitude fields in the imars_crash dataset. 

In [None]:
imars_crash.shape

In [None]:
imars_crash.dropna(subset=['Park']).shape

In [None]:
imars_crash_parks = imars_crash.dropna(subset=['Park'])
imars_crash_parks.shape

In [None]:
imars_crash.loc[imars_crash['Park'].isnull()==False].shape

In [None]:
# 12205 of 15130 crash entries have the park already explicitly identified

In [None]:
imars_crash.loc[imars_crash['Park'].isnull()].shape

In [None]:
# 2925 of the 15130 crash entries have no park identification

In [None]:
imars_crash_coords = imars_crash.loc[imars_crash['Park'].isnull()].dropna(subset=['Latitude','Longitude'])
imars_crash_coords.shape

In [None]:
# of the 2925 crash entries without park identification, 591 have lat/long coordinates from which park can be assigned using the shapefile
# the sum of crashes with pre-identified parks and crashes that can be assigned a park using lat/long coordinates 
# results in the following total of possible usable crash entries after the spatial join:
12205+591

In [None]:
imars_crash_noparks = imars_crash.loc[imars_crash['Park'].isnull()]
imars_crash_noparks_nolat = imars_crash_noparks.loc[imars_crash_noparks['Latitude'].isnull()]
imars_crash_noparks_nocoords = imars_crash_noparks_nolat.loc[imars_crash_noparks_nolat['Longitude'].isnull()]
imars_crash_noparks_nocoords.shape

In [None]:
# 2334 of the 15130 crashes entries have no park identification or lat/long coordinates

In [None]:
mask = imars_crash_noparks_nocoords.loc[:,['Linked_Common_Name','Linked_Address','NEAR_route_street_road_name',
                                           'At_Intersecting_route_street_road_name']].notnull()
imars_noparks_nocoords_someinfo = imars_crash_noparks_nocoords.loc[mask.any(axis=1)]
imars_noparks_nocoords_someinfo.shape

In [None]:
# of the 2334 crash entries without park identification or lat/long coordinates, 109 have other identifiable information 
# (e.g., roadway name) which could be used to identify the park name
# print file containing only this subset of crashes for manual park identification
imars_noparks_nocoords_someinfo.to_csv("./IMARS_noparks_nocoords_someinfo.csv",index=False)

## spatial join to assign park names

In [None]:
imars_crash_coords_geo=gpd.GeoDataFrame(imars_crash_coords, geometry=gpd.points_from_xy(imars_crash_coords.Longitude, 
                                                                             imars_crash_coords.Latitude))

In [None]:
filename = "./shapefiles/NPS_-_Land_Resources_Division_Boundary_and_Tract_Data_Service.geojson"
file = open(filename)
parks = gpd.read_file(file)

In [None]:
imars_crash_coords_geo.crs = "EPSG:4326"

In [None]:
parks = parks.to_crs(epsg=4326)
parks['geometry']=parks['geometry'].buffer(0.01)

imars_crash_coords_geo_withparknames=gpd.sjoin(imars_crash_coords_geo,parks,how="left", predicate='intersects')
imars_crash_coords_geo_withparknames.head()

In [None]:
imars_crash_coords_geo_withparknames2 = imars_crash_coords_geo_withparknames.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_coords_geo_withparknames2.shape

In [None]:
# all 591 crash entries were processed in the spatial join

In [None]:
imars_crash_withparknames = pd.DataFrame(imars_crash_coords_geo_withparknames2.drop(columns='geometry'))
imars_crash_withparknames['Park']= imars_crash_withparknames['UNIT_CODE']
imars_crash_withparknames.shape

In [None]:
imars_crash_withparknames_NoDupsorNulls = imars_crash_withparknames.dropna(subset=['Park'])
imars_crash_withparknames_NoDupsorNulls.shape

In [None]:
# of the 591 crash entries without parks identified, 505 now have parks assigned from spatial join
# so we expect the total ("expanded") dataset to be:
12205+505

In [None]:
imars_crash_expanded = pd.concat([imars_crash_parks,imars_crash_withparknames_NoDupsorNulls])
imars_crash_expanded.shape

In [None]:
# check to make sure all entries in combined dataset contain park assignments 
imars_crash_expanded = imars_crash_expanded.dropna(subset=['Park'])
imars_crash_expanded.shape

In [None]:
# check to make sure no duplicate crashes in combined dataset
imars_crash_expanded = imars_crash_expanded.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_expanded.shape

In [None]:
# open file Chris populated in which park names were manually added to crash data containing identifiable info (e.g., road name)
imars_new_parknames = pd.read_csv("./IMARS/IMARS_noparks_nocoords_someinfo_parks_filled.csv")

In [None]:
# remove any entries for which park names were not able to be identified
imars_new_parknames = imars_new_parknames.dropna(subset=['Park'])
imars_new_parknames.shape

In [None]:
# 80 of 109 crashes were able to have park names manually added using road names and should be combined with crash data 
# containing park names from original data and spatial assignment for a total of 12790 crashes in the final dataset

In [None]:
imars_crash_expanded = pd.concat([imars_crash_expanded,imars_new_parknames])
imars_crash_expanded.shape

In [None]:
# check to make sure there are no duplicates in final dataset
imars_crash_expanded = imars_crash_expanded.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_expanded.shape

## Create new region column
imars_crash database currently contains two columns with region information, although neither one is fully populated. This section will create and populate a new column re-assigning region name to every crash based on park name using a lookup table

In [None]:
# load lookup table
park_info = pd.read_csv("./crash database mapping/Park_Info_Table.csv")
park_info = park_info.rename(columns={'UNIT_CODE':'Park','REGION':'RGN'})
park_info.columns

In [None]:
imars_crash_expanded.columns

In [None]:
imars_crash_expanded.shape

In [None]:
# add RGN column from lookup table to IMARS crash database, joining the two datasets based on park name
imars_crash_expanded = pd.merge(imars_crash_expanded, park_info[['RGN','Park']], how='left', on='Park')
# resulting dataframe after join should have one additional column and no additional rows
imars_crash_expanded = imars_crash_expanded.drop_duplicates() 
imars_crash_expanded.shape

In [None]:
# check to see if any crashes were not assigned a region
imars_crash_expanded.loc[imars_crash_expanded['RGN'].isnull()].shape

In [None]:
#no_region = imars_crash_expanded.loc[imars_crash_expanded['RGN'].isnull()]
#no_region['Park'].value_counts()

In [None]:
imars_crash_expanded['RGN'].value_counts()

In [None]:
# remove partially populated pre-existing columns
imars_crash_expanded = imars_crash_expanded.drop(columns = ['Region', 'REGION'])

In [None]:
imars_crash_expanded.shape

# Filter for Necessary Fields, Group by IMARS_RECORD_NO

## Passenger table

In [None]:
imars_passenger.columns

In [None]:
# one person per passenger data entry
imars_passenger['NUM_OCC'] = 1
# rename column to match CDS column name
imars_passenger['INCID_NO'] = imars_passenger['IMARS_Record_No']

In [None]:
imars_passenger.shape

In [None]:
imars_passenger_slim = imars_passenger[['INCID_NO', 'NUM_OCC']]

In [None]:
imars_passenger_slim.shape

In [None]:
imars_passenger_slim.head()

In [None]:
imars_passenger['Injury_Severity'].value_counts().sort_index()

In [None]:
# conservative estimate for total number of injuries (including possible injuries):
1826+1163+486

In [None]:
imars_passenger['Driver_Distraction'].value_counts().sort_index()

In [None]:
# total number of distracted motorists:
20+17+76+166+378

In [None]:
imars_passenger['Non_motorist_distraction'].value_counts().sort_index()

In [None]:
# total number of distracted non-motorists:
2+1+2+16

In [None]:
# set flags for each injury severity
imars_passenger_slim['No Injury']= np.where(imars_passenger['Injury_Severity']=='01. No injury', 1,0)
imars_passenger_slim['Possible Injury']= np.where(imars_passenger['Injury_Severity']=='02. Possible injury', 1,0)
imars_passenger_slim['Non-incapacitating Injury']= np.where(imars_passenger['Injury_Severity']=='03. Non-incapacitating injury', 1,0)
imars_passenger_slim['Incapacitating Injury']= np.where(imars_passenger['Injury_Severity']=='04. Incapacitating injury', 1,0)
imars_passenger_slim['Fatality']= np.where(imars_passenger['Injury_Severity']=='05. Fatal', 1,0)
imars_passenger_slim['Unknown Injury']= np.where(imars_passenger['Injury_Severity']=='99. Unknown', 1,0)
imars_passenger_slim['Motorist Distraction']= np.where(imars_passenger['Driver_Distraction'].isin(['02. Talking on electronic communication device','03. Texting on electronic communication device','04. Other electronic device (navigation, DVD player, etc.)','05. Other inside the vehicle (eating, personal hygiene, etc.)','06. Outside the vehicle (includes unspecified external distractions)']), 1,0)
imars_passenger_slim['Non-motorist Distraction']= np.where(imars_passenger['Non_motorist_distraction'].isin(['02. Talking on electronic communications device','03. Texting on electronic communications device','04. Other electronic device (GPS, electronic music device, etc.)','05. Other distraction (looking at scenery, eating, day-dreaming, etc.)']), 1,0)

In [None]:
imars_passenger_slim.columns

In [None]:
# check new flag column against sum of distracted motorist attributes - looks good!
imars_passenger_slim['Motorist Distraction'].sum()

In [None]:
# check new flag column against sum of distracted non-motorist attributes - looks good!
imars_passenger_slim['Non-motorist Distraction'].sum()

In [None]:
# make sure all columns contain data as you would expect (i.e., 0 and 1)
#imars_passenger_slim.to_csv("./imars_passenger_slim_test.csv",index=False)

In [None]:
# create new column for number of fatalities to match CDS
imars_passenger_slim['Num_Fatalities'] = imars_passenger_slim['Fatality']
# check to make sure this matches the value count from above - looks good!
imars_passenger_slim['Num_Fatalities'].sum()

In [None]:
# create new column for number of injuries to match CDS
imars_passenger_slim['Num_Injuries']=imars_passenger_slim['Possible Injury']+imars_passenger_slim['Non-incapacitating Injury']+imars_passenger_slim['Incapacitating Injury']
# check to make sure this matches the value count sum from above - looks good!
imars_passenger_slim['Num_Injuries'].sum()

In [None]:
imars_passenger_slim.head()

In [None]:
imars_passenger_slim.shape

In [None]:
# collapse multiple rows for each person involved in the crash into a single row for each crash
# occupancy, injuries, and fatalities should be summed as total numbers per crash ("sum" function)
# only one distraction flag should exist per crash to not double count data attribute ("first" function)
imars_passenger_slim_agg = imars_passenger_slim.groupby(by=['INCID_NO']).agg({'NUM_OCC':'sum','No Injury':'sum','Possible Injury':'sum','Non-incapacitating Injury':'sum','Incapacitating Injury':'sum','Fatality':'sum','Unknown Injury':'sum','Motorist Distraction':'first','Non-motorist Distraction':'first','Num_Fatalities':'sum','Num_Injuries':'sum'}).reset_index()

In [None]:
# result should be no more than 15302 rows (number of useable crash records)
imars_passenger_slim_agg.shape

## Crash Details Table

In [None]:
# rename column to match CDS column name
imars_crash_details['INCID_NO'] = imars_crash_details['IMARS_Record_No']

imars_crash_details_slim = imars_crash_details[['INCID_NO']]

In [None]:
# find relevant data attributes in the following columns for which to create flags

In [None]:
imars_crash_details['Injury_or_Fatal_Crash'].value_counts().sort_index()

In [None]:
imars_crash_details['First_Harmful_Event_Type'].value_counts().sort_index()

In [None]:
pd.options.display.max_rows = 10000000
imars_crash_details['First_Harmful_Event'].value_counts().sort_index()

In [None]:
# total roadway departure crashes (right and left):
797+259

In [None]:
# total VRU crashes (pedestrian, bike, and pedacycle):
71+103+1

In [None]:
# total "other accident class" crashes (i.e., cargo and maintenance equipment)
27+30

In [None]:
imars_crash_details['Environmental_Contributing_Circumstances'].value_counts().sort_index()

In [None]:
#pd.options.mode.chained_assignment = None  # default='warn'
# set flags for each relevant data attribute
imars_crash_details_slim['Injury or Fatal']= np.where(imars_crash_details['Injury_or_Fatal_Crash']=="01. Yes", 1,0)
imars_crash_details_slim['Collision w Animal']= np.where(imars_crash_details['First_Harmful_Event_Type']=="Collision with animals", 1,0)
imars_crash_details_slim['Collision w Fixed Object']= np.where(imars_crash_details['First_Harmful_Event_Type']=="Collision with fixed object", 1,0)
imars_crash_details_slim['Non-Collision']= np.where(imars_crash_details['First_Harmful_Event_Type']=="Non-collision", 1,0)
imars_crash_details_slim['Other Accident Class']= np.where(imars_crash_details['First_Harmful_Event'].isin(['23. Struck by falling, shifting cargo or anything set in motion by MV','24. Work zone/maintenance equipment']), 1,0)
imars_crash_details_slim['Rollover']= np.where(imars_crash_details['First_Harmful_Event']=="01. Overturn/rollover", 1,0)
imars_crash_details_slim['Collision w Other Vehicle']= np.where(imars_crash_details['First_Harmful_Event']=='21. Motor vehicle in transport', 1,0)
imars_crash_details_slim['Collision w Parked Vehicle']= np.where(imars_crash_details['First_Harmful_Event']=="22. Parked motor vehicle", 1,0)
imars_crash_details_slim['Collision w Train']= np.where(imars_crash_details['First_Harmful_Event']=="20. Railway vehicle", 1,0)
imars_crash_details_slim['Collision w Other Object']= np.where(imars_crash_details['First_Harmful_Event']=="25. Other non-fixed object", 1,0)
imars_crash_details_slim['Collision w Unknown']= np.where(imars_crash_details['First_Harmful_Event_Type']=="Unknown",1,0)
imars_crash_details_slim['Roadway Departure']= np.where(imars_crash_details['First_Harmful_Event'].isin(['07. Ran off the road right','08. Ran Off the road left']), 1,0)
imars_crash_details_slim['Avoiding Animal']= np.where(imars_crash_details['First_Harmful_Event']=="13. Avoiding an animal on road", 1,0)
imars_crash_details_slim['Falling Object']= np.where(imars_crash_details['First_Harmful_Event']=="14. Thrown or falling object", 1,0)
imars_crash_details_slim['Pedestrian']= np.where(imars_crash_details['First_Harmful_Event']=="17. Pedestrian", 1,0)
imars_crash_details_slim['Bicycle']= np.where(imars_crash_details['First_Harmful_Event']=="18. Bicycle", 1,0)
imars_crash_details_slim['Pedacycle']= np.where(imars_crash_details['First_Harmful_Event']=="19. Other pedacycle", 1,0)
imars_crash_details_slim['VRU']= np.where(imars_crash_details['First_Harmful_Event'].isin(['17. Pedestrian','18. Bicycle','19. Other pedacycle']), 1,0)
imars_crash_details_slim['HorseLlama']= np.where(imars_crash_details['First_Harmful_Event']=="26. Horse/llama", 1,0)
imars_crash_details_slim['Cow']= np.where(imars_crash_details['First_Harmful_Event']=="27. Cow", 1,0)
imars_crash_details_slim['Deer']= np.where(imars_crash_details['First_Harmful_Event']=="28. Deer", 1,0)
imars_crash_details_slim['Elk']= np.where(imars_crash_details['First_Harmful_Event']=="29. Elk", 1,0)
imars_crash_details_slim['Moose']= np.where(imars_crash_details['First_Harmful_Event']=="30. Moose", 1,0)
imars_crash_details_slim['Buffalo']= np.where(imars_crash_details['First_Harmful_Event']=="31. Buffalo", 1,0)
imars_crash_details_slim['Bear']= np.where(imars_crash_details['First_Harmful_Event']=="32. Bear", 1,0)
imars_crash_details_slim['Antelope']= np.where(imars_crash_details['First_Harmful_Event']=="33. Antelope", 1,0)
imars_crash_details_slim['SheepGoats']= np.where(imars_crash_details['First_Harmful_Event']=="34. Sheep/goats", 1,0)
imars_crash_details_slim['OtherWild']= np.where(imars_crash_details['First_Harmful_Event']=="36. Other wild animal (crocodile, birds, coyote, etc.)", 1,0)
imars_crash_details_slim['OtherDomestic']= np.where(imars_crash_details['First_Harmful_Event']=='37. Other domestic (dog, cat, etc.)', 1,0)
imars_crash_details_slim['Rock in Roadway']= np.where(imars_crash_details['First_Harmful_Event']=='60. Rock, boulder, rock slide', 1,0)
imars_crash_details_slim['Animal in Roadway']= np.where(imars_crash_details['Environmental_Contributing_Circumstances'].str.contains("Animal")==True, 1,0)
imars_crash_details_slim['Rockfall']= np.where(imars_crash_details['Environmental_Contributing_Circumstances'].str.contains("Rockfall")==True, 1,0)

In [None]:
pd.options.display.max_columns = 10000000
imars_crash_details_slim.head()

In [None]:
# check to make sure this matches the value count sum from above - looks good!
imars_crash_details_slim['Roadway Departure'].sum()

In [None]:
# check to make sure this matches the value count sum from above - looks good!
imars_crash_details_slim['VRU'].sum()

In [None]:
# check to make sure this matches the value count sum from above - looks good!
imars_crash_details_slim['Other Accident Class'].sum()

In [None]:
# check new flag column sum against value counts - looks good!
imars_crash_details_slim['Animal in Roadway'].sum()

In [None]:
# check new flag column sum against value counts - looks good!
imars_crash_details_slim['Collision w Animal'].sum()

In [None]:
imars_crash_details_slim.shape

In [None]:
imars_crash_details_slim.columns

In [None]:
# make sure all columns contain data as you would expect (i.e., 0 and 1)
#imars_crash_details_slim.to_csv("./imars_crash_details_slim_test.csv",index=False)

## Vehicle Table

In [None]:
imars_vehicle.columns

In [None]:
imars_vehicle.shape

In [None]:
# rename column to match CDS column name
imars_vehicle['INCID_NO'] = imars_vehicle['IMARS_Record_No']
imars_vehicle_slim = imars_vehicle[['INCID_NO']]
imars_vehicle_slim.head()

In [None]:
imars_vehicle['Traffic_Control'].value_counts().sort_index()

In [None]:
# number of vehicle entries in which a crash occurred at a known site other than a crosswalk (or person)
12723+926+232+28+790+1246+253+135+5+527

In [None]:
# set flags for each relevant data attribute
imars_vehicle_slim['5_mph']= np.where(imars_vehicle['Posted_Speed']=='01. 5 mph', 1,0)
imars_vehicle_slim['10_mph']= np.where(imars_vehicle['Posted_Speed']=='02. 10 mph', 1,0)
imars_vehicle_slim['15_mph']= np.where(imars_vehicle['Posted_Speed']=='03. 15 mph', 1,0)
imars_vehicle_slim['20_mph']= np.where(imars_vehicle['Posted_Speed']=='04. 20 mph', 1,0)
imars_vehicle_slim['25_mph']= np.where(imars_vehicle['Posted_Speed']=='05. 25 mph', 1,0)
imars_vehicle_slim['30_mph']= np.where(imars_vehicle['Posted_Speed']=='06. 30 mph', 1,0)
imars_vehicle_slim['35_mph']= np.where(imars_vehicle['Posted_Speed']=='07. 35 mph', 1,0)
imars_vehicle_slim['40_mph']= np.where(imars_vehicle['Posted_Speed']=='08. 40 mph', 1,0)
imars_vehicle_slim['45_mph']= np.where(imars_vehicle['Posted_Speed']=='09. 45 mph', 1,0)
imars_vehicle_slim['50_mph']= np.where(imars_vehicle['Posted_Speed']=='10. 50 mph', 1,0)
imars_vehicle_slim['55_mph']= np.where(imars_vehicle['Posted_Speed']=='11. 55 mph', 1,0)
imars_vehicle_slim['60_mph']= np.where(imars_vehicle['Posted_Speed']=='12. 60 mph', 1,0)
imars_vehicle_slim['65_mph']= np.where(imars_vehicle['Posted_Speed']=='13. 65 mph', 1,0)
imars_vehicle_slim['70_mph']= np.where(imars_vehicle['Posted_Speed']=='14. 70 mph', 1,0)
imars_vehicle_slim['75_mph']= np.where(imars_vehicle['Posted_Speed']=='15. 75 mph', 1,0)
imars_vehicle_slim['no_posted_speed']= np.where(imars_vehicle['Posted_Speed']=='98. Not posted', 1,0)
imars_vehicle_slim['Crosswalk']= np.where(imars_vehicle['Traffic_Control']=='06. Pedestrian crossing', 1,0)
imars_vehicle_slim['Outside a Crosswalk']= np.where(imars_vehicle['Traffic_Control'].isin(['01. None','02. Stop sign','03. Yield sign','04. Flashing traffic signal','05. Traffic signal','08. No passing zone','09. Warning signs','10. Temporary traffic control','12. RR crossing device/signal','13. Other']), 1,0)


In [None]:
# check against value count sum from above - looks good!
imars_vehicle_slim['Outside a Crosswalk'].sum()

In [None]:
# check new flag column sum against value counts - looks good!
imars_vehicle_slim['Crosswalk'].sum()

In [None]:
imars_vehicle_slim.head()

In [None]:
imars_vehicle_slim.shape

In [None]:
imars_vehicle_slim.columns

In [None]:
# make sure all columns contain data as you would expect (i.e., 0 and 1)
#imars_vehicle_slim.to_csv("./imars_vehicle_slim_test.csv",index=False)

In [None]:
# collapse multiple rows for each vehicle involved in the crash into a single row for each crash
# only one speed limit or crosswalk flag should exist per crash to not double count data attribute ("first" function)
imars_vehicle_slim_agg = imars_vehicle_slim.groupby(by=['INCID_NO']).first().reset_index()

In [None]:
# result should be no more than 15302 rows (number of useable crash records)
imars_vehicle_slim_agg.shape

## Crash Table 

In [None]:
imars_crash_expanded.columns

In [None]:
imars_crash_expanded.head()

In [None]:
# reformat crash date, time, and year; separate into individual columns
imars_crash_expanded['Crash_Date_Time']=pd.to_datetime(imars_crash_expanded['Crash_Date_Time'], 
                                                   format='%Y%m%d %H:%M:%S:%f')
imars_crash_expanded['CRASH_DATE'] = imars_crash_expanded['Crash_Date_Time'].dt.strftime('%Y-%m-%d')
imars_crash_expanded['CRASH_TIME'] = imars_crash_expanded['Crash_Date_Time'].dt.strftime('%H%M')
imars_crash_expanded['CRASH_YEAR'] = imars_crash_expanded['Crash_Date_Time'].dt.strftime('%Y')
imars_crash_expanded.head()

In [None]:
# cleaned data, all original rows and columns
imars_crash_expanded.to_csv("./IMARS_crash_full.csv",index=False)

In [None]:
# rename column to match CDS column name
imars_crash_expanded['INCID_NO'] = imars_crash_details['IMARS_Record_No']
# parse out crucial basic crash info 
imars_crash_slim = imars_crash_expanded[['INCID_NO','Latitude','Longitude','Park','RGN','CRASH_DATE','CRASH_TIME','CRASH_YEAR']]
imars_crash_slim = imars_crash_slim.rename(columns={'Latitude':'LATITUDE','Longitude':'LONGITUDE'})

In [None]:
imars_crash_slim.shape

In [None]:
# cleaned data, basic crash info columns
imars_crash_slim.to_csv("./IMARS_crash_slim.csv",index=False)

## merge crash table with other IMARS tables

In [None]:
imars_passenger_merged = imars_passenger.merge(imars_crash_slim, how='right', on='INCID_NO')
imars_passenger_merged.drop_duplicates() 
imars_passenger_merged.head()

In [None]:
imars_passenger_merged.shape

In [None]:
# cleaned, non-aggregated dataset (one row per person involved, all original columns) merged to include basic crash info columns
imars_passenger_merged.to_csv("./IMARS_passenger_full.csv",index=False)

In [None]:
imars_passenger_slim_merged = imars_passenger_slim_agg.merge(imars_crash_slim, how='right', on='INCID_NO')
imars_passenger_slim_merged.drop_duplicates() 
# result should have no more than 12790 rows and exactly passenger_slim + crash_slim -1 columns (19)
imars_passenger_slim_merged.shape

In [None]:
imars_passenger_slim_merged.head()

In [None]:
# cleaned and aggregated distilled dataset (one row per crash, only columns of interest) merged to include basic crash info
imars_passenger_slim_merged.to_csv("./IMARS_passenger_slim.csv",index=False)

In [None]:
# determine number of (mostly) null rows
imars_passenger_slim_merged_null = imars_passenger_slim_merged.loc[imars_passenger_slim_merged['Fatality'].isnull()==True]
imars_passenger_slim_merged_null.shape

In [None]:
# 986 of 12790 crashes are missing passenger table entries (although the crash table was populated)

In [None]:
imars_vehicle_merged = imars_vehicle.merge(imars_crash_slim, how='right', on='INCID_NO')
imars_vehicle_merged.drop_duplicates() 
imars_vehicle_merged.head()

In [None]:
imars_vehicle_merged.shape

In [None]:
# cleaned, non-aggregated dataset (one row per vehicle involved, all original columns) merged to include basic crash info
imars_vehicle_merged.to_csv("./IMARS_vehicle_full.csv",index=False)

In [None]:
imars_vehicle_slim_merged = imars_vehicle_slim_agg.merge(imars_crash_slim, how='right', on='INCID_NO')
imars_vehicle_slim_merged.drop_duplicates() 
# result should have no more than 12790 rows and exactly vehicle_slim + crash_slim -1 columns (26)
imars_vehicle_slim_merged.shape

In [None]:
imars_vehicle_slim_merged.head()

In [None]:
# cleaned and aggregated dataset (one row per crash, only columns of interest) merged to include basic crash info
imars_vehicle_slim_merged.to_csv("./IMARS_vehicle_slim.csv",index=False)

In [None]:
# determine number of (mostly) null rows
imars_vehicle_slim_merged_null = imars_vehicle_slim_merged.loc[imars_vehicle_slim_merged['35_mph'].isnull()==True]
imars_vehicle_slim_merged_null.shape

In [None]:
# 986 of 12790 crashes are missing vehicle table entries (although the crash table was populated)

In [None]:
imars_crash_details_merged = imars_crash_details.merge(imars_crash_slim, how='right', on='INCID_NO')
imars_crash_details_merged.drop_duplicates(subset=['INCID_NO']) 
imars_crash_details_merged.head()

In [None]:
imars_crash_details_merged.shape

In [None]:
# cleaned data, all original rows and columns
imars_crash_details_merged.to_csv("./IMARS_crash_details_full.csv",index=False)

In [None]:
imars_crash_details_slim_merged = imars_crash_details_slim.merge(imars_crash_slim, how='right', on='INCID_NO')
imars_crash_details_slim_merged.drop_duplicates(subset=['INCID_NO']) 
# result should have no more than 12790 rows and exactly crash_details_slim + crash_slim -1 columns (40)
imars_crash_details_slim_merged.shape

In [None]:
imars_crash_details_slim_merged.head()

In [None]:
# cleaned data, all rows, but only columns of interest
imars_crash_details_slim_merged.to_csv("./IMARS_crash_details_slim.csv",index=False)

In [None]:
# determine number of (mostly) null rows
imars_crash_details_slim_merged_null = imars_crash_details_slim_merged.loc[imars_crash_details_slim_merged['Injury or Fatal'].isnull()==True]
imars_crash_details_slim_merged_null.shape

In [None]:
# 986 of 12790 crashes are missing crash details table entries (although the crash table was populated)
imars_crash_details_slim_merged_null['RGN'].value_counts()

## merge all IMARS slim (agg) tables into clean dataset for combination with CDS

In [None]:
imars_passenger_slim_agg.shape, imars_vehicle_slim_agg.shape, imars_crash_details_slim.shape, imars_crash_slim.shape

In [None]:
# the number of rows in the final dataset should be:
12790-986

In [None]:
# the number of columns in the final dataset should be:
12+19+33+8-3+1

In [None]:
imars_slim_passenger_and_vehicle=imars_vehicle_slim_agg.merge(imars_passenger_slim_agg, how='right', on='INCID_NO')
imars_slim_passenger_vehicle_details=imars_slim_passenger_and_vehicle.merge(imars_crash_details_slim, how='right', on='INCID_NO')
imars_slim_all=imars_slim_passenger_vehicle_details.merge(imars_crash_slim, how="right", on='INCID_NO')
imars_slim_all.drop_duplicates()
# add column for name of database
imars_slim_all['database']='IMARS'
# for Chris, to analyze data quality for all crashes with cleaned lat/long, park, and region georeferences, regardless of if all non-crash tables have records for each crash:
imars_slim_all.to_csv("./IMARS_slim_clean_forChris.csv",index=False)
imars_slim_all.shape

In [None]:
# drop rows with missing passenger or vehicle table entries, such that all final data have records in all tables
imars_slim_alltablespopulated = imars_slim_all[imars_slim_all['35_mph'].notna()]
imars_slim_alltablespopulated.shape

In [None]:
imars_slim_alltablespopulated.head()

In [None]:
imars_slim_alltablespopulated.columns

In [None]:
imars_slim_alltablespopulated.to_csv("./IMARS_slim_clean.csv",index=False)