## Crash Data Wrangling Jupyter Notebook

**Author:** Eric Englin, Meredith Raymer, and Sophie Kaye

**Date:** 6/23/22

**Purpose:** This notebook will clean pre-combined IMARS data to merge with CDS and STARS datasets. It will also pull out Yosemite data separately

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
from shapely.geometry import Point, LineString, Polygon

In [2]:
myworkingdirectory = r"C:\Users\Sophie.Kaye\Desktop\NPS Safety"
os.chdir(myworkingdirectory)

## Step 1: Concatenate Similar Datasets 

**Note:** Datasets have been divided across three time periods, so these must be combined to start joining the tables together. 

In [3]:
path = './IMARS'

files = os.listdir(path)

for f in files:
    print(f)

Archive
IMARS 2012 - 2021.xlsx
IMARS Crash Module Data Dictionary_Redacted.docx
IMARS_noparks_nocoords_someinfo_parks_filled.csv


In [4]:
#imars_crash_1 = pd.read_excel("./IMARS/NPS GOccIvPA Command Address 2011-2015.xlsx", sheet_name = "Batch1-Result1")
#imars_crash_2 = pd.read_excel("./IMARS/NPS GOccIvPA Command Address 2016 to 4-2018.xlsx", sheet_name = "Batch1-Result1")
#imars_crash_3 = pd.read_excel("./IMARS/NPS GOccIvPA Command Address 4-2018 to 6-2021.xlsx", sheet_name = "Batch1-Result1")
#imars_crash_4 = pd.read_excel("./IMARS/USPP GOccIvPA Command Address 1H2021.xlsx", sheet_name = "Batch1-Result1")
#imars_crash_5 = pd.read_excel("./IMARS/USPP GOccIvPA Command Address 4-2018 to 12-2018.xlsx", sheet_name = "Batch1-Result1")
#imars_crash_6 = pd.read_excel("./IMARS/USPP GOccIvPA Command Address 2011-2015.xlsx", sheet_name = "Batch1-Result1")
#imars_crash_7 = pd.read_excel("./IMARS/USPP GOccIvPA Command Address 2016 to 4-2018.xlsx", sheet_name = "Batch1-Result1")
#imars_crash_8 = pd.read_excel("./IMARS/USPP GOccIvPA Command Address 2019.xlsx", sheet_name = "Batch1-Result1")
#imars_crash_9 = pd.read_excel("./IMARS/USPP GOccIvPA Command Address 2020.xlsx", sheet_name = "Batch1-Result1")

#imars_passenger_1 = pd.read_excel("./IMARS/NPS MVCOccGPersonReport Command 2011 to 2015.xlsx", sheet_name = "Batch1-Result1")
#imars_passenger_2 = pd.read_excel("./IMARS/NPS MVCOccGPersonReport Command 2016 to 4-2018.xlsx", sheet_name = "Batch1-Result1")
#imars_passenger_3 = pd.read_excel("./IMARS/NPS MVCOccGPersonReport Command 4-2018 to 6-2021.xlsx", sheet_name = "Batch1-Result1")
#imars_passenger_4 = pd.read_excel("./IMARS/USPP MVCOccGPersonReport Command 4-2018 to 12-2018.xlsx", sheet_name = "Batch1-Result1")
#imars_passenger_5 = pd.read_excel("./IMARS/USPP MVCOccGPersonReport Command 2011-2015.xlsx", sheet_name = "Batch1-Result1")
#imars_passenger_6 = pd.read_excel("./IMARS/USPP MVCOccGPersonReport Command 2016 to 4-2018.xlsx", sheet_name = "Batch1-Result1")
#imars_passenger_7 = pd.read_excel("./IMARS/USPP MVCOccGPersonReport Command 2019.xlsx", sheet_name = "Batch1-Result1")
#imars_passenger_8 = pd.read_excel("./IMARS/USPP MVCOccGPersonReport Command 2020.xlsx", sheet_name = "Batch1-Result1")
#imars_passenger_9 = pd.read_excel("./IMARS/USPP MVCOccGPersonReport Command 1H2021.xlsx", sheet_name = "Batch1-Result1")

#imars_vehicle_1 = pd.read_excel("./IMARS/NPS MVCOccVehicleReport Command 2011 to 2015.xlsx", sheet_name = "Batch1-Result1")
#imars_vehicle_2 = pd.read_excel("./IMARS/NPS MVCOccVehicleReport Command 2016 to 4-2018.xlsx", sheet_name = "Batch1-Result1")
#imars_vehicle_3 = pd.read_excel("./IMARS/NPS MVCOccVehicleReport Command 4-2018 to 6-2021.xlsx", sheet_name = "Batch1-Result1")
#imars_vehicle_4 = pd.read_excel("./IMARS/USPP MVCOccVehicleReport Command 1H2021.xlsx", sheet_name = "Batch1-Result1")
#imars_vehicle_5 = pd.read_excel("./IMARS/USPP MVCOccVehicleReport Command 4-2018 to 12-2018.xlsx", sheet_name = "Batch1-Result1")
#imars_vehicle_6 = pd.read_excel("./IMARS/USPP MVCOccVehicleReport Command 2011-2015.xlsx", sheet_name = "Batch1-Result1")
#imars_vehicle_7 = pd.read_excel("./IMARS/USPP MVCOccVehicleReport Command 2016 to 4-2018.xlsx", sheet_name = "Batch1-Result1")
#imars_vehicle_8 = pd.read_excel("./IMARS/USPP MVCOccVehicleReport Command 2019.xlsx", sheet_name = "Batch1-Result1")
#imars_vehicle_9 = pd.read_excel("./IMARS/USPP MVCOccVehicleReport Command 2020.xlsx", sheet_name = "Batch1-Result1")

#imars_crash_details_1 = pd.read_excel("./IMARS/NPS MVCOccReportCommand_Classification 2011-2015.xlsx", sheet_name = "Batch1-Result1")
#imars_crash_details_2 = pd.read_excel("./IMARS/NPS MVCOccReportCommand_Classification 2016 to 4-2018.xlsx", sheet_name = "Batch1-Result1")
#imars_crash_details_3 = pd.read_excel("./IMARS/NPS MVCOccReportCommand_Classification 4-2018 to 6-2021.xlsx", sheet_name = "Batch1-Result1")
#imars_crash_details_4 = pd.read_excel("./IMARS/USPP MVCOccReportCommand_Classification 1H2021.xlsx", sheet_name = "Batch1-Result1")
#imars_crash_details_5 = pd.read_excel("./IMARS/USPP MVCOccReportCommand_Classification 2011 to 2015.xlsx", sheet_name = "Batch1-Result1")
#imars_crash_details_6 = pd.read_excel("./IMARS/USPP MVCOccReportCommand_Classification 2016 to 4-2018.xlsx", sheet_name = "Batch1-Result1")
#imars_crash_details_7 = pd.read_excel("./IMARS/USPP MVCOccReportCommand_Classification 2019.xlsx", sheet_name = "Batch1-Result1")
#imars_crash_details_8 = pd.read_excel("./IMARS/USPP MVCOccReportCommand_Classification 2020.xlsx", sheet_name = "Batch1-Result1")

In [3]:
#imars_crash = pd.concat([imars_crash_1, imars_crash_2, imars_crash_3, imars_crash_4, imars_crash_5, imars_crash_6, imars_crash_7, imars_crash_8, imars_crash_9])
imars_crash = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Location")
#imars_passenger = pd.concat([imars_passenger_1, imars_passenger_2, imars_passenger_3, imars_passenger_4, imars_passenger_5, imars_passenger_6, imars_passenger_7, imars_passenger_8, imars_passenger_9])
imars_passenger = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Person")
#imars_vehicle = pd.concat([imars_vehicle_1, imars_vehicle_2, imars_vehicle_3, imars_vehicle_4, imars_vehicle_5, imars_vehicle_6, imars_vehicle_7, imars_vehicle_8, imars_vehicle_9])
imars_vehicle = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Vehicle")
#imars_crash_details = pd.concat([imars_crash_details_1, imars_crash_details_2, imars_crash_details_3, imars_crash_details_4, imars_crash_details_5, imars_crash_details_6, imars_crash_details_7, imars_crash_details_8])
imars_crash_details = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Classification")

In [4]:
imars_crash.shape

(17828, 32)

In [5]:
imars_passenger.shape

(27932, 29)

In [6]:
imars_vehicle.shape

(23577, 51)

In [7]:
imars_crash_details.shape

(16304, 24)

In [8]:
# note that crash_details and crash don't have the same number of entries to begin with

In [9]:
#imars_crash.head()

In [10]:
#imars_passenger.Injury_Severity.value_counts()

In [11]:
imars_passenger.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Driver_Action',
       'Driver_Condition', 'Driver_Distraction', 'Suspect_Alcohol',
       'Alcohol_Test', 'Alcohol_Test_Result_1', 'Alcohol_Test_Result_2',
       'Suspect_Drugs', 'Drug_Test', 'Violations_Issued', 'Seat_Position',
       'Injury_Severity', 'Air_Bag_Deployed', 'Ejection',
       'Injury_Transported_By', 'Safety_Equipment_Used', 'Vehicle_number',
       'Injured_transported_by', 'Non_motorist_safety_equipment',
       'Non_motorist_action_circumstance_prior_to_crash',
       'Non_motorist_action_circumstance_at_time_of_crash',
       'Non_motorist_condition_at_time_of_crash', 'Non_motorist_distraction',
       'Non_motorist_location_at_time_of_crash', 'Pedestrian_Type',
       'Pedestrian_Type_Detail', 'Involvement'],
      dtype='object')

In [12]:
imars_crash.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Linked_Address_Classification',
       'City_Town_Park_Location', 'State', 'County', 'Direction',
       'Linked_Street_Number', 'Linked_Common_Name', 'Street_Type',
       'Direction.1', 'NEAR_Distance_to_MI', 'NEAR_Direction_To',
       'NEAR_Direction', 'NEAR_route_street_road_name', 'NEAR_Road_Type',
       'NEAR_Direction.1', 'AT_Intersection_route_street_road_DIRECTION',
       'At_Intersecting_route_street_road_name', 'AT_Road_Type',
       'AT_Direction', 'Mile_Marker', 'Latitude', 'Longitude', 'Region',
       'State_Zone', 'Park', 'Site', 'Place', 'Point',
       'Road_Type_Classification', 'Linked_Address'],
      dtype='object')

In [13]:
imars_crash_details.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Number_of_Vehicles_Involved',
       'Injury_or_Fatal_Crash', 'Investigated_at_Scene', 'Hit_and_Run',
       'Non_Motor_Vehicl_Property_Damage', 'Amount_of_Property_Damage',
       'First_Harmful_Event_Type', 'First_Harmful_Event',
       'Location_of_First_Harmful_Event', 'Weather', 'Roadway_Condition',
       'Lighting', 'School_Bus_related', 'AS_Road_Circumstance',
       'Environmental_Contributing_Circumstances', 'Work_Zone_Related',
       'Work_Zone_Workers_Present', 'Work_Zone_Location',
       'Law_Enforcement_Present_at_Work_Zone', 'Relation_to_Junction',
       'Type_of_Intersection', 'Manner_of_Collision'],
      dtype='object')

In [14]:
imars_vehicle.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Vehicle_Number',
       'Number_of_Occupants', 'Vehicle_Towed', 'Insurance_verified',
       'Initial_Impact_Point', 'Most_Damaged_Area', 'Extent_of_Damage',
       'Direction_of_Travel_Prior_to_Crash', 'Posted_Speed',
       'First_Event_Type', 'First_Event', 'Second_Event_Type', 'Second_Event',
       'Third_Event_Type', 'Third_Event', 'Fourth_Event_Type', 'Fourth_Event',
       'Motor_Vehicle_Unit_Type', 'Vehicle_Owner', 'Vehicle_Type',
       'Non_Commercial_Trailer_Style', 'Emergency_Vehicle_Use',
       'Emergency_Equipment_Activated', 'Special_Function_of_MV_in_Transport',
       'Motor_Vehicle_Contributing_Circumstance',
       'Vehicle_Maneuver_Action_Prior_to_Crash', 'Road_Surface', 'Grade',
       'Roadway_Alignment', 'Total_Number_of_Lanes', 'Traffic_Control',
       'Traffic_Control_Working_Properly', 'Roadway_Description',
       'Commercial_Non_Commercial', 'Number_of_Axles', 'Gross_Vehicle_Weight',
       'Combination_GVW', 

# General Data Cleaning

In [15]:
# NOTE that there are crashes with randomly missing park, state, region, and/or roadway

# DISCUSS WITH TEAM: decide which data to trust (i.e., park, lat/long) and fill in missing location information programatically (or manually) based on trusted source
# POSSIBLE TASK FOR CHRIS?

### CHECK FOR MISSING RECORD NUMBERS

In [16]:
imars_crash.shape

(17828, 32)

In [17]:
imars_crash.dropna(subset=['IMARS_Record_No']).shape

(17827, 32)

In [18]:
imars_crash = imars_crash.dropna(subset=['IMARS_Record_No'])

In [19]:
imars_passenger.shape

(27932, 29)

In [20]:
imars_passenger.dropna(subset=['IMARS_Record_No']).shape

(27931, 29)

In [21]:
imars_passenger = imars_passenger.dropna(subset=['IMARS_Record_No'])

In [22]:
imars_vehicle.shape

(23577, 51)

In [23]:
imars_vehicle.dropna(subset=['IMARS_Record_No']).shape

(23576, 51)

In [24]:
imars_vehicle = imars_vehicle.dropna(subset=['IMARS_Record_No'])

In [25]:
imars_crash_details.shape

(16304, 24)

In [26]:
imars_crash_details.dropna(subset=['IMARS_Record_No']).shape

(16303, 24)

In [27]:
imars_crash_details = imars_crash_details.dropna(subset=['IMARS_Record_No'])

### REMOVE DUPLICATE RECORD NUMBERS

In [28]:
imars_crash = imars_crash.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash.shape

(15130, 32)

In [29]:
imars_crash_details = imars_crash_details.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_details.shape

(15302, 24)

In [30]:
# note that crash and crash details still don't have the same number of entries (although they did with the older archived input data...)

### revert flipped lat/long

In [31]:
need_revert = imars_crash['Latitude'].abs() > 70
imars_crash.loc[need_revert, ['Latitude', 'Longitude']] = (
    imars_crash.loc[need_revert, ['Longitude', 'Latitude']].values)
#imars_crash.to_csv("./test.csv",index=False)

### adjust to correct hemisphere

In [32]:
imars_crash.loc[imars_crash.Latitude < 0,"Latitude"] = imars_crash['Latitude']*(-1)
imars_crash.loc[imars_crash.Longitude > 0,"Longitude"] = imars_crash['Longitude']*(-1)
#imars_crash.to_csv("./test.csv",index=False)

# Add Parks to Crash Details Dataset

**Note:** IMARS does not have park units with each crash, so these will have to be added using the Latitude and Longitude fields in the imars_crash dataset. 

In [33]:
imars_crash.shape

(15130, 32)

In [34]:
imars_crash.dropna(subset=['Park']).shape

(12205, 32)

In [35]:
#imars_crash['Region'].value_counts()

In [36]:
#imars_crash.dropna(subset=['Park'])['Region'].value_counts()

In [37]:
imars_crash_parks = imars_crash.dropna(subset=['Park'])
imars_crash_parks.shape

(12205, 32)

In [38]:
imars_crash.loc[imars_crash['Park'].isnull()==False].shape

(12205, 32)

In [39]:
# 12205 of 15130 crash entries have the park already explicitly identified

In [40]:
imars_crash.loc[imars_crash['Park'].isnull()==True].shape

(2925, 32)

In [41]:
# 2925 of the 15130 crash entries have no park identification

In [42]:
imars_crash_coords = imars_crash.loc[imars_crash['Park'].isnull()==True].dropna(subset=['Latitude','Longitude'])
imars_crash_coords.shape

(591, 32)

In [43]:
# of the 2925 crash entries without park identification, 591 have lat/long coordinates from which park can be assigned using the shapefile

In [44]:
12205+591

12796

In [45]:
# the sum of crashes with pre-identified parks and crashes that can be assigned a park results in a total of 12796 possible usable crash entries after the spatial join

In [46]:
imars_crash_noparks = imars_crash.loc[imars_crash['Park'].isnull()==True]
imars_crash_noparks_nolat = imars_crash_noparks.loc[imars_crash_noparks['Latitude'].isnull()==True]
imars_crash_noparks_nocoords = imars_crash_noparks_nolat.loc[imars_crash_noparks_nolat['Longitude'].isnull()==True]
imars_crash_noparks_nocoords.shape

(2334, 32)

In [47]:
# 2334 of the 15130 crashes entries have no park identification or lat/long coordinates

In [48]:
mask = imars_crash_noparks_nocoords.loc[:,['Linked_Common_Name','Linked_Address','NEAR_route_street_road_name','At_Intersecting_route_street_road_name']].notnull()
imars_noparks_nocoords_someinfo = imars_crash_noparks_nocoords.loc[mask.any(axis=1)]
imars_noparks_nocoords_someinfo.shape

(109, 32)

In [49]:
# of the 2334 crash entries without park identification or lat/long coordinates, 109 have other identifiable information (e.g., road name)

imars_noparks_nocoords_someinfo.to_csv("./IMARS_noparks_nocoords_someinfo.csv",index=False)

In [50]:
imars_crash_coords_geo=gpd.GeoDataFrame(imars_crash_coords, geometry=gpd.points_from_xy(imars_crash_coords.Longitude, 
                                                                             imars_crash_coords.Latitude))

In [51]:
filename = "./shapefiles/NPS_-_Land_Resources_Division_Boundary_and_Tract_Data_Service.geojson"
file = open(filename)
parks = gpd.read_file(file)

In [52]:
imars_crash_coords_geo.crs = "EPSG:4326"
#parks.crs = "EPSG:4326"

In [53]:
#parks = parks.set_crs("EPSG:4326")
#parks = parks.set_crs(epsg=4326)
#parks = parks.to_crs("EPSG:4326")
parks = parks.to_crs(epsg=4326)
parks['geometry']=parks['geometry'].buffer(0.01)

imars_crash_coords_geo_withparknames=gpd.sjoin(imars_crash_coords_geo,parks,how="left", predicate='intersects')
imars_crash_coords_geo_withparknames.head()


  parks['geometry']=parks['geometry'].buffer(0.01)


Unnamed: 0,IMARS_Record_No,Crash_Date_Time,Linked_Address_Classification,City_Town_Park_Location,State,County,Direction,Linked_Street_Number,Linked_Common_Name,Street_Type,...,CREATED_BY,METADATA,PARKNAME,CreationDate,Creator,EditDate,Editor,GlobalID,Shape__Area,Shape__Length
439,NP14039836,20140513 13:30:00:000,,,,,,,,,...,Lands,Preliminary data. Contact the Land Resources P...,Olympic,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,d568927b-56f4-4f49-a52a-b8ab9f7676a3,8203591000.0,1158605.0
443,NP14049772,20140607 14:42:00:000,,,,,,,,,...,Lands,Preliminary data. Contact the Land Resources P...,Olympic,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,d568927b-56f4-4f49-a52a-b8ab9f7676a3,8203591000.0,1158605.0
444,NP14060606,20140625 20:40:00:000,,,,,,,,,...,Lands,Preliminary data. Contact the Land Resources P...,Olympic,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,d568927b-56f4-4f49-a52a-b8ab9f7676a3,8203591000.0,1158605.0
497,NP16023660,20160312 16:32:00:000,,,,,,,,,...,Lands,https://irma.nps.gov/DataStore/Reference/Profi...,Big Bend,2022-01-06T10:41:22+00:00,WASO,2022-01-06T10:41:22+00:00,WASO,32b17c0c-12d0-4ffb-82fd-fb570aa92382,4329894000.0,462145.8
508,NP16029926,20160326 20:00:00:000,,,,,,,,,...,Lands,https://irma.nps.gov/DataStore/Reference/Profi...,Blue Ridge Parkway,2022-01-06T10:41:13+00:00,WASO,2022-01-06T10:41:13+00:00,WASO,a8ef8bcf-aaa2-4623-a8f6-1d93fe31f945,642257900.0,2287707.0


In [54]:
imars_crash_coords_geo_withparknames2 = imars_crash_coords_geo_withparknames.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_coords_geo_withparknames2.shape

(591, 53)

In [55]:
# all 591 crash entries were processed in the spatial join

In [56]:
imars_crash_withparknames = pd.DataFrame(imars_crash_coords_geo_withparknames2.drop(columns='geometry'))
imars_crash_withparknames['Park']= imars_crash_withparknames['UNIT_CODE']
imars_crash_withparknames.shape

(591, 52)

In [57]:
imars_crash_withparknames_NoDupsorNulls = imars_crash_withparknames.dropna(subset=['Park'])
imars_crash_withparknames_NoDupsorNulls.shape

(505, 52)

In [58]:
# of the 591 crash entries without parks identified, 505 now have parks assigned from spatial join
# so we expect the total ("expanded") dataset to be:
12205+505

12710

In [116]:
imars_crash_expanded = pd.concat([imars_crash_parks,imars_crash_withparknames_NoDupsorNulls])
imars_crash_expanded.shape

(12710, 52)

In [117]:
# check to make sure all entries in combined dataset contain park assignments 
imars_crash_expanded = imars_crash_expanded.dropna(subset=['Park'])
imars_crash_expanded.shape

(12710, 52)

In [118]:
# check to make sure no duplicate crashes in combined dataset
imars_crash_expanded = imars_crash_expanded.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_expanded.shape

(12710, 52)

In [119]:
# open file Chris populated in which park names were manually added to crash data containing identifiable info (e.g., road name)
imars_new_parknames = pd.read_csv("./IMARS/IMARS_noparks_nocoords_someinfo_parks_filled.csv")
imars_new_parknames.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Linked_Address_Classification',
       'City_Town_Park_Location', 'State', 'County', 'Direction',
       'Linked_Street_Number', 'Linked_Common_Name', 'Street_Type',
       'Direction.1', 'NEAR_Distance_to_MI', 'NEAR_Direction_To',
       'NEAR_Direction', 'NEAR_route_street_road_name', 'NEAR_Road_Type',
       'NEAR_Direction.1', 'AT_Intersection_route_street_road_DIRECTION',
       'At_Intersecting_route_street_road_name', 'AT_Road_Type',
       'AT_Direction', 'Mile_Marker', 'Latitude', 'Longitude', 'Region',
       'State_Zone', 'Park', 'Site', 'Place', 'Point',
       'Road_Type_Classification', 'Linked_Address'],
      dtype='object')

In [120]:
# remove any entries for which park names were not able to be identified
imars_new_parknames = imars_new_parknames.dropna(subset=['Park'])
imars_new_parknames.shape

(80, 32)

In [121]:
# 80 of 109 crashes were able to have park names manually added using road names and
# should be combined with crash data containing park names from original data and spatial assignment
# for a total of 12790 crashes in the final dataset

In [122]:
imars_crash_expanded = pd.concat([imars_crash_expanded,imars_new_parknames])
imars_crash_expanded.shape

(12790, 52)

In [123]:
# check to make sure there are no duplicates in final dataset
imars_crash_expanded = imars_crash_expanded.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_expanded.shape

(12790, 52)

## Add Region
imars_crash database currently contains two columns with region information, although neither one is fully populated. This section will create and populate a new column re-assigning region name to every crash based on park name using a lookup table

In [124]:
# load lookup table
park_info = pd.read_csv("./crash database mapping/Park_Info_Table.csv")
park_info = park_info.rename(columns={'UNIT_CODE':'Park','REGION':'RGN'})
park_info.columns

Index(['OBJECTID', 'Park', 'GIS_Notes', 'UNIT_NAME', 'DATE_EDIT', 'STATE',
       'RGN', 'GNIS_ID', 'UNIT_TYPE', 'CREATED_BY', 'METADATA', 'PARKNAME',
       'CreationDa', 'Creator', 'EditDate', 'Editor', 'Shape__Are',
       'Shape__Len', 'Unnamed: 18'],
      dtype='object')

In [125]:
imars_crash_expanded.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Linked_Address_Classification',
       'City_Town_Park_Location', 'State', 'County', 'Direction',
       'Linked_Street_Number', 'Linked_Common_Name', 'Street_Type',
       'Direction.1', 'NEAR_Distance_to_MI', 'NEAR_Direction_To',
       'NEAR_Direction', 'NEAR_route_street_road_name', 'NEAR_Road_Type',
       'NEAR_Direction.1', 'AT_Intersection_route_street_road_DIRECTION',
       'At_Intersecting_route_street_road_name', 'AT_Road_Type',
       'AT_Direction', 'Mile_Marker', 'Latitude', 'Longitude', 'Region',
       'State_Zone', 'Park', 'Site', 'Place', 'Point',
       'Road_Type_Classification', 'Linked_Address', 'index_right', 'OBJECTID',
       'UNIT_CODE', 'GIS_Notes', 'UNIT_NAME', 'DATE_EDIT', 'STATE', 'REGION',
       'GNIS_ID', 'UNIT_TYPE', 'CREATED_BY', 'METADATA', 'PARKNAME',
       'CreationDate', 'Creator', 'EditDate', 'Editor', 'GlobalID',
       'Shape__Area', 'Shape__Length'],
      dtype='object')

In [126]:
# add RGN column from lookup table to IMARS crash database, joining the two datasets based on park name
imars_crash_expanded = pd.merge(imars_crash_expanded, park_info[['RGN','Park']], how='left', on='Park')
# resulting dataframe after join should have one additional column and no additional rows
imars_crash_expanded = imars_crash_expanded.drop_duplicates() 
imars_crash_expanded.shape

(12790, 53)

In [128]:
#no_region = imars_crash_expanded.loc[imars_crash_expanded['RGN'].isnull()==True]
#no_region['Park'].value_counts()

In [129]:
imars_crash_expanded.loc[imars_crash_expanded['RGN'].isnull()==True].shape

(0, 53)

In [135]:
imars_crash_expanded['RGN'].value_counts()
## NOTE: CRASH TABLE MANUALLY EDITED POST-HOC TO FILL IN REGION NAMES FOR 19 "Unknowns" BASED ON LAT/LONG

SER        3541
PWR        2656
IMR        2642
NCR        2274
NER        1206
MWR         372
AKR          80
Unknown      19
Name: RGN, dtype: int64

# Step 2: Filter for Necessary Fields, Group by IMARS_RECORD_NO

#### Creating New Columns for Injury Severity

- Requires passenger dataset

In [109]:
imars_passenger.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Driver_Action',
       'Driver_Condition', 'Driver_Distraction', 'Suspect_Alcohol',
       'Alcohol_Test', 'Alcohol_Test_Result_1', 'Alcohol_Test_Result_2',
       'Suspect_Drugs', 'Drug_Test', 'Violations_Issued', 'Seat_Position',
       'Injury_Severity', 'Air_Bag_Deployed', 'Ejection',
       'Injury_Transported_By', 'Safety_Equipment_Used', 'Vehicle_number',
       'Injured_transported_by', 'Non_motorist_safety_equipment',
       'Non_motorist_action_circumstance_prior_to_crash',
       'Non_motorist_action_circumstance_at_time_of_crash',
       'Non_motorist_condition_at_time_of_crash', 'Non_motorist_distraction',
       'Non_motorist_location_at_time_of_crash', 'Pedestrian_Type',
       'Pedestrian_Type_Detail', 'Involvement', 'NUM_OCC', 'INCID_NO'],
      dtype='object')

In [110]:
imars_passenger['NUM_OCC'] = 1
imars_passenger['INCID_NO'] = imars_passenger['IMARS_Record_No']

In [111]:
imars_passenger.head()

Unnamed: 0,IMARS_Record_No,Crash_Date_Time,Driver_Action,Driver_Condition,Driver_Distraction,Suspect_Alcohol,Alcohol_Test,Alcohol_Test_Result_1,Alcohol_Test_Result_2,Suspect_Drugs,...,Non_motorist_action_circumstance_prior_to_crash,Non_motorist_action_circumstance_at_time_of_crash,Non_motorist_condition_at_time_of_crash,Non_motorist_distraction,Non_motorist_location_at_time_of_crash,Pedestrian_Type,Pedestrian_Type_Detail,Involvement,NUM_OCC,INCID_NO
0,NP12000078,20120106 14:30:00:000,,,,,,,,,...,,,,,,,,,1,NP12000078
1,NP12000378,20120121 00:00:00:000,,,,,,,,,...,,,,,,,,,1,NP12000378
2,NP12000550,20120131 13:51:00:000,,,,,,,,,...,,,,,,,,,1,NP12000550
3,NP12000911,20120208 00:00:00:000,,,,,,,,,...,,,,,,,,,1,NP12000911
4,NP12000935,20120215 10:15:00:000,,,,,,,,,...,,,,,,,,,1,NP12000935


In [112]:
#imars_passenger_slim = imars_passenger[[
 #   'INCID_NO', 'NUM_OCC', 'Crash_Date_Time','Injury_Severity'
#]]

In [113]:
#imars_passenger_slim.head()

In [114]:
#imars_passenger_slim['No Injury']= np.where(imars_passenger_slim['Injury_Severity']=='01. No injury', 1,0)
#imars_passenger_slim['Possible Injury']= np.where(imars_passenger_slim['Injury_Severity']=='02. Possible injury', 1,0)
#imars_passenger_slim['Non-incapacitating Injury']= np.where(imars_passenger_slim['Injury_Severity']=='03. Non-incapacitating injury', 1,0)
#imars_passenger_slim['Incapacitating Injury']= np.where(imars_passenger_slim['Injury_Severity']=='04. Incapacitating injury', 1,0)
#imars_passenger_slim['Fatality']= np.where(imars_passenger_slim['Injury_Severity']=='05. Fatal', 1,0)
#imars_passenger_slim['Unknown Injury']= np.where(imars_passenger_slim['Injury_Severity']=='99. Unknown', 1,0)

In [115]:
#imars_passenger_slim.Fatality.sum()

In [116]:
#imars_passenger_slim_agg = imars_passenger_slim.groupby(by=['INCID_NO']).sum()
#imars_slim_agg = imars_slim_agg.drop(columns = ['ACCLASS','DINJ', 'PINJ'])
#imars_passenger_slim_agg = imars_passenger_slim_agg.reset_index()

In [117]:
#imars_passenger_slim_agg.shape

In [118]:
#imars_passenger_slim_agg.Fatality.sum()

#### Creating New Columns for Accident Class

In [119]:
imars_crash_details['INCID_NO'] = imars_crash_details['IMARS_Record_No']

#imars_crash_details_slim = imars_crash_details[[
 #   'INCID_NO', 'First_Harmful_Event_Type','First_Harmful_Event'
#]]

In [120]:
#imars_crash_details_slim.First_Harmful_Event_Type.value_counts()

In [121]:
#imars_crash_details_slim.loc[imars_crash_details_slim['First_Harmful_Event_Type']=="Collision with person, MV or non-fixed object"].First_Harmful_Event.value_counts()

In [122]:
#pd.options.mode.chained_assignment = None  # default='warn'

#imars_crash_details_slim['Collision with Fixed Object']= np.where(imars_crash_details_slim['First_Harmful_Event_Type']=="Collision with fixed object", 1,0)
#imars_crash_details_slim['Collision with Animal']= np.where(imars_crash_details_slim['First_Harmful_Event_Type']=="Collision with animals", 1,0)
#imars_crash_details_slim['Non-Collision']= np.where(imars_crash_details_slim['First_Harmful_Event_Type']=="Non-collision", 1,0)
#imars_crash_details_slim['Other Accident Class']= np.where(imars_crash_details_slim['First_Harmful_Event_Type']=="Unknown",1,0)
#imars_crash_details_slim['Collision with Other Motor Vehicle']= np.where(imars_crash_details_slim['First_Harmful_Event']=='21. Motor vehicle in transport', 1,0)
#imars_crash_details_slim['Collision with Pedestrian']= np.where(imars_crash_details_slim['First_Harmful_Event']=="17. Pedestrian", 1,0)
#imars_crash_details_slim['Collision with Bicycle']= np.where(imars_crash_details_slim['First_Harmful_Event']=="18. Bicycle", 1,0)
#imars_crash_details_slim['Collision with Parked Motor Vehicle']= np.where(imars_crash_details_slim['First_Harmful_Event']=="22. Parked motor vehicle", 1,0)
#imars_crash_details_slim['Collision with Railway Train']= np.where(imars_crash_details_slim['First_Harmful_Event']=="20. Railway vehicle", 1,0)
#imars_crash_details_slim['Collision with Other Object']= np.where(imars_crash_details_slim['First_Harmful_Event']=="25. Other non-fixed object", 1,0)
#imars_crash_details_slim['Collision with Unknown']= np.where(imars_crash_details_slim['First_Harmful_Event'].isin(['23. Struck by falling, shifting cargo or anything set in motion by MV',
 #                                                                                             '24. Work zone/maintenance equipment']), 1,0)

In [123]:
#imars_crash_details_slim['Non-Collision'].value_counts()

In [124]:
#imars_crash_details_slim_nodups = imars_crash_details_slim.drop_duplicates()
#imars_passenger_slim_agg_nodups = imars_passenger_slim_agg.drop_duplicates()

In [125]:
imars_crash_clean = imars_crash_expanded[['IMARS_Record_No','Latitude', 'Longitude', 'Park','Crash_Date_Time']]
imars_crash_clean = imars_crash_clean.rename(columns={"IMARS_Record_No": "INCID_NO"})

imars_crash_clean = imars_crash_clean.drop_duplicates("INCID_NO")

In [126]:
#imars_crash_details_slim_nodups.shape, imars_crash_clean.shape, imars_passenger_slim_agg_nodups.shape

In [127]:
#imars_slim_agg=imars_crash_clean.merge(imars_passenger_slim_agg_nodups, how='left', on='INCID_NO')
#imars_slim_agg=imars_slim_agg.merge(imars_crash_details_slim_nodups, how='left', on='INCID_NO')


In [128]:
#imars_slim_agg.shape, imars_crash_details_slim_nodups.shape, imars_crash_clean.shape, imars_passenger_slim_agg_nodups.shape

In [129]:
#imars_slim_agg.Fatality.sum()

In [130]:
#imars_slim_agg.head()

In [131]:
#imars_slim_agg.columns

In [132]:
#imars_slim_agg = imars_slim_agg[['INCID_NO', 'NUM_OCC','Park', 'Non-Collision',
 #      'Collision with Other Motor Vehicle', 'Collision with Fixed Object',
 #      'Collision with Pedestrian', 'Collision with Bicycle',
 #      'Collision with Parked Motor Vehicle', 'Collision with Railway Train',
 #      'Collision with Animal', 'Collision with Other Object',
 #      'Collision with Unknown', 'Other Accident Class', 'No Injury',
 #      'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
 #      'Fatality', 'Unknown Injury', 'Latitude', 'Longitude', 'Crash_Date_Time']]

In [133]:
#imars_slim_agg.head()

In [134]:
#imars_slim_agg.Fatality.sum()

In [135]:
#imars_slim_agg.to_csv("./crash_data_IMARS_clean.csv",index=False)

# Filter for Yosemite

In [131]:
imars_crash_expanded.head()

Unnamed: 0,IMARS_Record_No,Crash_Date_Time,Linked_Address_Classification,City_Town_Park_Location,State,County,Direction,Linked_Street_Number,Linked_Common_Name,Street_Type,...,METADATA,PARKNAME,CreationDate,Creator,EditDate,Editor,GlobalID,Shape__Area,Shape__Length,RGN
0,NP13031262,20130615 14:00:00:000,Dispatch address,,,,,,,,...,,,,,,,,,,MWR
1,NP13054509,20130803 10:56:00:000,Dispatch address,,SD,,,,PPI-ENTRANCE/EXIT,,...,,,,,,,,,,MWR
2,NP13071209,20130905 12:03:00:000,Dispatch address,,,,,,,,...,,,,,,,,,,NER
3,NP13082489,20131006 09:10:00:000,Dispatch address,,VA,,,,MP97,,...,,,,,,,,,,NER
4,NP13082534,20131006 11:42:00:000,Dispatch address,,VA,,,,MP102,,...,,,,,,,,,,NER


In [132]:
imars_crash_expanded['Crash_Date_Time']=pd.to_datetime(imars_crash_expanded['Crash_Date_Time'], 
                                                   format='%Y%m%d %H:%M:%S:%f')
imars_crash_expanded['CRASH_DATE'] = imars_crash_expanded['Crash_Date_Time'].dt.strftime('%m/%d/%Y')
imars_crash_expanded['CRASH_TIME'] = imars_crash_expanded['Crash_Date_Time'].dt.strftime('%H%M')
imars_crash_expanded['CRASH_YEAR'] = imars_crash_expanded['Crash_Date_Time'].dt.strftime('%Y')
imars_crash_expanded.head()

Unnamed: 0,IMARS_Record_No,Crash_Date_Time,Linked_Address_Classification,City_Town_Park_Location,State,County,Direction,Linked_Street_Number,Linked_Common_Name,Street_Type,...,Creator,EditDate,Editor,GlobalID,Shape__Area,Shape__Length,RGN,CRASH_DATE,CRASH_TIME,CRASH_YEAR
0,NP13031262,2013-06-15 14:00:00,Dispatch address,,,,,,,,...,,,,,,,MWR,06/15/2013,1400,2013
1,NP13054509,2013-08-03 10:56:00,Dispatch address,,SD,,,,PPI-ENTRANCE/EXIT,,...,,,,,,,MWR,08/03/2013,1056,2013
2,NP13071209,2013-09-05 12:03:00,Dispatch address,,,,,,,,...,,,,,,,NER,09/05/2013,1203,2013
3,NP13082489,2013-10-06 09:10:00,Dispatch address,,VA,,,,MP97,,...,,,,,,,NER,10/06/2013,910,2013
4,NP13082534,2013-10-06 11:42:00,Dispatch address,,VA,,,,MP102,,...,,,,,,,NER,10/06/2013,1142,2013


In [133]:
imars_crash_expanded.shape

(12790, 56)

In [134]:
imars_crash_expanded.to_csv("./IMARS_crash.csv",index=False)
## NOTE: THIS FILE WAS MANUALLY EDITED POST-HOC TO FILL IN 19 REGION NAMES BASED ON LAT/LONG FOR CRASHES WITH ERRONEOUS PARK NAMES

In [140]:
#imars_crash_expanded_yose = imars_crash_expanded.loc[imars_crash_expanded['Park']=='YOSE']
#imars_crash_expanded_yose.shape

In [141]:
#imars_crash_expanded_yose.to_csv("./IMARS_YOSE_crash.csv",index=False)

In [142]:
imars_passenger_merged = imars_passenger.merge(imars_crash_expanded[['IMARS_Record_No','Latitude', 'Longitude','Park',
                                                                         'CRASH_DATE','CRASH_TIME', 'CRASH_YEAR']] , 
                                               how='left', on='IMARS_Record_No')
imars_passenger_merged.head()

Unnamed: 0,IMARS_Record_No,Crash_Date_Time,Driver_Action,Driver_Condition,Driver_Distraction,Suspect_Alcohol,Alcohol_Test,Alcohol_Test_Result_1,Alcohol_Test_Result_2,Suspect_Drugs,...,Pedestrian_Type_Detail,Involvement,NUM_OCC,INCID_NO,Latitude,Longitude,Park,CRASH_DATE,CRASH_TIME,CRASH_YEAR
0,NP12000078,20120106 14:30:00:000,,,,,,,,,...,,,1,NP12000078,,,,,,
1,NP12000378,20120121 00:00:00:000,,,,,,,,,...,,,1,NP12000378,,,,,,
2,NP12000550,20120131 13:51:00:000,,,,,,,,,...,,,1,NP12000550,,,,,,
3,NP12000911,20120208 00:00:00:000,,,,,,,,,...,,,1,NP12000911,,,,,,
4,NP12000935,20120215 10:15:00:000,,,,,,,,,...,,,1,NP12000935,,,,,,


In [143]:
imars_passenger_merged.shape

(27931, 37)

In [144]:
imars_passenger_merged.to_csv("./IMARS_passenger.csv",index=False)

In [145]:
#imars_passenger_yose = imars_passenger_merged.loc[imars_passenger_merged['Park']=='YOSE']
#imars_passenger_yose.shape

In [146]:
#imars_passenger_yose.to_csv("./IMARS_YOSE_passenger.csv",index=False)

In [147]:
imars_vehicle_merged = imars_vehicle.merge(imars_crash_expanded[['IMARS_Record_No','Latitude', 'Longitude','Park',
                                                                         'CRASH_DATE','CRASH_TIME', 'CRASH_YEAR']] , 
                                               how='left', on='IMARS_Record_No')
imars_vehicle_merged.head()

Unnamed: 0,IMARS_Record_No,Crash_Date_Time,Vehicle_Number,Number_of_Occupants,Vehicle_Towed,Insurance_verified,Initial_Impact_Point,Most_Damaged_Area,Extent_of_Damage,Direction_of_Travel_Prior_to_Crash,...,Hazmat_3_Class,Hazmat_3_ID,Hazmat_4_Class,Hazmat_4_ID,Latitude,Longitude,Park,CRASH_DATE,CRASH_TIME,CRASH_YEAR
0,NP12000078,20120106 14:30:00:000,,,,,,,,,...,,,,,,,,,,
1,NP12000378,20120121 00:00:00:000,,,,,,,,,...,,,,,,,,,,
2,NP12000550,20120131 13:51:00:000,,,,,,,,,...,,,,,,,,,,
3,NP12000911,20120208 00:00:00:000,,,,,,,,,...,,,,,,,,,,
4,NP12000935,20120215 10:15:00:000,,,,,,,,,...,,,,,,,,,,


In [148]:
imars_vehicle_merged.shape

(23576, 57)

In [149]:
imars_vehicle_merged.to_csv("./IMARS_vehicle.csv",index=False)

In [150]:
#imars_vehicle_yose = imars_vehicle_merged.loc[imars_vehicle_merged['Park']=='YOSE']
#imars_vehicle_yose.shape

In [151]:
#imars_vehicle_yose.to_csv("./IMARS_YOSE_vehicle.csv",index=False)

In [152]:
imars_crash_details_merged = imars_crash_details.merge(imars_crash_expanded[['IMARS_Record_No','Latitude', 'Longitude','Park',
                                                                         'CRASH_DATE','CRASH_TIME', 'CRASH_YEAR']] , 
                                               how='left', on='IMARS_Record_No')
imars_crash_details_merged.head()

Unnamed: 0,IMARS_Record_No,Crash_Date_Time,Number_of_Vehicles_Involved,Injury_or_Fatal_Crash,Investigated_at_Scene,Hit_and_Run,Non_Motor_Vehicl_Property_Damage,Amount_of_Property_Damage,First_Harmful_Event_Type,First_Harmful_Event,...,Relation_to_Junction,Type_of_Intersection,Manner_of_Collision,INCID_NO,Latitude,Longitude,Park,CRASH_DATE,CRASH_TIME,CRASH_YEAR
0,NP12000078,20120106 14:30:00:000,,,Yes,,,,,,...,,99. Unknown,99. Unknown,NP12000078,,,,,,
1,NP12000378,20120121 00:00:00:000,,,Yes,,,,,,...,,99. Unknown,99. Unknown,NP12000378,,,,,,
2,NP12000550,20120131 13:51:00:000,,,Yes,,,,,,...,,99. Unknown,99. Unknown,NP12000550,,,,,,
3,NP12000911,20120208 00:00:00:000,,,,,,,,,...,,,,NP12000911,,,,,,
4,NP12000935,20120215 10:15:00:000,,,Yes,,,,,,...,,,99. Unknown,NP12000935,,,,,,


In [153]:
imars_crash_details_merged.shape

(15302, 31)

In [154]:
imars_crash_details_merged.to_csv("./IMARS_crash_details.csv",index=False)

In [155]:
#imars_crash_details_yose = imars_crash_details_merged.loc[imars_crash_details_merged['Park']=='YOSE']
#imars_crash_details_yose.shape

In [156]:
#imars_crash_details_yose.to_csv("./IMARS_YOSE_crash_details.csv",index=False)