## Crash Data Wrangling Jupyter Notebook

**Author:** Eric Englin and Meredith Raymer

**Date:** 11/12/21

**Purpose:** This notebook will combine IMARS datasets over three time periods and change data to merge with CDS and STARS datasets

In [9]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
from shapely.geometry import Point, LineString, Polygon

In [10]:
myworkingdirectory = r"C:\Users\smitha.mahesh\Desktop"
os.chdir(myworkingdirectory)

## Step 1: Concatenate Similar Datasets 

**Note:** Datasets have been divided across three time periods, so these must be combined to start joining the tables together. 

In [11]:
path = r"C:./IMARS"

files = os.listdir(path)

for f in files:
    print(f)

NPS GOccIvPA Command Address 2011-2015.xlsx
NPS GOccIvPA Command Address 2016 to 4-2018.xlsx
NPS GOccIvPA Command Address 4-2018 to 6-2021.xlsx
NPS MVCOccGPersonReport Command 2011 to 2015.xlsx
NPS MVCOccGPersonReport Command 2016 to 4-2018.xlsx
NPS MVCOccGPersonReport Command 4-2018 to 6-2021.xlsx
NPS MVCOccReportCommand_Classification 2011-2015.xlsx
NPS MVCOccReportCommand_Classification 2016 to 4-2018.xlsx
NPS MVCOccReportCommand_Classification 4-2018 to 6-2021.xlsx
NPS MVCOccVehicleReport Command 2011 to 2015.xlsx
NPS MVCOccVehicleReport Command 2016 to 4-2018.xlsx
NPS MVCOccVehicleReport Command 4-2018 to 6-2021.xlsx
USPP GOccIvPA Command Address 1H2021.xlsx
USPP GOccIvPA Command Address 2011-2015.xlsx
USPP GOccIvPA Command Address 2016 to 4-2018.xlsx
USPP GOccIvPA Command Address 2019.xlsx
USPP GOccIvPA Command Address 2020.xlsx
USPP GOccIvPA Command Address 4-2018 to 12-2018.xlsx
USPP MVCOccGPersonReport Command 1H2021.xlsx
USPP MVCOccGPersonReport Command 2011-2015.xlsx
USPP MV

In [15]:
imars_crash_1 = pd.read_excel("./IMARS/NPS GOccIvPA Command Address 2011-2015.xlsx", sheet_name = "Batch1-Result1")


In [13]:
imars_crash_1 = pd.read_excel("./IMARS/NPS GOccIvPA Command Address 2011-2015.xlsx", sheet_name = "Batch1-Result1")
imars_crash_2 = pd.read_excel("./IMARS/NPS GOccIvPA Command Address 2016 to 4-2018.xlsx", sheet_name = "Batch1-Result1")
imars_crash_3 = pd.read_excel("./IMARS/NPS GOccIvPA Command Address 4-2018 to 6-2021.xlsx", sheet_name = "Batch1-Result1")

imars_passenger_1 = pd.read_excel("./IMARS/NPS MVCOccGPersonReport Command 2011 to 2015.xlsx", sheet_name = "Batch1-Result1")
imars_passenger_2 = pd.read_excel("./NPS MVCOccGPersonReport Command 2016 to 4-2018.xlsx", sheet_name = "Batch1-Result1")
imars_passenger_3 = pd.read_excel("./NPS MVCOccGPersonReport Command 4-2018 to 6-2021.xlsx", sheet_name = "Batch1-Result1")

imars_vehicle_1 = pd.read_excel("./NPS MVCOccVehicleReport Command 2011 to 2015.xlsx", sheet_name = "Batch1-Result1")
imars_vehicle_2 = pd.read_excel("./NPS MVCOccVehicleReport Command 2016 to 4-2018.xlsx", sheet_name = "Batch1-Result1")
imars_vehicle_3 = pd.read_excel("./NPS MVCOccVehicleReport Command 4-2018 to 6-2021.xlsx", sheet_name = "Batch1-Result1")

imars_crash_details_1 = pd.read_excel("./NPS MVCOccReportCommand_Classification 2011-2015.xlsx", sheet_name = "Batch1-Result1")
imars_crash_details_2 = pd.read_excel("./NPS MVCOccReportCommand_Classification 2016 to 4-2018.xlsx", sheet_name = "Batch1-Result1")
imars_crash_details_3 = pd.read_excel("./NPS MVCOccReportCommand_Classification 4-2018 to 6-2021.xlsx", sheet_name = "Batch1-Result1")



FileNotFoundError: [Errno 2] No such file or directory: './NPS GOccIvPA Command Address 2011-2015.xlsx'

In [None]:
imars_crash = pd.concat([imars_crash_1, imars_crash_2, imars_crash_3])
imars_passenger = pd.concat([imars_passenger_1, imars_passenger_2, imars_passenger_3])
imars_vehicle = pd.concat([imars_vehicle_1, imars_vehicle_2, imars_vehicle_3])
imars_crash_details = pd.concat([imars_crash_details_1, imars_crash_details_2, imars_crash_details_3])


In [None]:
imars_crash.shape, imars_crash_1.shape, imars_crash_2.shape, imars_crash_3.shape

In [None]:
imars_passenger.shape, imars_passenger_1.shape, imars_passenger_2.shape, imars_passenger_3.shape

In [None]:
imars_vehicle.shape, imars_vehicle_1.shape, imars_vehicle_2.shape,imars_vehicle_3.shape

In [None]:
imars_crash_details.shape, imars_crash_details_1.shape, imars_crash_details_2.shape, imars_crash_details_3.shape

In [None]:
imars_crash.head()

In [None]:
imars_passenger.Injury_Severity.value_counts()

In [None]:
imars_passenger.columns

In [None]:
imars_crash.columns

In [None]:
imars_crash_details.columns

In [None]:
imars_vehicle.columns

## Add Parks to Crash Details Dataset

**Note:** IMARS does not have park units with each crash, so these will have to be added using the Latitude and Longitude fields in the imars_crash dataset. 

In [None]:
imars_crash.shape

In [None]:
imars_crash.dropna(subset=['Park']).shape

In [None]:
imars_crash['Region'].value_counts()

In [None]:
imars_crash.dropna(subset=['Park'])['Region'].value_counts()

In [None]:
imars_crash_parks = imars_crash.dropna(subset=['Park'])
imars_crash_parks.shape

In [None]:
imars_crash.loc[imars_crash['Park'].isnull()==False].shape

In [None]:
imars_crash.loc[imars_crash['Park'].isnull()==True].shape

In [None]:
imars_crash_coords = imars_crash.loc[imars_crash['Park'].isnull()==True].dropna(subset=['Latitude','Longitude'])
imars_crash_coords.shape

In [None]:
3261+5419

In [None]:
imars_crash_coords = imars_crash.loc[imars_crash['Park'].isnull()==True].dropna(subset=['Latitude','Longitude'])
imars_crash_coords_geo=gpd.GeoDataFrame(imars_crash_coords, geometry=gpd.points_from_xy(imars_crash_coords.Longitude, 
                                                                             imars_crash_coords.Latitude))

In [None]:
filename = "C:\Users\smitha.mahesh\Desktop\shapefiles/NPS_-_Land_Resources_Division_Boundary_and_Tract_Data_Service.geojson"
file = open(filename)
parks = gpd.read_file(file)

In [None]:
imars_crash_coords_geo.crs = "EPSG:4326"
parks.crs = "EPSG:4326"

In [None]:
parks['geometry']=parks['geometry'].buffer(0.2)

imars_crash_coords_geo_withparknames=gpd.sjoin(imars_crash_coords_geo,parks,how="left", op='intersects')
imars_crash_coords_geo_withparknames.head()

In [None]:
imars_crash_coords_geo_withparknames.shape

In [None]:
imars_crash_coords_geo_withparknames2  = imars_crash_coords_geo_withparknames.drop_duplicates(subset=['IMARS_Record_No'])

In [None]:
imars_crash_coords_geo_withparknames2.shape

In [None]:
imars_crash_coords_geo_withparknames2.REGION.value_counts()

In [None]:
imars_crash_withparknames = pd.DataFrame(imars_crash_coords_geo_withparknames2.drop(columns='geometry'))
imars_crash_withparknames['Park']= imars_crash_withparknames['UNIT_CODE']

In [None]:
imars_crash_withparknames.shape

In [None]:
imars_crash_withparknames2 = imars_crash_withparknames[list(imars_crash_parks.columns)]
imars_crash_withparknames2.shape

In [None]:
imars_crash_expanded = pd.concat([imars_crash_parks,imars_crash_withparknames2])
imars_crash_expanded  = imars_crash_expanded.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_expanded.shape

## Step 2: Filter for Necessary Fields, Group by IMARS_RECORD_NO


#### Creating New Columns for Injury Severity

- Requires passenger dataset

In [None]:
imars_passenger['NUM_OCC'] = 1
imars_passenger['INCID_NO'] = imars_passenger['IMARS_Record_No']
imars_passenger['Crash_Date_Time_report'] = imars_passenger['Crash_Date_Time_person']

imars_passenger_slim = imars_passenger[[
    'INCID_NO', 'NUM_OCC', 'Crash_Date_Time_report','Injury_Severity'
]]

In [None]:
imars_passenger_slim.head()

In [None]:
imars_passenger_slim['No Injury']= np.where(imars_passenger_slim['Injury_Severity']=='01. No injury', 1,0)
imars_passenger_slim['Possible Injury']= np.where(imars_passenger_slim['Injury_Severity']=='02. Possible injury', 1,0)
imars_passenger_slim['Non-incapacitating Injury']= np.where(imars_passenger_slim['Injury_Severity']=='03. Non-incapacitating injury', 1,0)
imars_passenger_slim['Incapacitating Injury']= np.where(imars_passenger_slim['Injury_Severity']=='04. Incapacitating injury', 1,0)
imars_passenger_slim['Fatality']= np.where(imars_passenger_slim['Injury_Severity']=='05. Fatal', 1,0)
imars_passenger_slim['Unknown Injury']= np.where(imars_passenger_slim['Injury_Severity']=='99. Unknown', 1,0)


In [None]:
imars_passenger_slim.Fatality.sum()

In [None]:
imars_passenger_slim_agg = imars_passenger_slim.groupby(by=['INCID_NO']).sum()
#imars_slim_agg = imars_slim_agg.drop(columns = ['ACCLASS','DINJ', 'PINJ'])
imars_passenger_slim_agg = imars_passenger_slim_agg.reset_index()


In [None]:
imars_passenger_slim_agg.shape

In [None]:
imars_passenger_slim_agg.Fatality.sum()

#### Creating New Columns for Accident Class

In [None]:
imars_crash_details['INCID_NO'] = imars_crash_details['IMARS_Record_No']

imars_crash_details_slim = imars_crash_details[[
    'INCID_NO', 'First_Harmful_Event_Type','First_Harmful_Event'
]]



In [None]:
imars_crash_details_slim.First_Harmful_Event_Type.value_counts()

In [None]:
imars_crash_details_slim.loc[imars_crash_details_slim['First_Harmful_Event_Type']=="Collision with person, MV or non-fixed object"].First_Harmful_Event.value_counts()

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

imars_crash_details_slim['Collision with Fixed Object']= np.where(imars_crash_details_slim['First_Harmful_Event_Type']=="Collision with fixed object", 1,0)
imars_crash_details_slim['Collision with Animal']= np.where(imars_crash_details_slim['First_Harmful_Event_Type']=="Collision with animals", 1,0)
imars_crash_details_slim['Non-Collision']= np.where(imars_crash_details_slim['First_Harmful_Event_Type']=="Non-collision", 1,0)
imars_crash_details_slim['Other Accident Class']= np.where(imars_crash_details_slim['First_Harmful_Event_Type']=="Unknown",1,0)


imars_crash_details_slim['Collision with Other Motor Vehicle']= np.where(imars_crash_details_slim['First_Harmful_Event']=='21. Motor vehicle in transport', 1,0)
imars_crash_details_slim['Collision with Pedestrian']= np.where(imars_crash_details_slim['First_Harmful_Event']=="17. Pedestrian", 1,0)
imars_crash_details_slim['Collision with Bicycle']= np.where(imars_crash_details_slim['First_Harmful_Event']=="18. Bicycle", 1,0)
imars_crash_details_slim['Collision with Parked Motor Vehicle']= np.where(imars_crash_details_slim['First_Harmful_Event']=="22. Parked motor vehicle", 1,0)
imars_crash_details_slim['Collision with Railway Train']= np.where(imars_crash_details_slim['First_Harmful_Event']=="20. Railway vehicle", 1,0)
imars_crash_details_slim['Collision with Other Object']= np.where(imars_crash_details_slim['First_Harmful_Event']=="25. Other non-fixed object", 1,0)
imars_crash_details_slim['Collision with Unknown']= np.where(imars_crash_details_slim['First_Harmful_Event'].isin(['23. Struck by falling, shifting cargo or anything set in motion by MV',
                                                                                              '24. Work zone/maintenance equipment']), 1,0)


In [None]:
imars_crash_details_slim['Non-Collision'].value_counts()

In [None]:
imars_crash_details_slim_nodups = imars_crash_details_slim.drop_duplicates()
imars_passenger_slim_agg_nodups = imars_passenger_slim_agg.drop_duplicates()

In [None]:
imars_crash_clean = imars_crash_expanded[['IMARS_Record_No','Latitude', 'Longitude', 'Park','Crash_Date_Time']]
imars_crash_clean = imars_crash_clean.rename(columns={"IMARS_Record_No": "INCID_NO"})

imars_crash_clean = imars_crash_clean.drop_duplicates("INCID_NO")

In [None]:
imars_crash_details_slim_nodups.shape, imars_crash_clean.shape, imars_passenger_slim_agg_nodups.shape

In [None]:
imars_slim_agg=imars_crash_clean.merge(imars_passenger_slim_agg_nodups, how='left', on='INCID_NO')
imars_slim_agg=imars_slim_agg.merge(imars_crash_details_slim_nodups, how='left', on='INCID_NO')


In [None]:
imars_slim_agg.shape, imars_crash_details_slim_nodups.shape, imars_crash_clean.shape, imars_passenger_slim_agg_nodups.shape

In [None]:
imars_slim_agg.Fatality.sum()

In [None]:
imars_slim_agg.head()

In [None]:
imars_slim_agg.columns

In [None]:
imars_slim_agg = imars_slim_agg[['INCID_NO', 'NUM_OCC','Park', 'Non-Collision',
       'Collision with Other Motor Vehicle', 'Collision with Fixed Object',
       'Collision with Pedestrian', 'Collision with Bicycle',
       'Collision with Parked Motor Vehicle', 'Collision with Railway Train',
       'Collision with Animal', 'Collision with Other Object',
       'Collision with Unknown', 'Other Accident Class', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury', 'Latitude', 'Longitude', 'Crash_Date_Time']]

In [None]:
imars_slim_agg.head()

In [None]:
imars_slim_agg.Fatality.sum()

In [None]:
imars_slim_agg.to_csv("./crash_data_IMARS_clean.csv",index=False)