In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
from shapely.geometry import Point, LineString, Polygon
from datetime import datetime

In [2]:
myworkingdirectory = r"C:\Users\Sophie.Kaye\Desktop\NPS Crash Data"
os.chdir(myworkingdirectory)

In [3]:
imars_crash = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Location")
imars_passenger = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Person")
imars_vehicle = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Vehicle")
imars_crash_details = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Classification")

In [4]:
imars_passenger.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Driver_Action',
       'Driver_Condition', 'Driver_Distraction', 'Suspect_Alcohol',
       'Alcohol_Test', 'Alcohol_Test_Result_1', 'Alcohol_Test_Result_2',
       'Suspect_Drugs', 'Drug_Test', 'Violations_Issued', 'Seat_Position',
       'Injury_Severity', 'Air_Bag_Deployed', 'Ejection',
       'Injury_Transported_By', 'Safety_Equipment_Used', 'Vehicle_number',
       'Injured_transported_by', 'Non_motorist_safety_equipment',
       'Non_motorist_action_circumstance_prior_to_crash',
       'Non_motorist_action_circumstance_at_time_of_crash',
       'Non_motorist_condition_at_time_of_crash', 'Non_motorist_distraction',
       'Non_motorist_location_at_time_of_crash', 'Pedestrian_Type',
       'Pedestrian_Type_Detail', 'Involvement'],
      dtype='object')

In [5]:
imars_crash.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Linked_Address_Classification',
       'City_Town_Park_Location', 'State', 'County', 'Direction',
       'Linked_Street_Number', 'Linked_Common_Name', 'Street_Type',
       'Direction.1', 'NEAR_Distance_to_MI', 'NEAR_Direction_To',
       'NEAR_Direction', 'NEAR_route_street_road_name', 'NEAR_Road_Type',
       'NEAR_Direction.1', 'AT_Intersection_route_street_road_DIRECTION',
       'At_Intersecting_route_street_road_name', 'AT_Road_Type',
       'AT_Direction', 'Mile_Marker', 'Latitude', 'Longitude', 'Region',
       'State_Zone', 'Park', 'Site', 'Place', 'Point',
       'Road_Type_Classification', 'Linked_Address'],
      dtype='object')

In [6]:
imars_crash_details.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Number_of_Vehicles_Involved',
       'Injury_or_Fatal_Crash', 'Investigated_at_Scene', 'Hit_and_Run',
       'Non_Motor_Vehicl_Property_Damage', 'Amount_of_Property_Damage',
       'First_Harmful_Event_Type', 'First_Harmful_Event',
       'Location_of_First_Harmful_Event', 'Weather', 'Roadway_Condition',
       'Lighting', 'School_Bus_related', 'AS_Road_Circumstance',
       'Environmental_Contributing_Circumstances', 'Work_Zone_Related',
       'Work_Zone_Workers_Present', 'Work_Zone_Location',
       'Law_Enforcement_Present_at_Work_Zone', 'Relation_to_Junction',
       'Type_of_Intersection', 'Manner_of_Collision'],
      dtype='object')

In [7]:
imars_vehicle.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Vehicle_Number',
       'Number_of_Occupants', 'Vehicle_Towed', 'Insurance_verified',
       'Initial_Impact_Point', 'Most_Damaged_Area', 'Extent_of_Damage',
       'Direction_of_Travel_Prior_to_Crash', 'Posted_Speed',
       'First_Event_Type', 'First_Event', 'Second_Event_Type', 'Second_Event',
       'Third_Event_Type', 'Third_Event', 'Fourth_Event_Type', 'Fourth_Event',
       'Motor_Vehicle_Unit_Type', 'Vehicle_Owner', 'Vehicle_Type',
       'Non_Commercial_Trailer_Style', 'Emergency_Vehicle_Use',
       'Emergency_Equipment_Activated', 'Special_Function_of_MV_in_Transport',
       'Motor_Vehicle_Contributing_Circumstance',
       'Vehicle_Maneuver_Action_Prior_to_Crash', 'Road_Surface', 'Grade',
       'Roadway_Alignment', 'Total_Number_of_Lanes', 'Traffic_Control',
       'Traffic_Control_Working_Properly', 'Roadway_Description',
       'Commercial_Non_Commercial', 'Number_of_Axles', 'Gross_Vehicle_Weight',
       'Combination_GVW', 

# General Data Cleaning
Note that there are crashes with randomly missing park, region, and/or roadway

### Remove crashes missing record numbers

In [8]:
imars_crash = imars_crash.dropna(subset=['IMARS_Record_No'])
imars_passenger = imars_passenger.dropna(subset=['IMARS_Record_No'])
imars_vehicle = imars_vehicle.dropna(subset=['IMARS_Record_No'])
imars_crash_details = imars_crash_details.dropna(subset=['IMARS_Record_No'])

### Remove duplicate reports

In [9]:
imars_crash = imars_crash.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_details = imars_crash_details.drop_duplicates(subset=['IMARS_Record_No'])
# Cannot remove duplicates from vehicle and passenger tables because they are unique records for each person/car involved in crash 

### Revert flipped lat/long

In [10]:
need_revert = imars_crash['Latitude'].abs() > 70
imars_crash.loc[need_revert, ['Latitude', 'Longitude']] = (imars_crash.loc[need_revert, ['Longitude', 'Latitude']].values)

### Adjust coordinate signs to correct hemisphere

In [11]:
imars_crash.loc[imars_crash.Latitude < 0,"Latitude"] = imars_crash['Latitude']*(-1)
imars_crash.loc[imars_crash.Longitude > 0,"Longitude"] = imars_crash['Longitude']*(-1)

# Add Parks to Crash Table

**Note:** IMARS does not have park units with each crash, so these will have to be added using the Latitude and Longitude fields in the imars_crash dataset. 

In [12]:
imars_crash_parks = imars_crash.dropna(subset=['Park'])
imars_crash_parks.shape[0]

12205

In [13]:
imars_crash.loc[imars_crash['Park'].isnull()].shape[0]
# 2925 crash entries have no park identification

2925

In [14]:
imars_crash_coords = imars_crash.loc[imars_crash['Park'].isnull()].dropna(subset=['Latitude','Longitude'])
imars_crash_coords.shape[0]
# of the 2925 crash entries without park identification, 591 have lat/long coordinates from which park can be assigned using the shapefile
# the sum of crashes with pre-identified parks and crashes that can be assigned a park using lat/long coordinates 

591

In [15]:
# results in the following total of possible usable crash entries after the spatial join:
12205+591

12796

In [16]:
imars_crash_noparks = imars_crash.loc[imars_crash['Park'].isnull()]
imars_crash_noparks_nocoords = imars_crash_noparks.loc[imars_crash_noparks['Latitude'].isnull()]
# 2334 of the crash entries have no park identification or lat/long coordinates
2925-591

2334

In [17]:
mask = imars_crash_noparks_nocoords.loc[:,['Linked_Common_Name','Linked_Address','NEAR_route_street_road_name',
                                           'At_Intersecting_route_street_road_name']].notnull()
imars_noparks_nocoords_someinfo = imars_crash_noparks_nocoords.loc[mask.any(axis=1)]
imars_noparks_nocoords_someinfo.shape[0]
# of the 2334 crash entries without park identification or lat/long coordinates, 109 have other identifiable information 
# (e.g., roadway name) which could be used to identify the park name

109

In [18]:
# print file containing only this subset of crashes for manual park identification
imars_noparks_nocoords_someinfo.to_csv("./IMARS_noparks_nocoords_someinfo.csv",index=False)

## Spatial join to assign Park names

In [19]:
imars_crash_coords_geo=gpd.GeoDataFrame(imars_crash_coords, geometry=gpd.points_from_xy(imars_crash_coords.Longitude, imars_crash_coords.Latitude))
filename = "./shapefiles/NPS_-_Land_Resources_Division_Boundary_and_Tract_Data_Service.geojson"
file = open(filename)
parks = gpd.read_file(file)

In [20]:
imars_crash_coords_geo.crs = "EPSG:4326"
parks = parks.to_crs(epsg=4326)
parks['geometry']=parks['geometry'].buffer(0.01)

imars_crash_coords_geo_withparknames=gpd.sjoin(imars_crash_coords_geo,parks,how="left", predicate='intersects')
imars_crash_coords_geo_withparknames.head()


  parks['geometry']=parks['geometry'].buffer(0.01)


Unnamed: 0,IMARS_Record_No,Crash_Date_Time,Linked_Address_Classification,City_Town_Park_Location,State,County,Direction,Linked_Street_Number,Linked_Common_Name,Street_Type,...,CREATED_BY,METADATA,PARKNAME,CreationDate,Creator,EditDate,Editor,GlobalID,Shape__Area,Shape__Length
439,NP14039836,20140513 13:30:00:000,,,,,,,,,...,Lands,Preliminary data. Contact the Land Resources P...,Olympic,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,d568927b-56f4-4f49-a52a-b8ab9f7676a3,8203591000.0,1158605.0
443,NP14049772,20140607 14:42:00:000,,,,,,,,,...,Lands,Preliminary data. Contact the Land Resources P...,Olympic,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,d568927b-56f4-4f49-a52a-b8ab9f7676a3,8203591000.0,1158605.0
444,NP14060606,20140625 20:40:00:000,,,,,,,,,...,Lands,Preliminary data. Contact the Land Resources P...,Olympic,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,d568927b-56f4-4f49-a52a-b8ab9f7676a3,8203591000.0,1158605.0
497,NP16023660,20160312 16:32:00:000,,,,,,,,,...,Lands,https://irma.nps.gov/DataStore/Reference/Profi...,Big Bend,2022-01-06T10:41:22+00:00,WASO,2022-01-06T10:41:22+00:00,WASO,32b17c0c-12d0-4ffb-82fd-fb570aa92382,4329894000.0,462145.8
508,NP16029926,20160326 20:00:00:000,,,,,,,,,...,Lands,https://irma.nps.gov/DataStore/Reference/Profi...,Blue Ridge Parkway,2022-01-06T10:41:13+00:00,WASO,2022-01-06T10:41:13+00:00,WASO,a8ef8bcf-aaa2-4623-a8f6-1d93fe31f945,642257900.0,2287707.0


In [21]:
imars_crash_coords_geo_withparknames2 = imars_crash_coords_geo_withparknames.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_coords_geo_withparknames2.shape[0]
# all 591 crash entries were processed in the spatial join

591

In [22]:
imars_crash_withparknames = pd.DataFrame(imars_crash_coords_geo_withparknames2.drop(columns='geometry'))
imars_crash_withparknames['Park']= imars_crash_withparknames['UNIT_CODE']
imars_crash_withparknames_NoDupsorNulls = imars_crash_withparknames.dropna(subset=['Park'])
imars_crash_withparknames_NoDupsorNulls.shape[0]
# of the 591 crash entries without parks identified, 505 now have parks assigned from spatial join

505

In [23]:
# so we expect the total ("expanded") dataset to be:
12205+505

12710

In [24]:
imars_crash_expanded = pd.concat([imars_crash_parks,imars_crash_withparknames_NoDupsorNulls])
imars_crash_expanded.shape

(12710, 52)

In [25]:
# check to make sure all entries in combined dataset contain park assignments 
imars_crash_expanded = imars_crash_expanded.dropna(subset=['Park'])
imars_crash_expanded.shape[0]

12710

In [26]:
# check to make sure no duplicate crashes in combined dataset
imars_crash_expanded = imars_crash_expanded.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_expanded.shape[0]

12710

## Add in crashes with Park names manually identified via unmistakable roadway names

In [27]:
# open file Chris populated in which park names were manually added to crash data containing identifiable info (e.g., road name)
imars_new_parknames = pd.read_csv("./IMARS/IMARS_noparks_nocoords_someinfo_parks_filled.csv")

In [28]:
# remove any entries for which park names were not able to be identified
imars_new_parknames = imars_new_parknames.dropna(subset=['Park'])
imars_new_parknames.shape[0]

80

In [29]:
# 80 of 109 crashes were able to have park names manually added using road names and should be combined with crash data 
# containing park names from original data and spatial assignment for a total of crashes in the final dataset:
12710+80

12790

In [30]:
imars_crash_expanded = pd.concat([imars_crash_expanded,imars_new_parknames])
imars_crash_expanded.shape

(12790, 52)

In [31]:
# check to make sure there are no duplicates in final dataset
imars_crash_expanded = imars_crash_expanded.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_expanded.shape

(12790, 52)

## Create new Region column
imars_crash database currently contains two columns with region information, although neither one is fully populated. This section will create and populate a new column re-assigning region name to every crash based on park name using a lookup table

In [32]:
imars_crash_expanded.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Linked_Address_Classification',
       'City_Town_Park_Location', 'State', 'County', 'Direction',
       'Linked_Street_Number', 'Linked_Common_Name', 'Street_Type',
       'Direction.1', 'NEAR_Distance_to_MI', 'NEAR_Direction_To',
       'NEAR_Direction', 'NEAR_route_street_road_name', 'NEAR_Road_Type',
       'NEAR_Direction.1', 'AT_Intersection_route_street_road_DIRECTION',
       'At_Intersecting_route_street_road_name', 'AT_Road_Type',
       'AT_Direction', 'Mile_Marker', 'Latitude', 'Longitude', 'Region',
       'State_Zone', 'Park', 'Site', 'Place', 'Point',
       'Road_Type_Classification', 'Linked_Address', 'index_right', 'OBJECTID',
       'UNIT_CODE', 'GIS_Notes', 'UNIT_NAME', 'DATE_EDIT', 'STATE', 'REGION',
       'GNIS_ID', 'UNIT_TYPE', 'CREATED_BY', 'METADATA', 'PARKNAME',
       'CreationDate', 'Creator', 'EditDate', 'Editor', 'GlobalID',
       'Shape__Area', 'Shape__Length'],
      dtype='object')

In [33]:
# load lookup table
park_info = pd.read_csv("./Reference Data/Park_Info_Table.csv")
park_info = park_info.rename(columns={'UNIT_CODE':'Park','REGION':'RGN'})
park_info.columns

Index(['OBJECTID', 'Park', 'GIS_Notes', 'UNIT_NAME', 'DATE_EDIT', 'STATE',
       'RGN', 'GNIS_ID', 'UNIT_TYPE', 'CREATED_BY', 'METADATA', 'PARKNAME',
       'CreationDa', 'Creator', 'EditDate', 'Editor', 'Shape__Are',
       'Shape__Len', 'Unnamed: 18'],
      dtype='object')

In [34]:
# add RGN column from lookup table to IMARS crash database, joining the two datasets based on park name
imars_crash_expanded = pd.merge(imars_crash_expanded, park_info[['RGN','Park']], how='left', on='Park')
# resulting dataframe after join should have one additional column and no additional rows
imars_crash_expanded = imars_crash_expanded.drop_duplicates() 
imars_crash_expanded.shape

(12790, 53)

In [35]:
# check to see if any crashes were not assigned a region
imars_crash_expanded.loc[imars_crash_expanded['RGN'].isnull()].shape[0]

0

In [36]:
imars_crash_expanded['RGN'].value_counts()

SER    3541
PWR    2656
IMR    2642
NCR    2293
NER    1206
MWR     372
AKR      80
Name: RGN, dtype: int64

In [37]:
# remove partially populated pre-existing columns
imars_crash_expanded = imars_crash_expanded.drop(columns = ['Region', 'REGION'])

In [38]:
imars_crash_expanded.shape

(12790, 51)

In [39]:
imars_crash_expanded.to_csv("IMARS_Crash_ParkandRegionAdded_Clean.csv", index= False)

# Filter for Necessary Fields, Group by IMARS_RECORD_NO

## Passenger Table

In [40]:
imars_passenger = imars_passenger.rename(columns={'IMARS_Record_No':'INCID_NO'}) # rename to match CDS
imars_passenger_slim = imars_passenger[['INCID_NO']] # subset needed columns

In [41]:
imars_passenger['Injury_Severity'].value_counts()

01. No injury                    13599
02. Possible injury               1826
99. Unknown                       1470
03. Non-incapacitating injury     1163
04. Incapacitating injury          486
05. Fatal                          115
Name: Injury_Severity, dtype: int64

In [42]:
# set flags for each injury severity 
imars_passenger_slim['No Injury']= np.where(imars_passenger['Injury_Severity']=='01. No injury', 1,0)
imars_passenger_slim['Possible Injury']= np.where(imars_passenger['Injury_Severity']=='02. Possible injury', 1,0)
imars_passenger_slim['Non-incapacitating Injury']= np.where(imars_passenger['Injury_Severity']=='03. Non-incapacitating injury', 1,0)
imars_passenger_slim['Incapacitating Injury']= np.where(imars_passenger['Injury_Severity']=='04. Incapacitating injury', 1,0)
imars_passenger_slim['Fatality']= np.where(imars_passenger['Injury_Severity']=='05. Fatal', 1,0)
imars_passenger_slim['Fatality'].sum() # exactly as expected from injury severity value counts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_passenger_slim['No Injury']= np.where(imars_passenger['Injury_Severity']=='01. No injury', 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_passenger_slim['Possible Injury']= np.where(imars_passenger['Injury_Severity']=='02. Possible injury', 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versu

115

In [43]:
imars_passenger['Pedestrian_Type'].value_counts()

Other         162
Bicyclist      66
Pedestrian     39
Name: Pedestrian_Type, dtype: int64

In [44]:
# determine the number of motorists and non-motorists involved
imars_passenger_slim['Num_Motorist']= np.where(imars_passenger['Pedestrian_Type'].isnull()==True, 1,0)
imars_passenger_slim['Num_Non_Motorist']= np.where(imars_passenger['Pedestrian_Type'].isnull()==False, 1,0)
# check new non-motorist logic, looks good!
imars_passenger_slim['Num_Non_Motorist'].sum(), (162+66+39) # combined number of Other, Bicyclists, and Pedestrians

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_passenger_slim['Num_Motorist']= np.where(imars_passenger['Pedestrian_Type'].isnull()==True, 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_passenger_slim['Num_Non_Motorist']= np.where(imars_passenger['Pedestrian_Type'].isnull()==False, 1,0)


(267, 267)

In [45]:
imars_passenger_slim['Num_Pedestrian'] = np.where(imars_passenger['Pedestrian_Type']=='Pedestrian', 1,0)
imars_passenger_slim['Num_Pedestrian'].sum() # exactly as expected based on Pedestrian Type value counts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_passenger_slim['Num_Pedestrian'] = np.where(imars_passenger['Pedestrian_Type']=='Pedestrian', 1,0)


39

In [46]:
imars_passenger_slim['Num_Ped_Deaths'] = np.where(imars_passenger_slim['Num_Pedestrian']==1, imars_passenger_slim['Fatality'], 0)
# check logic, looks good!
test = imars_passenger.loc[imars_passenger['Pedestrian_Type']=='Pedestrian']
imars_passenger_slim['Num_Ped_Deaths'].sum(), test[test.Injury_Severity=='05. Fatal'].shape[0] # one fatality in subset of passenger table for pedestrians only

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_passenger_slim['Num_Ped_Deaths'] = np.where(imars_passenger_slim['Num_Pedestrian']==1, imars_passenger_slim['Fatality'], 0)


(1, 1)

In [47]:
imars_passenger_slim.head()

Unnamed: 0,INCID_NO,No Injury,Possible Injury,Non-incapacitating Injury,Incapacitating Injury,Fatality,Num_Motorist,Num_Non_Motorist,Num_Pedestrian,Num_Ped_Deaths
0,NP12000078,0,0,0,0,0,1,0,0,0
1,NP12000378,0,0,0,0,0,1,0,0,0
2,NP12000550,0,0,0,0,0,1,0,0,0
3,NP12000911,0,0,0,0,0,1,0,0,0
4,NP12000935,0,0,0,0,0,1,0,0,0


In [48]:
imars_passenger_slim.shape

(27931, 10)

In [49]:
imars_passenger_slim.columns

Index(['INCID_NO', 'No Injury', 'Possible Injury', 'Non-incapacitating Injury',
       'Incapacitating Injury', 'Fatality', 'Num_Motorist', 'Num_Non_Motorist',
       'Num_Pedestrian', 'Num_Ped_Deaths'],
      dtype='object')

In [50]:
# collapse multiple rows for each person involved in the crash into a single row for each crash
# injuries/fatalities and number of people involved should be summed as total numbers per crash ("sum" function)
imars_passenger_slim_agg = imars_passenger_slim.groupby(by=['INCID_NO']).sum().reset_index()
imars_passenger_slim_agg.shape

(15302, 10)

In [51]:
imars_passenger_slim_agg['Num_Pedestrian'].value_counts()

0    15274
1       22
2        3
4        2
3        1
Name: Num_Pedestrian, dtype: int64

In [52]:
# set binary indicators for VRU and Pedestrian
imars_passenger_slim_agg['Pedestrian'] = np.where(imars_passenger_slim_agg['Num_Pedestrian'] > 0, 1, 0)
imars_passenger_slim_agg['VRU'] = np.where(imars_passenger_slim_agg['Num_Non_Motorist'] > 0, 1, 0)
# test logic, looks good!
imars_passenger_slim_agg['Pedestrian'].sum(), (22+3+2+1) # replacing number of pedestrians involved in each crash with 1 for all crashes involving pedestrians

(28, 28)

In [53]:
imars_passenger_slim_agg['Crash_Severity'] = 0 # initialize column with dummy data to replace with for loop
length = range(imars_passenger_slim_agg.shape[0]) # search through all rows in aggregated dataset (to examine all injuries from each crash as a whole)
for i in length: # determine most severe injury outcome
    if imars_passenger_slim_agg['Fatality'].iloc[i] > 0:
        imars_passenger_slim_agg['Crash_Severity'].iloc[i] = 'Fatal'
    elif imars_passenger_slim_agg['Incapacitating Injury'].iloc[i] > 0:
        imars_passenger_slim_agg['Crash_Severity'].iloc[i] = 'Incap'
    elif imars_passenger_slim_agg['Non-incapacitating Injury'].iloc[i] > 0:
        imars_passenger_slim_agg['Crash_Severity'].iloc[i] = 'Non-Incap'
    elif imars_passenger_slim_agg['Possible Injury'].iloc[i] > 0:
        imars_passenger_slim_agg['Crash_Severity'].iloc[i] = 'Possible'
    else:
        imars_passenger_slim_agg['Crash_Severity'].iloc[i] = 'No Inj'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_passenger_slim_agg['Crash_Severity'].iloc[i] = 'No Inj'


In [54]:
#set flags for each Crash Severity attribute
imars_passenger_slim_agg['CrashSeverity_Fatal'] = np.where(imars_passenger_slim_agg['Crash_Severity']=='Fatal', 1,0)
imars_passenger_slim_agg['CrashSeverity_Incap'] = np.where(imars_passenger_slim_agg['Crash_Severity']=='Incap', 1,0)
imars_passenger_slim_agg['CrashSeverity_NonIncap'] = np.where(imars_passenger_slim_agg['Crash_Severity']=='Non-Incap', 1,0)
imars_passenger_slim_agg['CrashSeverity_Possible'] = np.where(imars_passenger_slim_agg['Crash_Severity']=='Possible', 1,0)
imars_passenger_slim_agg['CrashSeverity_NoInj'] = np.where(imars_passenger_slim_agg['Crash_Severity']=='No Inj', 1,0)

In [55]:
# test crash severity logic - looks good!
test=imars_passenger_slim_agg.loc[imars_passenger_slim_agg['Fatality']==0] # remove fatal crashes
imars_passenger_slim_agg['CrashSeverity_Incap'].sum(), test.loc[test['Incapacitating Injury']>0].shape[0] # number of crashes with incapacitating severity = number of crashes resulting in incapacitating injuries when fatal crashes are removed from dataset

(355, 355)

In [56]:
imars_passenger_slim_agg.head()

Unnamed: 0,INCID_NO,No Injury,Possible Injury,Non-incapacitating Injury,Incapacitating Injury,Fatality,Num_Motorist,Num_Non_Motorist,Num_Pedestrian,Num_Ped_Deaths,Pedestrian,VRU,Crash_Severity,CrashSeverity_Fatal,CrashSeverity_Incap,CrashSeverity_NonIncap,CrashSeverity_Possible,CrashSeverity_NoInj
0,NP12000078,0,0,0,0,0,1,0,0,0,0,0,No Inj,0,0,0,0,1
1,NP12000378,0,0,0,0,0,1,0,0,0,0,0,No Inj,0,0,0,0,1
2,NP12000550,0,0,0,0,0,1,0,0,0,0,0,No Inj,0,0,0,0,1
3,NP12000911,0,0,0,0,0,1,0,0,0,0,0,No Inj,0,0,0,0,1
4,NP12000935,0,0,0,0,0,1,0,0,0,0,0,No Inj,0,0,0,0,1


In [57]:
imars_passenger_slim_agg.shape

(15302, 18)

In [58]:
imars_passenger_slim_agg.columns

Index(['INCID_NO', 'No Injury', 'Possible Injury', 'Non-incapacitating Injury',
       'Incapacitating Injury', 'Fatality', 'Num_Motorist', 'Num_Non_Motorist',
       'Num_Pedestrian', 'Num_Ped_Deaths', 'Pedestrian', 'VRU',
       'Crash_Severity', 'CrashSeverity_Fatal', 'CrashSeverity_Incap',
       'CrashSeverity_NonIncap', 'CrashSeverity_Possible',
       'CrashSeverity_NoInj'],
      dtype='object')

In [59]:
imars_passenger_slim_agg.to_csv("./IMARS_passenger_slim_agg.csv",index=False)

## Vehicle Table

In [60]:
imars_vehicle['Posted_Speed'].value_counts()

09. 45 mph        3821
07. 35 mph        3390
05. 25 mph        2413
11. 55 mph        1827
10. 50 mph        1568
98. Not posted    1528
99. Unknown       1246
03. 15 mph        1022
08. 40 mph         614
06. 30 mph         470
02. 10 mph         440
01. 5 mph          281
04. 20 mph         226
13. 65 mph          40
12. 60 mph          31
14. 70 mph           7
15. 75 mph           2
Name: Posted_Speed, dtype: int64

In [61]:
imars_vehicle = imars_vehicle.rename(columns={'IMARS_Record_No':'INCID_NO'}) # rename to match CDS
imars_vehicle_slim = imars_vehicle[['INCID_NO']] # subset needed columns
# set flags for speed limit
imars_vehicle_slim['5_mph']= np.where(imars_vehicle['Posted_Speed']=='01. 5 mph', 1,0)
imars_vehicle_slim['10_mph']= np.where(imars_vehicle['Posted_Speed']=='02. 10 mph', 1,0)
imars_vehicle_slim['15_mph']= np.where(imars_vehicle['Posted_Speed']=='03. 15 mph', 1,0)
imars_vehicle_slim['20_mph']= np.where(imars_vehicle['Posted_Speed']=='04. 20 mph', 1,0)
imars_vehicle_slim['25_mph']= np.where(imars_vehicle['Posted_Speed']=='05. 25 mph', 1,0)
imars_vehicle_slim['30_mph']= np.where(imars_vehicle['Posted_Speed']=='06. 30 mph', 1,0)
imars_vehicle_slim['35_mph']= np.where(imars_vehicle['Posted_Speed']=='07. 35 mph', 1,0)
imars_vehicle_slim['40_mph']= np.where(imars_vehicle['Posted_Speed']=='08. 40 mph', 1,0)
imars_vehicle_slim['45_mph']= np.where(imars_vehicle['Posted_Speed']=='09. 45 mph', 1,0)
imars_vehicle_slim['50_mph']= np.where(imars_vehicle['Posted_Speed']=='10. 50 mph', 1,0)
imars_vehicle_slim['55_mph']= np.where(imars_vehicle['Posted_Speed']=='11. 55 mph', 1,0)
imars_vehicle_slim['60_mph']= np.where(imars_vehicle['Posted_Speed']=='12. 60 mph', 1,0)
imars_vehicle_slim['65_mph']= np.where(imars_vehicle['Posted_Speed']=='13. 65 mph', 1,0)
imars_vehicle_slim['70_mph']= np.where(imars_vehicle['Posted_Speed']=='14. 70 mph', 1,0)
imars_vehicle_slim['75_mph']= np.where(imars_vehicle['Posted_Speed']=='15. 75 mph', 1,0)
imars_vehicle_slim['no_posted_speed']= np.where(imars_vehicle['Posted_Speed']=='98. Not posted', 1,0)
imars_vehicle_slim['25_mph'].sum() # exactly as expected based on Posted Speed value counts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_vehicle_slim['5_mph']= np.where(imars_vehicle['Posted_Speed']=='01. 5 mph', 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_vehicle_slim['10_mph']= np.where(imars_vehicle['Posted_Speed']=='02. 10 mph', 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_vehicle_slim['15_mph']=

2413

In [62]:
imars_vehicle_slim.columns

Index(['INCID_NO', '5_mph', '10_mph', '15_mph', '20_mph', '25_mph', '30_mph',
       '35_mph', '40_mph', '45_mph', '50_mph', '55_mph', '60_mph', '65_mph',
       '70_mph', '75_mph', 'no_posted_speed'],
      dtype='object')

In [63]:
# collapse multiple rows for each vehicle involved in the crash into a single row for each crash
# only one speed limit should exist per crash to not double count data attribute ("first" function)
imars_vehicle_slim_agg = imars_vehicle_slim.groupby(by=['INCID_NO']).first().reset_index()
imars_vehicle_slim_agg.shape

(15302, 17)

In [64]:
imars_vehicle['Vehicle_Type'].value_counts()

01. Passenger car (sedan, minivan, etc.)         8537
02. Sport utility vehicle                        4259
05. Pickup truck                                 2557
10. Motorcycle > 150cc                            873
06. Motor home                                    649
99. Unknown                                       641
18. Medium/heavy trucks (10,000 lbs. or more)     343
17. Other light trucks (10,000 lbs. or less)      281
03. Passenger van (9-15 passengers)               223
21. Other                                         207
08. Transit/shuttle bus                           197
04. Cargo van (10,000 lbs. or less)               162
19. Maintenance/construction vehicle              117
09. Motor coach                                   107
16. Snowmobile                                     43
15. ATV                                            33
07. School bus                                     31
11. Motorcycle < 150cc (trail bike)                21
12. Moped/minibike/scooter/s

In [65]:
imars_vehicle['NUM_VEH'] = 1 # one vehicle per entry
imars_vehicletypes = imars_vehicle[['INCID_NO','NUM_VEH']] # subset needed columns

In [66]:
# set flags for each vehicle type

#imars_vehicletypes['Car'] = np.where(imars_vehicle['Vehicle_Type']=='01. Passenger car (sedan, minivan, etc.)', 1,0)
#imars_vehicletypes['SUV'] = np.where(imars_vehicle['Vehicle_Type']=='02. Sport utility vehicle', 1,0)
#imars_vehicletypes['Van'] = np.where(imars_vehicle['Vehicle_Type'].str.contains(" van")==True, 1,0) # space necessary to prevent inclusion of "minivan"
#imars_vehicletypes['Truck'] = np.where(imars_vehicle['Vehicle_Type'].str.contains("truck")==True, 1,0)
#imars_vehicletypes['Bus'] = np.where(imars_vehicle['Vehicle_Type'].isin(['07. School bus','08. Transit/shuttle bus','09. Motor coach'])==True, 1,0)
imars_vehicletypes['Num_Motorcycle'] = np.where(imars_vehicle['Vehicle_Type'].str.contains("Motorcycle")==True, 1,0)
#imars_vehicletypes['RV'] = np.where(imars_vehicle['Vehicle_Type']=='06. Motor home', 1,0)
imars_vehicletypes['Num_Motorcycle'].sum(), (873+21) # number of motorcycles = sum of vehicle type counts above and below 150 cc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_vehicletypes['Num_Motorcycle'] = np.where(imars_vehicle['Vehicle_Type'].str.contains("Motorcycle")==True, 1,0)


(894, 894)

In [67]:
# collapse multiple rows for each vehicle involved in the crash into a single row for each crash
# vehicle types and number of people involved should be summed as total numbers per crash ("sum" function)
imars_vehicletypes_agg = imars_vehicletypes.groupby(by=['INCID_NO']).sum().reset_index()
imars_vehicletypes_agg.shape

(15302, 3)

In [68]:
imars_vehicletypes_agg['Num_Motorcycle'].value_counts()

0    14508
1      719
2       64
4        4
3        4
6        2
7        1
Name: Num_Motorcycle, dtype: int64

In [69]:
# set indicator for crashes involving motorcycles
imars_vehicletypes_agg['Motorcycle_Ind'] = np.where(imars_vehicletypes_agg['Num_Motorcycle'] > 0, 1,0)
imars_vehicletypes_agg['Motorcycle_Ind'].sum(), (719+64+4+4+2+1) #  looks good (replacing number of motorcyclists involved in each crash with 1 for all crashes involving motorcyclists)

(794, 794)

In [70]:
imars_vehicletypes_agg.columns

Index(['INCID_NO', 'NUM_VEH', 'Num_Motorcycle', 'Motorcycle_Ind'], dtype='object')

In [71]:
# join aggregated ('sum' and 'first') vehicle data into single table
imars_slim_vehicle_agg = imars_vehicle_slim_agg.merge(imars_vehicletypes_agg, how='right', on='INCID_NO')
imars_slim_vehicle_agg = imars_slim_vehicle_agg.drop_duplicates()
imars_slim_vehicle_agg.shape

(15302, 20)

In [72]:
imars_slim_vehicle_agg.columns

Index(['INCID_NO', '5_mph', '10_mph', '15_mph', '20_mph', '25_mph', '30_mph',
       '35_mph', '40_mph', '45_mph', '50_mph', '55_mph', '60_mph', '65_mph',
       '70_mph', '75_mph', 'no_posted_speed', 'NUM_VEH', 'Num_Motorcycle',
       'Motorcycle_Ind'],
      dtype='object')

In [73]:
imars_slim_vehicle_agg.to_csv("./IMARS_slim_vehicle_agg.csv",index=False)

## Crash Details Table

In [74]:
imars_crash_details = imars_crash_details.rename(columns={'IMARS_Record_No':'INCID_NO'}) # rename to match CDS
imars_crash_details_slim = imars_crash_details[['INCID_NO', 'First_Harmful_Event', 'First_Harmful_Event_Type']] # subset needed columns

In [75]:
pd.options.display.max_rows = 10000000
imars_crash_details['First_Harmful_Event'].value_counts()

21. Motor vehicle in transport                                           4829
22. Parked motor vehicle                                                 1422
52. Tree/shrub                                                           1233
28. Deer                                                                  922
07. Ran off the road right                                                797
99. Unknown                                                               411
25. Other non-fixed object                                                345
60. Rock, boulder, rock slide                                             341
66. Other fixed object                                                    331
46. Earth embankment/berm                                                 325
16. Other non-collision                                                   278
01. Overturn/rollover                                                     271
08. Ran Off the road left                                       

In [108]:
imars_crash_details['First_Harmful_Event_Type'].value_counts()

Collision with person, MV or non-fixed object    6869
Collision with fixed object                      3990
Non-collision                                    1897
Collision with animals                           1376
Unknown                                           413
Name: First_Harmful_Event_Type, dtype: int64

In [76]:
imars_crash_details['Environmental_Contributing_Circumstances'].value_counts()

01. None                                                                                    10032
02. Weather                                                                                  1270
06. Animal(s) in roadway                                                                      974
99. Unknown                                                                                   595
03. Physical obstruction(s) (trees, bushes, etc.)                                             239
07. Other                                                                                     229
05. Glare                                                                                      88
02. Weather; 03. Physical obstruction(s) (trees, bushes, etc.)                                 86
02. Weather; 07. Other                                                                         56
02. Weather; 06. Animal(s) in roadway                                                          50
04. Rockfall        

In [77]:
# set flags for animal related data attributes
imars_crash_details_slim['Collision w Animal']= np.where(imars_crash_details['First_Harmful_Event_Type']=="Collision with animals", 1,0)
imars_crash_details_slim['Avoiding Animal']= np.where(imars_crash_details['First_Harmful_Event']=="13. Avoiding an animal on road", 1,0)
imars_crash_details_slim['Animal in Roadway']= np.where(imars_crash_details['Environmental_Contributing_Circumstances'].str.contains("Animal")==True, 1,0)
# check logic - looks good!
imars_crash_details_slim['Animal in Roadway'].sum(), (974+50+17+11+9+7+3+3+2+1+1+1+1+1) # includes all possible animal-related entries in environmental value counts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Collision w Animal']= np.where(imars_crash_details['First_Harmful_Event_Type']=="Collision with animals", 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Avoiding Animal']= np.where(imars_crash_details['First_Harmful_Event']=="13. Avoiding an animal on road", 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-d

(1081, 1081)

In [78]:
imars_crash_details_slim.sum() # order if statements in for loop from most to least common to increase speed

  imars_crash_details_slim.sum() # order if statements in for loop from most to least common to increase speed


INCID_NO              NP12000078NP12000378NP12000550NP12000911NP1200...
Collision w Animal                                                 1376
Avoiding Animal                                                      56
Animal in Roadway                                                  1081
dtype: object

In [79]:
# set animal indicator
imars_crash_details_slim['Involving Animal'] = 0 # initialize column with dummy data to replace with for loop
length = range(imars_crash_details_slim.shape[0])
for i in length:
    if imars_crash_details_slim['Collision w Animal'].iloc[i] == 1:
        imars_crash_details_slim['Involving Animal'].iloc[i] = 1
    elif imars_crash_details_slim['Animal in Roadway'].iloc[i] == 1:
        imars_crash_details_slim['Involving Animal'].iloc[i] = 1
    elif imars_crash_details_slim['Avoiding Animal'].iloc[i] == 1:
        imars_crash_details_slim['Involving Animal'].iloc[i] = 1
    else:
        imars_crash_details_slim['Involving Animal'].iloc[i] = 0
imars_crash_details_slim['Involving Animal'].sum() # greater value than either Collision w Animal, Animal in Roadway, or Avoiding Animal so it must be accounting for them all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Involving Animal'] = 0 # initialize column with dummy data to replace with for loop
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Involving Animal'].iloc[i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Involving Animal'].iloc[i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the docum

1655

In [80]:
# set flags for lighting characteristics
imars_crash_details_slim['Daylight'] = np.where(imars_crash_details['Lighting']=="01. Daylight", 1,0)
imars_crash_details_slim['Dawn'] = np.where(imars_crash_details['Lighting']=="02. Dawn", 1,0)
imars_crash_details_slim['Dusk'] = np.where(imars_crash_details['Lighting']=="03. Dusk", 1,0)
imars_crash_details_slim['Dark_Lit'] = np.where(imars_crash_details['Lighting']=="04. Dark - lighted", 1,0)
imars_crash_details_slim['Dark_NotLit'] = np.where(imars_crash_details['Lighting']=="05. Dark - not lighted", 1,0)
imars_crash_details_slim['Dark_UnknownLit'] = np.where(imars_crash_details['Lighting']=="06. Dark - unknown lighting", 1,0)
imars_crash_details_slim.sum() # order if statements in for loop from most to least common to increase speed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Daylight'] = np.where(imars_crash_details['Lighting']=="01. Daylight", 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Dawn'] = np.where(imars_crash_details['Lighting']=="02. Dawn", 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_cra

INCID_NO              NP12000078NP12000378NP12000550NP12000911NP1200...
Collision w Animal                                                 1376
Avoiding Animal                                                      56
Animal in Roadway                                                  1081
Involving Animal                                                   1655
Daylight                                                          10188
Dawn                                                                335
Dusk                                                                528
Dark_Lit                                                            439
Dark_NotLit                                                        2327
Dark_UnknownLit                                                     101
dtype: object

In [81]:
# group adverse lighting attributes into bin for poor visibility
imars_crash_details_slim['Poor Lighting'] = 0 # initialize column with dummy data to replace with for loop
length = range(imars_crash_details_slim.shape[0])
for i in length:
    if imars_crash_details_slim['Dark_NotLit'].iloc[i] == 1:
        imars_crash_details_slim['Poor Lighting'].iloc[i] = 1
    elif imars_crash_details_slim['Dusk'].iloc[i] == 1:
        imars_crash_details_slim['Poor Lighting'].iloc[i] = 1
    elif imars_crash_details_slim['Dark_Lit'].iloc[i] == 1:
        imars_crash_details_slim['Poor Lighting'].iloc[i] = 1
    elif imars_crash_details_slim['Dawn'].iloc[i] == 1:
        imars_crash_details_slim['Poor Lighting'].iloc[i] = 1
    elif imars_crash_details_slim['Dark_UnknownLit'].iloc[i] == 1:
        imars_crash_details_slim['Poor Lighting'].iloc[i] = 1
    else:
        imars_crash_details_slim['Poor Lighting'].iloc[i] = 0
# check logic - looks good!
imars_crash_details_slim['Poor Lighting'].sum(), (335+528+439+2327+101)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Poor Lighting'] = 0 # initialize column with dummy data to replace with for loop
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Poor Lighting'].iloc[i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Poor Lighting'].iloc[i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation:

(3730, 3730)

In [82]:
imars_crash_details['Weather'].value_counts()

01. Clear                                             10967
02. Raining                                             847
10. Cloudy, overcast                                    776
99. Unknown                                             418
03. Snowing                                             256
02. Raining; 10. Cloudy, overcast                       195
02. Raining; 04. Fog                                    103
04. Fog                                                  87
03. Snowing; 10. Cloudy, overcast                        78
01. Clear; 10. Cloudy, overcast                          63
03. Snowing; 09. Blowing snow                            54
03. Snowing; 08. Sleet/hail/freezing rain                45
12. Other                                                37
06. Severe wind only                                     27
01. Clear; 12. Other                                     23
01. Clear; 99. Unknown                                   21
08. Sleet/hail/freezing rain            

In [83]:
imars_crash_details['Roadway_Condition'].value_counts()

01. Dry                                        10421
02. Wet                                         1591
99. Unknown                                      401
03. Snow; 05. Ice/frost                          370
08. Mud, dirt, gravel                            220
01. Dry; 08. Mud, dirt, gravel                   199
05. Ice/frost                                    175
03. Snow                                         162
10. Other                                         66
02. Wet; 05. Ice/frost                            60
03. Snow; 04. Slush                               58
02. Wet; 08. Mud, dirt, gravel                    52
07. Sand                                          41
02. Wet; 03. Snow                                 41
01. Dry; 07. Sand                                 38
02. Wet; 06. Water (standing, moving)             32
04. Slush; 05. Ice/frost                          25
02. Wet; 04. Slush                                23
01. Dry; 02. Wet                              

In [84]:
imars_crash_details['AS_Road_Circumstance'].value_counts()

01. None                                                                                                                                                10101
02. Road surface condition (wet, icy, snow, slush, etc.)                                                                                                 1503
99. Unknown                                                                                                                                               438
09. Shoulders (none, low, soft, high)                                                                                                                     309
11. Other                                                                                                                                                 253
07. Obstruction in roadway (i.e. vehicles, etc.)                                                                                                          200
04. Rut, holes, bumps                               

In [85]:
# set flags for adverse visibility conditions due to precip
imars_crash_details_slim['Bad Weather'] = np.where(imars_crash_details['Weather'].str.contains('02|03|04|05|07|08|09|11')==True, 1,0)
imars_crash_details_slim['Bad Road Condition'] = np.where(imars_crash_details['Roadway_Condition'].str.contains('02|03|04|05|06')==True, 1,0)
imars_crash_details_slim['Bad Road Circumstance'] = np.where(imars_crash_details['AS_Road_Circumstance'].str.contains('02')==True, 1,0)
imars_crash_details_slim['Bad Environmental Circumstance'] = np.where(imars_crash_details['Environmental_Contributing_Circumstances'].str.contains('02')==True, 1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Bad Weather'] = np.where(imars_crash_details['Weather'].str.contains('02|03|04|05|07|08|09|11')==True, 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Bad Road Condition'] = np.where(imars_crash_details['Roadway_Condition'].str.contains('02|03|04|05|06')==True, 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas

In [86]:
# check to be sure all attributes are accounted for and order if statements in subsequent for loop from most to least common to increase speed

In [87]:
imars_crash_details_slim['Bad Weather'].sum(), (847+195+103+18+18+15+11+10+2+256+78+54+45+11+6+4+3+1+87+10+6+3+3+1+7+2+1+4+2+1+20+8+5+1+12+5+1+1+18+11+3+1)

(1890, 1890)

In [88]:
imars_crash_details_slim['Bad Road Condition'].sum(), (1591+60+52+41+32+23+19+17+3+2+2+370+162+58+6+3+2+1+1+25+18+1+1+175+5+4+4+1+1+4)

(2684, 2684)

In [89]:
imars_crash_details_slim['Bad Road Circumstance'].sum(), (1503+75+43+42+38+19+17+15+11+10+7+6+5+5+5+5+3+3+3+3+2+2+2+2+2+2+2+2+1+1+1+1+1+1+1+1)

(1842, 1842)

In [90]:
imars_crash_details_slim['Bad Environmental Circumstance'].sum(), (1270+86+56+50+13+9+5+3+3+1+1+1+1+1+1)

(1501, 1501)

In [91]:
# group adverse weather attributes into bin for poor visibility
imars_crash_details_slim['Adverse Weather'] = 0 # initialize column with dummy data to replace with for loop
length = range(imars_crash_details_slim.shape[0])
for i in length:
    if imars_crash_details_slim['Bad Road Condition'].iloc[i] == 1 :
        imars_crash_details_slim['Adverse Weather'].iloc[i] = 1
    elif imars_crash_details_slim['Bad Weather'].iloc[i] == 1 :
        imars_crash_details_slim['Adverse Weather'].iloc[i] = 1
    elif imars_crash_details_slim['Bad Road Circumstance'].iloc[i] == 1 :
        imars_crash_details_slim['Adverse Weather'].iloc[i] = 1
    elif imars_crash_details_slim['Bad Environmental Circumstance'].iloc[i] == 1 :
        imars_crash_details_slim['Adverse Weather'].iloc[i] = 1
    else:
        imars_crash_details_slim['Adverse Weather'].iloc[i] = 0
imars_crash_details_slim['Adverse Weather'].sum() # greater value than either Bad Road Condition, Bad Weather, Bad Road Circumstance, or Bad Environmental Circumstance so it must be accounting for them all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Adverse Weather'] = 0 # initialize column with dummy data to replace with for loop
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Adverse Weather'].iloc[i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Adverse Weather'].iloc[i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the document

2932

In [92]:
imars_crash_details_slim.columns

Index(['INCID_NO', 'First_Harmful_Event', 'First_Harmful_Event_Type',
       'Collision w Animal', 'Avoiding Animal', 'Animal in Roadway',
       'Involving Animal', 'Daylight', 'Dawn', 'Dusk', 'Dark_Lit',
       'Dark_NotLit', 'Dark_UnknownLit', 'Poor Lighting', 'Bad Weather',
       'Bad Road Condition', 'Bad Road Circumstance',
       'Bad Environmental Circumstance', 'Adverse Weather'],
      dtype='object')

In [93]:
imars_crash_details_slim = imars_crash_details_slim.drop(columns={'Collision w Animal', 'Avoiding Animal', 'Animal in Roadway', 'Bad Weather', 'Bad Road Condition', 'Bad Road Circumstance', 'Bad Environmental Circumstance'}) # get rid of columns used to create indicators that will not be needed for analysis
imars_crash_details_slim.columns

Index(['INCID_NO', 'First_Harmful_Event', 'First_Harmful_Event_Type',
       'Involving Animal', 'Daylight', 'Dawn', 'Dusk', 'Dark_Lit',
       'Dark_NotLit', 'Dark_UnknownLit', 'Poor Lighting', 'Adverse Weather'],
      dtype='object')

In [94]:
imars_crash_details_slim.shape

(15302, 12)

In [95]:
imars_crash_details_slim.to_csv("./IMARS_crash_details_slim.csv",index=False)

## Crash Table

In [96]:
# reformat crash date, time, and year; separate into individual columns
imars_crash_expanded['Crash_Date_Time']=pd.to_datetime(imars_crash_expanded['Crash_Date_Time'], format='%Y%m%d %H:%M:%S:%f')
imars_crash_expanded['DATE'] = imars_crash_expanded['Crash_Date_Time'].dt.strftime('%Y-%m-%d')
imars_crash_expanded['YEAR'] = imars_crash_expanded['Crash_Date_Time'].dt.strftime('%Y')
imars_crash_expanded['MONTH'] = imars_crash_expanded['Crash_Date_Time'].dt.strftime('%m')
imars_crash_expanded['DOW'] = imars_crash_expanded['Crash_Date_Time'].dt.strftime('%w')
imars_crash_expanded['HOUR'] = imars_crash_expanded['Crash_Date_Time'].dt.strftime('%H')
imars_crash_expanded.head() # looks good!

Unnamed: 0,IMARS_Record_No,Crash_Date_Time,Linked_Address_Classification,City_Town_Park_Location,State,County,Direction,Linked_Street_Number,Linked_Common_Name,Street_Type,...,Editor,GlobalID,Shape__Area,Shape__Length,RGN,DATE,YEAR,MONTH,DOW,HOUR
0,NP13031262,2013-06-15 14:00:00,Dispatch address,,,,,,,,...,,,,,MWR,2013-06-15,2013,6,6,14
1,NP13054509,2013-08-03 10:56:00,Dispatch address,,SD,,,,PPI-ENTRANCE/EXIT,,...,,,,,MWR,2013-08-03,2013,8,6,10
2,NP13071209,2013-09-05 12:03:00,Dispatch address,,,,,,,,...,,,,,NER,2013-09-05,2013,9,4,12
3,NP13082489,2013-10-06 09:10:00,Dispatch address,,VA,,,,MP97,,...,,,,,NER,2013-10-06,2013,10,0,9
4,NP13082534,2013-10-06 11:42:00,Dispatch address,,VA,,,,MP102,,...,,,,,NER,2013-10-06,2013,10,0,11


In [97]:
imars_crash_expanded.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Linked_Address_Classification',
       'City_Town_Park_Location', 'State', 'County', 'Direction',
       'Linked_Street_Number', 'Linked_Common_Name', 'Street_Type',
       'Direction.1', 'NEAR_Distance_to_MI', 'NEAR_Direction_To',
       'NEAR_Direction', 'NEAR_route_street_road_name', 'NEAR_Road_Type',
       'NEAR_Direction.1', 'AT_Intersection_route_street_road_DIRECTION',
       'At_Intersecting_route_street_road_name', 'AT_Road_Type',
       'AT_Direction', 'Mile_Marker', 'Latitude', 'Longitude', 'State_Zone',
       'Park', 'Site', 'Place', 'Point', 'Road_Type_Classification',
       'Linked_Address', 'index_right', 'OBJECTID', 'UNIT_CODE', 'GIS_Notes',
       'UNIT_NAME', 'DATE_EDIT', 'STATE', 'GNIS_ID', 'UNIT_TYPE', 'CREATED_BY',
       'METADATA', 'PARKNAME', 'CreationDate', 'Creator', 'EditDate', 'Editor',
       'GlobalID', 'Shape__Area', 'Shape__Length', 'RGN', 'DATE', 'YEAR',
       'MONTH', 'DOW', 'HOUR'],
      dtype='object

In [98]:
imars_crash_expanded = imars_crash_expanded.rename(columns={'IMARS_Record_No':'INCID_NO','Latitude':'LATITUDE','Longitude':'LONGITUDE'}) # to match CDS
imars_crash_slim = imars_crash_expanded[['INCID_NO','LATITUDE','LONGITUDE','Park','RGN','YEAR','MONTH','DOW','HOUR']]
imars_crash_slim.shape

(12790, 9)

In [99]:
imars_crash_slim.to_csv("./IMARS_crash_slim.csv",index=False)

# Merge Slim Tables

In [100]:
imars_crash_slim.shape, imars_crash_details_slim.shape, imars_passenger_slim_agg.shape, imars_slim_vehicle_agg.shape

((12790, 9), (15302, 12), (15302, 18), (15302, 20))

In [101]:
imars_crash_details_slim_merged = imars_crash_details_slim.merge(imars_crash_slim, how='right', on='INCID_NO')
imars_crash_details_slim_merged.shape

(12790, 20)

In [102]:
imars_crash_details_slim_merged.columns

Index(['INCID_NO', 'First_Harmful_Event', 'First_Harmful_Event_Type',
       'Involving Animal', 'Daylight', 'Dawn', 'Dusk', 'Dark_Lit',
       'Dark_NotLit', 'Dark_UnknownLit', 'Poor Lighting', 'Adverse Weather',
       'LATITUDE', 'LONGITUDE', 'Park', 'RGN', 'YEAR', 'MONTH', 'DOW', 'HOUR'],
      dtype='object')

In [103]:
imars_crash_details_and_passenger_slim = imars_passenger_slim_agg.merge(imars_crash_details_slim_merged, how='right', on='INCID_NO')
imars_crash_details_and_passenger_slim.shape

(12790, 37)

In [104]:
imars_crash_details_and_passenger_slim.columns

Index(['INCID_NO', 'No Injury', 'Possible Injury', 'Non-incapacitating Injury',
       'Incapacitating Injury', 'Fatality', 'Num_Motorist', 'Num_Non_Motorist',
       'Num_Pedestrian', 'Num_Ped_Deaths', 'Pedestrian', 'VRU',
       'Crash_Severity', 'CrashSeverity_Fatal', 'CrashSeverity_Incap',
       'CrashSeverity_NonIncap', 'CrashSeverity_Possible',
       'CrashSeverity_NoInj', 'First_Harmful_Event',
       'First_Harmful_Event_Type', 'Involving Animal', 'Daylight', 'Dawn',
       'Dusk', 'Dark_Lit', 'Dark_NotLit', 'Dark_UnknownLit', 'Poor Lighting',
       'Adverse Weather', 'LATITUDE', 'LONGITUDE', 'Park', 'RGN', 'YEAR',
       'MONTH', 'DOW', 'HOUR'],
      dtype='object')

In [105]:
imars_slim_all = imars_slim_vehicle_agg.merge(imars_crash_details_and_passenger_slim, how='right', on='INCID_NO')
imars_slim_all['database'] = 'IMARS' # add column with database name
imars_slim_all.shape

(12790, 57)

In [106]:
imars_slim_all.columns

Index(['INCID_NO', '5_mph', '10_mph', '15_mph', '20_mph', '25_mph', '30_mph',
       '35_mph', '40_mph', '45_mph', '50_mph', '55_mph', '60_mph', '65_mph',
       '70_mph', '75_mph', 'no_posted_speed', 'NUM_VEH', 'Num_Motorcycle',
       'Motorcycle_Ind', 'No Injury', 'Possible Injury',
       'Non-incapacitating Injury', 'Incapacitating Injury', 'Fatality',
       'Num_Motorist', 'Num_Non_Motorist', 'Num_Pedestrian', 'Num_Ped_Deaths',
       'Pedestrian', 'VRU', 'Crash_Severity', 'CrashSeverity_Fatal',
       'CrashSeverity_Incap', 'CrashSeverity_NonIncap',
       'CrashSeverity_Possible', 'CrashSeverity_NoInj', 'First_Harmful_Event',
       'First_Harmful_Event_Type', 'Involving Animal', 'Daylight', 'Dawn',
       'Dusk', 'Dark_Lit', 'Dark_NotLit', 'Dark_UnknownLit', 'Poor Lighting',
       'Adverse Weather', 'LATITUDE', 'LONGITUDE', 'Park', 'RGN', 'YEAR',
       'MONTH', 'DOW', 'HOUR', 'database'],
      dtype='object')

In [107]:
imars_slim_all.to_csv("./IMARS_slim_all_clean.csv",index=False)