In [3]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
from shapely.geometry import Point, LineString, Polygon

In [4]:
myworkingdirectory = r"C:\Users\Sophie.Kaye\Desktop\NPS Crash Data"
os.chdir(myworkingdirectory)

In [113]:
imars_crash = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Location")
imars_passenger = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Person")
imars_vehicle = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Vehicle")
imars_crash_details = pd.read_excel("./IMARS/IMARS 2012 - 2021.xlsx", sheet_name = "Classification")

In [6]:
imars_passenger.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Driver_Action',
       'Driver_Condition', 'Driver_Distraction', 'Suspect_Alcohol',
       'Alcohol_Test', 'Alcohol_Test_Result_1', 'Alcohol_Test_Result_2',
       'Suspect_Drugs', 'Drug_Test', 'Violations_Issued', 'Seat_Position',
       'Injury_Severity', 'Air_Bag_Deployed', 'Ejection',
       'Injury_Transported_By', 'Safety_Equipment_Used', 'Vehicle_number',
       'Injured_transported_by', 'Non_motorist_safety_equipment',
       'Non_motorist_action_circumstance_prior_to_crash',
       'Non_motorist_action_circumstance_at_time_of_crash',
       'Non_motorist_condition_at_time_of_crash', 'Non_motorist_distraction',
       'Non_motorist_location_at_time_of_crash', 'Pedestrian_Type',
       'Pedestrian_Type_Detail', 'Involvement'],
      dtype='object')

In [7]:
imars_crash.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Linked_Address_Classification',
       'City_Town_Park_Location', 'State', 'County', 'Direction',
       'Linked_Street_Number', 'Linked_Common_Name', 'Street_Type',
       'Direction.1', 'NEAR_Distance_to_MI', 'NEAR_Direction_To',
       'NEAR_Direction', 'NEAR_route_street_road_name', 'NEAR_Road_Type',
       'NEAR_Direction.1', 'AT_Intersection_route_street_road_DIRECTION',
       'At_Intersecting_route_street_road_name', 'AT_Road_Type',
       'AT_Direction', 'Mile_Marker', 'Latitude', 'Longitude', 'Region',
       'State_Zone', 'Park', 'Site', 'Place', 'Point',
       'Road_Type_Classification', 'Linked_Address'],
      dtype='object')

In [8]:
imars_crash_details.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Number_of_Vehicles_Involved',
       'Injury_or_Fatal_Crash', 'Investigated_at_Scene', 'Hit_and_Run',
       'Non_Motor_Vehicl_Property_Damage', 'Amount_of_Property_Damage',
       'First_Harmful_Event_Type', 'First_Harmful_Event',
       'Location_of_First_Harmful_Event', 'Weather', 'Roadway_Condition',
       'Lighting', 'School_Bus_related', 'AS_Road_Circumstance',
       'Environmental_Contributing_Circumstances', 'Work_Zone_Related',
       'Work_Zone_Workers_Present', 'Work_Zone_Location',
       'Law_Enforcement_Present_at_Work_Zone', 'Relation_to_Junction',
       'Type_of_Intersection', 'Manner_of_Collision'],
      dtype='object')

In [9]:
imars_vehicle.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Vehicle_Number',
       'Number_of_Occupants', 'Vehicle_Towed', 'Insurance_verified',
       'Initial_Impact_Point', 'Most_Damaged_Area', 'Extent_of_Damage',
       'Direction_of_Travel_Prior_to_Crash', 'Posted_Speed',
       'First_Event_Type', 'First_Event', 'Second_Event_Type', 'Second_Event',
       'Third_Event_Type', 'Third_Event', 'Fourth_Event_Type', 'Fourth_Event',
       'Motor_Vehicle_Unit_Type', 'Vehicle_Owner', 'Vehicle_Type',
       'Non_Commercial_Trailer_Style', 'Emergency_Vehicle_Use',
       'Emergency_Equipment_Activated', 'Special_Function_of_MV_in_Transport',
       'Motor_Vehicle_Contributing_Circumstance',
       'Vehicle_Maneuver_Action_Prior_to_Crash', 'Road_Surface', 'Grade',
       'Roadway_Alignment', 'Total_Number_of_Lanes', 'Traffic_Control',
       'Traffic_Control_Working_Properly', 'Roadway_Description',
       'Commercial_Non_Commercial', 'Number_of_Axles', 'Gross_Vehicle_Weight',
       'Combination_GVW', 

# General Data Cleaning
Note that there are crashes with randomly missing park, region, and/or roadway

### REMOVE CRASHES WITH MISSING RECORD NUMBERS

In [114]:
imars_crash = imars_crash.dropna(subset=['IMARS_Record_No'])
imars_passenger = imars_passenger.dropna(subset=['IMARS_Record_No'])
imars_vehicle = imars_vehicle.dropna(subset=['IMARS_Record_No'])
imars_crash_details = imars_crash_details.dropna(subset=['IMARS_Record_No'])

### REMOVE DUPLICATES
**Note:** Cannot remove duplicates from vehicle and passenger tables because they are unique records for each person/car involved in crash 

In [11]:
imars_crash = imars_crash.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_details = imars_crash_details.drop_duplicates(subset=['IMARS_Record_No'])

### REVERT FLIPPED LAT/LONG

In [12]:
need_revert = imars_crash['Latitude'].abs() > 70
imars_crash.loc[need_revert, ['Latitude', 'Longitude']] = (imars_crash.loc[need_revert, ['Longitude', 'Latitude']].values)

### ADJUST SIGNS TO CORRECT HEMISPHERE

In [13]:
imars_crash.loc[imars_crash.Latitude < 0,"Latitude"] = imars_crash['Latitude']*(-1)
imars_crash.loc[imars_crash.Longitude > 0,"Longitude"] = imars_crash['Longitude']*(-1)

# Add Parks to Crash Table

**Note:** IMARS does not have park units with each crash, so these will have to be added using the Latitude and Longitude fields in the imars_crash dataset. 

In [14]:
imars_crash_parks = imars_crash.dropna(subset=['Park'])
imars_crash_parks.shape[0]

12205

In [15]:
imars_crash.loc[imars_crash['Park'].isnull()].shape[0]
# 2925 crash entries have no park identification

2925

In [16]:
imars_crash_coords = imars_crash.loc[imars_crash['Park'].isnull()].dropna(subset=['Latitude','Longitude'])
imars_crash_coords.shape[0]

591

In [17]:
# of the 2925 crash entries without park identification, 591 have lat/long coordinates from which park can be assigned using the shapefile
# the sum of crashes with pre-identified parks and crashes that can be assigned a park using lat/long coordinates 
# results in the following total of possible usable crash entries after the spatial join:
12205+591

12796

In [18]:
imars_crash_noparks = imars_crash.loc[imars_crash['Park'].isnull()]
imars_crash_noparks_nocoords = imars_crash_noparks.loc[imars_crash_noparks['Latitude'].isnull()]
# 2334 of the crash entries have no park identification or lat/long coordinates
2925-591

2334

In [19]:
mask = imars_crash_noparks_nocoords.loc[:,['Linked_Common_Name','Linked_Address','NEAR_route_street_road_name',
                                           'At_Intersecting_route_street_road_name']].notnull()
imars_noparks_nocoords_someinfo = imars_crash_noparks_nocoords.loc[mask.any(axis=1)]
imars_noparks_nocoords_someinfo.shape[0]

109

In [20]:
# of the 2334 crash entries without park identification or lat/long coordinates, 109 have other identifiable information 
# (e.g., roadway name) which could be used to identify the park name
# print file containing only this subset of crashes for manual park identification
imars_noparks_nocoords_someinfo.to_csv("./IMARS_noparks_nocoords_someinfo.csv",index=False)

## spatial join to assign park names

In [21]:
imars_crash_coords_geo=gpd.GeoDataFrame(imars_crash_coords, geometry=gpd.points_from_xy(imars_crash_coords.Longitude, 
                                                                             imars_crash_coords.Latitude))
filename = "./shapefiles/NPS_-_Land_Resources_Division_Boundary_and_Tract_Data_Service.geojson"
file = open(filename)
parks = gpd.read_file(file)

In [22]:
imars_crash_coords_geo.crs = "EPSG:4326"
parks = parks.to_crs(epsg=4326)
parks['geometry']=parks['geometry'].buffer(0.01)

imars_crash_coords_geo_withparknames=gpd.sjoin(imars_crash_coords_geo,parks,how="left", predicate='intersects')
imars_crash_coords_geo_withparknames.head()


  parks['geometry']=parks['geometry'].buffer(0.01)


Unnamed: 0,IMARS_Record_No,Crash_Date_Time,Linked_Address_Classification,City_Town_Park_Location,State,County,Direction,Linked_Street_Number,Linked_Common_Name,Street_Type,...,CREATED_BY,METADATA,PARKNAME,CreationDate,Creator,EditDate,Editor,GlobalID,Shape__Area,Shape__Length
439,NP14039836,20140513 13:30:00:000,,,,,,,,,...,Lands,Preliminary data. Contact the Land Resources P...,Olympic,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,d568927b-56f4-4f49-a52a-b8ab9f7676a3,8203591000.0,1158605.0
443,NP14049772,20140607 14:42:00:000,,,,,,,,,...,Lands,Preliminary data. Contact the Land Resources P...,Olympic,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,d568927b-56f4-4f49-a52a-b8ab9f7676a3,8203591000.0,1158605.0
444,NP14060606,20140625 20:40:00:000,,,,,,,,,...,Lands,Preliminary data. Contact the Land Resources P...,Olympic,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,2020-01-09T22:16:03+00:00,SCarlton@nps.gov_nps,d568927b-56f4-4f49-a52a-b8ab9f7676a3,8203591000.0,1158605.0
497,NP16023660,20160312 16:32:00:000,,,,,,,,,...,Lands,https://irma.nps.gov/DataStore/Reference/Profi...,Big Bend,2022-01-06T10:41:22+00:00,WASO,2022-01-06T10:41:22+00:00,WASO,32b17c0c-12d0-4ffb-82fd-fb570aa92382,4329894000.0,462145.8
508,NP16029926,20160326 20:00:00:000,,,,,,,,,...,Lands,https://irma.nps.gov/DataStore/Reference/Profi...,Blue Ridge Parkway,2022-01-06T10:41:13+00:00,WASO,2022-01-06T10:41:13+00:00,WASO,a8ef8bcf-aaa2-4623-a8f6-1d93fe31f945,642257900.0,2287707.0


In [23]:
imars_crash_coords_geo_withparknames2 = imars_crash_coords_geo_withparknames.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_coords_geo_withparknames2.shape[0]
# all 591 crash entries were processed in the spatial join

591

In [24]:
imars_crash_withparknames = pd.DataFrame(imars_crash_coords_geo_withparknames2.drop(columns='geometry'))
imars_crash_withparknames['Park']= imars_crash_withparknames['UNIT_CODE']
imars_crash_withparknames_NoDupsorNulls = imars_crash_withparknames.dropna(subset=['Park'])
imars_crash_withparknames_NoDupsorNulls.shape[0]

505

In [25]:
# of the 591 crash entries without parks identified, 505 now have parks assigned from spatial join
# so we expect the total ("expanded") dataset to be:
12205+505

12710

In [26]:
imars_crash_expanded = pd.concat([imars_crash_parks,imars_crash_withparknames_NoDupsorNulls])
imars_crash_expanded.shape

(12710, 52)

In [27]:
# check to make sure all entries in combined dataset contain park assignments 
imars_crash_expanded = imars_crash_expanded.dropna(subset=['Park'])
imars_crash_expanded.shape[0]

12710

In [28]:
# check to make sure no duplicate crashes in combined dataset
imars_crash_expanded = imars_crash_expanded.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_expanded.shape[0]

12710

### Add in crashes with park names manually identified via unmistakable roadway names

In [29]:
# open file Chris populated in which park names were manually added to crash data containing identifiable info (e.g., road name)
imars_new_parknames = pd.read_csv("./IMARS/IMARS_noparks_nocoords_someinfo_parks_filled.csv")

In [30]:
# remove any entries for which park names were not able to be identified
imars_new_parknames = imars_new_parknames.dropna(subset=['Park'])
imars_new_parknames.shape[0]

80

In [31]:
# 80 of 109 crashes were able to have park names manually added using road names and should be combined with crash data 
# containing park names from original data and spatial assignment for a total of crashes in the final dataset:
12710+80

12790

In [32]:
imars_crash_expanded = pd.concat([imars_crash_expanded,imars_new_parknames])
imars_crash_expanded.shape

(12790, 52)

In [33]:
# check to make sure there are no duplicates in final dataset
imars_crash_expanded = imars_crash_expanded.drop_duplicates(subset=['IMARS_Record_No'])
imars_crash_expanded.shape

(12790, 52)

## Create new region column
imars_crash database currently contains two columns with region information, although neither one is fully populated. This section will create and populate a new column re-assigning region name to every crash based on park name using a lookup table

In [34]:
imars_crash_expanded.columns

Index(['IMARS_Record_No', 'Crash_Date_Time', 'Linked_Address_Classification',
       'City_Town_Park_Location', 'State', 'County', 'Direction',
       'Linked_Street_Number', 'Linked_Common_Name', 'Street_Type',
       'Direction.1', 'NEAR_Distance_to_MI', 'NEAR_Direction_To',
       'NEAR_Direction', 'NEAR_route_street_road_name', 'NEAR_Road_Type',
       'NEAR_Direction.1', 'AT_Intersection_route_street_road_DIRECTION',
       'At_Intersecting_route_street_road_name', 'AT_Road_Type',
       'AT_Direction', 'Mile_Marker', 'Latitude', 'Longitude', 'Region',
       'State_Zone', 'Park', 'Site', 'Place', 'Point',
       'Road_Type_Classification', 'Linked_Address', 'index_right', 'OBJECTID',
       'UNIT_CODE', 'GIS_Notes', 'UNIT_NAME', 'DATE_EDIT', 'STATE', 'REGION',
       'GNIS_ID', 'UNIT_TYPE', 'CREATED_BY', 'METADATA', 'PARKNAME',
       'CreationDate', 'Creator', 'EditDate', 'Editor', 'GlobalID',
       'Shape__Area', 'Shape__Length'],
      dtype='object')

In [35]:
# load lookup table
park_info = pd.read_csv("./Reference Data/Park_Info_Table.csv")
park_info = park_info.rename(columns={'UNIT_CODE':'Park','REGION':'RGN'})
park_info.columns

Index(['OBJECTID', 'Park', 'GIS_Notes', 'UNIT_NAME', 'DATE_EDIT', 'STATE',
       'RGN', 'GNIS_ID', 'UNIT_TYPE', 'CREATED_BY', 'METADATA', 'PARKNAME',
       'CreationDa', 'Creator', 'EditDate', 'Editor', 'Shape__Are',
       'Shape__Len', 'Unnamed: 18'],
      dtype='object')

In [36]:
# add RGN column from lookup table to IMARS crash database, joining the two datasets based on park name
imars_crash_expanded = pd.merge(imars_crash_expanded, park_info[['RGN','Park']], how='left', on='Park')
# resulting dataframe after join should have one additional column and no additional rows
imars_crash_expanded = imars_crash_expanded.drop_duplicates() 
imars_crash_expanded.shape

(12790, 53)

In [37]:
# check to see if any crashes were not assigned a region
imars_crash_expanded.loc[imars_crash_expanded['RGN'].isnull()].shape[0]

0

In [38]:
imars_crash_expanded['RGN'].value_counts()

SER    3541
PWR    2656
IMR    2642
NCR    2293
NER    1206
MWR     372
AKR      80
Name: RGN, dtype: int64

In [39]:
# remove partially populated pre-existing columns
imars_crash_expanded = imars_crash_expanded.drop(columns = ['Region', 'REGION'])

In [40]:
imars_crash_expanded.shape

(12790, 51)

In [41]:
imars_crash_expanded.to_csv("IMARS_Crash_ParkandRegionAdded_Clean.csv", index= False)

# Filter for Necessary Fields, Group by IMARS_RECORD_NO

## Passenger Table

In [125]:
imars_passenger = imars_passenger.rename(columns={'IMARS_Record_No':'INCID_NO'}) # rename to match CDS
imars_passenger['NUM_OCC'] = 1 # one person per passenger data entry
imars_passenger_slim = imars_passenger[['INCID_NO','NUM_OCC']] # subset needed columns
imars_passenger_slim.head()

Unnamed: 0,INCID_NO,NUM_OCC
0,NP12000078,1
1,NP12000378,1
2,NP12000550,1
3,NP12000911,1
4,NP12000935,1


In [126]:
# set flags for each injury severity 
imars_passenger_slim['No Injury']= np.where(imars_passenger['Injury_Severity']=='01. No injury', 1,0)
imars_passenger_slim['Possible Injury']= np.where(imars_passenger['Injury_Severity']=='02. Possible injury', 1,0)
imars_passenger_slim['Non-incapacitating Injury']= np.where(imars_passenger['Injury_Severity']=='03. Non-incapacitating injury', 1,0)
imars_passenger_slim['Incapacitating Injury']= np.where(imars_passenger['Injury_Severity']=='04. Incapacitating injury', 1,0)
imars_passenger_slim['Fatality']= np.where(imars_passenger['Injury_Severity']=='05. Fatal', 1,0)
imars_passenger_slim['Unknown Injury']= np.where(imars_passenger['Injury_Severity']=='99. Unknown', 1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_passenger_slim['No Injury']= np.where(imars_passenger['Injury_Severity']=='01. No injury', 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_passenger_slim['Possible Injury']= np.where(imars_passenger['Injury_Severity']=='02. Possible injury', 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versu

In [127]:
# determine the number of motorists and non-motorists involved
imars_passenger_slim['Num_Motorist']= np.where(imars_passenger['Pedestrian_Type'].isnull()==True, 1,0)
imars_passenger_slim['Num_Non_Motorist']= np.where(imars_passenger['Pedestrian_Type'].isnull()==False, 1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_passenger_slim['Num_Motorist']= np.where(imars_passenger['Pedestrian_Type'].isnull()==True, 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_passenger_slim['Num_Non_Motorist']= np.where(imars_passenger['Pedestrian_Type'].isnull()==False, 1,0)


In [128]:
# set indicator for VRU and pedestrian involvement
imars_passenger_slim['VRU'] = np.where(imars_passenger_slim['Num_Non_Motorist'] > 0, 1,0)
imars_passenger_slim['Pedestrian'] = np.where(imars_passenger['Pedestrian_Type']=='Pedestrian', 1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_passenger_slim['VRU'] = np.where(imars_passenger_slim['Num_Non_Motorist'] > 0, 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_passenger_slim['Pedestrian'] = np.where(imars_passenger['Pedestrian_Type']=='Pedestrian', 1,0)


In [129]:
imars_passenger_slim.head()

Unnamed: 0,INCID_NO,NUM_OCC,No Injury,Possible Injury,Non-incapacitating Injury,Incapacitating Injury,Fatality,Unknown Injury,Num_Motorist,Num_Non_Motorist,VRU,Pedestrian
0,NP12000078,1,0,0,0,0,0,0,1,0,0,0
1,NP12000378,1,0,0,0,0,0,0,1,0,0,0
2,NP12000550,1,0,0,0,0,0,0,1,0,0,0
3,NP12000911,1,0,0,0,0,0,0,1,0,0,0
4,NP12000935,1,0,0,0,0,0,0,1,0,0,0


In [130]:
imars_passenger_slim.shape

(27931, 12)

In [131]:
imars_passenger_slim.columns

Index(['INCID_NO', 'NUM_OCC', 'No Injury', 'Possible Injury',
       'Non-incapacitating Injury', 'Incapacitating Injury', 'Fatality',
       'Unknown Injury', 'Num_Motorist', 'Num_Non_Motorist', 'VRU',
       'Pedestrian'],
      dtype='object')

In [132]:
# collapse multiple rows for each person involved in the crash into a single row for each crash
# occupancy, injuries/fatalities, and number of people involved should be summed as total numbers per crash ("sum" function)
imars_passenger_slim_agg = imars_passenger_slim.groupby(by=['INCID_NO']).sum().reset_index()
imars_passenger_slim_agg.shape

(15302, 12)

In [133]:
# however, pedestrian is a binary indicator, so values should be restricted to 0 and 1 (even if more than one pedestrian was involved)
imars_passenger_slim_agg['Pedestrian'].value_counts()

0    15274
1       22
2        3
4        2
3        1
Name: Pedestrian, dtype: int64

In [134]:
# replace all values greater than 1 with 1
imars_passenger_slim_agg['Pedestrian'] = np.where(imars_passenger_slim_agg['Pedestrian'] > 1, 1, imars_passenger_slim_agg['Pedestrian'])
imars_passenger_slim_agg['Pedestrian'].value_counts()

0    15274
1       28
Name: Pedestrian, dtype: int64

In [138]:
imars_passenger_slim_agg['Crash_Severity'] = 0 # iniitalize column with dummy data to replace with for loop
length = range(imars_passenger_slim_agg.shape[0]) # search through all rows in aggregated dataset (to examine all injuries from each crash as a whole)
for i in length: # determine most severe injury outcome
    if imars_passenger_slim_agg['Fatality'].iloc[i] > 0:
        imars_passenger_slim_agg['Crash_Severity'].iloc[i] = 'Fatal'
    elif imars_passenger_slim_agg['Incapacitating Injury'].iloc[i] > 0:
        imars_passenger_slim_agg['Crash_Severity'].iloc[i] = 'Incap'
    elif imars_passenger_slim_agg['Non-incapacitating Injury'].iloc[i] > 0:
        imars_passenger_slim_agg['Crash_Severity'].iloc[i] = 'Non-Incap'
    elif imars_passenger_slim_agg['Possible Injury'].iloc[i] > 0:
        imars_passenger_slim_agg['Crash_Severity'].iloc[i] = 'Possible'
    else:
        imars_passenger_slim_agg['Crash_Severity'].iloc[i] = 'No Inj'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_passenger_slim_agg['Crash_Severity'].iloc[i] = 'No Inj'


In [141]:
#set flags for each Crash Severity attribute
imars_passenger_slim_agg['CrashSeverity_Fatal'] = np.where(imars_passenger_slim_agg['Crash_Severity']=='Fatal', 1,0)
imars_passenger_slim_agg['CrashSeverity_Incap'] = np.where(imars_passenger_slim_agg['Crash_Severity']=='Incap', 1,0)
imars_passenger_slim_agg['CrashSeverity_NonIncap'] = np.where(imars_passenger_slim_agg['Crash_Severity']=='Non-Incap', 1,0)
imars_passenger_slim_agg['CrashSeverity_Possible'] = np.where(imars_passenger_slim_agg['Crash_Severity']=='Possible', 1,0)
imars_passenger_slim_agg['CrashSeverity_NoInj'] = np.where(imars_passenger_slim_agg['Crash_Severity']=='No Inj', 1,0)

In [142]:
imars_passenger_slim_agg.head()

Unnamed: 0,INCID_NO,NUM_OCC,No Injury,Possible Injury,Non-incapacitating Injury,Incapacitating Injury,Fatality,Unknown Injury,Num_Motorist,Num_Non_Motorist,VRU,Pedestrian,Crash_Severity,CrashSeverity_Fatal,CrashSeverity_Incap,CrashSeverity_NonIncap,CrashSeverity_Possible,CrashSeverity_NoInj
0,NP12000078,1,0,0,0,0,0,0,1,0,0,0,No Inj,0,0,0,0,1
1,NP12000378,1,0,0,0,0,0,0,1,0,0,0,No Inj,0,0,0,0,1
2,NP12000550,1,0,0,0,0,0,0,1,0,0,0,No Inj,0,0,0,0,1
3,NP12000911,1,0,0,0,0,0,0,1,0,0,0,No Inj,0,0,0,0,1
4,NP12000935,1,0,0,0,0,0,0,1,0,0,0,No Inj,0,0,0,0,1


In [143]:
imars_passenger_slim_agg.shape

(15302, 18)

In [144]:
imars_passenger_slim_agg.columns

Index(['INCID_NO', 'NUM_OCC', 'No Injury', 'Possible Injury',
       'Non-incapacitating Injury', 'Incapacitating Injury', 'Fatality',
       'Unknown Injury', 'Num_Motorist', 'Num_Non_Motorist', 'VRU',
       'Pedestrian', 'Crash_Severity', 'CrashSeverity_Fatal',
       'CrashSeverity_Incap', 'CrashSeverity_NonIncap',
       'CrashSeverity_Possible', 'CrashSeverity_NoInj'],
      dtype='object')

## Vehicle Table

In [115]:
imars_vehicle = imars_vehicle.rename(columns={'IMARS_Record_No':'INCID_NO'}) # rename to match CDS
imars_vehicle_slim = imars_vehicle[['INCID_NO']] # subset needed columns
imars_vehicle_slim['5_mph']= np.where(imars_vehicle['Posted_Speed']=='01. 5 mph', 1,0)
imars_vehicle_slim['10_mph']= np.where(imars_vehicle['Posted_Speed']=='02. 10 mph', 1,0)
imars_vehicle_slim['15_mph']= np.where(imars_vehicle['Posted_Speed']=='03. 15 mph', 1,0)
imars_vehicle_slim['20_mph']= np.where(imars_vehicle['Posted_Speed']=='04. 20 mph', 1,0)
imars_vehicle_slim['25_mph']= np.where(imars_vehicle['Posted_Speed']=='05. 25 mph', 1,0)
imars_vehicle_slim['30_mph']= np.where(imars_vehicle['Posted_Speed']=='06. 30 mph', 1,0)
imars_vehicle_slim['35_mph']= np.where(imars_vehicle['Posted_Speed']=='07. 35 mph', 1,0)
imars_vehicle_slim['40_mph']= np.where(imars_vehicle['Posted_Speed']=='08. 40 mph', 1,0)
imars_vehicle_slim['45_mph']= np.where(imars_vehicle['Posted_Speed']=='09. 45 mph', 1,0)
imars_vehicle_slim['50_mph']= np.where(imars_vehicle['Posted_Speed']=='10. 50 mph', 1,0)
imars_vehicle_slim['55_mph']= np.where(imars_vehicle['Posted_Speed']=='11. 55 mph', 1,0)
imars_vehicle_slim['60_mph']= np.where(imars_vehicle['Posted_Speed']=='12. 60 mph', 1,0)
imars_vehicle_slim['65_mph']= np.where(imars_vehicle['Posted_Speed']=='13. 65 mph', 1,0)
imars_vehicle_slim['70_mph']= np.where(imars_vehicle['Posted_Speed']=='14. 70 mph', 1,0)
imars_vehicle_slim['75_mph']= np.where(imars_vehicle['Posted_Speed']=='15. 75 mph', 1,0)
imars_vehicle_slim['no_posted_speed']= np.where(imars_vehicle['Posted_Speed']=='98. Not posted', 1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_vehicle_slim['5_mph']= np.where(imars_vehicle['Posted_Speed']=='01. 5 mph', 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_vehicle_slim['10_mph']= np.where(imars_vehicle['Posted_Speed']=='02. 10 mph', 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_vehicle_slim['15_mph']=

In [116]:
imars_vehicle_slim.columns

Index(['INCID_NO', '5_mph', '10_mph', '15_mph', '20_mph', '25_mph', '30_mph',
       '35_mph', '40_mph', '45_mph', '50_mph', '55_mph', '60_mph', '65_mph',
       '70_mph', '75_mph', 'no_posted_speed'],
      dtype='object')

In [117]:
# collapse multiple rows for each vehicle involved in the crash into a single row for each crash
# only one speed limit should exist per crash to not double count data attribute ("first" function)
imars_vehicle_slim_agg = imars_vehicle_slim.groupby(by=['INCID_NO']).first().reset_index()
imars_vehicle_slim_agg.shape

(15302, 17)

In [118]:
imars_vehicle['Vehicle_Type'].value_counts()

01. Passenger car (sedan, minivan, etc.)         8537
02. Sport utility vehicle                        4259
05. Pickup truck                                 2557
10. Motorcycle > 150cc                            873
06. Motor home                                    649
99. Unknown                                       641
18. Medium/heavy trucks (10,000 lbs. or more)     343
17. Other light trucks (10,000 lbs. or less)      281
03. Passenger van (9-15 passengers)               223
21. Other                                         207
08. Transit/shuttle bus                           197
04. Cargo van (10,000 lbs. or less)               162
19. Maintenance/construction vehicle              117
09. Motor coach                                   107
16. Snowmobile                                     43
15. ATV                                            33
07. School bus                                     31
11. Motorcycle < 150cc (trail bike)                21
12. Moped/minibike/scooter/s

In [109]:
imars_vehicle['Vehicle_Maneuver_Action_Prior_to_Crash'].value_counts()
# what to do with this (if anything)???

01. Straight ahead                    8268
99. Unknown                           1638
11. Negotiating a curve               1463
12. Parked                            1423
02. Backing                           1176
13. Stopped in traffic                1008
06. Turning left                       740
10. Slowing                            643
05. Turning right                      446
16. Other                              301
03. Changing lanes                     286
09. Entering a traffic lane            286
04. Overtaking/passing                 267
17. Parking maneuver                   259
08. Leaving a traffic lane/parking     179
07. Make U-turn                        147
14. Driverless motor vehicle            23
15. Roadway maintenance                 16
Name: Vehicle_Maneuver_Action_Prior_to_Crash, dtype: int64

In [119]:
imars_vehicle['NUM_VEH'] = 1 # one vehicle per entry
imars_vehicletypes = imars_vehicle[['INCID_NO','NUM_VEH']] # subset needed columns

In [120]:
# set flags for each vehicle type
imars_vehicletypes['Car'] = np.where(imars_vehicle['Vehicle_Type']=='01. Passenger car (sedan, minivan, etc.)', 1,0)
imars_vehicletypes['SUV'] = np.where(imars_vehicle['Vehicle_Type']=='02. Sport utility vehicle', 1,0)
imars_vehicletypes['Van'] = np.where(imars_vehicle['Vehicle_Type'].str.contains(" van")==True, 1,0) # space necessary to prevent inclusion of "minivan"
imars_vehicletypes['Truck'] = np.where(imars_vehicle['Vehicle_Type'].str.contains("truck")==True, 1,0)
imars_vehicletypes['Bus'] = np.where(imars_vehicle['Vehicle_Type'].isin(['07. School bus','08. Transit/shuttle bus','09. Motor coach'])==True, 1,0)
imars_vehicletypes['Motorcycle'] = np.where(imars_vehicle['Vehicle_Type'].str.contains("Motorcycle")==True, 1,0)
imars_vehicletypes['RV'] = np.where(imars_vehicle['Vehicle_Type']=='06. Motor home', 1,0)

# does Wayne approves of these bins?
# does Wayne want v1 vs. v2 distinction for analysis?

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_vehicletypes['Car'] = np.where(imars_vehicle['Vehicle_Type']=='01. Passenger car (sedan, minivan, etc.)', 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_vehicletypes['SUV'] = np.where(imars_vehicle['Vehicle_Type']=='02. Sport utility vehicle', 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

In [121]:
imars_vehicletypes.columns

Index(['INCID_NO', 'NUM_VEH', 'Car', 'SUV', 'Van', 'Truck', 'Bus',
       'Motorcycle', 'RV'],
      dtype='object')

In [122]:
# collapse multiple rows for each vehicle involved in the crash into a single row for each crash
# vehicle types and number of people involved should be summed as total numbers per crash ("sum" function)
imars_vehicletypes_agg = imars_vehicletypes.groupby(by=['INCID_NO']).sum().reset_index()
imars_vehicletypes_agg.shape

(15302, 9)

In [123]:
# join aggregated vehicle data into single table
imars_slim_vehicle = imars_vehicle_slim_agg.merge(imars_vehicletypes_agg, how='right', on='INCID_NO')
imars_slim_vehicle = imars_slim_vehicle.drop_duplicates()
imars_slim_vehicle.shape

(15302, 25)

In [124]:
imars_slim_vehicle.columns

Index(['INCID_NO', '5_mph', '10_mph', '15_mph', '20_mph', '25_mph', '30_mph',
       '35_mph', '40_mph', '45_mph', '50_mph', '55_mph', '60_mph', '65_mph',
       '70_mph', '75_mph', 'no_posted_speed', 'NUM_VEH', 'Car', 'SUV', 'Van',
       'Truck', 'Bus', 'Motorcycle', 'RV'],
      dtype='object')

## Crash Details Table

In [145]:
imars_crash_details = imars_crash_details.rename(columns={'IMARS_Record_No':'INCID_NO'}) # rename to match CDS
imars_crash_details_slim = imars_crash_details[['INCID_NO','First_Harmful_Event_Type', 'First_Harmful_Event','Location_of_First_Harmful_Event']] # subset needed columns

In [150]:
imars_crash_details['First_Harmful_Event_Type'].value_counts()
# do we want to use this or first harmful event in analysis?

Collision with person, MV or non-fixed object    7340
Collision with fixed object                      4238
Non-collision                                    1965
Collision with animals                           1486
Unknown                                           492
Name: First_Harmful_Event_Type, dtype: int64

In [147]:
imars_crash_details['Location_of_First_Harmful_Event'].value_counts()
# do we want to use this in analysis?
# if so, do we care about all of these attributes or can we bin them?

01. On roadway               8199
02. Shoulder                 1978
07. Parking lot              1897
09. Off roadway              1387
04. Roadside                  739
99. Unknown                   493
08. Pull-off parking area     315
13. Other                     239
03. Median                    106
12. Bridge                     59
11. Tunnel                     50
06. Separator                  24
10. Outside right-of-way       14
05. Gore                        4
Name: Location_of_First_Harmful_Event, dtype: int64

In [149]:
pd.options.display.max_rows = 10000000
imars_crash_details['First_Harmful_Event'].value_counts()
# do we want to use this or first harmful event type in analysis?
# if we want to use this, do we care about all of these attributes or can we bin them?

21. Motor vehicle in transport                                           5101
22. Parked motor vehicle                                                 1575
52. Tree/shrub                                                           1309
28. Deer                                                                  986
07. Ran off the road right                                                820
99. Unknown                                                               488
60. Rock, boulder, rock slide                                             365
25. Other non-fixed object                                                363
66. Other fixed object                                                    341
46. Earth embankment/berm                                                 337
16. Other non-collision                                                   292
01. Overturn/rollover                                                     285
45. Ditch                                                       

In [151]:
# set flags for animal related data attributes
imars_crash_details_slim['Collision w Animal']= np.where(imars_crash_details['First_Harmful_Event_Type']=="Collision with animals", 1,0)
imars_crash_details_slim['Avoiding Animal']= np.where(imars_crash_details['First_Harmful_Event']=="13. Avoiding an animal on road", 1,0)
imars_crash_details_slim['Animal in Roadway']= np.where(imars_crash_details['Environmental_Contributing_Circumstances'].str.contains("Animal")==True, 1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Collision w Animal']= np.where(imars_crash_details['First_Harmful_Event_Type']=="Collision with animals", 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Avoiding Animal']= np.where(imars_crash_details['First_Harmful_Event']=="13. Avoiding an animal on road", 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-d

In [153]:
# set animal indicator
imars_crash_details_slim['Involving Animal'] = 0 # iniitalize column with dummy data to replace with for loop
length = range(imars_crash_details_slim.shape[0]) # search through all rows in aggregated dataset
for i in length:
    if imars_crash_details_slim['Collision w Animal'].iloc[i] == 1:
        imars_crash_details_slim['Involving Animal'].iloc[i] = 1
    elif imars_crash_details_slim['Avoiding Animal'].iloc[i] == 1:
        imars_crash_details_slim['Involving Animal'].iloc[i] = 1
    elif imars_crash_details_slim['Animal in Roadway'].iloc[i] == 1:
        imars_crash_details_slim['Involving Animal'].iloc[i] = 1
    else:
        imars_crash_details_slim['Involving Animal'].iloc[i] = 0
imars_crash_details_slim['Involving Animal'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Involving Animal'] = 0 # iniitalize column with dummy data to replace with for loop
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Involving Animal'].iloc[i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imars_crash_details_slim['Involving Animal'].iloc[i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the docum

0    14529
1     1774
Name: Involving Animal, dtype: int64

In [154]:
# rerun all code
# continue with weather and lighting indicators

16303