In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
myworkingdirectory = r"C:\Users\Sophie.Kaye\Desktop\NPS Crash Data"
os.chdir(myworkingdirectory)

In [3]:
cds_crash = pd.read_excel('./CDS/Legacy_CDS_v2.xlsx') # updated with more lat/longs from Bryce
cds_crash.shape

(204690, 57)

In [4]:
cds_crash.columns

Index(['OBJECTID_1', 'OBJECTID', 'INCID_NO', 'CASE_NUM', 'PARK_ALPHA',
       'STATE_CODE', 'CRASH_DATE', 'CRASH_TIME', 'RTE_NO', 'RTE_NAME',
       'NODE_DIST_FT', 'NODE_DIST_MI', 'NODE_DIR', 'NODE_NUM', 'LIGHT',
       'WEATHER', 'CRASH_LOCATION', 'SURF_COND', 'CRASH_CLASS', 'VEH_COLL',
       'OBJ_STRUCK', 'ROAD_CHAR', 'CON_FACT1', 'CON_FACT2', 'CON_FACT3',
       'CON_FACT4', 'CON_FACT5', 'CON_FACT6', 'HIT_RUN', 'CATEGORY', 'FATALS',
       'INJURED', 'PED_FAT', 'PED_INJ', 'BIKE_FAT', 'BIKE_INJ', 'PED',
       'CRASH_YEAR', 'COMMENTS', 'ZIPFILE', 'LOCATION', 'PHOTOS_TAKEN',
       'USPP_NPS_VEH_INV', 'PARK_PTY_DEST', 'LOCKED_UPDATE', 'LOCKED_BY_USER',
       'DATA_SRC', 'LATITUDE', 'LONGITUDE', 'MILEPOST', 'IMPORT_DATE',
       'FILE_NAME', 'SAVE_DATE', 'ROUTE_IDENT', 'RIP_CYCLE', 'MP_NODE',
       'SPTL_LOC'],
      dtype='object')

In [5]:
cds_crash.loc[cds_crash['PARK_ALPHA'].isnull()==True].shape[0]
# 3 crashes without park names identified (to be dropped during data cleaning)

3

In [6]:
pd.options.display.max_rows = 10000000
cds_crash.PARK_ALPHA.value_counts().sort_index()
# note 778 crashes with park name "ZZZZ", which is CDS code for unknown

ABLI        7
ACAD     1419
AGFO        1
ALPO       20
AMIS       94
ANJO        4
ANTI      122
APCO        1
APIS        1
ARCH      306
ARPO        1
ASIS      155
BADL      213
BAND       70
BAWA    28179
BIBE      914
BICA       30
BICY       50
BISC        4
BISO      270
BITH       55
BLCA       42
BLRI     7853
BLUE        2
BOST       58
BOWA       12
BRCA      577
BUFF      110
CABR       16
CACH        6
CACL        1
CACO      416
CAGR        6
CAHA      370
CALO       52
CANA       44
CANY      130
CARE       34
CARL        7
CASA        1
CATO      120
CAVE      119
CAVO        7
CEBR        6
CHAT      226
CHCH      780
CHCU        4
CHIC      402
CHIR       18
CHIS        2
CHOH      420
CIRO        1
CLBA       12
CODA       41
COLM      395
COLO     1661
CONG       10
CORO       10
CRLA      504
CUGA     1019
CUIS        2
CURE       84
CUVA      247
DENA      225
DEPO        1
DETO       17
DEVA      837
DEWA     5394
DINO       53
EDIS        1
EISE       19
ELIS  

In [7]:
# for any of the 778 crashes with unknown park names, can park names be assigned geospatially using lat/long like in IMARS?
cds_unknown_park = cds_crash[cds_crash['PARK_ALPHA']=="ZZZZ"]
cds_unknown_park[cds_unknown_park['LATITUDE'].isnull()==False].shape[0] # nope!

0

In [8]:
cds_vehicle = pd.read_excel('./CDS/ALL_UNIT.xlsx')

In [9]:
cds_vehicle.loc[cds_vehicle['INCID_NO'].isnull()==True].shape[0]
# no vehicle reports with missing record numbers

0

In [10]:
cds_passenger = pd.read_excel('./CDS/ALL_PASSENGER.xlsx')

In [11]:
cds_passenger.loc[cds_passenger['INCID_NO'].isnull()==True].shape[0]
# no person reports with missing record numbers

0

# Data Cleaning

## Remove crashes missing record numbers

In [12]:
cds_crash = cds_crash.dropna(subset=['INCID_NO']) 
# not needed for passenger or vehicle tables as shown in tests above

## Drop duplicates

In [13]:
cds_crash = cds_crash.drop_duplicates(subset=['INCID_NO']) 
# Cannot remove duplicates from vehicle and passenger tables because they are unique records for each person/car involved in crash

## Revert flipped lat/long

In [14]:
need_revert = cds_crash['LATITUDE'].abs() > 70
cds_crash.loc[need_revert, ['LATITUDE', 'LONGITUDE']] = (cds_crash.loc[need_revert, ['LONGITUDE', 'LATITUDE']].values)

## Adjust signs to correct hemisphere

In [15]:
cds_crash.loc[cds_crash.LATITUDE < 0,"LATITUDE"] = cds_crash['LATITUDE']*(-1)
cds_crash.loc[cds_crash.LONGITUDE > 0,"LONGITUDE"] = cds_crash['LONGITUDE']*(-1)

## Create new region column

In [16]:
park_info = pd.read_csv("./Reference Data/Park_Info_Table.csv")

In [17]:
park_info.columns

Index(['OBJECTID', 'UNIT_CODE', 'GIS_Notes', 'UNIT_NAME', 'DATE_EDIT', 'STATE',
       'REGION', 'GNIS_ID', 'UNIT_TYPE', 'CREATED_BY', 'METADATA', 'PARKNAME',
       'CreationDa', 'Creator', 'EditDate', 'Editor', 'Shape__Are',
       'Shape__Len', 'Unnamed: 18'],
      dtype='object')

In [18]:
park_info = park_info.rename(columns={'UNIT_CODE':'Park','REGION':'RGN'})
cds_crash = cds_crash.rename(columns={'PARK_ALPHA' : 'Park'})

In [19]:
# add RGN column from lookup table to CDS crash database, joining the two datasets based on park name
cds_crash = pd.merge(cds_crash, park_info[['RGN','Park']], how='left', on='Park')
# resulting dataframe after join should have one additional column and no additional rows
cds_crash = cds_crash.drop_duplicates(subset=['INCID_NO']) 
cds_crash.shape

(204687, 58)

In [20]:
cds_crash.loc[cds_crash['RGN'].isnull()==True].shape[0]
# all crashes were assigned a region

0

In [21]:
cds_crash.RGN.value_counts().sort_index()

AKR          237
IMR        33076
MWR         2435
NCR        97838
NER        18642
PWR        24419
SER        27262
Unknown      778
Name: RGN, dtype: int64

In [22]:
# drop rows without region
cds_crash = cds_crash[cds_crash["RGN"].str.contains("Unknown")==False]
cds_crash.RGN.value_counts().sort_index()

AKR      237
IMR    33076
MWR     2435
NCR    97838
NER    18642
PWR    24419
SER    27262
Name: RGN, dtype: int64

In [23]:
cds_crash.to_csv("CDS_CrashTable_RegionAdded_Clean.csv", index=False)

# Filter for Necessary Fields, Group by INCID_NO

## Crash Table

In [24]:
cds_crash['CRASH_TIME'].value_counts().sort_index()
# order if statements in for loop from most to least common to increase speed

0.0        572
1.0        205
2.0         41
3.0         39
4.0         35
5.0        121
6.0         38
7.0         40
8.0         64
9.0         47
10.0       127
11.0        43
12.0        33
13.0        54
14.0        50
15.0       202
16.0        37
17.0        22
18.0        29
19.0        33
20.0        94
21.0        35
22.0        47
23.0        45
24.0        23
25.0        47
26.0        43
27.0        34
28.0        35
29.0        30
30.0       379
31.0        50
32.0        33
33.0        37
34.0        27
35.0        54
36.0        36
37.0        40
38.0        30
39.0        23
40.0        58
41.0        29
42.0        25
43.0        23
44.0        26
45.0       132
46.0        36
47.0        30
48.0        28
49.0        33
50.0        64
51.0        21
52.0        18
53.0        26
54.0        31
55.0        68
56.0        30
57.0        35
58.0        35
59.0        30
65.0         1
68.0         1
80.0         1
85.0         2
93.0         1
99.0         6
100.0     

In [25]:
cds_crash['CRASH_DATE'].head()

0    2007-04-25 00:00:00.0000000
1    2007-08-04 00:00:00.0000000
2    2009-11-17 00:00:00.0000000
3    2012-10-09 00:00:00.0000000
4    2014-06-10 00:00:00.0000000
Name: CRASH_DATE, dtype: object

In [26]:
cds_crash['CRASH_TIME'].head()

0     750.0
1    1755.0
2    1709.0
3    1100.0
4    1635.0
Name: CRASH_TIME, dtype: float64

In [27]:
# reformat crash date and time; separate into individual columns
cds_crash = cds_crash.rename(columns={'CRASH_YEAR':'YEAR'})
cds_crash['CRASH_DATE'] = pd.to_datetime(cds_crash['CRASH_DATE'], format='%Y%m%d %H:%M:%S.%f')
cds_crash['MONTH'] = cds_crash['CRASH_DATE'].dt.strftime('%m')
cds_crash['DOW'] = cds_crash['CRASH_DATE'].dt.strftime('%w')
cds_crash['HOUR'] = 0 # initialize column with dummy data to replace with for loop
length = range(cds_crash.shape[0])
for i in length:
    if cds_crash['CRASH_TIME'].iloc[i] > 999 and cds_crash['CRASH_TIME'].iloc[i] < 2400:
        cds_crash['HOUR'].iloc[i] = int(str(cds_crash['CRASH_TIME'].iloc[i])[:2]) # parse first two digits from afternoon data in 24-hour timestamp
    elif cds_crash['CRASH_TIME'].iloc[i] > 99 and cds_crash['CRASH_TIME'].iloc[i] < 1000:
        cds_crash['HOUR'].iloc[i] = int(str(cds_crash['CRASH_TIME'].iloc[i])[:1]) # parse first digit from morning data in 24-hour timestamp
    else:
        cds_crash['HOUR'].iloc[i] = '0'
cds_crash_slim = cds_crash[['INCID_NO','LATITUDE','LONGITUDE','Park','RGN','YEAR','MONTH','DOW','HOUR']]
cds_crash_slim.head() # looks good!

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,INCID_NO,LATITUDE,LONGITUDE,Park,RGN,YEAR,MONTH,DOW,HOUR
0,ABLI070425075000,,,ABLI,SER,2007.0,4,3,7
1,ABLI070804175500,,,ABLI,SER,2007.0,8,6,17
2,ABLI091117170900,,,ABLI,SER,2009.0,11,2,17
3,ABLI121009110000,,,ABLI,SER,2012.0,10,2,11
4,ABLI140610163500,38.91205,-76.93412,ABLI,SER,2014.0,6,2,16


In [41]:
cds_crash_slim['HOUR'].value_counts() # to be sure that range (0-23) matches that of IMARS

15    15954
16    15490
17    14573
14    14425
13    13172
18    12425
12    12318
11    11369
10    10381
8     10132
9      9859
19     9253
7      8337
20     6979
21     6272
22     5548
6      5399
23     4742
0      3907
1      3050
2      2993
5      2889
3      2505
4      1937
Name: HOUR, dtype: int64

In [29]:
cds_crash_slim.shape

(203909, 9)

In [30]:
cds_crash_slim.to_csv('./CDS_crash_slim.csv',index=False)

## Crash Details Table

In [70]:
cds_crash_details_slim = cds_crash[['INCID_NO']]

In [71]:
cds_crash['CRASH_CLASS'].value_counts()

Collision with Other Motor Vehicle     101521
Collision with Fixed Object             47715
Collision with Animal                   18927
Non-collision                           14132
Collision with Parked Motor Vehicle      8133
Collision with Other Object              5383
Unknown                                  2849
Collision with Pedalcycle (Bicycle)      1493
Collision with Pedestrian                1428
Blank On Form                             822
Non-Collision(Overturn)                    34
Collision with Railway Train               32
Name: CRASH_CLASS, dtype: int64

In [72]:
cds_crash['OBJ_STRUCK'].value_counts()

Not Applicable        142603
Tree/Shrub             16695
Other Fixed Object      7979
Guardrail/Barrier       6782
Rock/Stone Wall         5702
Unknown                 4595
Ditch                   3124
Pole                    2605
Blank On Form           2598
Sign                    2513
Backslope               1968
Boulder                 1844
Bridge Structure        1413
Barricade               1075
Drainage Structure       563
Culvert End Wall         436
Embankment                19
Name: OBJ_STRUCK, dtype: int64

In [73]:
# change strings to match IMARS (in order of from most to least common to increase speed)
cds_crash_details_slim['First_Harmful_Event'] = "0" # initialize column with dummy data to replace with for loop
length = range(cds_crash.shape[0])
for i in length:
    if cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Other Motor Vehicle': # 101,521
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '21. Motor vehicle in transport' 
    elif cds_crash['CRASH_CLASS'].iloc[i] in (['Collision with Fixed Object', 'Collision with Other Object']): # 53,098
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '66. Other fixed object'
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Animal': # 18,927
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '36. Other wild animal (crocodile, birds, coyote, etc.)'
    elif cds_crash['OBJ_STRUCK'].iloc[i]=='Tree/Shrub': # 16,695
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '52. Tree/shrub'
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Non-collision': # 14,132
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '16. Other non-collision'
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Parked Motor Vehicle': # 8,133
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '22. Parked motor vehicle'
    elif cds_crash['OBJ_STRUCK'].iloc[i]=='Other Fixed Object': # 7,979
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '66. Other fixed object'
    elif cds_crash['OBJ_STRUCK'].iloc[i] in (['Unknown', 'Blank On Form']): #7,193
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '99. Unknown'
    elif cds_crash['OBJ_STRUCK'].iloc[i]=='Rock/Stone Wall': # 5,702
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '50. Wall'
    elif cds_crash['CRASH_CLASS'].iloc[i] in (["Unknown", "Blank On Form"]): # 3,671
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '99. Unknown'
    elif cds_crash['OBJ_STRUCK'].iloc[i]=='Ditch': # 3,124
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '45. Ditch'
    elif cds_crash['OBJ_STRUCK'].iloc[i]=='Pole': # 2,605
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '53. Utility pole/light support'
    elif cds_crash['OBJ_STRUCK'].iloc[i]=='Boulder': # 1,844
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '60. Rock, boulder, rock slide'
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Pedalcycle (Bicycle)': # 1,493
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '18. Bicycle'
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Pedestrian': # 1,428
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '17. Pedestrian' 
    elif cds_crash['OBJ_STRUCK'].iloc[i] in (['Culvert End Wall','Drainage Structure']): # 999
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '43. End of drainage pipe/structure/culvert'
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Non-Collision(Overturn)': # 34
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '01. Overturn/rollover'
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Railway Train': # 32
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '20. Railway vehicle' 
    elif cds_crash['OBJ_STRUCK'].iloc[i]=='Embankment': # 19
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '46. Earth embankment/berm'  
    else:
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '99. Unknown'

cds_crash_details_slim['First_Harmful_Event'].value_counts() # see email sent to Wayne on 6/28 at 4:14PM regarding ambiguity of whether CRASH_CLASS or OBJ_STRUCK prevails

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['First_Harmful_Event'] = "0" # initialize column with dummy data to replace with for loop
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


21. Motor vehicle in transport                            101521
66. Other fixed object                                     53194
36. Other wild animal (crocodile, birds, coyote, etc.)     18927
16. Other non-collision                                    13870
22. Parked motor vehicle                                    8117
99. Unknown                                                 4943
18. Bicycle                                                 1479
17. Pedestrian                                              1369
52. Tree/shrub                                               345
50. Wall                                                      48
01. Overturn/rollover                                         29
20. Railway vehicle                                           28
53. Utility pole/light support                                14
60. Rock, boulder, rock slide                                 11
45. Ditch                                                     10
43. End of drainage pipe/

In [94]:
# change strings to match IMARS (in order of from most to least common to increase speed)
cds_crash_details_slim['First_Harmful_Event'] = "0" # initialize column with dummy data to replace with for loop
length = range(cds_crash.shape[0])
for i in length:
    if cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Fixed Object': # 47,715
        if cds_crash['OBJ_STRUCK'].iloc[i]=='Tree/Shrub': # 16,695
            cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '52. Tree/shrub'
        elif cds_crash['OBJ_STRUCK'].iloc[i]=='Other Fixed Object': # 7,979
            cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '66. Other fixed object'
        elif cds_crash['OBJ_STRUCK'].iloc[i] in (['Unknown', 'Blank On Form']): #7,193
            cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '99. Unknown'
        elif cds_crash['OBJ_STRUCK'].iloc[i]=='Rock/Stone Wall': # 5,702
            cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '50. Wall'
        elif cds_crash['OBJ_STRUCK'].iloc[i]=='Ditch': # 3,124
            cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '45. Ditch'
        elif cds_crash['OBJ_STRUCK'].iloc[i]=='Pole': # 2,605
            cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '53. Utility pole/light support'
        elif cds_crash['OBJ_STRUCK'].iloc[i]=='Boulder': # 1,844
            cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '60. Rock, boulder, rock slide'
        elif cds_crash['OBJ_STRUCK'].iloc[i] in (['Culvert End Wall','Drainage Structure']): # 999
            cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '43. End of drainage pipe/structure/culvert'
        elif cds_crash['OBJ_STRUCK'].iloc[i]=='Embankment': # 19
            cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '46. Earth embankment/berm'      
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Other Motor Vehicle': # 101,521
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '21. Motor vehicle in transport' 
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Other Object': # 5,383
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '66. Other fixed object'
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Animal': # 18,927
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '36. Other wild animal (crocodile, birds, coyote, etc.)'
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Non-collision': # 14,132
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '16. Other non-collision'
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Parked Motor Vehicle': # 8,133
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '22. Parked motor vehicle'
    elif cds_crash['CRASH_CLASS'].iloc[i] in (["Unknown", "Blank On Form"]): # 3,671
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '99. Unknown'
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Pedalcycle (Bicycle)': # 1,493
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '18. Bicycle'
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Pedestrian': # 1,428
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '17. Pedestrian' 
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Non-Collision(Overturn)': # 34
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '01. Overturn/rollover'
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Railway Train': # 32
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '20. Railway vehicle'  
    else:
        cds_crash_details_slim['First_Harmful_Event'].iloc[i] = '99. Unknown'

cds_crash_details_slim['First_Harmful_Event'].value_counts()

21. Motor vehicle in transport                            101521
36. Other wild animal (crocodile, birds, coyote, etc.)     18927
52. Tree/shrub                                             15497
16. Other non-collision                                    14132
0                                                          12951
66. Other fixed object                                     12121
22. Parked motor vehicle                                    8133
99. Unknown                                                 5129
50. Wall                                                    5024
45. Ditch                                                   2538
53. Utility pole/light support                              2383
60. Rock, boulder, rock slide                               1707
18. Bicycle                                                 1493
17. Pedestrian                                              1428
43. End of drainage pipe/structure/culvert                   845
01. Overturn/rollover    

In [96]:
# check sums of OBJ STRUCK codes only for crashes where CRASH_CLASS==2 looks good!
fixed_obj = cds_crash.loc[cds_crash['CRASH_CLASS']=='Collision with Fixed Object']
fixed_obj['OBJ_STRUCK'].value_counts()

Tree/Shrub            15497
Other Fixed Object     6738
Guardrail/Barrier      5984
Rock/Stone Wall        5024
Ditch                  2538
Pole                   2383
Sign                   2314
Backslope              1787
Boulder                1707
Bridge Structure       1188
Barricade              1010
Not Applicable          667
Drainage Structure      461
Culvert End Wall        384
Embankment               14
Blank On Form            10
Unknown                   8
Name: OBJ_STRUCK, dtype: int64

In [103]:
fixed_obj.loc[fixed_obj['OBJ_STRUCK'].isnull()==True].shape[0]

1

In [74]:
cds_crash.loc[cds_crash['CRASH_CLASS'].isnull()==True].shape[0]

1440

In [100]:
# "66. Other fixed object" attribute combines OBJ STRUCK and CRASH CLASS - looks good! 
6738+5383

12121

In [99]:
# "99. Unknown" attribute combines OBJ STRUCK, CRASH CLASS, and nulls - looks good!
18+3671+1440

5129

In [104]:
# "43. End of drainage pipe/structure/culvert" attribute combines two OBJ STRUCK attributes - looks good!
384+461

845

In [107]:
# "0" in First_Harmful_Event sum correspond to sum of OBJ STRUCK codes in fixed_obj that are not mapped to IMARS First_Harmful_Event in schema
47715-15497-6738-5024-2538-2383-1707-461-384-14-10-8

12951

# TO DO: update First Harmful Event mappings to reflect Wayne's new "CDS to" spreadsheet tab

In [75]:
# change strings to match IMARS (in order of from most to least common to increase speed)
cds_crash_details_slim['First_Harmful_Event_Type'] = "0" # initialize column with dummy data to replace with for loop
length = range(cds_crash.shape[0])
for i in length:
    if cds_crash['CRASH_CLASS'].iloc[i] in (['Collision with Other Motor Vehicle','Collision with Pedestrian','Collision with Pedalcycle (Bicycle)','Collision with Parked Motor Vehicle','Collision with Railway Train']):
        cds_crash_details_slim['First_Harmful_Event_Type'].iloc[i] = 'Collision with person, MV or non-fixed object'
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Fixed Object':
        cds_crash_details_slim['First_Harmful_Event_Type'].iloc[i] = 'Collision with fixed object'
    elif cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Animal':
        cds_crash_details_slim['First_Harmful_Event_Type'].iloc[i] = 'Collision with animals'
    elif cds_crash['CRASH_CLASS'].iloc[i] in (['Non-collision','Non-Collision(Overturn)']):
        cds_crash_details_slim['First_Harmful_Event_Type'].iloc[i] = 'Non-collision'   
    else:
        cds_crash_details_slim['First_Harmful_Event_Type'].iloc[i] = 'Unknown'

cds_crash_details_slim['First_Harmful_Event_Type'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['First_Harmful_Event_Type'] = "0" # initialize column with dummy data to replace with for loop
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


Collision with person, MV or non-fixed object    112607
Collision with fixed object                       47715
Collision with animals                            18927
Non-collision                                     14166
Unknown                                           10494
Name: First_Harmful_Event_Type, dtype: int64

In [76]:
# check First Harmful Event Type sums- looks good!
(101521 + 1428 + 1493 + 8133 + 32), (14132 + 34), (5383 + 2849 + 822 + 1440)

(112607, 14166, 10494)

In [77]:
cds_crash['CON_FACT1'].value_counts()

Driver: Failed to Give Full Time and Attention                  40234
Unknown                                                         21055
Environment: Animal                                             16691
Driver: Other                                                   13400
Driver: Too Fast for Conditions                                 12277
Driver: Followed Too Closely                                     9605
Driver: Improper Backing                                         8165
Driver: Failed to Yield Right of Way                             6449
Environment: Rain, Snow                                          6237
Driver: Disregarded Traffic Signs, Signals, or Road Markings     5558
Driver: Under Influence of Alcohol                               5322
Driver: Exceeded Posted Speed Limits                             4100
Driver: Improper Lane Change                                     3484
Driver: Fell Asleep, Fainted, Etc.                               3198
Driver: Made Imprope

In [78]:
# set animal indicator
cds_crash_details_slim['Involving Animal'] = 0 # initialize column with dummy data to replace with for loop
length = range(cds_crash.shape[0])
for i in length:
    if cds_crash['CRASH_CLASS'].iloc[i]=='Collision with Animal':
        cds_crash_details_slim['Involving Animal'].iloc[i] = 1
    elif cds_crash['CON_FACT1'].iloc[i]=='Environment: Animal' :
        cds_crash_details_slim['Involving Animal'].iloc[i] = 1
    elif cds_crash['CON_FACT2'].iloc[i]=='Environment: Animal' :
        cds_crash_details_slim['Involving Animal'].iloc[i] = 1
    elif cds_crash['CON_FACT3'].iloc[i]=='Environment: Animal' :
        cds_crash_details_slim['Involving Animal'].iloc[i] = 1 
    elif cds_crash['CON_FACT4'].iloc[i]=='Environment: Animal' :
        cds_crash_details_slim['Involving Animal'].iloc[i] = 1
    elif cds_crash['CON_FACT5'].iloc[i]=='Environment: Animal' :
        cds_crash_details_slim['Involving Animal'].iloc[i] = 1
    elif cds_crash['CON_FACT6'].iloc[i]=='Environment: Animal' :
        cds_crash_details_slim['Involving Animal'].iloc[i] = 1
    else:
        cds_crash_details_slim['Involving Animal'].iloc[i] = 0
# check logic - looks good!
cds_crash_details_slim['Involving Animal'].sum() # (greater value than either CRASH_CLASS or CON_FACT1, so it must be accounting for both) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['Involving Animal'] = 0 # initialize column with dummy data to replace with for loop
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


22291

In [79]:
cds_crash['LIGHT'].value_counts()

Daylight              54932
Dark - Not Lighted    14015
Dark - Lighted         6429
Dusk                   2757
Dawn                   1764
Unknown                1521
Blank On Form           590
Name: LIGHT, dtype: int64

In [80]:
cds_crash_details_slim['Daylight'] = np.where(cds_crash['LIGHT']=="Daylight", 1,0)
cds_crash_details_slim['Dawn'] = np.where(cds_crash['LIGHT']=="Dawn", 1,0)
cds_crash_details_slim['Dusk'] = np.where(cds_crash['LIGHT']=="Dusk", 1,0)
cds_crash_details_slim['Dark_Lit'] = np.where(cds_crash['LIGHT']=="Dark - Lighted", 1,0)
cds_crash_details_slim['Dark_NotLit'] = np.where(cds_crash['LIGHT']=="Dark - Not Lighted", 1,0)
cds_crash_details_slim['Dark_UnknownLit'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['Daylight'] = np.where(cds_crash['LIGHT']=="Daylight", 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['Dawn'] = np.where(cds_crash['LIGHT']=="Dawn", 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['Dusk'] = np.where(cds_c

In [81]:
# group adverse lighting attributes into bin for poor visibility

cds_crash_details_slim['Poor Lighting'] = 0 # initialize column with dummy data to replace with for loop
length = range(cds_crash.shape[0])
for i in length: # in order of most to least common to speed up processing
    if cds_crash_details_slim['Dark_NotLit'].iloc[i] == 1:
        cds_crash_details_slim['Poor Lighting'].iloc[i] = 1
    elif cds_crash_details_slim['Dark_Lit'].iloc[i] == 1:
        cds_crash_details_slim['Poor Lighting'].iloc[i] = 1
    elif cds_crash_details_slim['Dusk'].iloc[i] == 1:
        cds_crash_details_slim['Poor Lighting'].iloc[i] = 1
    elif cds_crash_details_slim['Dawn'].iloc[i] == 1:
        cds_crash_details_slim['Poor Lighting'].iloc[i] = 1
    else:
        cds_crash_details_slim['Poor Lighting'].iloc[i] = 0
# check logic - looks good!
cds_crash_details_slim['Poor Lighting'].sum(), (14015+6429+2757+1764)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['Poor Lighting'] = 0 # initialize column with dummy data to replace with for loop


(24965, 24965)

In [82]:
cds_crash['WEATHER'].value_counts()

Clear                             133196
Cloudy                             31059
Rain                               22786
Snow                                5753
Unknown                             4599
Fog, Smog, Smoke                    1897
Sleet, Hail, Freezing Rain          1421
Other                                923
Blank On Form                        571
Severe Crosswinds                    366
Blowing Sand, Soil, Dirt, Etc.       192
Name: WEATHER, dtype: int64

In [83]:
cds_crash['SURF_COND'].value_counts()

Dry              146048
Wet               33773
Icy or Slushy      8639
Unknown            5077
Snowy              5028
Other              1786
Debris             1102
Muddy               666
Blank On Form       627
Name: SURF_COND, dtype: int64

In [84]:
# set flags for adverse visibility conditions due to precip
cds_crash_details_slim['Bad Weather'] = np.where(cds_crash['WEATHER'].isin(['Rain','Snow','Fog, Smog, Smoke','Sleet, Hail, Freezing Rain','Blowing Sand, Soil, Dirt, Etc.'])==True, 1,0)
cds_crash_details_slim['Bad Road Condition'] = np.where(cds_crash['SURF_COND'].isin(['Wet','Icy or Slushy','Snowy'])==True, 1,0)
cds_crash_details_slim['Bad Circumstance 1'] = np.where(cds_crash['CON_FACT1'].isin(['Road: Wet','Road: Icy or Slushy','Environment: Smog, Smoke','Environment: Sleet, Hail, Freezing Rain','Environment: Blowing Sand, Soil, Dirt','Environment: Rain, Snow'])==True, 1,0)
cds_crash_details_slim['Bad Circumstance 2'] = np.where(cds_crash['CON_FACT2'].isin(['Road: Wet','Road: Icy or Slushy','Environment: Smog, Smoke','Environment: Sleet, Hail, Freezing Rain','Environment: Blowing Sand, Soil, Dirt','Environment: Rain, Snow'])==True, 1,0)
cds_crash_details_slim['Bad Circumstance 3'] = np.where(cds_crash['CON_FACT3'].isin(['Road: Wet','Road: Icy or Slushy','Environment: Smog, Smoke','Environment: Sleet, Hail, Freezing Rain','Environment: Blowing Sand, Soil, Dirt','Environment: Rain, Snow'])==True, 1,0)
cds_crash_details_slim['Bad Circumstance 4'] = np.where(cds_crash['CON_FACT4'].isin(['Road: Wet','Road: Icy or Slushy','Environment: Smog, Smoke','Environment: Sleet, Hail, Freezing Rain','Environment: Blowing Sand, Soil, Dirt','Environment: Rain, Snow'])==True, 1,0)
cds_crash_details_slim['Bad Circumstance 5'] = np.where(cds_crash['CON_FACT5'].isin(['Road: Wet','Road: Icy or Slushy','Environment: Smog, Smoke','Environment: Sleet, Hail, Freezing Rain','Environment: Blowing Sand, Soil, Dirt','Environment: Rain, Snow'])==True, 1,0)
cds_crash_details_slim['Bad Circumstance 6'] = np.where(cds_crash['CON_FACT6'].isin(['Road: Wet','Road: Icy or Slushy','Environment: Smog, Smoke','Environment: Sleet, Hail, Freezing Rain','Environment: Blowing Sand, Soil, Dirt','Environment: Rain, Snow'])==True, 1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['Bad Weather'] = np.where(cds_crash['WEATHER'].isin(['Rain','Snow','Fog, Smog, Smoke','Sleet, Hail, Freezing Rain','Blowing Sand, Soil, Dirt, Etc.'])==True, 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['Bad Road Condition'] = np.where(cds_crash['SURF_COND'].isin(['Wet','Icy or Slushy','Snowy'])==True, 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documenta

In [85]:
# bad weather sum check - looks good!
(22786+5753+1897+1421+192), cds_crash_details_slim['Bad Weather'].sum()

(32049, 32049)

In [86]:
# bad road condition sum check - looks good!
(33773+8639+5028), cds_crash_details_slim['Bad Road Condition'].sum()

(47440, 47440)

In [87]:
# bad circumstance sum check - looks good!
(6237+2201+1804+106+136+803), cds_crash_details_slim['Bad Circumstance 1'].sum()

(11287, 11287)

In [88]:
# check value to order from greatest to least to speed up following for loop
cds_crash_details_slim['Bad Circumstance 1'].sum(), cds_crash_details_slim['Bad Circumstance 2'].sum(), cds_crash_details_slim['Bad Circumstance 3'].sum(), cds_crash_details_slim['Bad Circumstance 4'].sum(), cds_crash_details_slim['Bad Circumstance 5'].sum(), cds_crash_details_slim['Bad Circumstance 6'].sum()

(11287, 17168, 8592, 2647, 978, 267)

In [89]:
# group adverse weather attributes into bin for poor visibility
cds_crash_details_slim['Adverse Weather'] = 0 # initialize column with dummy data to replace with for loop
length = range(cds_crash.shape[0])
for i in length: # in order of most to least common to speed up processing
    if cds_crash_details_slim['Bad Road Condition'].iloc[i] == 1:
        cds_crash_details_slim['Adverse Weather'].iloc[i] = 1
    elif cds_crash_details_slim['Bad Weather'].iloc[i] == 1:
        cds_crash_details_slim['Adverse Weather'].iloc[i] = 1
    elif cds_crash_details_slim['Bad Circumstance 2'].iloc[i] == 1:
        cds_crash_details_slim['Adverse Weather'].iloc[i] = 1
    elif cds_crash_details_slim['Bad Circumstance 1'].iloc[i] == 1:
        cds_crash_details_slim['Adverse Weather'].iloc[i] = 1
    elif cds_crash_details_slim['Bad Circumstance 3'].iloc[i] == 1:
        cds_crash_details_slim['Adverse Weather'].iloc[i] = 1
    elif cds_crash_details_slim['Bad Circumstance 4'].iloc[i] == 1:
        cds_crash_details_slim['Adverse Weather'].iloc[i] = 1
    elif cds_crash_details_slim['Bad Circumstance 5'].iloc[i] == 1:
        cds_crash_details_slim['Adverse Weather'].iloc[i] = 1
    elif cds_crash_details_slim['Bad Circumstance 6'].iloc[i] == 1:
        cds_crash_details_slim['Adverse Weather'].iloc[i] = 1
    else:
        cds_crash_details_slim['Adverse Weather'].iloc[i] = 0
# check logic - looks good!
cds_crash_details_slim['Adverse Weather'].sum() # (greater value than either Bad Road Condition, Bad Weather, or Bad Circumstance 1, so it must be accounting for all) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['Adverse Weather'] = 0 # initialize column with dummy data to replace with for loop
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


49828

In [90]:
cds_crash_details_slim.columns

Index(['INCID_NO', 'First_Harmful_Event', 'First_Harmful_Event_Type',
       'Involving Animal', 'Daylight', 'Dawn', 'Dusk', 'Dark_Lit',
       'Dark_NotLit', 'Dark_UnknownLit', 'Poor Lighting', 'Bad Weather',
       'Bad Road Condition', 'Bad Circumstance 1', 'Bad Circumstance 2',
       'Bad Circumstance 3', 'Bad Circumstance 4', 'Bad Circumstance 5',
       'Bad Circumstance 6', 'Adverse Weather'],
      dtype='object')

In [91]:
cds_crash_details_slim = cds_crash_details_slim.drop(columns=['Bad Weather','Bad Road Condition','Bad Circumstance 1','Bad Circumstance 2', 'Bad Circumstance 3','Bad Circumstance 4','Bad Circumstance 5','Bad Circumstance 6'])
cds_crash_details_slim.columns

Index(['INCID_NO', 'First_Harmful_Event', 'First_Harmful_Event_Type',
       'Involving Animal', 'Daylight', 'Dawn', 'Dusk', 'Dark_Lit',
       'Dark_NotLit', 'Dark_UnknownLit', 'Poor Lighting', 'Adverse Weather'],
      dtype='object')

In [56]:
cds_crash_details_slim.shape

(203909, 11)

In [57]:
cds_crash_details_slim.to_csv('./CDS_crash_details_slim.csv',index=False)

## Passenger Table

In [58]:
# determine number of driver injuries for each crash
cds_vehicle_inj = cds_vehicle[['INCID_NO','PED_TYPE']] # subset needed columns (PED_TYPE for Num_Ped_Deaths later calc)
cds_vehicle_inj['No Injury'] = np.where(cds_vehicle['DRIVER_INJ'] == 0.0, 1, 0)
cds_vehicle_inj['Possible Injury'] = np.where(cds_vehicle['DRIVER_INJ'] == 1.0, 1, 0)
cds_vehicle_inj['Non-incapacitating Injury'] = np.where(cds_vehicle['DRIVER_INJ'] == 2.0, 1, 0)
cds_vehicle_inj['Incapacitating Injury'] = np.where(cds_vehicle['DRIVER_INJ'] == 3.0, 1, 0)
cds_vehicle_inj['Fatality'] = np.where(cds_vehicle['DRIVER_INJ'] == 4.0, 1, 0)
# collapse multiple rows for each person involved in the crash into a single row for each crash
#injuries/fatalities and number of people involved should be summed as total numbers per crash ("sum" function)
cds_vehicle_inj_agg = cds_vehicle_inj.groupby(by=['INCID_NO']).sum().reset_index()
cds_vehicle_inj_agg.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_vehicle_inj['No Injury'] = np.where(cds_vehicle['DRIVER_INJ'] == 0.0, 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_vehicle_inj['Possible Injury'] = np.where(cds_vehicle['DRIVER_INJ'] == 1.0, 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_vehicle_inj['Non-incapacitating Inj

(194715, 7)

In [59]:
# determine number of passenger injuries for each crash
cds_passenger_inj = cds_passenger[['INCID_NO']] # subset needed columns
cds_passenger_inj['No Injury Mod'] = np.where(cds_passenger['PASS_INJ'] == 0.0, 1, 0)
cds_passenger_inj['Possible Injury Mod'] = np.where(cds_passenger['PASS_INJ'] == 1.0, 1, 0)
cds_passenger_inj['Non-incapacitating Injury Mod'] = np.where(cds_passenger['PASS_INJ'] == 2.0, 1, 0)
cds_passenger_inj['Incapacitating Injury Mod'] = np.where(cds_passenger['PASS_INJ'] == 3.0, 1, 0)
cds_passenger_inj['Fatality Mod'] = np.where(cds_passenger['PASS_INJ'] == 4.0, 1, 0)
# collapse multiple rows for each person involved in the crash into a single row for each crash
#injuries/fatalities and number of people involved should be summed as total numbers per crash ("sum" function)
cds_passenger_inj_agg = cds_passenger_inj.groupby(by=['INCID_NO']).sum().reset_index()
cds_passenger_inj_agg.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_passenger_inj['No Injury Mod'] = np.where(cds_passenger['PASS_INJ'] == 0.0, 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_passenger_inj['Possible Injury Mod'] = np.where(cds_passenger['PASS_INJ'] == 1.0, 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_passenger_inj['Non-inca

(60136, 6)

In [60]:
# join driver and passenger injuries for each crash
merged_injury_agg = cds_vehicle_inj_agg.merge(cds_passenger_inj_agg, how='left', on='INCID_NO') # left join to retain records in larger vehicle_inj_agg table
merged_injury_agg.shape

(194715, 12)

In [61]:
merged_injury_agg.head()

Unnamed: 0,INCID_NO,PED_TYPE,No Injury,Possible Injury,Non-incapacitating Injury,Incapacitating Injury,Fatality,No Injury Mod,Possible Injury Mod,Non-incapacitating Injury Mod,Incapacitating Injury Mod,Fatality Mod
0,ABLI070425075000,0.0,1,0,0,0,0,,,,,
1,ABLI070804175500,0.0,1,0,0,0,0,,,,,
2,ABLI091117170900,2.0,2,0,0,0,0,,,,,
3,ABLI121009110000,0.0,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0
4,ABLI140610163500,0.0,0,0,0,0,0,,,,,


In [62]:
merged_injury_agg = merged_injury_agg.fillna(0) # required to make subsequent column addition step work (cannot handle NaN)
merged_injury_agg.head()

Unnamed: 0,INCID_NO,PED_TYPE,No Injury,Possible Injury,Non-incapacitating Injury,Incapacitating Injury,Fatality,No Injury Mod,Possible Injury Mod,Non-incapacitating Injury Mod,Incapacitating Injury Mod,Fatality Mod
0,ABLI070425075000,0.0,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0
1,ABLI070804175500,0.0,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0
2,ABLI091117170900,2.0,2,0,0,0,0,0.0,0.0,0.0,0.0,0.0
3,ABLI121009110000,0.0,1,0,0,0,0,1.0,0.0,0.0,0.0,0.0
4,ABLI140610163500,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


In [63]:
# combine driver and passenger injuries to single value for each crash
cds_combined_injury_agg = merged_injury_agg[['INCID_NO']] # subset needed columns
cds_combined_injury_agg['No Injury'] = merged_injury_agg['No Injury'] + merged_injury_agg['No Injury Mod']
cds_combined_injury_agg['Possible Injury'] = merged_injury_agg['Possible Injury'] + merged_injury_agg['Possible Injury Mod']
cds_combined_injury_agg['Non-incapacitating Injury'] = merged_injury_agg['Non-incapacitating Injury'] + merged_injury_agg['Non-incapacitating Injury Mod']
cds_combined_injury_agg['Incapacitating Injury'] = merged_injury_agg['Incapacitating Injury'] + merged_injury_agg['Incapacitating Injury Mod']
cds_combined_injury_agg['Fatality'] = merged_injury_agg['Fatality'] + merged_injury_agg['Fatality Mod']
# sum of driver and passenger total injuries looks good!!
cds_combined_injury_agg['Fatality'].sum(), cds_vehicle_inj_agg['Fatality'].sum()+cds_passenger_inj_agg['Fatality Mod'].sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_combined_injury_agg['No Injury'] = merged_injury_agg['No Injury'] + merged_injury_agg['No Injury Mod']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_combined_injury_agg['Possible Injury'] = merged_injury_agg['Possible Injury'] + merged_injury_agg['Possible Injury Mod']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

(1232.0, 1232)

In [64]:
cds_combined_injury_agg['Crash_Severity'] = 0 # initialize column with dummy data to replace with for loop
length = range(cds_combined_injury_agg.shape[0]) # search through all rows in aggregated dataset (to examine all injuries from each crash as a whole)
for i in length: # determine most severe injury outcome
    if cds_combined_injury_agg['Fatality'].iloc[i] > 0:
        cds_combined_injury_agg['Crash_Severity'].iloc[i] = 'Fatal'
    elif cds_combined_injury_agg['Incapacitating Injury'].iloc[i] > 0:
        cds_combined_injury_agg['Crash_Severity'].iloc[i] = 'Incap'
    elif cds_combined_injury_agg['Non-incapacitating Injury'].iloc[i] > 0:
        cds_combined_injury_agg['Crash_Severity'].iloc[i] = 'Non-Incap'
    elif cds_combined_injury_agg['Possible Injury'].iloc[i] > 0:
        cds_combined_injury_agg['Crash_Severity'].iloc[i] = 'Possible'
    else:
        cds_combined_injury_agg['Crash_Severity'].iloc[i] = 'No Inj'

#set flags for each Crash Severity attribute
cds_combined_injury_agg['CrashSeverity_Fatal'] = np.where(cds_combined_injury_agg['Crash_Severity']=='Fatal', 1,0)
cds_combined_injury_agg['CrashSeverity_Incap'] = np.where(cds_combined_injury_agg['Crash_Severity']=='Incap', 1,0)
cds_combined_injury_agg['CrashSeverity_NonIncap'] = np.where(cds_combined_injury_agg['Crash_Severity']=='Non-Incap', 1,0)
cds_combined_injury_agg['CrashSeverity_Possible'] = np.where(cds_combined_injury_agg['Crash_Severity']=='Possible', 1,0)
cds_combined_injury_agg['CrashSeverity_NoInj'] = np.where(cds_combined_injury_agg['Crash_Severity']=='No Inj', 1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_combined_injury_agg['Crash_Severity'] = 0 # initialize column with dummy data to replace with for loop
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [65]:
# test crash severity logic - looks good!
test=cds_combined_injury_agg.loc[cds_combined_injury_agg['Fatality']==0] # remove fatal crashes
cds_combined_injury_agg['CrashSeverity_Incap'].sum(), test.loc[test['Incapacitating Injury']>0].shape[0] # number of crashes with incapacitating severity = number of crashes resulting in incapacitating injuries when fatal crashes are removed from dataset

(6348, 6348)

In [66]:
cds_vehicle['PED_TYPE'].value_counts().sort_index()

0.0     301573
1.0       1045
2.0       1039
3.0          6
4.0          6
5.0         44
6.0         40
88.0       122
98.0        31
99.0      1308
Name: PED_TYPE, dtype: int64

In [67]:
cds_vehicle.columns

Index(['OBJECTID', 'INCID_NO', 'UNIT_NO', 'VEH_YEAR', 'MAKE_MOD', 'MODEL',
       'NUM_OCC', 'REG_STATE', 'REG_YEAR', 'PLATE_NUM', 'DIR_TRAVEL',
       'SPEED_LIMIT', 'BODY_TYPE', 'VEH_MANVR', 'VEH_DAMAGE', 'DAM_LOCATION',
       'LIC_NUM', 'LIC_STATE', 'PED', 'BRTH_DATE', 'DRIVER_SEX', 'DRIVER_BELT',
       'DRIVER_EJECT', 'DRIVER_INJ', 'DRIVER_VIOLTN', 'VIOL_CHG1', 'VIOL_CHG2',
       'PED_TYPE', 'PED_LOC', 'PED_ACTN', 'REPAIR'],
      dtype='object')

In [68]:
# determine number of people involved
cds_passenger_slim = cds_vehicle[['INCID_NO']] # subset needed columns
cds_passenger_slim['Num_Motorist'] = np.where(cds_vehicle['PED_TYPE'].isin([1.0, 2.0, 3.0, 4.0])==False, cds_vehicle['NUM_OCC'], 0)
cds_passenger_slim['Num_Non_Motorist'] = np.where(cds_vehicle['PED_TYPE'].isin([1.0, 2.0, 3.0, 4.0])==True, 1, 0)
cds_passenger_slim['Num_Pedestrian'] = np.where(cds_vehicle['PED_TYPE']==1.0, 1,0)
# check new non-motorist logic, looks good!
cds_passenger_slim['Num_Non_Motorist'].sum(), (1045+1039+6+6) # combined number of Pedestrians, Pedalcyclists, Riders of Animals, and Riders of Animal-drawn Vehicle

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_passenger_slim['Num_Motorist'] = np.where(cds_vehicle['PED_TYPE'].isin([1.0, 2.0, 3.0, 4.0])==False, cds_vehicle['NUM_OCC'], 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_passenger_slim['Num_Non_Motorist'] = np.where(cds_vehicle['PED_TYPE'].isin([1.0, 2.0, 3.0, 4.0])==True, 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexi

(2096, 2096)

In [69]:
# check new motorist logic - looks good!
test = cds_vehicle.loc[cds_vehicle['PED_TYPE'].isin([1.0, 2.0, 3.0, 4.0])==True]
cds_passenger_slim['Num_Motorist'].sum(), cds_vehicle['NUM_OCC'].sum()-test['NUM_OCC'].sum() # number of motorists = all people - number of non-motorists 

(538053.0, 538053.0)

In [70]:
cds_passenger_slim['Num_Pedestrian'].sum() # exactly as expected from 'PED_TYPE' value counts

1045

In [71]:
cds_passenger_slim['Num_Ped_Deaths'] = np.where(cds_vehicle_inj['PED_TYPE']==1.0, cds_vehicle_inj['Fatality'], 0)
# test logic, looks good!
test = cds_vehicle_inj.loc[cds_vehicle_inj['PED_TYPE']==1.0]
cds_passenger_slim['Num_Ped_Deaths'].sum(), test['Fatality'].sum() # number of pedestrian deaths = "driver" fatalities in subset of vehicle reports for pedestrians

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_passenger_slim['Num_Ped_Deaths'] = np.where(cds_vehicle_inj['PED_TYPE']==1.0, cds_vehicle_inj['Fatality'], 0)


(45, 45)

In [72]:
# collapse multiple rows for each person involved in the crash into a single row for each crash
# number of people involved should be summed as total numbers per crash ("sum" function)
cds_passenger_slim_agg = cds_passenger_slim.groupby(by=['INCID_NO']).sum().reset_index()
cds_passenger_slim_agg.shape

(194715, 5)

In [73]:
cds_passenger_slim_agg['Num_Pedestrian'].value_counts()

0    193728
1       937
2        45
3         3
4         1
5         1
Name: Num_Pedestrian, dtype: int64

In [74]:
# set binary indicators for VRU and Pedestrian
cds_passenger_slim_agg['Pedestrian'] = np.where(cds_passenger_slim_agg['Num_Pedestrian'] > 0, 1, 0)
cds_passenger_slim_agg['VRU'] = np.where(cds_passenger_slim_agg['Num_Non_Motorist'] > 0, 1, 0)
# test logic - looks good!
cds_passenger_slim_agg['Pedestrian'].sum(), (937+45+3+1+1) # replacing number of pedestrians involved in each crash with 1 for all crashes involving pedestrians

(987, 987)

In [75]:
cds_passenger_slim_agg.shape, cds_combined_injury_agg.shape

((194715, 7), (194715, 12))

In [76]:
cds_passenger_slim_agg.columns

Index(['INCID_NO', 'Num_Motorist', 'Num_Non_Motorist', 'Num_Pedestrian',
       'Num_Ped_Deaths', 'Pedestrian', 'VRU'],
      dtype='object')

In [77]:
cds_combined_injury_agg.columns

Index(['INCID_NO', 'No Injury', 'Possible Injury', 'Non-incapacitating Injury',
       'Incapacitating Injury', 'Fatality', 'Crash_Severity',
       'CrashSeverity_Fatal', 'CrashSeverity_Incap', 'CrashSeverity_NonIncap',
       'CrashSeverity_Possible', 'CrashSeverity_NoInj'],
      dtype='object')

In [78]:
# combine injuries with number of people involved
cds_passenger_slim_agg = cds_passenger_slim_agg.merge(cds_combined_injury_agg, how='left', on='INCID_NO')
cds_passenger_slim_agg.shape

(194715, 18)

In [79]:
cds_passenger_slim_agg.columns

Index(['INCID_NO', 'Num_Motorist', 'Num_Non_Motorist', 'Num_Pedestrian',
       'Num_Ped_Deaths', 'Pedestrian', 'VRU', 'No Injury', 'Possible Injury',
       'Non-incapacitating Injury', 'Incapacitating Injury', 'Fatality',
       'Crash_Severity', 'CrashSeverity_Fatal', 'CrashSeverity_Incap',
       'CrashSeverity_NonIncap', 'CrashSeverity_Possible',
       'CrashSeverity_NoInj'],
      dtype='object')

In [80]:
# reorder columns to match IMARS
cds_passenger_slim_agg = cds_passenger_slim_agg[['INCID_NO','No Injury','Possible Injury','Non-incapacitating Injury','Incapacitating Injury','Fatality','Num_Motorist','Num_Non_Motorist','Num_Pedestrian','Num_Ped_Deaths','Pedestrian','VRU','Crash_Severity','CrashSeverity_Fatal','CrashSeverity_Incap','CrashSeverity_NonIncap','CrashSeverity_Possible','CrashSeverity_NoInj']]

In [81]:
cds_passenger_slim_agg.to_csv("./CDS_passenger_slim_agg.csv",index=False)

## Vehicle Table

In [82]:
cds_vehicle['SPEED_LIMIT'].value_counts()

25.0    65332
45.0    47545
35.0    47061
55.0    28737
15.0    26493
99.0    17894
50.0    16816
40.0    15184
30.0    14313
5.0      9385
10.0     4002
20.0     3218
60.0      132
65.0      117
75.0       19
70.0        4
Name: SPEED_LIMIT, dtype: int64

In [83]:
cds_vehicle_slim = cds_vehicle[['INCID_NO']] # subset for needed columns
cds_vehicle_slim['5_mph']= np.where(cds_vehicle['SPEED_LIMIT']==5.0, 1,0)
cds_vehicle_slim['10_mph']= np.where(cds_vehicle['SPEED_LIMIT']==10.0, 1,0)
cds_vehicle_slim['15_mph']= np.where(cds_vehicle['SPEED_LIMIT']==15.0, 1,0)
cds_vehicle_slim['20_mph']= np.where(cds_vehicle['SPEED_LIMIT']==20.0, 1,0)
cds_vehicle_slim['25_mph']= np.where(cds_vehicle['SPEED_LIMIT']==25.0, 1,0)
cds_vehicle_slim['30_mph']= np.where(cds_vehicle['SPEED_LIMIT']==30.0, 1,0)
cds_vehicle_slim['35_mph']= np.where(cds_vehicle['SPEED_LIMIT']==35.0, 1,0)
cds_vehicle_slim['40_mph']= np.where(cds_vehicle['SPEED_LIMIT']==40.0, 1,0)
cds_vehicle_slim['45_mph']= np.where(cds_vehicle['SPEED_LIMIT']==45.0, 1,0)
cds_vehicle_slim['50_mph']= np.where(cds_vehicle['SPEED_LIMIT']==50.0, 1,0)
cds_vehicle_slim['55_mph']= np.where(cds_vehicle['SPEED_LIMIT']==55.0, 1,0)
cds_vehicle_slim['60_mph']= np.where(cds_vehicle['SPEED_LIMIT']==60.0, 1,0)
cds_vehicle_slim['65_mph']= np.where(cds_vehicle['SPEED_LIMIT']==65.0, 1,0)
cds_vehicle_slim['70_mph']= np.where(cds_vehicle['SPEED_LIMIT']==70.0, 1,0)
cds_vehicle_slim['75_mph']= np.where(cds_vehicle['SPEED_LIMIT']==75.0, 1,0)
cds_vehicle_slim['no_posted_speed']= 0
cds_vehicle_slim['25_mph'].sum() # exactly as expected from speed limit value counts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_vehicle_slim['5_mph']= np.where(cds_vehicle['SPEED_LIMIT']==5.0, 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_vehicle_slim['10_mph']= np.where(cds_vehicle['SPEED_LIMIT']==10.0, 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_vehicle_slim['15_mph']= np.where(cds_vehicle['SPEED

65332

In [84]:
# collapse multiple rows for each vehicle involved in the crash into a single row for each crash
# only one speed limit should exist per crash to not double count data attribute ("first" function)
cds_vehicle_slim_agg = cds_vehicle_slim.groupby(by=['INCID_NO']).first().reset_index()
cds_vehicle_slim_agg.shape

(194715, 17)

In [85]:
cds_vehicletypes = cds_vehicle[['INCID_NO']] # subset needed columns

In [86]:
cds_vehicletypes['NUM_VEH'] = np.where(cds_vehicle['PED_TYPE'].isin([1.0, 2.0, 3.0, 4.0])==False, 1, 0) # number of entries in vehicle table, excluding VRUs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_vehicletypes['NUM_VEH'] = np.where(cds_vehicle['PED_TYPE'].isin([1.0, 2.0, 3.0, 4.0])==False, 1, 0) # number of entries in vehicle table, excluding VRUs


In [87]:
cds_vehicle['BODY_TYPE'].value_counts()

1.0     199904
6.0      31569
3.0      19692
7.0       9761
0.0       7729
4.0       7525
12.0      6409
88.0      5551
15.0      3482
10.0      2732
8.0       2181
99.0      1911
2.0       1721
9.0       1309
11.0       973
13.0       943
16.0       872
5.0        339
17.0       215
14.0       199
30.0        69
18.0        50
50.0        36
98.0        19
Name: BODY_TYPE, dtype: int64

In [88]:
# set flags for each vehicle type

#cds_vehicletypes['Car'] = np.where(cds_vehicle['BODY_TYPE']==.isin([1.0, 2.0, 16.0])==True, 1,0)
#cds_vehicletypes['SUV'] = np.where(cds_vehicle['BODY_TYPE']==15.0, 1,0)
#cds_vehicletypes['Van'] = np.where(cds_vehicle['BODY_TYPE']==3.0, 1,0)
#cds_vehicletypes['Truck'] = np.where(cds_vehicle['BODY_TYPE'].isin([6.0, 7.0, 8.0, 17.0])==True, 1,0)
#cds_vehicletypes['Bus'] = np.where(cds_vehicle['BODY_TYPE'].isin([10.0, 11.0])==True, 1,0)
cds_vehicletypes['Num_Motorcycle'] = np.where(cds_vehicle['BODY_TYPE']==4.0, 1,0)
#cds_vehicletypes['RV'] = np.where(cds_vehicle['BODY_TYPE']==12.0, 1,0)
cds_vehicletypes['Num_Motorcycle'].sum() # exactly as expected from body type value counts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_vehicletypes['Num_Motorcycle'] = np.where(cds_vehicle['BODY_TYPE']==4.0, 1,0)


7525

In [89]:
# collapse multiple rows for each vehicle involved in the crash into a single row for each crash
# vehicle types and number of people involved should be summed as total numbers per crash ("sum" function)
cds_vehicletypes_agg = cds_vehicletypes.groupby(by=['INCID_NO']).sum().reset_index()
cds_vehicletypes_agg.shape

(194715, 3)

In [90]:
cds_vehicletypes_agg['Num_Motorcycle'].value_counts()

0    187574
1      6784
2       334
3        19
4         4
Name: Num_Motorcycle, dtype: int64

In [91]:
# set indicator for crashes involving motorcycles
cds_vehicletypes_agg['Motorcycle_Ind'] = np.where(cds_vehicletypes_agg['Num_Motorcycle'] > 0, 1,0)
cds_vehicletypes_agg['Motorcycle_Ind'].sum(), (6784+334+19+4) # looks good (replacing number of motorcyclists involved in each crash with 1 for all crashes involving motorcyclists)

(7141, 7141)

In [92]:
cds_vehicletypes_agg.columns

Index(['INCID_NO', 'NUM_VEH', 'Num_Motorcycle', 'Motorcycle_Ind'], dtype='object')

In [93]:
# join aggregated ('sum' and 'first') vehicle data into single table
cds_slim_vehicle_agg = cds_vehicle_slim_agg.merge(cds_vehicletypes_agg, how='right', on='INCID_NO')
cds_slim_vehicle_agg = cds_slim_vehicle_agg.drop_duplicates()
cds_slim_vehicle_agg.shape

(194715, 20)

In [94]:
cds_slim_vehicle_agg.columns

Index(['INCID_NO', '5_mph', '10_mph', '15_mph', '20_mph', '25_mph', '30_mph',
       '35_mph', '40_mph', '45_mph', '50_mph', '55_mph', '60_mph', '65_mph',
       '70_mph', '75_mph', 'no_posted_speed', 'NUM_VEH', 'Num_Motorcycle',
       'Motorcycle_Ind'],
      dtype='object')

In [95]:
cds_slim_vehicle_agg.to_csv("./CDS_slim_vehicle_agg.csv",index=False)

# Merge Slim Tables

In [96]:
cds_crash_slim.shape, cds_crash_details_slim.shape, cds_passenger_slim_agg.shape, cds_slim_vehicle_agg.shape

((203909, 9), (203909, 11), (194715, 18), (194715, 20))

In [97]:
cds_crash_details_slim_merged = cds_crash_details_slim.merge(cds_crash_slim, how='right', on='INCID_NO')
cds_crash_details_slim_merged.shape

(203909, 19)

In [98]:
cds_crash_details_slim_merged.columns

Index(['INCID_NO', 'First_Harmful_Event', 'Involving Animal', 'Daylight',
       'Dawn', 'Dusk', 'Dark_Lit', 'Dark_NotLit', 'Dark_UnknownLit',
       'Poor Lighting', 'Adverse Weather', 'LATITUDE', 'LONGITUDE', 'Park',
       'RGN', 'YEAR', 'MONTH', 'DOW', 'HOUR'],
      dtype='object')

In [99]:
cds_crash_details_and_passenger_slim = cds_passenger_slim_agg.merge(cds_crash_details_slim_merged, how='left', on='INCID_NO') # left join to drop crashes w/o passenger reports
cds_crash_details_and_passenger_slim.shape

(194715, 36)

In [100]:
cds_crash_details_and_passenger_slim.columns

Index(['INCID_NO', 'No Injury', 'Possible Injury', 'Non-incapacitating Injury',
       'Incapacitating Injury', 'Fatality', 'Num_Motorist', 'Num_Non_Motorist',
       'Num_Pedestrian', 'Num_Ped_Deaths', 'Pedestrian', 'VRU',
       'Crash_Severity', 'CrashSeverity_Fatal', 'CrashSeverity_Incap',
       'CrashSeverity_NonIncap', 'CrashSeverity_Possible',
       'CrashSeverity_NoInj', 'First_Harmful_Event', 'Involving Animal',
       'Daylight', 'Dawn', 'Dusk', 'Dark_Lit', 'Dark_NotLit',
       'Dark_UnknownLit', 'Poor Lighting', 'Adverse Weather', 'LATITUDE',
       'LONGITUDE', 'Park', 'RGN', 'YEAR', 'MONTH', 'DOW', 'HOUR'],
      dtype='object')

In [101]:
cds_slim_all = cds_slim_vehicle_agg.merge(cds_crash_details_and_passenger_slim, how='right', on='INCID_NO')
cds_slim_all['database'] = 'CDS' # add column with database name
cds_slim_all.shape

(194715, 56)

In [102]:
cds_slim_all.columns

Index(['INCID_NO', '5_mph', '10_mph', '15_mph', '20_mph', '25_mph', '30_mph',
       '35_mph', '40_mph', '45_mph', '50_mph', '55_mph', '60_mph', '65_mph',
       '70_mph', '75_mph', 'no_posted_speed', 'NUM_VEH', 'Num_Motorcycle',
       'Motorcycle_Ind', 'No Injury', 'Possible Injury',
       'Non-incapacitating Injury', 'Incapacitating Injury', 'Fatality',
       'Num_Motorist', 'Num_Non_Motorist', 'Num_Pedestrian', 'Num_Ped_Deaths',
       'Pedestrian', 'VRU', 'Crash_Severity', 'CrashSeverity_Fatal',
       'CrashSeverity_Incap', 'CrashSeverity_NonIncap',
       'CrashSeverity_Possible', 'CrashSeverity_NoInj', 'First_Harmful_Event',
       'Involving Animal', 'Daylight', 'Dawn', 'Dusk', 'Dark_Lit',
       'Dark_NotLit', 'Dark_UnknownLit', 'Poor Lighting', 'Adverse Weather',
       'LATITUDE', 'LONGITUDE', 'Park', 'RGN', 'YEAR', 'MONTH', 'DOW', 'HOUR',
       'database'],
      dtype='object')

In [103]:
cds_slim_all.to_csv("./CDS_slim_all_clean.csv",index=False)