## Crash Data Wrangling Jupyter Notebook

**Author:** Sophie Kaye 

**Date:** 7/26/2022 

**Purpose:** This notebook filters the new CDS/STARS combined data tables for data attributes to be used in analysis. These smaller tables are then combined to create a dataframe that is joinable with the reciprocal IMARS dataset

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
myworkingdirectory = r"C:\Users\Sophie.Kaye\Desktop\NPS Safety"
os.chdir(myworkingdirectory)

In [3]:
cds_vehicle = pd.read_excel('./CDS/New CDS Excel Files/ALL_UNIT.xlsx')
cds_vehicle.shape

(311057, 31)

In [4]:
cds_vehicle.columns

Index(['OBJECTID', 'INCID_NO', 'UNIT_NO', 'VEH_YEAR', 'MAKE_MOD', 'MODEL',
       'NUM_OCC', 'REG_STATE', 'REG_YEAR', 'PLATE_NUM', 'DIR_TRAVEL',
       'SPEED_LIMIT', 'BODY_TYPE', 'VEH_MANVR', 'VEH_DAMAGE', 'DAM_LOCATION',
       'LIC_NUM', 'LIC_STATE', 'PED', 'BRTH_DATE', 'DRIVER_SEX', 'DRIVER_BELT',
       'DRIVER_EJECT', 'DRIVER_INJ', 'DRIVER_VIOLTN', 'VIOL_CHG1', 'VIOL_CHG2',
       'PED_TYPE', 'PED_LOC', 'PED_ACTN', 'REPAIR'],
      dtype='object')

In [5]:
cds_passenger = pd.read_excel('./CDS/New CDS Excel Files/ALL_PASSENGER.xlsx')
cds_passenger.shape

(114151, 10)

In [6]:
cds_passenger.columns

Index(['OBJECTID', 'INCID_NO', 'UNIT_NO', 'PASS_SEQ', 'PASS_SEX', 'PASS_BELT',
       'PASS_EJECT', 'PASS_SEAT', 'PASS_INJ', 'PASS_AGE'],
      dtype='object')

In [7]:
cds_crash = pd.read_excel('./CDS/New CDS Excel Files/ALL_CRASH.xlsx')
cds_crash.shape

(204687, 56)

In [8]:
cds_crash.columns

Index(['OBJECTID', 'INCID_NO', 'CASE_NUM', 'PARK_ALPHA', 'STATE_CODE',
       'CRASH_DATE', 'CRASH_TIME', 'RTE_NO', 'RTE_NAME', 'NODE_DIST_FT',
       'NODE_DIST_MI', 'NODE_DIR', 'NODE_NUM', 'LIGHT', 'WEATHER',
       'CRASH_LOCATION', 'SURF_COND', 'CRASH_CLASS', 'VEH_COLL', 'OBJ_STRUCK',
       'ROAD_CHAR', 'CON_FACT1', 'CON_FACT2', 'CON_FACT3', 'CON_FACT4',
       'CON_FACT5', 'CON_FACT6', 'HIT_RUN', 'CATEGORY', 'FATALS', 'INJURED',
       'PED_FAT', 'PED_INJ', 'BIKE_FAT', 'BIKE_INJ', 'PED', 'CRASH_YEAR',
       'COMMENTS', 'ZIPFILE', 'LOCATION', 'PHOTOS_TAKEN', 'USPP_NPS_VEH_INV',
       'PARK_PTY_DEST', 'LOCKED_UPDATE', 'LOCKED_BY_USER', 'DATA_SRC',
       'LATITUDE', 'LONGITUDE', 'MILEPOST', 'IMPORT_DATE', 'FILE_NAME',
       'SAVE_DATE', 'ROUTE_IDENT', 'RIP_CYCLE', 'MP_NODE', 'SPTL_LOC'],
      dtype='object')

In [9]:
pd.options.display.max_rows = 10000000
cds_crash.head()

Unnamed: 0,OBJECTID,INCID_NO,CASE_NUM,PARK_ALPHA,STATE_CODE,CRASH_DATE,CRASH_TIME,RTE_NO,RTE_NAME,NODE_DIST_FT,...,LATITUDE,LONGITUDE,MILEPOST,IMPORT_DATE,FILE_NAME,SAVE_DATE,ROUTE_IDENT,RIP_CYCLE,MP_NODE,SPTL_LOC
0,2,ABLI070425075000,5540070001,ABLI,KY,2007-04-25,750,0.0,KNOB CREEK PARKING,0.0,...,,,,,,NaT,,,,0
1,3,ABLI070804175500,5540070013,ABLI,KY,2007-08-04,1755,0.0,,0.0,...,,,,,,NaT,,,,0
2,4,ABLI091117170900,N08113,ABLI,NY,2009-11-17,1709,,NEW YORK AVE,,...,,,,,,2014-02-07,,,,0
3,5,ABLI121009110000,12474,ABLI,KY,2012-10-09,1100,101.0,PRIVATE DRIVEWAY OFF EAST BEACH ROAD (875),,...,,,,,,2015-03-16,,,,0
4,6,ABLI140610163500,14054379,ABLI,KY,2014-06-10,1635,,DC 295,,...,38.91205,-76.93412,0.0,,,2014-06-18,,,,0


In [10]:
cds_crash.dropna(subset=['INCID_NO']).shape
# no duplicate record numbers

(204687, 56)

## Create new region column

In [11]:
park_info = pd.read_csv("./crash database mapping/Park_Info_Table.csv")

In [12]:
park_info.columns

Index(['OBJECTID', 'UNIT_CODE', 'GIS_Notes', 'UNIT_NAME', 'DATE_EDIT', 'STATE',
       'REGION', 'GNIS_ID', 'UNIT_TYPE', 'CREATED_BY', 'METADATA', 'PARKNAME',
       'CreationDa', 'Creator', 'EditDate', 'Editor', 'Shape__Are',
       'Shape__Len', 'Unnamed: 18'],
      dtype='object')

In [13]:
park_info = park_info.rename(columns={'UNIT_CODE':'Park','REGION':'RGN'})
cds_crash = cds_crash.rename(columns={'PARK_ALPHA' : 'Park'})

In [14]:
# add RGN column from lookup table to CDS crash database, joining the two datasets based on park name
cds_crash = pd.merge(cds_crash, park_info[['RGN','Park']], how='left', on='Park')
# resulting dataframe after join should have one additional column and no additional rows
cds_crash = cds_crash.drop_duplicates() 
cds_crash.shape

(204687, 57)

In [15]:
#no_region = cds_crash.loc[cds_crash['RGN'].isnull()==True]
#no_region['Park'].value_counts()

In [16]:
cds_crash.loc[cds_crash['RGN'].isnull()==True].shape

(0, 57)

In [17]:
cds_crash.RGN.value_counts().sort_index()

AKR          237
IMR        33076
MWR         2435
NCR        97838
NER        18642
PWR        24419
SER        27262
Unknown      778
Name: RGN, dtype: int64

In [18]:
cds_crash.to_csv("CDS_CrashTable_RegionAdded.csv", index=False)

## Filter for Necessary Fields, Group by INCID_NO

### Crash table

In [19]:
# drop rows without region
cds_crash = cds_crash[cds_crash["RGN"].str.contains("Unknown")==False]
cds_crash.RGN.value_counts().sort_index()

AKR      237
IMR    33076
MWR     2435
NCR    97838
NER    18642
PWR    24419
SER    27262
Name: RGN, dtype: int64

In [20]:
cds_crash_slim = cds_crash[['INCID_NO','LATITUDE','LONGITUDE','Park','RGN','CRASH_DATE','CRASH_TIME','CRASH_YEAR']]

In [21]:
cds_crash_slim.shape

(203909, 8)

In [22]:
cds_crash_slim.head()

Unnamed: 0,INCID_NO,LATITUDE,LONGITUDE,Park,RGN,CRASH_DATE,CRASH_TIME,CRASH_YEAR
0,ABLI070425075000,,,ABLI,SER,2007-04-25,750,2007.0
1,ABLI070804175500,,,ABLI,SER,2007-08-04,1755,2007.0
2,ABLI091117170900,,,ABLI,SER,2009-11-17,1709,2009.0
3,ABLI121009110000,,,ABLI,SER,2012-10-09,1100,2012.0
4,ABLI140610163500,38.91205,-76.93412,ABLI,SER,2014-06-10,1635,2014.0


In [23]:
cds_crash_slim.to_csv("./CDS_crash_slim.csv",index=False)

### Crash details

In [24]:
cds_crash['CATEGORY'].value_counts()

PD ONLY    166966
INJURY      35282
FATAL        1081
Unknown         7
Name: CATEGORY, dtype: int64

In [25]:
# total number of crashes resulting in an injury or fatality:
35282+1081

36363

In [27]:
cds_crash_details_slim = cds_crash[['INCID_NO']]
cds_crash_details_slim['Injury or Fatal']= np.where(cds_crash['CATEGORY'].isin(['INJURY','FATAL']), 1,0)
cds_crash_details_slim['Injury or Fatal'].sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['Injury or Fatal']= np.where(cds_crash['CATEGORY'].isin(['INJURY','FATAL']), 1,0)


36363

In [28]:
cds_crash['CRASH_CLASS'].value_counts().sort_index()

0.0      14132
1.0     101521
2.0      47715
3.0       1428
4.0       1493
5.0       8133
6.0         32
7.0      18927
10.0        34
88.0      5383
98.0       822
99.0      2849
Name: CRASH_CLASS, dtype: int64

In [29]:
# total number of crashes involving a VRU:
1428+1493

2921

In [30]:
# total number of collisions with "unknown":
822+2849

3671

In [31]:
cds_crash['OBJ_STRUCK'].value_counts().sort_index()

0.0     142603
1.0       6782
2.0       5702
3.0       2605
4.0      16695
5.0        436
6.0        563
7.0       1413
8.0       2513
9.0       1075
10.0      1844
11.0      3124
12.0      1968
13.0        19
88.0      7979
98.0      2598
99.0      4595
Name: OBJ_STRUCK, dtype: int64

In [32]:
# collapse First through sixth Contributing Factor columns into a single column to string search for relevant data attributes
cds_con_fact = cds_crash[['CON_FACT1', 'CON_FACT2', 'CON_FACT3', 'CON_FACT4',
       'CON_FACT5', 'CON_FACT6']]
cds_crash_details_slim['Con_Fact'] = cds_con_fact.apply(
   lambda x: ','.join(x.dropna().astype(str)),
  axis=1
)
cds_crash_details_slim['Con_Fact']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['Con_Fact'] = cds_con_fact.apply(


0                             A88
1                             A88
2                         A13,D06
3                         A16,E88
4         A04,A05,A06,A09,A15,A16
5         U99,U99,U99,U99,U99,U99
6         U99,U99,U99,U99,U99,U99
7         E02,U99,U99,U99,U99,U99
8         A11,U99,U99,U99,U99,U99
9         A88,A16,U99,U99,U99,U99
10        A16,U99,U99,U99,U99,U99
11        A88,U99,U99,U99,U99,U99
12        A06,U99,U99,U99,U99,U99
13        A16,C01,A14,U99,U99,U99
14        A11,U99,U99,U99,U99,U99
15        A16,U99,U99,U99,U99,U99
16        A15,U99,U99,U99,U99,U99
17        A16,U99,U99,U99,U99,U99
18        E03,U99,U99,U99,U99,U99
19        A16,U99,U99,U99,U99,U99
20        A88,U99,U99,U99,U99,U99
21        A16,U99,U99,U99,U99,U99
22        A16,U99,U99,U99,U99,U99
23        A11,U99,U99,U99,U99,U99
24        A16,U99,U99,U99,U99,U99
25        A14,U99,U99,U99,U99,U99
26        A11,U99,U99,U99,U99,U99
27        A88,U99,U99,U99,U99,U99
28        A16,A12,A08,A06,A03,U99
29        A05,

In [33]:
# set flags for relevant data attributes
# include zero columns for IMARS data with no CDS equivalent
cds_crash_details_slim['Collision w Animal']= np.where(cds_crash['CRASH_CLASS']==7.0, 1,0)
cds_crash_details_slim['Collision w Fixed Object']= np.where(cds_crash['CRASH_CLASS']==2.0, 1,0)
cds_crash_details_slim['Non-Collision']= np.where(cds_crash['CRASH_CLASS']==0.0, 1,0)
cds_crash_details_slim['Other Accident Class']= 0
cds_crash_details_slim['Rollover']= np.where(cds_crash['CRASH_CLASS']==10.0, 1,0)
cds_crash_details_slim['Collision w Other Vehicle']= np.where(cds_crash['CRASH_CLASS']==1.0, 1,0)
cds_crash_details_slim['Collision w Parked Vehicle']= np.where(cds_crash['CRASH_CLASS']==5.0, 1,0)
cds_crash_details_slim['Collision w Train']= np.where(cds_crash['CRASH_CLASS']==6.0, 1,0)
cds_crash_details_slim['Collision w Other Object']= np.where(cds_crash['CRASH_CLASS']==88.0, 1,0)
cds_crash_details_slim['Collision w Unknown']= np.where(cds_crash['CRASH_CLASS'].isin([98.0,99.0]), 1,0)
cds_crash_details_slim['Roadway Departure'] = 0
cds_crash_details_slim['Avoiding Animal'] = 0
cds_crash_details_slim['Falling Object'] = 0
cds_crash_details_slim['Pedestrian']= np.where(cds_crash['CRASH_CLASS']==3.0, 1,0)
cds_crash_details_slim['Bicycle']= np.where(cds_crash['CRASH_CLASS']==4.0, 1,0)
cds_crash_details_slim['Pedacycle'] = 0
cds_crash_details_slim['VRU']= np.where(cds_crash['CRASH_CLASS'].isin([3.0,4.0]), 1,0)
cds_crash_details_slim['HorseLlama'] = 0
cds_crash_details_slim['Cow'] = 0
cds_crash_details_slim['Deer'] = 0
cds_crash_details_slim['Elk'] = 0
cds_crash_details_slim['Moose'] = 0
cds_crash_details_slim['Buffalo'] = 0
cds_crash_details_slim['Bear'] = 0
cds_crash_details_slim['Antelope'] = 0
cds_crash_details_slim['SheepGoats'] = 0
cds_crash_details_slim['OtherWild'] = 0
cds_crash_details_slim['OtherDomestic'] = 0
cds_crash_details_slim['Rock in Roadway']= np.where(cds_crash['OBJ_STRUCK']==10.0, 1,0)
cds_crash_details_slim['Animal in Roadway']= np.where(cds_crash_details_slim['Con_Fact'].str.contains('B06')==True,1,0)
cds_crash_details_slim['Rockfall'] = 0
cds_crash_details_slim['Motorist Distraction']=np.where(cds_crash_details_slim['Con_Fact'].str.contains('A16')==True,1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['Collision w Animal']= np.where(cds_crash['CRASH_CLASS']==7.0, 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['Collision w Fixed Object']= np.where(cds_crash['CRASH_CLASS']==2.0, 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['Rockfall'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_crash_details_slim['Motorist Distraction']=np.where(cds_crash_details_slim['Con_Fact'].str.contains('A16')==True,1,0)


In [34]:
# check new flag column against value count sum from above - looks good!
cds_crash_details_slim['VRU'].sum()

2921

In [35]:
# check new flag column sum against value counts - looks good!
cds_crash_details_slim['Non-Collision'].sum()

14132

In [36]:
# check new flag column against value count sum from above - looks good!
cds_crash_details_slim['Collision w Unknown'].sum()

3671

In [37]:
# check new flag column sum against value counts - looks good!
cds_crash_details_slim.Pedestrian.sum()

1428

In [38]:
# check new flag column sum against value counts - looks good!
cds_crash_details_slim['Rock in Roadway'].sum()

1844

In [39]:
# check new flag column sum against value counts - looks good 
# Note that this total represents a maximum of one instance of 'B06' reported per crash, 
# although it is possible for 'B06' to be repeated as more than one of six contributing factors for the same crash.
# Therefore, the following total is slightly less than that from wildlife data exploration in which every instance was counted
cds_crash_details_slim['Animal in Roadway'].sum()

19808

In [40]:
# check new flag column sum against value counts - looks good 
# Note that this total represents a maximum of one instance of 'A16' reported per crash, 
# although it is possible for 'A16' to be repeated as more than one of six contributing factors for the same crash.
# Therefore, the following total is slightly less than that from the sum above in which every instance was counted
cds_crash_details_slim['Motorist Distraction'].sum()

55644

In [41]:
cds_crash_details_slim.head()

Unnamed: 0,INCID_NO,Injury or Fatal,Con_Fact,Collision w Animal,Collision w Fixed Object,Non-Collision,Other Accident Class,Rollover,Collision w Other Vehicle,Collision w Parked Vehicle,...,Buffalo,Bear,Antelope,SheepGoats,OtherWild,OtherDomestic,Rock in Roadway,Animal in Roadway,Rockfall,Motorist Distraction
0,ABLI070425075000,0,A88,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ABLI070804175500,0,A88,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ABLI091117170900,0,"A13,D06",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ABLI121009110000,0,"A16,E88",0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,ABLI140610163500,0,"A04,A05,A06,A09,A15,A16",0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [42]:
# remove intermediate columns used to set flags in crash details and passenger tables
cds_crash_details_slim = cds_crash_details_slim.drop(columns = ['Con_Fact'])
cds_crash_details_slim.head()

Unnamed: 0,INCID_NO,Injury or Fatal,Collision w Animal,Collision w Fixed Object,Non-Collision,Other Accident Class,Rollover,Collision w Other Vehicle,Collision w Parked Vehicle,Collision w Train,...,Buffalo,Bear,Antelope,SheepGoats,OtherWild,OtherDomestic,Rock in Roadway,Animal in Roadway,Rockfall,Motorist Distraction
0,ABLI070425075000,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ABLI070804175500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ABLI091117170900,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ABLI121009110000,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,ABLI140610163500,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [43]:
cds_crash_details_slim.to_csv("./CDS_crash_details_slim.csv",index=False)

### Passenger table

In [44]:
cds_passenger['PASS_INJ'].value_counts()

0.0     95323
1.0      7776
2.0      5967
3.0      2165
98.0     1124
99.0      825
4.0       298
Name: PASS_INJ, dtype: int64

In [45]:
cds_vehicle['DRIVER_INJ'].value_counts()

0.0     257854
1.0      17472
2.0      14419
3.0       5816
99.0      5419
4.0        934
98.0       122
Name: DRIVER_INJ, dtype: int64

In [46]:
cds_passenger['PASS_INJ'].value_counts()

0.0     95323
1.0      7776
2.0      5967
3.0      2165
98.0     1124
99.0      825
4.0       298
Name: PASS_INJ, dtype: int64

In [47]:
cds_crash['PED_INJ'].value_counts()

 0.0    201961
 1.0       501
 2.0        16
 3.0         4
-1.0         1
Name: PED_INJ, dtype: int64

In [48]:
cds_crash['BIKE_INJ'].value_counts()

0.0    201917
1.0       579
2.0         7
3.0         1
Name: BIKE_INJ, dtype: int64

In [61]:
# collect injury severity data from all tables
cds_crash_inj = cds_crash[['INCID_NO','PED_INJ','BIKE_INJ']]
cds_vehicle_inj = cds_vehicle[['INCID_NO','DRIVER_INJ']]
cds_passenger_inj = cds_passenger[['INCID_NO','PASS_INJ']]
cds_crash_inj.shape, cds_vehicle_inj.shape, cds_passenger_inj.shape

((203909, 3), (311057, 2), (114151, 2))

In [62]:
# merge injury severity data from all tables into single dataframe
cds_vehicle_and_crash_inj = cds_vehicle_inj.merge(cds_crash_inj, how='left', on='INCID_NO')
cds_all_inj = cds_vehicle_and_crash_inj.merge(cds_passenger_inj, how='left', on='INCID_NO')
cds_all_inj.shape 

(411722, 5)

In [63]:
cds_all_inj.head()

Unnamed: 0,INCID_NO,DRIVER_INJ,PED_INJ,BIKE_INJ,PASS_INJ
0,ABLI070425075000,0.0,0.0,0.0,
1,ABLI070804175500,0.0,0.0,0.0,
2,ABLI091117170900,0.0,0.0,0.0,
3,ABLI091117170900,0.0,0.0,0.0,
4,ABLI121009110000,0.0,0.0,0.0,0.0


In [127]:
# count instances of each data attribute per row
cds_inj_sev_counts = cds_all_inj[cds_all_inj.columns[1:5]].apply(pd.Series.value_counts, axis=1).fillna(0)
cds_inj_sev_counts.head()

Unnamed: 0,-1.0,0.0,1.0,2.0,3.0,4.0,98.0,99.0
0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0


In [128]:
cds_inj_sev_counts.shape

(411722, 8)

In [129]:
# add incident number to counts
cds_inj_sev_counts['INCID_NO']=cds_all_inj[['INCID_NO']]
# combine "Blank on Form" and "Unknown" data attributes into a single attribute
cds_inj_sev_counts['Unknown Injury']=cds_inj_sev_counts[98.0]+cds_inj_sev_counts[99.0]
cds_inj_sev_counts.head()

Unnamed: 0,-1.0,0.0,1.0,2.0,3.0,4.0,98.0,99.0,INCID_NO,Unknown Injury
0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,ABLI070425075000,0.0
1,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,ABLI070804175500,0.0
2,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,ABLI091117170900,0.0
3,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,ABLI091117170900,0.0
4,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,ABLI121009110000,0.0


In [130]:
# drop no longer needed columns and rename useful columns
cds_inj_sev = cds_inj_sev_counts[['INCID_NO']]
cds_inj_sev['NUM_OCC']=1
cds_inj_sev['No Injury']= cds_inj_sev_counts[0.0]
cds_inj_sev['Possible Injury']= cds_inj_sev_counts[1.0]
cds_inj_sev['Non-incapacitating Injury']= cds_inj_sev_counts[2.0]
cds_inj_sev['Incapacitating Injury']= cds_inj_sev_counts[3.0]
cds_inj_sev['Fatality']= cds_inj_sev_counts[4.0]
cds_inj_sev['Unknown Injury']= cds_inj_sev_counts['Unknown Injury']
cds_inj_sev.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_inj_sev['NUM_OCC']=1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_inj_sev['No Injury']= cds_inj_sev_counts[0.0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_inj_sev['Possible Injury']= cds_inj_sev_counts[1.0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

Unnamed: 0,INCID_NO,NUM_OCC,No Injury,Possible Injury,Non-incapacitating Injury,Incapacitating Injury,Fatality,Unknown Injury
0,ABLI070425075000,1,3.0,0.0,0.0,0.0,0.0,0.0
1,ABLI070804175500,1,3.0,0.0,0.0,0.0,0.0,0.0
2,ABLI091117170900,1,3.0,0.0,0.0,0.0,0.0,0.0
3,ABLI091117170900,1,3.0,0.0,0.0,0.0,0.0,0.0
4,ABLI121009110000,1,4.0,0.0,0.0,0.0,0.0,0.0


In [131]:
cds_inj_sev.shape

(411722, 8)

In [132]:
# collapse multiple rows for each person involved in the crash into a single row for each crash
# injuries and fatalities should be summed as total numbers per crash ("sum" function)
cds_inj_sev_agg = cds_inj_sev.groupby(by=['INCID_NO']).sum().reset_index()
cds_inj_sev_agg.shape

(194715, 8)

In [133]:
cds_crash['FATALS'].value_counts()

 0.0    202330
 1.0       936
 2.0        98
 3.0        12
 4.0         5
 5.0         4
-2.0         1
Name: FATALS, dtype: int64

In [134]:
cds_crash['INJURED'].value_counts()

 0.0     167921
 1.0      25018
 2.0       7055
 3.0       1971
 4.0        856
 5.0        318
 6.0        148
 7.0         45
 8.0         24
 9.0         12
 10.0         7
 12.0         3
 13.0         1
 15.0         1
-1.0          1
 17.0         1
 11.0         1
 16.0         1
 14.0         1
Name: INJURED, dtype: int64

In [135]:
# add columns to match IMARS
cds_inj_and_fatal_counts = cds_crash[['INCID_NO']]
cds_inj_and_fatal_counts['Non-motorist Distraction']=0
cds_inj_and_fatal_counts['Num_Fatalities'] = cds_crash[['FATALS']]
cds_inj_and_fatal_counts['Num_Injuries'] = cds_crash[['INJURED']]
cds_inj_and_fatal_counts.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_inj_and_fatal_counts['Non-motorist Distraction']=0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_inj_and_fatal_counts['Num_Fatalities'] = cds_crash[['FATALS']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_inj_and_fatal_counts['Num_Injuries'] = cds_crash[['INJURED']]


(203909, 4)

In [136]:
# merge fatality/injury counts with pre-aggregated injury severity into single passenger_slim table
cds_passenger_slim = cds_inj_sev_agg.merge(cds_inj_and_fatal_counts, how='right', on='INCID_NO')
cds_passenger_slim.head()

Unnamed: 0,INCID_NO,NUM_OCC,No Injury,Possible Injury,Non-incapacitating Injury,Incapacitating Injury,Fatality,Unknown Injury,Non-motorist Distraction,Num_Fatalities,Num_Injuries
0,ABLI070425075000,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
1,ABLI070804175500,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
2,ABLI091117170900,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
3,ABLI121009110000,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
4,ABLI140610163500,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0


In [137]:
# result should be no more than 203909 rows (number of useable crash records)
cds_passenger_slim.shape

(203909, 11)

In [138]:
# make sure all columns contain data as you would expect
cds_passenger_slim.to_csv("./CDS_passenger_slim.csv",index=False)

### Vehicle table

In [92]:
cds_vehicle_slim = cds_vehicle[['INCID_NO']]

In [93]:
cds_vehicle.SPEED_LIMIT.value_counts().sort_index()

5.0      9385
10.0     4002
15.0    26493
20.0     3218
25.0    65332
30.0    14313
35.0    47061
40.0    15184
45.0    47545
50.0    16816
55.0    28737
60.0      132
65.0      117
70.0        4
75.0       19
99.0    17894
Name: SPEED_LIMIT, dtype: int64

In [94]:
cds_vehicle.PED_LOC.value_counts().sort_index()

0.0     189071
1.0        447
2.0       1195
3.0         79
4.0        199
88.0      1170
98.0      1862
99.0    111750
Name: PED_LOC, dtype: int64

In [95]:
# number of vehicle entries in which a crash occurred at a known site other than a crosswalk 
# (i.e., "in roadway (not in crosswalk), "on trail/bikeway (off roadway)", "other off roadway", "other"):
1195+79+199+1170

2643

In [96]:
# set flags for each relevant data attribute
# add zero column for IMARS data with no CDS equivalent
cds_vehicle_slim['5_mph']= np.where(cds_vehicle['SPEED_LIMIT']==5.0, 1,0)
cds_vehicle_slim['10_mph']= np.where(cds_vehicle['SPEED_LIMIT']==10.0, 1,0)
cds_vehicle_slim['15_mph']= np.where(cds_vehicle['SPEED_LIMIT']==15.0, 1,0)
cds_vehicle_slim['20_mph']= np.where(cds_vehicle['SPEED_LIMIT']==20.0, 1,0)
cds_vehicle_slim['25_mph']= np.where(cds_vehicle['SPEED_LIMIT']==25.0, 1,0)
cds_vehicle_slim['30_mph']= np.where(cds_vehicle['SPEED_LIMIT']==30.0, 1,0)
cds_vehicle_slim['35_mph']= np.where(cds_vehicle['SPEED_LIMIT']==35.0, 1,0)
cds_vehicle_slim['40_mph']= np.where(cds_vehicle['SPEED_LIMIT']==40.0, 1,0)
cds_vehicle_slim['45_mph']= np.where(cds_vehicle['SPEED_LIMIT']==45.0, 1,0)
cds_vehicle_slim['50_mph']= np.where(cds_vehicle['SPEED_LIMIT']==50.0, 1,0)
cds_vehicle_slim['55_mph']= np.where(cds_vehicle['SPEED_LIMIT']==55.0, 1,0)
cds_vehicle_slim['60_mph']= np.where(cds_vehicle['SPEED_LIMIT']==60.0, 1,0)
cds_vehicle_slim['65_mph']= np.where(cds_vehicle['SPEED_LIMIT']==65.0, 1,0)
cds_vehicle_slim['70_mph']= np.where(cds_vehicle['SPEED_LIMIT']==70.0, 1,0)
cds_vehicle_slim['75_mph']= np.where(cds_vehicle['SPEED_LIMIT']==75.0, 1,0)
cds_vehicle_slim['no_posted_speed']= 0
cds_vehicle_slim['Crosswalk']= np.where(cds_vehicle['PED_LOC']==1.0, 1,0)
cds_vehicle_slim['Outside a Crosswalk']= np.where(cds_vehicle['PED_LOC'].isin([2.0,3.0,4.0,88.0]), 1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_vehicle_slim['5_mph']= np.where(cds_vehicle['SPEED_LIMIT']==5.0, 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_vehicle_slim['10_mph']= np.where(cds_vehicle['SPEED_LIMIT']==10.0, 1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_vehicle_slim['15_mph']= np.where(cds_vehicle['SPEED

In [97]:
cds_vehicle_slim['Outside a Crosswalk'].sum()

2643

In [101]:
# check new flag column sum against value counts - looks good!
cds_vehicle_slim['5_mph'].sum()

9385

In [102]:
# check new flag column sum against value counts - looks good!
cds_vehicle_slim['Crosswalk'].sum()

447

In [98]:
cds_vehicle_slim.columns

Index(['INCID_NO', '5_mph', '10_mph', '15_mph', '20_mph', '25_mph', '30_mph',
       '35_mph', '40_mph', '45_mph', '50_mph', '55_mph', '60_mph', '65_mph',
       '70_mph', '75_mph', 'no_posted_speed', 'Crosswalk',
       'Outside a Crosswalk'],
      dtype='object')

In [99]:
cds_vehicle_slim.head()

Unnamed: 0,INCID_NO,5_mph,10_mph,15_mph,20_mph,25_mph,30_mph,35_mph,40_mph,45_mph,50_mph,55_mph,60_mph,65_mph,70_mph,75_mph,no_posted_speed,Crosswalk,Outside a Crosswalk
0,ABLI070425075000,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,ABLI070804175500,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,ABLI091117170900,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,ABLI091117170900,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,ABLI121009110000,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [100]:
cds_vehicle_slim.shape

(311057, 19)

In [103]:
# make sure all columns contain data as you would expect (i.e., 0 and 1)
cds_vehicle_slim.to_csv("./CDS_vehicle_slim_test.csv",index=False)

In [104]:
# collapse multiple rows for each vehicle involved in the crash into a single row for each crash
# only one speed limit or crosswalk flag should exist per crash to not double count data attribute ("first" function)
cds_vehicle_slim_agg = cds_vehicle_slim.groupby(by=['INCID_NO']).first().reset_index()

In [105]:
# result should be no more than 203909 rows (number of useable crash records)
cds_vehicle_slim_agg.shape

(194715, 19)

## merge crash table with other CDS tables 

In [139]:
cds_passenger_merged = cds_passenger.merge(cds_crash_slim, how='right', on='INCID_NO')
cds_passenger_merged.drop_duplicates() 
cds_passenger_merged.head()

Unnamed: 0,OBJECTID,INCID_NO,UNIT_NO,PASS_SEQ,PASS_SEX,PASS_BELT,PASS_EJECT,PASS_SEAT,PASS_INJ,PASS_AGE,LATITUDE,LONGITUDE,Park,RGN,CRASH_DATE,CRASH_TIME,CRASH_YEAR
0,,ABLI070425075000,,,,,,,,,,,ABLI,SER,2007-04-25,750,2007.0
1,,ABLI070804175500,,,,,,,,,,,ABLI,SER,2007-08-04,1755,2007.0
2,,ABLI091117170900,,,,,,,,,,,ABLI,SER,2009-11-17,1709,2009.0
3,5.0,ABLI121009110000,1.0,1.0,1.0,1.0,0.0,3.0,0.0,52.0,,,ABLI,SER,2012-10-09,1100,2012.0
4,,ABLI140610163500,,,,,,,,,38.91205,-76.93412,ABLI,SER,2014-06-10,1635,2014.0


In [140]:
cds_passenger_merged.shape

(257759, 17)

In [141]:
# non-aggregated dataset (one row per person involved, all original columns) merged to include basic crash info columns
cds_passenger_merged.to_csv("./CDS_passenger_full.csv",index=False)

In [142]:
cds_passenger_slim_merged = cds_passenger_slim.merge(cds_crash_slim, how='right', on='INCID_NO')
cds_passenger_slim_merged.drop_duplicates() 
# result should have no more than 209303 rows and exactly passenger_slim + crash_slim -1 columns (18)
cds_passenger_slim_merged.shape

(203909, 18)

In [143]:
cds_passenger_slim_merged.head()

Unnamed: 0,INCID_NO,NUM_OCC,No Injury,Possible Injury,Non-incapacitating Injury,Incapacitating Injury,Fatality,Unknown Injury,Non-motorist Distraction,Num_Fatalities,Num_Injuries,LATITUDE,LONGITUDE,Park,RGN,CRASH_DATE,CRASH_TIME,CRASH_YEAR
0,ABLI070425075000,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,,,ABLI,SER,2007-04-25,750,2007.0
1,ABLI070804175500,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,,,ABLI,SER,2007-08-04,1755,2007.0
2,ABLI091117170900,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,,,ABLI,SER,2009-11-17,1709,2009.0
3,ABLI121009110000,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,,,ABLI,SER,2012-10-09,1100,2012.0
4,ABLI140610163500,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,38.91205,-76.93412,ABLI,SER,2014-06-10,1635,2014.0


In [144]:
# aggregated dataset (one row per crash, only columns of interest) merged to include basic crash info
cds_passenger_slim_merged.to_csv("./CDS_passenger_slim_agg.csv",index=False)

In [145]:
cds_vehicle_merged = cds_vehicle.merge(cds_crash_slim, how='right', on='INCID_NO')
cds_vehicle_merged.drop_duplicates() 
cds_vehicle_merged.head()

Unnamed: 0,OBJECTID,INCID_NO,UNIT_NO,VEH_YEAR,MAKE_MOD,MODEL,NUM_OCC,REG_STATE,REG_YEAR,PLATE_NUM,...,PED_LOC,PED_ACTN,REPAIR,LATITUDE,LONGITUDE,Park,RGN,CRASH_DATE,CRASH_TIME,CRASH_YEAR
0,4.0,ABLI070425075000,1.0,,UNKNOWN,,1.0,,,,...,0.0,0.0,0.0,,,ABLI,SER,2007-04-25,750,2007.0
1,5.0,ABLI070804175500,1.0,0.0,,,0.0,,,,...,0.0,0.0,0.0,,,ABLI,SER,2007-08-04,1755,2007.0
2,6.0,ABLI091117170900,1.0,2004.0,JEEP,LIBERTY,1.0,,,,...,0.0,0.0,,,,ABLI,SER,2009-11-17,1709,2009.0
3,7.0,ABLI091117170900,2.0,2009.0,GIANT,DEFY3,1.0,,,,...,2.0,2.0,,,,ABLI,SER,2009-11-17,1709,2009.0
4,8.0,ABLI121009110000,1.0,2010.0,CHEVROLET,TAHOE,2.0,,,,...,0.0,0.0,2300.0,,,ABLI,SER,2012-10-09,1100,2012.0


In [146]:
cds_vehicle_merged.shape

(320023, 38)

In [147]:
# non-aggregated dataset (one row per vehicle involved, all original columns) merged to include basic crash info
cds_vehicle_merged.to_csv("./CDS_vehicle_full.csv",index=False)

In [148]:
cds_vehicle_slim_merged = cds_vehicle_slim_agg.merge(cds_crash_slim, how='right', on='INCID_NO')
cds_vehicle_slim_merged.drop_duplicates() 
# result should have no more than 209303 rows and exactly vehicle_slim + crash_slim -1 columns (26)
cds_vehicle_slim_merged.shape

(203909, 26)

In [149]:
cds_vehicle_slim_merged.head()

Unnamed: 0,INCID_NO,5_mph,10_mph,15_mph,20_mph,25_mph,30_mph,35_mph,40_mph,45_mph,...,no_posted_speed,Crosswalk,Outside a Crosswalk,LATITUDE,LONGITUDE,Park,RGN,CRASH_DATE,CRASH_TIME,CRASH_YEAR
0,ABLI070425075000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,ABLI,SER,2007-04-25,750,2007.0
1,ABLI070804175500,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,ABLI,SER,2007-08-04,1755,2007.0
2,ABLI091117170900,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,ABLI,SER,2009-11-17,1709,2009.0
3,ABLI121009110000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,ABLI,SER,2012-10-09,1100,2012.0
4,ABLI140610163500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,38.91205,-76.93412,ABLI,SER,2014-06-10,1635,2014.0


In [150]:
# cleaned and aggregated dataset (one row per crash, only columns of interest) merged to include basic crash info
cds_vehicle_slim_merged.to_csv("./CDS_vehicle_slim.csv",index=False)

## merge all CDS slim tables into clean dataset for combination with IMARS

In [151]:
cds_passenger_slim.shape, cds_vehicle_slim_agg.shape, cds_crash_details_slim.shape, cds_crash_slim.shape

((203909, 11), (194715, 19), (203909, 34), (203909, 8))

In [152]:
#the number of columns in the final dataset should be:
11+19+34+8-3+1

70

In [153]:
cds_slim_passenger_and_vehicle=cds_vehicle_slim_agg.merge(cds_passenger_slim, how='right', on='INCID_NO')
cds_slim_passenger_vehicle_details=cds_slim_passenger_and_vehicle.merge(cds_crash_details_slim, how='right', on='INCID_NO')
cds_slim_all=cds_slim_passenger_vehicle_details.merge(cds_crash_slim, how="right", on='INCID_NO')
cds_slim_all.drop_duplicates() 
# add column for name of database
cds_slim_all['database']='CDS'
cds_slim_all.shape

(203909, 70)

In [154]:
cds_slim_all.head()

Unnamed: 0,INCID_NO,5_mph,10_mph,15_mph,20_mph,25_mph,30_mph,35_mph,40_mph,45_mph,...,Rockfall,Motorist Distraction,LATITUDE,LONGITUDE,Park,RGN,CRASH_DATE,CRASH_TIME,CRASH_YEAR,database
0,ABLI070425075000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,,,ABLI,SER,2007-04-25,750,2007.0,CDS
1,ABLI070804175500,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,,,ABLI,SER,2007-08-04,1755,2007.0,CDS
2,ABLI091117170900,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,,,ABLI,SER,2009-11-17,1709,2009.0,CDS
3,ABLI121009110000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,1,,,ABLI,SER,2012-10-09,1100,2012.0,CDS
4,ABLI140610163500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,38.91205,-76.93412,ABLI,SER,2014-06-10,1635,2014.0,CDS


In [155]:
cds_slim_all.columns

Index(['INCID_NO', '5_mph', '10_mph', '15_mph', '20_mph', '25_mph', '30_mph',
       '35_mph', '40_mph', '45_mph', '50_mph', '55_mph', '60_mph', '65_mph',
       '70_mph', '75_mph', 'no_posted_speed', 'Crosswalk',
       'Outside a Crosswalk', 'NUM_OCC', 'No Injury', 'Possible Injury',
       'Non-incapacitating Injury', 'Incapacitating Injury', 'Fatality',
       'Unknown Injury', 'Non-motorist Distraction', 'Num_Fatalities',
       'Num_Injuries', 'Injury or Fatal', 'Collision w Animal',
       'Collision w Fixed Object', 'Non-Collision', 'Other Accident Class',
       'Rollover', 'Collision w Other Vehicle', 'Collision w Parked Vehicle',
       'Collision w Train', 'Collision w Other Object', 'Collision w Unknown',
       'Roadway Departure', 'Avoiding Animal', 'Falling Object', 'Pedestrian',
       'Bicycle', 'Pedacycle', 'VRU', 'HorseLlama', 'Cow', 'Deer', 'Elk',
       'Moose', 'Buffalo', 'Bear', 'Antelope', 'SheepGoats', 'OtherWild',
       'OtherDomestic', 'Rock in Roadway', 'A

In [156]:
# reorder columns to match IMARS
cds_slim_all = cds_slim_all[['INCID_NO', '5_mph', '10_mph', '15_mph', '20_mph', '25_mph', '30_mph',
       '35_mph', '40_mph', '45_mph', '50_mph', '55_mph', '60_mph', '65_mph',
       '70_mph', '75_mph', 'no_posted_speed', 'Crosswalk',
       'Outside a Crosswalk', 'NUM_OCC', 'No Injury', 'Possible Injury',
       'Non-incapacitating Injury', 'Incapacitating Injury', 'Fatality',
       'Unknown Injury', 'Motorist Distraction', 'Non-motorist Distraction',
       'Num_Fatalities', 'Num_Injuries', 'Injury or Fatal',
       'Collision w Animal', 'Collision w Fixed Object', 'Non-Collision',
       'Other Accident Class', 'Rollover', 'Collision w Other Vehicle',
       'Collision w Parked Vehicle', 'Collision w Train',
       'Collision w Other Object', 'Collision w Unknown', 'Roadway Departure',
       'Avoiding Animal', 'Falling Object', 'Pedestrian', 'Bicycle',
       'Pedacycle', 'VRU', 'HorseLlama', 'Cow', 'Deer', 'Elk', 'Moose',
       'Buffalo', 'Bear', 'Antelope', 'SheepGoats', 'OtherWild',
       'OtherDomestic', 'Rock in Roadway', 'Animal in Roadway', 'Rockfall',
       'LATITUDE', 'LONGITUDE', 'Park', 'RGN', 'CRASH_DATE', 'CRASH_TIME',
       'CRASH_YEAR', 'database']]

In [157]:
cds_slim_all.to_csv('CDS_slim_clean.csv',index=False)