## Crash Data Wrangling Jupyter Notebook

**Author:** Eric Englin

**Date:** 11/22/21

**Purpose:** This notebook will clean the CDS data and make a dataframe that is joinable with other data sources

In [1]:
import pandas as pd
import numpy as np
import os



In [2]:
myworkingdirectory = r"C:\Users\eric.englin\Desktop\TSP"
os.chdir(myworkingdirectory)

In [3]:
cds_df = pd.read_csv('./data/CDS All Crashes Table for FLH 6-28-21.csv')
cds_df_passengers = pd.read_excel('./data/CDS all_passengers_20211109.xlsx')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
cds_df.head()

Unnamed: 0,OBJECTID,INCID_NO,CASE_NUM,PARK_ALPHA,Region,# Crash,STATE_CODE,CRASH_DATE,CRASH_TIME,RTE_NO,...,LATITUDE,LONGITUDE,MILEPOST,IMPORT_DATE,FILE_NAME,SAVE_DATE,ROUTE_IDENT,RIP_CYCLE,MP_NODE,SPTL_LOC
0,6,ABLI140610163500,14054379,ABLI,SER,1,KY,"Tuesday, June 10, 2014",1635.0,,...,38.91205,-76.93412,0.0,,,00:00.0,,,,0
1,5,ABLI121009110000,12474,ABLI,SER,1,KY,"Tuesday, October 9, 2012",1100.0,101.0,...,,,,,,00:00.0,,,,0
2,4,ABLI091117170900,N08113,ABLI,SER,1,NY,"Tuesday, November 17, 2009",1709.0,,...,,,,,,00:00.0,,,,0
3,3,ABLI070804175500,5540070013,ABLI,SER,1,KY,"Saturday, August 4, 2007",1755.0,0.0,...,,,,,,,,,,0
4,2,ABLI070425075000,5540070001,ABLI,SER,1,KY,"Wednesday, April 25, 2007",750.0,0.0,...,,,,,,,,,,0


In [5]:
cds_df_passengers.head()

Unnamed: 0,OBJECTID,INCID_NO,UNIT_NO,PASS_SEQ,PASS_SEX,PASS_BELT,PASS_EJECT,PASS_SEAT,PASS_INJ,PASS_AGE
0,5,ABLI121009110000,1,1,1.0,1.0,0.0,3.0,0.0,52.0
1,6,ACAD000417131000,1,1,2.0,1.0,0.0,3.0,0.0,36.0
2,7,ACAD000531143000,1,1,2.0,1.0,0.0,3.0,0.0,46.0
3,8,ACAD000531143000,1,2,2.0,1.0,0.0,4.0,0.0,25.0
4,9,ACAD000531143000,1,3,1.0,2.0,0.0,6.0,0.0,13.0


In [6]:
cds_df.CRASH_CLASS.value_counts()

1.0     45460
2.0     18223
7.0      6603
0.0      4824
5.0      2422
88.0     1985
98.0      822
99.0      801
4.0       703
3.0       604
10.0       23
6.0        15
Name: CRASH_CLASS, dtype: int64

In [7]:
cds_df['Non-Collision']= np.where(cds_df['CRASH_CLASS']==0, 1,0)
cds_df['Collision with Other Motor Vehicle']= np.where(cds_df['CRASH_CLASS']==1, 1,0)
cds_df['Collision with Fixed Object']= np.where(cds_df['CRASH_CLASS']==2, 1,0)
cds_df['Collision with Pedestrian']= np.where(cds_df['CRASH_CLASS']==3, 1,0)
cds_df['Collision with Bicycle']= np.where(cds_df['CRASH_CLASS']==4, 1,0)
cds_df['Collision with Parked Motor Vehicle']= np.where(cds_df['CRASH_CLASS']==5, 1,0)
cds_df['Collision with Railway Train']= np.where(cds_df['CRASH_CLASS']==6, 1,0)
cds_df['Collision with Animal']= np.where(cds_df['CRASH_CLASS']==7, 1,0)
cds_df['Collision with Other Object']= np.where(cds_df['CRASH_CLASS']==88, 1,0)
cds_df['Collision with Unknown']= np.where(cds_df['CRASH_CLASS']==99, 1,0)
cds_df['Other Accident Class']= np.where(cds_df['CRASH_CLASS'].isin([0,1,2,3,4,5,6,7,88,99])==False,
                                                                                        1,0)


In [8]:
cds_df_passengers['No Injury']= np.where(cds_df_passengers['PASS_INJ']==0, 1,0)
cds_df_passengers['Possible Injury']= np.where(cds_df_passengers['PASS_INJ']==1, 1,0)
cds_df_passengers['Non-incapacitating Injury']= np.where(cds_df_passengers['PASS_INJ']==2, 1,0)
cds_df_passengers['Incapacitating Injury']= np.where(cds_df_passengers['PASS_INJ']==3, 1,0)
cds_df_passengers['Fatality']= np.where(cds_df_passengers['PASS_INJ']==4, 1,0)
cds_df_passengers['Unknown Injury']= np.where(cds_df_passengers['PASS_INJ'].isin([98,99]), 1,0)
cds_df_passengers['NUM_OCC']=1

In [9]:
cds_df_passengers.shape, cds_df.shape

((49359, 17), (83926, 69))

#### Data Quality Issues
- 

In [10]:
cds_df['FATALS'].sum(), cds_df_passengers['Fatality'].sum()

(712.0, 103)

In [11]:
cds_df_passengers_slim = cds_df_passengers[['INCID_NO', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury', 'NUM_OCC']]

In [12]:
cds_df_passengers_agg = cds_df_passengers_slim.groupby(by=['INCID_NO']).sum()
cds_df_passengers_agg = cds_df_passengers_agg.reset_index()

In [13]:
cds_df_passengers_agg.shape, cds_df_passengers.shape, cds_df.shape

((26734, 8), (49359, 17), (83926, 69))

In [14]:
cds_df_join = cds_df.merge(cds_df_passengers_agg, on = 'INCID_NO',how='left',indicator = True)

In [15]:
cds_df_join._merge.value_counts()

left_only     57192
both          26734
right_only        0
Name: _merge, dtype: int64

In [16]:
26734/83926

0.31854252555822987

In [17]:
cds_df_join = cds_df_join.rename(columns={"CASE_NUM": "CASENUM", 
                                      "STATE_CODE": "STATE"})


In [18]:
cds_df_join['CRASH_DATE2'] = cds_df_join['CRASH_DATE']
cds_df_join['CRASH_DATE'] = pd.to_datetime( cds_df_join['CRASH_DATE'], 
                                            format = '%A, %B %d, %Y')

In [20]:
cds_df_join = cds_df_join[['INCID_NO', 'CASENUM', 'NUM_OCC', 'Non-Collision',
       'Collision with Other Motor Vehicle', 'Collision with Fixed Object',
       'Collision with Pedestrian', 'Collision with Bicycle',
       'Collision with Parked Motor Vehicle', 'Collision with Railway Train',
       'Collision with Animal', 'Collision with Other Object',
       'Collision with Unknown', 'Other Accident Class', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury', 'PARK_ALPHA', 'STATE', 'CRASH_DATE',
       'CRASH_TIME', 'LATITUDE', 'LONGITUDE', 'FATALS', 'INJURED']]

In [21]:
cds_df.shape, cds_df_join.shape

((83926, 69), (83926, 28))

In [22]:
cds_df_join.to_csv("crash_data_CDS_clean.csv", index=False)