## Crash Data Wrangling Jupyter Notebook

**Author:** Smitha Mahesh 

**Date:** 6/27/2022 

**Purpose:** This notebook use the new CDS files as the input and make a dataframe that is joinable with other data sources

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
myworkingdirectory = r"C:\Users\smitha.mahesh\Desktop\New CDS Excel Files"
os.chdir(myworkingdirectory)

In [None]:
cds_df = pd.read_excel('./ALL_CRASH.xlsx')
cds_df_passengers = pd.read_excel('./ALL_PASSENGER.xlsx')

In [None]:
cds_df.head()

In [None]:
cds_df_passengers.head()

In [10]:
cds_df.CRASH_CLASS.value_counts()

1.0     102007
2.0      47867
7.0      18945
0.0      14174
5.0       8176
88.0      5396
99.0      2858
4.0       1498
3.0       1438
98.0       822
10.0        34
6.0         32
Name: CRASH_CLASS, dtype: int64

In [11]:
cds_df['Non-Collision']= np.where(cds_df['CRASH_CLASS']==0, 1,0)
cds_df['Collision with Other Motor Vehicle']= np.where(cds_df['CRASH_CLASS']==1, 1,0)
cds_df['Collision with Fixed Object']= np.where(cds_df['CRASH_CLASS']==2, 1,0)
cds_df['Collision with Pedestrian']= np.where(cds_df['CRASH_CLASS']==3, 1,0)
cds_df['Collision with Bicycle']= np.where(cds_df['CRASH_CLASS']==4, 1,0)
cds_df['Collision with Parked Motor Vehicle']= np.where(cds_df['CRASH_CLASS']==5, 1,0)
cds_df['Collision with Railway Train']= np.where(cds_df['CRASH_CLASS']==6, 1,0)
cds_df['Collision with Animal']= np.where(cds_df['CRASH_CLASS']==7, 1,0)
cds_df['Collision with Other Object']= np.where(cds_df['CRASH_CLASS']==88, 1,0)
cds_df['Collision with Unknown']= np.where(cds_df['CRASH_CLASS']==99, 1,0)
cds_df['Other Accident Class']= np.where(cds_df['CRASH_CLASS'].isin([0,1,2,3,4,5,6,7,88,99])==False,
                                                                                        1,0)


In [12]:
cds_df_passengers['No Injury']= np.where(cds_df_passengers['PASS_INJ']==0, 1,0)
cds_df_passengers['Possible Injury']= np.where(cds_df_passengers['PASS_INJ']==1, 1,0)
cds_df_passengers['Non-incapacitating Injury']= np.where(cds_df_passengers['PASS_INJ']==2, 1,0)
cds_df_passengers['Incapacitating Injury']= np.where(cds_df_passengers['PASS_INJ']==3, 1,0)
cds_df_passengers['Fatality']= np.where(cds_df_passengers['PASS_INJ']==4, 1,0)
cds_df_passengers['Unknown Injury']= np.where(cds_df_passengers['PASS_INJ'].isin([98,99]), 1,0)
cds_df_passengers['NUM_OCC']=1

In [13]:
cds_df_passengers.shape, cds_df.shape

((114151, 17), (204687, 67))

#### Data Quality Issues
- 

In [14]:
cds_df['FATALS'].sum(), cds_df_passengers['Fatality'].sum()

(1206.0, 298)

In [15]:
cds_df_passengers_slim = cds_df_passengers[['INCID_NO', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury', 'NUM_OCC']]

In [16]:
cds_df_passengers_agg = cds_df_passengers_slim.groupby(by=['INCID_NO']).sum()
cds_df_passengers_agg = cds_df_passengers_agg.reset_index()

In [17]:
cds_df_passengers_agg.shape, cds_df_passengers.shape, cds_df.shape

((60136, 8), (114151, 17), (204687, 67))

In [18]:
cds_df_join = cds_df.merge(cds_df_passengers_agg, on = 'INCID_NO',how='left',indicator = True)

In [19]:
cds_df_join._merge.value_counts()

left_only     144551
both           60136
right_only         0
Name: _merge, dtype: int64

In [20]:
26734/83926

0.31854252555822987

In [21]:
cds_df_join = cds_df_join.rename(columns={"CASE_NUM": "CASENUM", 
                                      "STATE_CODE": "STATE"})


In [22]:
cds_df_join['CRASH_DATE2'] = cds_df_join['CRASH_DATE']
cds_df_join['CRASH_DATE'] = pd.to_datetime( cds_df_join['CRASH_DATE'], 
                                            format = '%A, %B %d, %Y')

In [23]:
cds_df_join = cds_df_join[['INCID_NO', 'CASENUM', 'NUM_OCC', 'Non-Collision',
       'Collision with Other Motor Vehicle', 'Collision with Fixed Object',
       'Collision with Pedestrian', 'Collision with Bicycle',
       'Collision with Parked Motor Vehicle', 'Collision with Railway Train',
       'Collision with Animal', 'Collision with Other Object',
       'Collision with Unknown', 'Other Accident Class', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury', 'PARK_ALPHA', 'STATE', 'CRASH_DATE',
       'CRASH_TIME', 'LATITUDE', 'LONGITUDE', 'FATALS', 'INJURED']]

In [24]:
cds_df.shape, cds_df_join.shape

((204687, 67), (204687, 28))

In [25]:
cds_df_join.to_csv("crash_data_CDS_clean.csv", index=False)