## Crash Data Wrangling Jupyter Notebook

**Author:** Eric Englin

**Date:** 11/3/21

**Purpose:** This notebook will combine STARS data with CDS data

In [5]:
import pandas as pd
import numpy as np
import os

In [6]:
myworkingdirectory = r"C:\Users\smitha.mahesh\Desktop\STARS"
os.chdir(myworkingdirectory)

In [None]:
stars_unit = pd.read_excel("./STARS_all_Unit.xlsx")
stars_passenger = pd.read_excel("./STARS_all_Passenger.xlsx")
stars_accident = pd.read_excel("./STARS_all_Accident.xlsx")



In [None]:
stars_unit.shape, stars_passenger.shape, stars_accident.shape

In [None]:
stars_unit.columns

In [None]:
stars_passenger.columns

In [None]:
stars_accident.columns

## Summary of Injury Codes

- 00 No Injury
- 01 Possible Injury
- 02 Non-incapacitating Injury
- 03 Incapacitating Injury
- 04 Fatal
- 99 Unknown

## Relevant Injury Columns

- all_Accident: FATALS, INJURED, PED_FAT, PED_INJ
- all_Passenger: PINJ (assuming passenger injury/fatality)
- all_Units: DINJ (assuming driver injury/fatality)

In [None]:
stars_passenger.PINJ.value_counts()

In [None]:
stars_unit.DINJ.value_counts()

## Fatality Count by Table

- all_Passenger: 195
- all_Unit: 610

### Create New Columns
- Passenger/Driver Injury Severity
- Crash Class

In [None]:
stars_unit_slim = stars_unit[['INCID_NO', 'NUM_OCC','PED','DINJ',]]


In [None]:
stars_accident_slim = stars_accident[['INCID_NO','PARKCODE', 'STATE', 'ACCDATE',
                                       'TIMEACC','ACCLASS']]


In [None]:
stars_passenger_slim = stars_passenger[['INCID_NO','PINJ']]


In [None]:
stars_accident_slim.shape, stars_unit_slim.shape, stars_passenger_slim.shape

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

stars_passenger_slim['No Injury - Passenger']= np.where(stars_passenger_slim['PINJ']==0, 1,0)
stars_passenger_slim['Possible Injury - Passenger']= np.where(stars_passenger_slim['PINJ']==1, 1,0)
stars_passenger_slim['Non-incapacitating Injury - Passenger']= np.where(stars_passenger_slim['PINJ']==2, 1,0)
stars_passenger_slim['Incapacitating Injury - Passenger']= np.where(stars_passenger_slim['PINJ']==3, 1,0)
stars_passenger_slim['Fatality - Passenger']= np.where(stars_passenger_slim['PINJ']==4, 1,0)
stars_passenger_slim['Unknown Injury - Passenger']= np.where(stars_passenger_slim['PINJ']==99, 1,0)

stars_unit_slim['No Injury - Driver']= np.where(stars_unit_slim['DINJ']==0, 1,0)
stars_unit_slim['Possible Injury - Driver']= np.where(stars_unit_slim['DINJ']==1, 1,0)
stars_unit_slim['Non-incapacitating Injury - Driver']= np.where(stars_unit_slim['DINJ']==2, 1,0)
stars_unit_slim['Incapacitating Injury - Driver']= np.where(stars_unit_slim['DINJ']==3, 1,0)
stars_unit_slim['Fatality - Driver']= np.where(stars_unit_slim['DINJ']==4, 1,0)
stars_unit_slim['Unknown Injury - Driver']= np.where(stars_unit_slim['DINJ']==99, 1,0)

In [None]:
stars_accident_slim['Non-Collision']= np.where(stars_accident_slim['ACCLASS']==0, 1,0)
stars_accident_slim['Collision with Other Motor Vehicle']= np.where(stars_accident_slim['ACCLASS']==1, 1,0)
stars_accident_slim['Collision with Fixed Object']= np.where(stars_accident_slim['ACCLASS']==2, 1,0)
stars_accident_slim['Collision with Pedestrian']= np.where(stars_accident_slim['ACCLASS']==3, 1,0)
stars_accident_slim['Collision with Bicycle']= np.where(stars_accident_slim['ACCLASS']==4, 1,0)
stars_accident_slim['Collision with Parked Motor Vehicle']= np.where(stars_accident_slim['ACCLASS']==5, 1,0)
stars_accident_slim['Collision with Railway Train']= np.where(stars_accident_slim['ACCLASS']==6, 1,0)
stars_accident_slim['Collision with Animal']= np.where(stars_accident_slim['ACCLASS']==7, 1,0)
stars_accident_slim['Collision with Other Object']= np.where(stars_accident_slim['ACCLASS']==88, 1,0)
stars_accident_slim['Collision with Unknown']= np.where(stars_accident_slim['ACCLASS']==99, 1,0)
stars_accident_slim['Other Accident Class']= np.where(stars_accident_slim['ACCLASS'].isin([0,1,2,3,4
                                                                                       ,5,6,7,88,99])==False,
                                                                                        1,0)


In [None]:
stars_passenger_slim.PINJ.value_counts()

### Aggregate Datasets

In [None]:
stars_passenger_slim_agg = stars_passenger_slim.groupby(by=['INCID_NO']).sum()
#imars_slim_agg = imars_slim_agg.drop(columns = ['ACCLASS','DINJ', 'PINJ'])
stars_passenger_slim_agg = stars_passenger_slim_agg.reset_index()
stars_passenger_slim_agg['Fatality - Passenger'].sum()

In [None]:
stars_unit_slim_agg = stars_unit_slim.groupby(by=['INCID_NO']).sum()
#imars_slim_agg = imars_slim_agg.drop(columns = ['ACCLASS','DINJ', 'PINJ'])
stars_unit_slim_agg = stars_unit_slim_agg.reset_index()
stars_unit_slim_agg['Fatality - Driver'].sum()

In [None]:
stars_accident_slim['Non-Collision'].value_counts()

In [None]:
stars_accident_slim_agg = stars_accident_slim.groupby(by=['INCID_NO']).sum()
#imars_slim_agg = imars_slim_agg.drop(columns = ['ACCLASS','DINJ', 'PINJ'])
stars_accident_slim_agg = stars_accident_slim_agg.reset_index()
stars_accident_slim_agg['Non-Collision'].sum()

In [None]:
stars_unit_slim_agg_nodups = stars_unit_slim_agg.drop_duplicates()
stars_passenger_slim_agg_nodups = stars_passenger_slim_agg.drop_duplicates()
stars_accident_slim_agg_nodups = stars_accident_slim_agg.drop_duplicates()

In [None]:
stars_accident_slim_agg_nodups.shape, stars_unit_slim_agg_nodups.shape, stars_passenger_slim_agg_nodups.shape

In [None]:
stars_passenger_slim_agg_nodups['Fatality - Passenger'].sum(), stars_unit_slim_agg_nodups['Fatality - Driver'].sum()

In [None]:
stars_combined_df = stars_accident_slim_agg_nodups.merge(stars_unit_slim_agg_nodups, on="INCID_NO", how = 'left', indicator = True)
stars_combined_df = stars_combined_df.merge(stars_passenger_slim_agg_nodups, on=["INCID_NO"], how = 'left')

In [None]:
stars_combined_df['Fatality - Passenger'].sum(), stars_combined_df['Fatality - Driver'].sum()

###  Clean Columns for final dataframe

In [None]:
stars_combined_df['No Injury - Passenger'].fillna(0, inplace = True)
stars_combined_df['Possible Injury - Passenger'].fillna(0, inplace = True)
stars_combined_df['Non-incapacitating Injury - Passenger'].fillna(0, inplace = True)
stars_combined_df['Incapacitating Injury - Passenger'].fillna(0, inplace = True)
stars_combined_df['Fatality - Passenger'].fillna(0, inplace = True)
stars_combined_df['Unknown Injury - Passenger'].fillna(0, inplace = True)

stars_combined_df['No Injury - Driver'].fillna(0, inplace = True)
stars_combined_df['Possible Injury - Driver'].fillna(0, inplace = True)
stars_combined_df['Non-incapacitating Injury - Driver'].fillna(0, inplace = True)
stars_combined_df['Incapacitating Injury - Driver'].fillna(0, inplace = True)
stars_combined_df['Fatality - Driver'].fillna(0, inplace = True)
stars_combined_df['Unknown Injury - Driver'].fillna(0, inplace = True)

In [None]:
stars_combined_df['No Injury']= stars_combined_df['No Injury - Passenger'] + stars_combined_df['No Injury - Driver']
stars_combined_df['Possible Injury']= stars_combined_df['Possible Injury - Passenger']+stars_combined_df['Possible Injury - Driver']
stars_combined_df['Non-incapacitating Injury']= stars_combined_df['Non-incapacitating Injury - Passenger']+stars_combined_df['Non-incapacitating Injury - Driver']
stars_combined_df['Incapacitating Injury']= stars_combined_df['Incapacitating Injury - Passenger']+stars_combined_df['Incapacitating Injury - Driver']
stars_combined_df['Fatality']= stars_combined_df['Fatality - Passenger']+stars_combined_df['Fatality - Driver']
stars_combined_df['Unknown Injury']= stars_combined_df['Unknown Injury - Passenger']+stars_combined_df['Unknown Injury - Driver']



In [None]:
stars_combined_df['Fatality'].sum()

In [None]:
stars_final_df = stars_combined_df[['INCID_NO', 'NUM_OCC', 'Non-Collision',
       'Collision with Other Motor Vehicle', 'Collision with Fixed Object',
       'Collision with Pedestrian', 'Collision with Bicycle',
       'Collision with Parked Motor Vehicle', 'Collision with Railway Train',
       'Collision with Animal', 'Collision with Other Object',
       'Collision with Unknown', 'Other Accident Class', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury']]

In [None]:
stars_final_df = stars_final_df.merge(stars_accident[['INCID_NO','PARKCODE', 'STATE', 'ACCDATE',
                                       'TIMEACC','ACCLASS']], how = 'left', on='INCID_NO')


In [None]:
stars_final_df['CASENUM']=0

In [None]:
stars_final_df.shape

In [None]:
stars_final_df['Fatality'].sum()

In [None]:
stars_final_df = stars_final_df.rename(columns={"ACCDATE": "CRASH_DATE", 
                                      "TIMEACC": "CRASH_TIME",
                                      'PARKCODE':'PARK_ALPHA'})
stars_final_df['LATITUDE']=0
stars_final_df['LONGITUDE']=0


In [None]:
stars_final_df['CRASH_DATE'] = pd.to_datetime(stars_final_df['CRASH_DATE'],
                                                    format = "%Y-%m-%d")


In [None]:
stars_final_df.columns

In [None]:
stars_final_df = stars_final_df[['INCID_NO', 'CASENUM', 'NUM_OCC', 'Non-Collision',
       'Collision with Other Motor Vehicle', 'Collision with Fixed Object',
       'Collision with Pedestrian', 'Collision with Bicycle',
       'Collision with Parked Motor Vehicle', 'Collision with Railway Train',
       'Collision with Animal', 'Collision with Other Object',
       'Collision with Unknown', 'Other Accident Class', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury', 'PARK_ALPHA', 'STATE', 'CRASH_DATE',
       'CRASH_TIME', 'LATITUDE', 'LONGITUDE']]

In [None]:
stars_final_df.to_csv("crash_data_STARS_clean.csv", index=False)