## Crash Data Wrangling Jupyter Notebook

**Author:** Eric Englin

**Date:** 11/3/21

**Purpose:** This notebook will combine STARS data with CDS data

In [1]:
import pandas as pd
import numpy as np
import os



In [2]:
myworkingdirectory = r"C:\Users\eric.englin\Desktop\TSP"
os.chdir(myworkingdirectory)

In [3]:
stars_unit = pd.read_excel("./data/STARS_all_Unit.xlsx")
stars_passenger = pd.read_excel("./data/STARS_all_Passenger.xlsx")
stars_accident = pd.read_excel("./data/STARS_all_Accident.xlsx")



In [4]:
stars_unit.shape, stars_passenger.shape, stars_accident.shape

((189023, 33), (64792, 13), (120762, 48))

In [137]:
stars_unit.columns

Index(['INCID_NO', 'UNITU', 'CASENUM', 'CASEUNIT', 'PRKCODEU', 'YEAR',
       'MAKEMOD', 'MODEL', 'NUM_OCC', 'REGSTATE', 'REGYEAR', 'DIR_TRAV',
       'SPEED_LMT', 'BOD_TYPE', 'VEH_MANVR', 'VEH_DAMG', 'DAM_LOCTN',
       'LICSTATE', 'PED', 'BRTH_DATE', 'DSEX', 'DBELT', 'DEJCT', 'DINJ',
       'DVIOLTN', 'VIOLCHG1', 'VIOLCHG2', 'PED_TYPE', 'PED_LOC', 'PED_ACTN',
       'REPAIR', 'ACC_YEAR', 'TOWED'],
      dtype='object')

In [138]:
stars_passenger.columns

Index(['INCID_NO', 'CASENUM', 'UNITNUM', 'PASS_SEQ', 'CASEUNIT', 'PSEX',
       'PBELT', 'PEJCT', 'PSEAT', 'PINJ', 'PARKCODE', 'PASSAGE', 'ACC_YEAR'],
      dtype='object')

In [139]:
stars_accident.columns

Index(['INCID_NO', 'incid', 'CASENUM', 'PARKCODE', 'STATE', 'ACCDATE',
       'TIMEACC', 'ROUTENUM', 'ROADNAME', 'NODISTFT', 'NODISMI', 'NODEDIR',
       'NODENUM', 'DSCUSE', 'LIGHT', 'WEATHER', 'ACCLOCTN', 'SURFCOND',
       'ACCLASS', 'VEHCOLL', 'OBJSTRUK', 'ROADCHAR', 'CONFACT1', 'CONFACT2',
       'CONFACT3', 'CONFACT4', 'CONFACT5', 'CONFACT6', 'HIT_RUN', 'CATEGORY',
       'FATALS', 'INJURED', 'PED_FAT', 'PED_INJ', 'BIK_FAT', 'BIK_INJ', 'PED',
       'ACC_YEAR', 'parknum', 'comments', 'TIFF_FILE', 'zipfile', 'folder',
       'tiffnumb', 'roadname_updated', 'top60ish', 'StudyNode',
       'AdjacentNode'],
      dtype='object')

## Summary of Injury Codes

- 00 No Injury
- 01 Possible Injury
- 02 Non-incapacitating Injury
- 03 Incapacitating Injury
- 04 Fatal
- 99 Unknown

## Relevant Injury Columns

- all_Accident: FATALS, INJURED, PED_FAT, PED_INJ
- all_Passenger: PINJ (assuming passenger injury/fatality)
- all_Units: DINJ (assuming driver injury/fatality)

In [5]:
stars_passenger.PINJ.value_counts()

0.0     54974
1.0      3890
2.0      3579
3.0      1345
99.0      791
4.0       195
Name: PINJ, dtype: int64

In [6]:
stars_unit.DINJ.value_counts()

0.0     161803
1.0       9993
2.0       8922
99.0      4072
3.0       3620
4.0        610
Name: DINJ, dtype: int64

## Fatality Count by Table

- all_Passenger: 195
- all_Unit: 610

### Create New Columns
- Passenger/Driver Injury Severity
- Crash Class

In [84]:
stars_unit_slim = stars_unit[['INCID_NO', 'NUM_OCC','PED','DINJ',]]


In [85]:
stars_accident_slim = stars_accident[['INCID_NO','PARKCODE', 'STATE', 'ACCDATE',
                                       'TIMEACC','ACCLASS']]


In [86]:
stars_passenger_slim = stars_passenger[['INCID_NO','PINJ']]


In [87]:
stars_accident_slim.shape, stars_unit_slim.shape, stars_passenger_slim.shape

((120762, 7), (189023, 4), (64792, 2))

In [88]:
pd.options.mode.chained_assignment = None  # default='warn'

stars_passenger_slim['No Injury - Passenger']= np.where(stars_passenger_slim['PINJ']==0, 1,0)
stars_passenger_slim['Possible Injury - Passenger']= np.where(stars_passenger_slim['PINJ']==1, 1,0)
stars_passenger_slim['Non-incapacitating Injury - Passenger']= np.where(stars_passenger_slim['PINJ']==2, 1,0)
stars_passenger_slim['Incapacitating Injury - Passenger']= np.where(stars_passenger_slim['PINJ']==3, 1,0)
stars_passenger_slim['Fatality - Passenger']= np.where(stars_passenger_slim['PINJ']==4, 1,0)
stars_passenger_slim['Unknown Injury - Passenger']= np.where(stars_passenger_slim['PINJ']==99, 1,0)

stars_unit_slim['No Injury - Driver']= np.where(stars_unit_slim['DINJ']==0, 1,0)
stars_unit_slim['Possible Injury - Driver']= np.where(stars_unit_slim['DINJ']==1, 1,0)
stars_unit_slim['Non-incapacitating Injury - Driver']= np.where(stars_unit_slim['DINJ']==2, 1,0)
stars_unit_slim['Incapacitating Injury - Driver']= np.where(stars_unit_slim['DINJ']==3, 1,0)
stars_unit_slim['Fatality - Driver']= np.where(stars_unit_slim['DINJ']==4, 1,0)
stars_unit_slim['Unknown Injury - Driver']= np.where(stars_unit_slim['DINJ']==99, 1,0)

In [89]:
stars_accident_slim['Non-Collision']= np.where(stars_accident_slim['ACCLASS']==0, 1,0)
stars_accident_slim['Collision with Other Motor Vehicle']= np.where(stars_accident_slim['ACCLASS']==1, 1,0)
stars_accident_slim['Collision with Fixed Object']= np.where(stars_accident_slim['ACCLASS']==2, 1,0)
stars_accident_slim['Collision with Pedestrian']= np.where(stars_accident_slim['ACCLASS']==3, 1,0)
stars_accident_slim['Collision with Bicycle']= np.where(stars_accident_slim['ACCLASS']==4, 1,0)
stars_accident_slim['Collision with Parked Motor Vehicle']= np.where(stars_accident_slim['ACCLASS']==5, 1,0)
stars_accident_slim['Collision with Railway Train']= np.where(stars_accident_slim['ACCLASS']==6, 1,0)
stars_accident_slim['Collision with Animal']= np.where(stars_accident_slim['ACCLASS']==7, 1,0)
stars_accident_slim['Collision with Other Object']= np.where(stars_accident_slim['ACCLASS']==88, 1,0)
stars_accident_slim['Collision with Unknown']= np.where(stars_accident_slim['ACCLASS']==99, 1,0)
stars_accident_slim['Other Accident Class']= np.where(stars_accident_slim['ACCLASS'].isin([0,1,2,3,4
                                                                                       ,5,6,7,88,99])==False,
                                                                                        1,0)


In [90]:
stars_passenger_slim.PINJ.value_counts()

0.0     54974
1.0      3890
2.0      3579
3.0      1345
99.0      791
4.0       195
Name: PINJ, dtype: int64

### Aggregate Datasets

In [91]:
stars_passenger_slim_agg = stars_passenger_slim.groupby(by=['INCID_NO']).sum()
#imars_slim_agg = imars_slim_agg.drop(columns = ['ACCLASS','DINJ', 'PINJ'])
stars_passenger_slim_agg = stars_passenger_slim_agg.reset_index()
stars_passenger_slim_agg['Fatality - Passenger'].sum()

195

In [92]:
stars_unit_slim_agg = stars_unit_slim.groupby(by=['INCID_NO']).sum()
#imars_slim_agg = imars_slim_agg.drop(columns = ['ACCLASS','DINJ', 'PINJ'])
stars_unit_slim_agg = stars_unit_slim_agg.reset_index()
stars_unit_slim_agg['Fatality - Driver'].sum()

610

In [93]:
stars_accident_slim['Non-Collision'].value_counts()

0    111410
1      9352
Name: Non-Collision, dtype: int64

In [94]:
stars_accident_slim_agg = stars_accident_slim.groupby(by=['INCID_NO']).sum()
#imars_slim_agg = imars_slim_agg.drop(columns = ['ACCLASS','DINJ', 'PINJ'])
stars_accident_slim_agg = stars_accident_slim_agg.reset_index()
stars_accident_slim_agg['Non-Collision'].sum()

9352

In [95]:
stars_unit_slim_agg_nodups = stars_unit_slim_agg.drop_duplicates()
stars_passenger_slim_agg_nodups = stars_passenger_slim_agg.drop_duplicates()
stars_accident_slim_agg_nodups = stars_accident_slim_agg.drop_duplicates()

In [96]:
stars_accident_slim_agg_nodups.shape, stars_unit_slim_agg_nodups.shape, stars_passenger_slim_agg_nodups.shape

((120762, 14), (119240, 9), (33402, 8))

In [100]:
stars_passenger_slim_agg_nodups['Fatality - Passenger'].sum(), stars_unit_slim_agg_nodups['Fatality - Driver'].sum()

(195, 610)

In [109]:
stars_combined_df = stars_accident_slim_agg_nodups.merge(stars_unit_slim_agg_nodups, on="INCID_NO", how = 'left', indicator = True)
stars_combined_df = stars_combined_df.merge(stars_passenger_slim_agg_nodups, on=["INCID_NO"], how = 'left')

In [120]:
stars_combined_df['Fatality - Passenger'].sum(), stars_combined_df['Fatality - Driver'].sum()

(195.0, 610.0)

###  Clean Columns for final dataframe

In [129]:
stars_combined_df['No Injury - Passenger'].fillna(0, inplace = True)
stars_combined_df['Possible Injury - Passenger'].fillna(0, inplace = True)
stars_combined_df['Non-incapacitating Injury - Passenger'].fillna(0, inplace = True)
stars_combined_df['Incapacitating Injury - Passenger'].fillna(0, inplace = True)
stars_combined_df['Fatality - Passenger'].fillna(0, inplace = True)
stars_combined_df['Unknown Injury - Passenger'].fillna(0, inplace = True)

stars_combined_df['No Injury - Driver'].fillna(0, inplace = True)
stars_combined_df['Possible Injury - Driver'].fillna(0, inplace = True)
stars_combined_df['Non-incapacitating Injury - Driver'].fillna(0, inplace = True)
stars_combined_df['Incapacitating Injury - Driver'].fillna(0, inplace = True)
stars_combined_df['Fatality - Driver'].fillna(0, inplace = True)
stars_combined_df['Unknown Injury - Driver'].fillna(0, inplace = True)

In [130]:
stars_combined_df['No Injury']= stars_combined_df['No Injury - Passenger'] + stars_combined_df['No Injury - Driver']
stars_combined_df['Possible Injury']= stars_combined_df['Possible Injury - Passenger']+stars_combined_df['Possible Injury - Driver']
stars_combined_df['Non-incapacitating Injury']= stars_combined_df['Non-incapacitating Injury - Passenger']+stars_combined_df['Non-incapacitating Injury - Driver']
stars_combined_df['Incapacitating Injury']= stars_combined_df['Incapacitating Injury - Passenger']+stars_combined_df['Incapacitating Injury - Driver']
stars_combined_df['Fatality']= stars_combined_df['Fatality - Passenger']+stars_combined_df['Fatality - Driver']
stars_combined_df['Unknown Injury']= stars_combined_df['Unknown Injury - Passenger']+stars_combined_df['Unknown Injury - Driver']



In [132]:
stars_combined_df['Fatality'].sum()

805.0

In [145]:
stars_final_df = stars_combined_df[['INCID_NO', 'NUM_OCC', 'Non-Collision',
       'Collision with Other Motor Vehicle', 'Collision with Fixed Object',
       'Collision with Pedestrian', 'Collision with Bicycle',
       'Collision with Parked Motor Vehicle', 'Collision with Railway Train',
       'Collision with Animal', 'Collision with Other Object',
       'Collision with Unknown', 'Other Accident Class', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury']]

In [146]:
stars_final_df = stars_final_df.merge(stars_accident[['INCID_NO','PARKCODE', 'STATE', 'ACCDATE',
                                       'TIMEACC','ACCLASS']], how = 'left', on='INCID_NO')


In [147]:
stars_final_df['CASENUM']=0

In [148]:
stars_final_df.shape

(120762, 25)

In [149]:
stars_final_df['Fatality'].sum()

805.0

In [150]:
stars_final_df = stars_final_df.rename(columns={"ACCDATE": "CRASH_DATE", 
                                      "TIMEACC": "CRASH_TIME",
                                      'PARKCODE':'PARK_ALPHA'})
stars_final_df['LATITUDE']=0
stars_final_df['LONGITUDE']=0


In [151]:
stars_final_df['CRASH_DATE'] = pd.to_datetime(stars_final_df['CRASH_DATE'],
                                                    format = "%Y-%m-%d")


In [152]:
stars_final_df.columns

Index(['INCID_NO', 'NUM_OCC', 'Non-Collision',
       'Collision with Other Motor Vehicle', 'Collision with Fixed Object',
       'Collision with Pedestrian', 'Collision with Bicycle',
       'Collision with Parked Motor Vehicle', 'Collision with Railway Train',
       'Collision with Animal', 'Collision with Other Object',
       'Collision with Unknown', 'Other Accident Class', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury', 'PARK_ALPHA', 'STATE', 'CRASH_DATE',
       'CRASH_TIME', 'ACCLASS', 'CASENUM', 'LATITUDE', 'LONGITUDE'],
      dtype='object')

In [155]:
stars_final_df = stars_final_df[['INCID_NO', 'CASENUM', 'NUM_OCC', 'Non-Collision',
       'Collision with Other Motor Vehicle', 'Collision with Fixed Object',
       'Collision with Pedestrian', 'Collision with Bicycle',
       'Collision with Parked Motor Vehicle', 'Collision with Railway Train',
       'Collision with Animal', 'Collision with Other Object',
       'Collision with Unknown', 'Other Accident Class', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury', 'PARK_ALPHA', 'STATE', 'CRASH_DATE',
       'CRASH_TIME', 'LATITUDE', 'LONGITUDE']]

In [156]:
stars_final_df.to_csv("crash_data_STARS_clean.csv", index=False)