## Crash Data Wrangling Jupyter Notebook

**Author:** Eric Englin

**Date:** 11/3/21

**Purpose:** This notebook will combine STARS data with CDS data

In [3]:
import pyodbc
import pandas as pd
import numpy as np
import os

In [5]:
myworkingdirectory = r"C:\Users\eric.englin\Desktop\TSP"
os.chdir(myworkingdirectory)

In [6]:
stars_unit = pd.read_excel("./data/STARS_all_Unit.xlsx")
stars_unit.head()

Unnamed: 0,INCID_NO,UNITU,CASENUM,CASEUNIT,PRKCODEU,YEAR,MAKEMOD,MODEL,NUM_OCC,REGSTATE,...,DINJ,DVIOLTN,VIOLCHG1,VIOLCHG2,PED_TYPE,PED_LOC,PED_ACTN,REPAIR,ACC_YEAR,TOWED
0,ABLI9000000001,1,5540000001,,ABLI,89.0,CHEVROLET CHEYENNE,,0.0,US,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1990.0,
1,ABLI9000000001,2,5540000001,,ABLI,0.0,UNKNOWN,,0.0,,...,99.0,99.0,0.0,0.0,0.0,0.0,0.0,0.0,1990.0,
2,ABLI9100000001,1,5540910001,554091.0,ABLI,90.0,CHEVROLET 1/2 TON PU,,2.0,US,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,400.0,1991.0,
3,ACAD0000000003,1,1700000003,,ACAD,99.0,PLYMOUTH BREEZE SDN,,1.0,US,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,600.0,2000.0,
4,ACAD0000000014,1,1700000014,,ACAD,0.0,PONTIAC GRAN AM,,2.0,NY,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5000.0,2000.0,


In [7]:
stars_passenger = pd.read_excel("./data/STARS_all_Passenger.xlsx")
stars_passenger.head()

Unnamed: 0,INCID_NO,CASENUM,UNITNUM,PASS_SEQ,CASEUNIT,PSEX,PBELT,PEJCT,PSEAT,PINJ,PARKCODE,PASSAGE,ACC_YEAR
0,CUGA9900000236,5230990236,1.0,1.0,,1.0,1.0,0.0,3.0,0.0,CUGA,21.0,
1,CUGA9900000329,5230990329,1.0,1.0,,1.0,1.0,0.0,3.0,1.0,CUGA,17.0,
2,CUGA9900000331,5230990331,1.0,1.0,,1.0,1.0,0.0,3.0,0.0,CUGA,17.0,
3,PRWI0100000018,3700010018,1.0,1.0,,1.0,1.0,0.0,3.0,0.0,PRWI,25.0,
4,PRWI0200000167,3700020167,1.0,1.0,,2.0,0.0,0.0,3.0,2.0,PRWI,20.0,


In [8]:
stars_accident = pd.read_excel("./data/STARS_all_Accident.xlsx")
stars_accident.head()

Unnamed: 0,INCID_NO,incid,CASENUM,PARKCODE,STATE,ACCDATE,TIMEACC,ROUTENUM,ROADNAME,NODISTFT,...,parknum,comments,TIFF_FILE,zipfile,folder,tiffnumb,roadname_updated,top60ish,StudyNode,AdjacentNode
0,ABLI9000000001,,5540000001,ABLI,KY,1990-08-05,1115.0,0.0,VISITOR CENTER RD,70.0,...,,,,,,,,,,
1,ABLI9100000001,,5540910001,ABLI,KY,1991-08-02,1400.0,400.0,MEMORIAL BUILDING SE,100.0,...,,,,,,,,,,
2,ACAD0000000003,top,1700000003,ACAD,ME,2000-01-27,800.0,233.0,DERMOT HOUSE DRIVEWAY,0.0,...,,,,,,,,T60VM,,
3,ACAD0000000014,top,1700000014,ACAD,ME,2000-04-17,1310.0,300.0,PARK LOOP RD,130.0,...,,,,,,,,T60VM,,
4,ACAD0000000026,top,1700000026,ACAD,ME,2000-03-26,1840.0,301.0,SCHOODIC OUTBOUND,150.0,...,,,,,,,,T60VM,,


In [9]:
stars_unit.shape, stars_passenger.shape, stars_accident.shape

((189023, 33), (64792, 13), (120762, 48))

In [10]:
64792+189023

253815

## Summary of Injury Codes

- 00 No Injury
- 01 Possible Injury
- 02 Non-incapacitating Injury
- 03 Incapacitating Injury
- 04 Fatal
- 99 Unknown

## Relevant Injury Columns

- all_Accident: FATALS, INJURED, PED_FAT, PED_INJ
- all_Passenger: PINJ (assuming passenger injury/fatality)
- all_Units: DINJ (assuming driver injury/fatality)

In [11]:
stars_accident.FATALS.value_counts()

0.0    119730
1.0       656
2.0        67
3.0        10
4.0         5
5.0         2
Name: FATALS, dtype: int64

In [12]:
fatalities = 656*1+67*2+10*3+5*4+5*2

print("Table all_Accident has ", fatalities, " fatalities")

Table all_Accident has  850  fatalities


In [13]:
stars_passenger.PINJ.value_counts()

0.0     54974
1.0      3890
2.0      3579
3.0      1345
99.0      791
4.0       195
Name: PINJ, dtype: int64

In [14]:
stars_unit.DINJ.value_counts()

0.0     161803
1.0       9993
2.0       8922
99.0      4072
3.0       3620
4.0        610
Name: DINJ, dtype: int64

## Fatality Count by Table

- all_Accident: 850
- all_Passenger: 195
- all_Unit: 610

In [15]:
stars_unit.columns

Index(['INCID_NO', 'UNITU', 'CASENUM', 'CASEUNIT', 'PRKCODEU', 'YEAR',
       'MAKEMOD', 'MODEL', 'NUM_OCC', 'REGSTATE', 'REGYEAR', 'DIR_TRAV',
       'SPEED_LMT', 'BOD_TYPE', 'VEH_MANVR', 'VEH_DAMG', 'DAM_LOCTN',
       'LICSTATE', 'PED', 'BRTH_DATE', 'DSEX', 'DBELT', 'DEJCT', 'DINJ',
       'DVIOLTN', 'VIOLCHG1', 'VIOLCHG2', 'PED_TYPE', 'PED_LOC', 'PED_ACTN',
       'REPAIR', 'ACC_YEAR', 'TOWED'],
      dtype='object')

In [16]:
stars_unit_slim = stars_unit[['INCID_NO', 'CASENUM','NUM_OCC','PED','DINJ',]]

In [17]:
stars_unit_slim.head()

Unnamed: 0,INCID_NO,CASENUM,NUM_OCC,PED,DINJ
0,ABLI9000000001,5540000001,0.0,N,0.0
1,ABLI9000000001,5540000001,0.0,N,99.0
2,ABLI9100000001,5540910001,2.0,N,0.0
3,ACAD0000000003,1700000003,1.0,N,0.0
4,ACAD0000000014,1700000014,2.0,N,0.0


In [18]:
stars_accident.columns

Index(['INCID_NO', 'incid', 'CASENUM', 'PARKCODE', 'STATE', 'ACCDATE',
       'TIMEACC', 'ROUTENUM', 'ROADNAME', 'NODISTFT', 'NODISMI', 'NODEDIR',
       'NODENUM', 'DSCUSE', 'LIGHT', 'WEATHER', 'ACCLOCTN', 'SURFCOND',
       'ACCLASS', 'VEHCOLL', 'OBJSTRUK', 'ROADCHAR', 'CONFACT1', 'CONFACT2',
       'CONFACT3', 'CONFACT4', 'CONFACT5', 'CONFACT6', 'HIT_RUN', 'CATEGORY',
       'FATALS', 'INJURED', 'PED_FAT', 'PED_INJ', 'BIK_FAT', 'BIK_INJ', 'PED',
       'ACC_YEAR', 'parknum', 'comments', 'TIFF_FILE', 'zipfile', 'folder',
       'tiffnumb', 'roadname_updated', 'top60ish', 'StudyNode',
       'AdjacentNode'],
      dtype='object')

In [19]:
stars_accident_slim = stars_accident[['INCID_NO','CASENUM',
                                      'ACCLASS']]

In [20]:
stars_accident_slim.head()

Unnamed: 0,INCID_NO,CASENUM,ACCLASS
0,ABLI9000000001,5540000001,5.0
1,ABLI9100000001,5540910001,2.0
2,ACAD0000000003,1700000003,2.0
3,ACAD0000000014,1700000014,2.0
4,ACAD0000000026,1700000026,2.0


In [21]:
stars_passenger_slim = stars_passenger[['INCID_NO','CASENUM','PINJ']]

In [22]:
stars_passenger_slim.head()

Unnamed: 0,INCID_NO,CASENUM,PINJ
0,CUGA9900000236,5230990236,0.0
1,CUGA9900000329,5230990329,1.0
2,CUGA9900000331,5230990331,0.0
3,PRWI0100000018,3700010018,0.0
4,PRWI0200000167,3700020167,2.0


In [23]:
len(set(stars_accident_slim['INCID_NO'])), len(set(stars_unit_slim['INCID_NO'])), len(set(stars_passenger_slim['INCID_NO']))

(120762, 119240, 33402)

In [24]:
stars_combined_df = stars_accident_slim.merge(stars_unit_slim, on=["INCID_NO", 'CASENUM'], how = 'left')
stars_combined_df = stars_combined_df.merge(stars_passenger_slim, on=["INCID_NO", 'CASENUM'], how = 'left')

In [25]:
stars_combined_df.head(25)

Unnamed: 0,INCID_NO,CASENUM,ACCLASS,NUM_OCC,PED,DINJ,PINJ
0,ABLI9000000001,5540000001,5.0,0.0,N,0.0,
1,ABLI9000000001,5540000001,5.0,0.0,N,99.0,
2,ABLI9100000001,5540910001,2.0,2.0,N,0.0,0.0
3,ACAD0000000003,1700000003,2.0,1.0,N,0.0,
4,ACAD0000000014,1700000014,2.0,2.0,N,0.0,0.0
5,ACAD0000000026,1700000026,2.0,1.0,N,0.0,
6,ACAD0000000027,1700000027,1.0,2.0,N,0.0,
7,ACAD0000000034,1700000034,2.0,1.0,N,0.0,
8,ACAD0000000035,1700000035,2.0,1.0,N,0.0,
9,ACAD0000000077,1700000077,5.0,0.0,N,0.0,


00 No Injury
01 Possible Injury
02 Non-incapacitating Injury
03 Incapacitating Injury
04 Fatal

In [26]:
stars_combined_df['No Injury - Passenger']= np.where(stars_combined_df['PINJ']==0, 1,0)
stars_combined_df['Possible Injury - Passenger']= np.where(stars_combined_df['PINJ']==1, 1,0)
stars_combined_df['Non-incapacitating Injury - Passenger']= np.where(stars_combined_df['PINJ']==2, 1,0)
stars_combined_df['Incapacitating Injury - Passenger']= np.where(stars_combined_df['PINJ']==3, 1,0)
stars_combined_df['Fatality - Passenger']= np.where(stars_combined_df['PINJ']==4, 1,0)
stars_combined_df['Unknown Injury - Passenger']= np.where(stars_combined_df['PINJ']==99, 1,0)

stars_combined_df['No Injury - Driver']= np.where(stars_combined_df['DINJ']==0, 1,0)
stars_combined_df['Possible Injury - Driver']= np.where(stars_combined_df['DINJ']==1, 1,0)
stars_combined_df['Non-incapacitating Injury - Driver']= np.where(stars_combined_df['DINJ']==2, 1,0)
stars_combined_df['Incapacitating Injury - Driver']= np.where(stars_combined_df['DINJ']==3, 1,0)
stars_combined_df['Fatality - Driver']= np.where(stars_combined_df['DINJ']==4, 1,0)
stars_combined_df['Unknown Injury - Driver']= np.where(stars_combined_df['DINJ']==99, 1,0)

In [27]:
stars_combined_df['PINJ'].value_counts()

0.0     91070
1.0      6589
2.0      5107
3.0      1924
99.0     1298
4.0       266
Name: PINJ, dtype: int64

## Accident Class Categories

- 00 : Non-collision
- 01 : Collision with other motor vehicle
- 02 : Collision with fixed object
- 03 : Collision with pedestrian
- 04 : collision with bicycle
- 05 : collision with parked motor vehicle
- 06 : Collision with railway train
- 07 : collision with animal
- 88 : collision with other object
- 99 : collision with unknown

In [28]:
stars_combined_df['Non-Collision']= np.where(stars_combined_df['ACCLASS']==0, 1,0)
stars_combined_df['Collision with Other Motor Vehicle']= np.where(stars_combined_df['ACCLASS']==1, 1,0)
stars_combined_df['Collision with Fixed Object']= np.where(stars_combined_df['ACCLASS']==2, 1,0)
stars_combined_df['Collision with Pedestrian']= np.where(stars_combined_df['ACCLASS']==3, 1,0)
stars_combined_df['Collision with Bicycle']= np.where(stars_combined_df['ACCLASS']==4, 1,0)
stars_combined_df['Collision with Parked Motor Vehicle']= np.where(stars_combined_df['ACCLASS']==5, 1,0)
stars_combined_df['Collision with Railway Train']= np.where(stars_combined_df['ACCLASS']==6, 1,0)
stars_combined_df['Collision with Animal']= np.where(stars_combined_df['ACCLASS']==7, 1,0)
stars_combined_df['Collision with Other Object']= np.where(stars_combined_df['ACCLASS']==88, 1,0)
stars_combined_df['Collision with Unknown']= np.where(stars_combined_df['ACCLASS']==99, 1,0)
stars_combined_df['Other Accident Class']= np.where(stars_combined_df['ACCLASS'].isin([0,1,2,3,4
                                                                                       ,5,6,7,88,99])==False,
                                                                                        1,0)


In [29]:
stars_combined_df['Non-Collision'].value_counts()

0    231513
1     12221
Name: Non-Collision, dtype: int64

In [30]:
stars_combined_df['ACCLASS'].value_counts()

1.0     156223
2.0      36222
7.0      15253
5.0      12719
0.0      12221
88.0      4606
99.0      2704
3.0       1928
4.0       1737
6.0         28
15.0        24
10.0        23
20.0         9
8.0          8
9.0          5
58.0         2
91.0         2
14.0         2
11.0         1
12.0         1
16.0         1
40.0         1
28.0         1
Name: ACCLASS, dtype: int64

In [31]:
stars_combined_df.head()

Unnamed: 0,INCID_NO,CASENUM,ACCLASS,NUM_OCC,PED,DINJ,PINJ,No Injury - Passenger,Possible Injury - Passenger,Non-incapacitating Injury - Passenger,...,Collision with Other Motor Vehicle,Collision with Fixed Object,Collision with Pedestrian,Collision with Bicycle,Collision with Parked Motor Vehicle,Collision with Railway Train,Collision with Animal,Collision with Other Object,Collision with Unknown,Other Accident Class
0,ABLI9000000001,5540000001,5.0,0.0,N,0.0,,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,ABLI9000000001,5540000001,5.0,0.0,N,99.0,,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,ABLI9100000001,5540910001,2.0,2.0,N,0.0,0.0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,ACAD0000000003,1700000003,2.0,1.0,N,0.0,,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,ACAD0000000014,1700000014,2.0,2.0,N,0.0,0.0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [32]:
stars_combined_df.columns

Index(['INCID_NO', 'CASENUM', 'ACCLASS', 'NUM_OCC', 'PED', 'DINJ', 'PINJ',
       'No Injury - Passenger', 'Possible Injury - Passenger',
       'Non-incapacitating Injury - Passenger',
       'Incapacitating Injury - Passenger', 'Fatality - Passenger',
       'Unknown Injury - Passenger', 'No Injury - Driver',
       'Possible Injury - Driver', 'Non-incapacitating Injury - Driver',
       'Incapacitating Injury - Driver', 'Fatality - Driver',
       'Unknown Injury - Driver', 'Non-Collision',
       'Collision with Other Motor Vehicle', 'Collision with Fixed Object',
       'Collision with Pedestrian', 'Collision with Bicycle',
       'Collision with Parked Motor Vehicle', 'Collision with Railway Train',
       'Collision with Animal', 'Collision with Other Object',
       'Collision with Unknown', 'Other Accident Class'],
      dtype='object')

In [45]:
stars_accident.head()

Unnamed: 0,INCID_NO,incid,CASENUM,PARKCODE,STATE,ACCDATE,TIMEACC,ROUTENUM,ROADNAME,NODISTFT,...,parknum,comments,TIFF_FILE,zipfile,folder,tiffnumb,roadname_updated,top60ish,StudyNode,AdjacentNode
0,ABLI9000000001,,5540000001,ABLI,KY,1990-08-05,1115.0,0.0,VISITOR CENTER RD,70.0,...,,,,,,,,,,
1,ABLI9100000001,,5540910001,ABLI,KY,1991-08-02,1400.0,400.0,MEMORIAL BUILDING SE,100.0,...,,,,,,,,,,
2,ACAD0000000003,top,1700000003,ACAD,ME,2000-01-27,800.0,233.0,DERMOT HOUSE DRIVEWAY,0.0,...,,,,,,,,T60VM,,
3,ACAD0000000014,top,1700000014,ACAD,ME,2000-04-17,1310.0,300.0,PARK LOOP RD,130.0,...,,,,,,,,T60VM,,
4,ACAD0000000026,top,1700000026,ACAD,ME,2000-03-26,1840.0,301.0,SCHOODIC OUTBOUND,150.0,...,,,,,,,,T60VM,,


In [44]:
stars_accident.columns

Index(['INCID_NO', 'incid', 'CASENUM', 'PARKCODE', 'STATE', 'ACCDATE',
       'TIMEACC', 'ROUTENUM', 'ROADNAME', 'NODISTFT', 'NODISMI', 'NODEDIR',
       'NODENUM', 'DSCUSE', 'LIGHT', 'WEATHER', 'ACCLOCTN', 'SURFCOND',
       'ACCLASS', 'VEHCOLL', 'OBJSTRUK', 'ROADCHAR', 'CONFACT1', 'CONFACT2',
       'CONFACT3', 'CONFACT4', 'CONFACT5', 'CONFACT6', 'HIT_RUN', 'CATEGORY',
       'FATALS', 'INJURED', 'PED_FAT', 'PED_INJ', 'BIK_FAT', 'BIK_INJ', 'PED',
       'ACC_YEAR', 'parknum', 'comments', 'TIFF_FILE', 'zipfile', 'folder',
       'tiffnumb', 'roadname_updated', 'top60ish', 'StudyNode',
       'AdjacentNode'],
      dtype='object')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


## Group by INCID_NO and CASENUM 
## Clean Columns for final dataframe

In [34]:
stars_df_agg = stars_combined_df.groupby(by=['INCID_NO','CASENUM']).sum()
stars_df_agg = stars_df_agg.drop(columns = ['ACCLASS','DINJ', 'PINJ'])
stars_df_agg = stars_df_agg.reset_index()

In [35]:
stars_df_agg['No Injury']= stars_combined_df['No Injury - Passenger'] + stars_combined_df['No Injury - Driver']
stars_df_agg['Possible Injury']= stars_combined_df['Possible Injury - Passenger']+stars_combined_df['Possible Injury - Driver']
stars_df_agg['Non-incapacitating Injury']= stars_combined_df['Non-incapacitating Injury - Passenger']+stars_combined_df['Non-incapacitating Injury - Driver']
stars_df_agg['Incapacitating Injury']= stars_combined_df['Incapacitating Injury - Passenger']+stars_combined_df['Incapacitating Injury - Driver']
stars_df_agg['Fatality']= stars_combined_df['Fatality - Passenger']+stars_combined_df['Fatality - Driver']
stars_df_agg['Unknown Injury']= stars_combined_df['Unknown Injury - Passenger']+stars_combined_df['Unknown Injury - Driver']



In [36]:
stars_df_agg = stars_df_agg[['INCID_NO', 'CASENUM', 'NUM_OCC', 'Non-Collision',
       'Collision with Other Motor Vehicle', 'Collision with Fixed Object',
       'Collision with Pedestrian', 'Collision with Bicycle',
       'Collision with Parked Motor Vehicle', 'Collision with Railway Train',
       'Collision with Animal', 'Collision with Other Object',
       'Collision with Unknown', 'Other Accident Class', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury']]

In [41]:
stars_combined_df.shape, stars_df_agg.shape

((243734, 30), (118559, 20))

In [42]:
stars_df_agg.head()

Unnamed: 0,INCID_NO,CASENUM,NUM_OCC,Non-Collision,Collision with Other Motor Vehicle,Collision with Fixed Object,Collision with Pedestrian,Collision with Bicycle,Collision with Parked Motor Vehicle,Collision with Railway Train,Collision with Animal,Collision with Other Object,Collision with Unknown,Other Accident Class,No Injury,Possible Injury,Non-incapacitating Injury,Incapacitating Injury,Fatality,Unknown Injury
0,ABLI9000000001,5540000001,0.0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0
1,ABLI9100000001,5540910001,2.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,ACAD0000000003,1700000003,1.0,0,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0
3,ACAD0000000014,1700000014,2.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,ACAD0000000026,1700000026,1.0,0,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0


In [97]:
stars_accident_tojoin = stars_accident[['INCID_NO','CASENUM', 'PARKCODE', 'STATE', 'ACCDATE',
       'TIMEACC','FATALS', 'INJURED', 'PED_FAT', 'PED_INJ', 'BIK_FAT', 'BIK_INJ', 'PED']]


stars_accident_tojoin = stars_accident_tojoin.rename(columns={"ACCDATE": "CRASH_DATE", 
                                      "TIMEACC": "CRASH_TIME",
                                      'PARKCODE':'PARK_ALPHA'})
stars_accident_tojoin['LATITUDE']=0
stars_accident_tojoin['LONGITUDE']=0


In [98]:
crash_df_final = stars_df_agg.merge(stars_accident_tojoin, on = ['INCID_NO','CASENUM'])

In [99]:
crash_df_final.columns

Index(['INCID_NO', 'CASENUM', 'NUM_OCC', 'Non-Collision',
       'Collision with Other Motor Vehicle', 'Collision with Fixed Object',
       'Collision with Pedestrian', 'Collision with Bicycle',
       'Collision with Parked Motor Vehicle', 'Collision with Railway Train',
       'Collision with Animal', 'Collision with Other Object',
       'Collision with Unknown', 'Other Accident Class', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury', 'PARK_ALPHA', 'STATE', 'CRASH_DATE',
       'CRASH_TIME', 'FATALS', 'INJURED', 'PED_FAT', 'PED_INJ', 'BIK_FAT',
       'BIK_INJ', 'PED', 'LATITUDE', 'LONGITUDE'],
      dtype='object')

## CDS Data Cleaning

In [82]:
cds_df = pd.read_csv('./data/CDS All Crashes Table for FLH 6-28-21.csv')

In [83]:
cds_df.head()

Unnamed: 0,OBJECTID,INCID_NO,CASE_NUM,PARK_ALPHA,Region,# Crash,STATE_CODE,CRASH_DATE,CRASH_TIME,RTE_NO,...,LATITUDE,LONGITUDE,MILEPOST,IMPORT_DATE,FILE_NAME,SAVE_DATE,ROUTE_IDENT,RIP_CYCLE,MP_NODE,SPTL_LOC
0,6,ABLI140610163500,14054379,ABLI,SER,1,KY,"Tuesday, June 10, 2014",1635.0,,...,38.91205,-76.93412,0.0,,,00:00.0,,,,0
1,5,ABLI121009110000,12474,ABLI,SER,1,KY,"Tuesday, October 9, 2012",1100.0,101.0,...,,,,,,00:00.0,,,,0
2,4,ABLI091117170900,N08113,ABLI,SER,1,NY,"Tuesday, November 17, 2009",1709.0,,...,,,,,,00:00.0,,,,0
3,3,ABLI070804175500,5540070013,ABLI,SER,1,KY,"Saturday, August 4, 2007",1755.0,0.0,...,,,,,,,,,,0
4,2,ABLI070425075000,5540070001,ABLI,SER,1,KY,"Wednesday, April 25, 2007",750.0,0.0,...,,,,,,,,,,0


In [84]:
cds_df.CRASH_CLASS.value_counts()

1.0     45460
2.0     18223
7.0      6603
0.0      4824
5.0      2422
88.0     1985
98.0      822
99.0      801
4.0       703
3.0       604
10.0       23
6.0        15
Name: CRASH_CLASS, dtype: int64

In [85]:
cds_df['Non-Collision']= np.where(cds_df['CRASH_CLASS']==0, 1,0)
cds_df['Collision with Other Motor Vehicle']= np.where(cds_df['CRASH_CLASS']==1, 1,0)
cds_df['Collision with Fixed Object']= np.where(cds_df['CRASH_CLASS']==2, 1,0)
cds_df['Collision with Pedestrian']= np.where(cds_df['CRASH_CLASS']==3, 1,0)
cds_df['Collision with Bicycle']= np.where(cds_df['CRASH_CLASS']==4, 1,0)
cds_df['Collision with Parked Motor Vehicle']= np.where(cds_df['CRASH_CLASS']==5, 1,0)
cds_df['Collision with Railway Train']= np.where(cds_df['CRASH_CLASS']==6, 1,0)
cds_df['Collision with Animal']= np.where(cds_df['CRASH_CLASS']==7, 1,0)
cds_df['Collision with Other Object']= np.where(cds_df['CRASH_CLASS']==88, 1,0)
cds_df['Collision with Unknown']= np.where(cds_df['CRASH_CLASS']==99, 1,0)
cds_df['Other Accident Class']= np.where(cds_df['CRASH_CLASS'].isin([0,1,2,3,4,5,6,7,88,99])==False,
                                                                                        1,0)


In [86]:
cds_df.columns

Index(['OBJECTID', 'INCID_NO', 'CASE_NUM', 'PARK_ALPHA', 'Region', '# Crash',
       'STATE_CODE', 'CRASH_DATE', 'CRASH_TIME', 'RTE_NO', 'RTE_NAME',
       'NODE_DIST_FT', 'NODE_DIST_MI', 'NODE_DIR', 'NODE_NUM', 'LIGHT',
       'WEATHER', 'CRASH_LOCATION', 'SURF_COND', 'CRASH_CLASS', 'VEH_COLL',
       'OBJ_STRUCK', 'ROAD_CHAR', 'CON_FACT1', 'CON_FACT2', 'CON_FACT3',
       'CON_FACT4', 'CON_FACT5', 'CON_FACT6', 'HIT_RUN', 'CATEGORY', 'FATALS',
       'INJURED', 'PED_FAT', 'PED_INJ', 'BIKE_FAT', 'BIKE_INJ', 'PED',
       'CRASH_YEAR', 'COMMENTS', 'ZIPFILE', 'LOCATION', 'PHOTOS_TAKEN',
       'USPP_NPS_VEH_INV', 'PARK_PTY_DEST', 'LOCKED_UPDATE', 'LOCKED_BY_USER',
       'DATA_SRC', 'LATITUDE', 'LONGITUDE', 'MILEPOST', 'IMPORT_DATE',
       'FILE_NAME', 'SAVE_DATE', 'ROUTE_IDENT', 'RIP_CYCLE', 'MP_NODE',
       'SPTL_LOC', 'Non-Collision', 'Collision with Other Motor Vehicle',
       'Collision with Fixed Object', 'Collision with Pedestrian',
       'Collision with Bicycle', 'Collision w

In [102]:
cds_df['No Injury']= 0
cds_df['Possible Injury']= 0
cds_df['Non-incapacitating Injury']= 0
cds_df['Incapacitating Injury']= 0
cds_df['Fatality']= 0
cds_df['Unknown Injury']= 0
cds_df['BIK_FAT'] = 0
cds_df['BIK_INJ']=0
cds_df['NUM_OCC']=0


cds_df = cds_df.rename(columns={"STATECODE": "STATE", 
                                      "CASE_NUM": "CASENUM"})


In [103]:
cds_df = cds_df[['INCID_NO', 'CASENUM', 'NUM_OCC', 'Non-Collision',
       'Collision with Other Motor Vehicle', 'Collision with Fixed Object',
       'Collision with Pedestrian', 'Collision with Bicycle',
       'Collision with Parked Motor Vehicle', 'Collision with Railway Train',
       'Collision with Animal', 'Collision with Other Object',
       'Collision with Unknown', 'Other Accident Class', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury', 'PARK_ALPHA', 'STATE_CODE', 'CRASH_DATE',
       'CRASH_TIME', 'FATALS', 'INJURED', 'PED_FAT', 'PED_INJ', 'BIK_FAT',
       'BIK_INJ', 'PED', 'LATITUDE', 'LONGITUDE']]

In [105]:
cds_df.shape, crash_df_final.shape

((83926, 33), (118559, 33))

## Merge CDS Data with STARS Data

In [106]:
crash_df_final_final = pd.concat([crash_df_final, cds_df])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [112]:
#testing if dataframes concatenated correctly
len(cds_df)+len(crash_df_final)==len(crash_df_final_final)

True

In [113]:
crash_df_final_final.to_csv("crash_data_STARS_CDS.csv",index=False)