## Crash Data Wrangling Jupyter Notebook

**Author:** Smitha Mahesh 

**Date:** 6/27/2022 

**Purpose:** This notebook use the new CDS files as the input and make a dataframe that is joinable with other data sources

In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
myworkingdirectory = r"C:\Users\Sophie.Kaye\Desktop\NPS Safety"
os.chdir(myworkingdirectory)

In [33]:
cds_df = pd.read_excel('./CDS/New CDS Excel Files/ALL_CRASH.xlsx')
cds_df.shape

(204687, 56)

In [34]:
cds_df.columns

Index(['OBJECTID', 'INCID_NO', 'CASE_NUM', 'PARK_ALPHA', 'STATE_CODE',
       'CRASH_DATE', 'CRASH_TIME', 'RTE_NO', 'RTE_NAME', 'NODE_DIST_FT',
       'NODE_DIST_MI', 'NODE_DIR', 'NODE_NUM', 'LIGHT', 'WEATHER',
       'CRASH_LOCATION', 'SURF_COND', 'CRASH_CLASS', 'VEH_COLL', 'OBJ_STRUCK',
       'ROAD_CHAR', 'CON_FACT1', 'CON_FACT2', 'CON_FACT3', 'CON_FACT4',
       'CON_FACT5', 'CON_FACT6', 'HIT_RUN', 'CATEGORY', 'FATALS', 'INJURED',
       'PED_FAT', 'PED_INJ', 'BIKE_FAT', 'BIKE_INJ', 'PED', 'CRASH_YEAR',
       'COMMENTS', 'ZIPFILE', 'LOCATION', 'PHOTOS_TAKEN', 'USPP_NPS_VEH_INV',
       'PARK_PTY_DEST', 'LOCKED_UPDATE', 'LOCKED_BY_USER', 'DATA_SRC',
       'LATITUDE', 'LONGITUDE', 'MILEPOST', 'IMPORT_DATE', 'FILE_NAME',
       'SAVE_DATE', 'ROUTE_IDENT', 'RIP_CYCLE', 'MP_NODE', 'SPTL_LOC'],
      dtype='object')

In [35]:
cds_df.head()

Unnamed: 0,OBJECTID,INCID_NO,CASE_NUM,PARK_ALPHA,STATE_CODE,CRASH_DATE,CRASH_TIME,RTE_NO,RTE_NAME,NODE_DIST_FT,...,LATITUDE,LONGITUDE,MILEPOST,IMPORT_DATE,FILE_NAME,SAVE_DATE,ROUTE_IDENT,RIP_CYCLE,MP_NODE,SPTL_LOC
0,2,ABLI070425075000,5540070001,ABLI,KY,2007-04-25,750,0.0,KNOB CREEK PARKING,0.0,...,,,,,,NaT,,,,0
1,3,ABLI070804175500,5540070013,ABLI,KY,2007-08-04,1755,0.0,,0.0,...,,,,,,NaT,,,,0
2,4,ABLI091117170900,N08113,ABLI,NY,2009-11-17,1709,,NEW YORK AVE,,...,,,,,,2014-02-07,,,,0
3,5,ABLI121009110000,12474,ABLI,KY,2012-10-09,1100,101.0,PRIVATE DRIVEWAY OFF EAST BEACH ROAD (875),,...,,,,,,2015-03-16,,,,0
4,6,ABLI140610163500,14054379,ABLI,KY,2014-06-10,1635,,DC 295,,...,38.91205,-76.93412,0.0,,,2014-06-18,,,,0


In [36]:
cds_df.dropna(subset=['INCID_NO']).shape
# no duplicate record numbers

(204687, 56)

In [37]:
park_info = pd.read_csv("./crash database mapping/Park_Info_Table.csv")

In [38]:
park_info.columns

Index(['OBJECTID', 'UNIT_CODE', 'GIS_Notes', 'UNIT_NAME', 'DATE_EDIT', 'STATE',
       'REGION', 'GNIS_ID', 'UNIT_TYPE', 'CREATED_BY', 'METADATA', 'PARKNAME',
       'CreationDa', 'Creator', 'EditDate', 'Editor', 'Shape__Are',
       'Shape__Len', 'Unnamed: 18'],
      dtype='object')

In [39]:
park_info = park_info.rename(columns={'UNIT_CODE':'Park','REGION':'RGN'})
cds_df = cds_df.rename(columns={'PARK_ALPHA' : 'Park'})

In [40]:
# add RGN column from lookup table to CDS crash database, joining the two datasets based on park name
cds_df = pd.merge(cds_df, park_info[['RGN','Park']], how='left', on='Park')
# resulting dataframe after join should have one additional column and no additional rows
cds_df = cds_df.drop_duplicates() 
cds_df.shape

(204687, 57)

In [43]:
#no_region = cds_df.loc[cds_df['RGN'].isnull()==True]
#no_region['Park'].value_counts()

In [44]:
cds_df.loc[cds_df['RGN'].isnull()==True].shape

(0, 57)

In [45]:
cds_df.to_csv("CDS_CrashTable_RegionAdded.csv", index=False)