In [44]:
import pandas as pd
import numpy as np
import os

In [45]:
myworkingdirectory = r"C:\Users\Sophie.Kaye\Desktop\NPS Crash Data"
os.chdir(myworkingdirectory)

In [46]:
cds_crash = pd.read_excel('./CDS/ALL_CRASH.xlsx')
cds_crash.shape

(204687, 56)

In [47]:
cds_crash.columns

Index(['OBJECTID', 'INCID_NO', 'CASE_NUM', 'PARK_ALPHA', 'STATE_CODE',
       'CRASH_DATE', 'CRASH_TIME', 'RTE_NO', 'RTE_NAME', 'NODE_DIST_FT',
       'NODE_DIST_MI', 'NODE_DIR', 'NODE_NUM', 'LIGHT', 'WEATHER',
       'CRASH_LOCATION', 'SURF_COND', 'CRASH_CLASS', 'VEH_COLL', 'OBJ_STRUCK',
       'ROAD_CHAR', 'CON_FACT1', 'CON_FACT2', 'CON_FACT3', 'CON_FACT4',
       'CON_FACT5', 'CON_FACT6', 'HIT_RUN', 'CATEGORY', 'FATALS', 'INJURED',
       'PED_FAT', 'PED_INJ', 'BIKE_FAT', 'BIKE_INJ', 'PED', 'CRASH_YEAR',
       'COMMENTS', 'ZIPFILE', 'LOCATION', 'PHOTOS_TAKEN', 'USPP_NPS_VEH_INV',
       'PARK_PTY_DEST', 'LOCKED_UPDATE', 'LOCKED_BY_USER', 'DATA_SRC',
       'LATITUDE', 'LONGITUDE', 'MILEPOST', 'IMPORT_DATE', 'FILE_NAME',
       'SAVE_DATE', 'ROUTE_IDENT', 'RIP_CYCLE', 'MP_NODE', 'SPTL_LOC'],
      dtype='object')

In [48]:
cds_crash.loc[cds_crash['INCID_NO'].isnull()==True].shape[0]
# no crashes with missing record numbers

0

In [49]:
cds_crash.loc[cds_crash['PARK_ALPHA'].isnull()==True].shape[0]
# no crashes without park names identified

0

In [50]:
pd.options.display.max_rows = 10000000
cds_crash.PARK_ALPHA.value_counts().sort_index()
# note 778 crashes with park name "ZZZZ", which is CDS code for unknown

ABLI        7
ACAD     1419
AGFO        1
ALPO       20
AMIS       94
ANJO        4
ANTI      122
APCO        1
APIS        1
ARCH      306
ARPO        1
ASIS      155
BADL      213
BAND       70
BAWA    28179
BIBE      914
BICA       30
BICY       50
BISC        4
BISO      270
BITH       55
BLCA       42
BLRI     7853
BLUE        2
BOST       58
BOWA       12
BRCA      577
BUFF      110
CABR       16
CACH        6
CACL        1
CACO      416
CAGR        6
CAHA      370
CALO       52
CANA       44
CANY      130
CARE       34
CARL        7
CASA        1
CATO      120
CAVE      119
CAVO        7
CEBR        6
CHAT      226
CHCH      780
CHCU        4
CHIC      402
CHIR       18
CHIS        2
CHOH      420
CIRO        1
CLBA       12
CODA       41
COLM      395
COLO     1661
CONG       10
CORO       10
CRLA      504
CUGA     1019
CUIS        2
CURE       84
CUVA      247
DENA      225
DEPO        1
DETO       17
DEVA      837
DEWA     5394
DINO       53
EDIS        1
EISE       19
ELIS  

In [51]:
# for any of the 778 crashes with unknown park names, can park names be assigned geospatially using lat/long like in IMARS?
cds_unknown_park = cds_crash[cds_crash['PARK_ALPHA']=="ZZZZ"]
cds_unknown_park[cds_unknown_park['LATITUDE'].isnull()==False].shape[0] # nope!

0

In [52]:
cds_vehicle = pd.read_excel('./CDS/ALL_UNIT.xlsx')

(311057, 31)

In [65]:
cds_vehicle.loc[cds_vehicle['INCID_NO'].isnull()==True].shape[0]
# no vehicle reports with missing record numbers

0

In [53]:
cds_passenger = pd.read_excel('./CDS/ALL_PASSENGER.xlsx')

(114151, 10)

In [67]:
cds_passenger.loc[cds_passenger['INCID_NO'].isnull()==True].shape[0]
# no person reports with missing record numbers

0

# DATA CLEANING

## DROP DUPLICATES
**Note:** Cannot remove duplicates from vehicle and passenger tables because they are unique records for each person/car involved in crash

In [68]:
cds_crash = cds_crash.drop_duplicates(subset=['INCID_NO'])

203909

## REVERT FLIPPED LAT/LONG

In [55]:
need_revert = cds_crash['LATITUDE'].abs() > 70
cds_crash.loc[need_revert, ['LATITUDE', 'LONGITUDE']] = (cds_crash.loc[need_revert, ['LONGITUDE', 'LATITUDE']].values)

## ADJUST SIGNS TO CORRECT HEMISPHERE

In [56]:
cds_crash.loc[cds_crash.LATITUDE < 0,"LATITUDE"] = cds_crash['LATITUDE']*(-1)
cds_crash.loc[cds_crash.LONGITUDE > 0,"LONGITUDE"] = cds_crash['LONGITUDE']*(-1)

## Create new region column

In [57]:
park_info = pd.read_csv("./Reference Data/Park_Info_Table.csv")

In [58]:
park_info.columns

Index(['OBJECTID', 'UNIT_CODE', 'GIS_Notes', 'UNIT_NAME', 'DATE_EDIT', 'STATE',
       'REGION', 'GNIS_ID', 'UNIT_TYPE', 'CREATED_BY', 'METADATA', 'PARKNAME',
       'CreationDa', 'Creator', 'EditDate', 'Editor', 'Shape__Are',
       'Shape__Len', 'Unnamed: 18'],
      dtype='object')

In [59]:
park_info = park_info.rename(columns={'UNIT_CODE':'Park','REGION':'RGN'})
cds_crash = cds_crash.rename(columns={'PARK_ALPHA' : 'Park'})

In [60]:
# add RGN column from lookup table to CDS crash database, joining the two datasets based on park name
cds_crash = pd.merge(cds_crash, park_info[['RGN','Park']], how='left', on='Park')
# resulting dataframe after join should have one additional column and no additional rows
cds_crash = cds_crash.drop_duplicates(subset=['INCID_NO']) 
cds_crash.shape

(204687, 57)

In [61]:
cds_crash.loc[cds_crash['RGN'].isnull()==True].shape[0]
# all crashes were assigned a region

0

In [62]:
cds_crash.RGN.value_counts().sort_index()

AKR          237
IMR        33076
MWR         2435
NCR        97838
NER        18642
PWR        24419
SER        27262
Unknown      778
Name: RGN, dtype: int64

In [63]:
# drop rows without region
cds_crash = cds_crash[cds_crash["RGN"].str.contains("Unknown")==False]
cds_crash.RGN.value_counts().sort_index()

AKR      237
IMR    33076
MWR     2435
NCR    97838
NER    18642
PWR    24419
SER    27262
Name: RGN, dtype: int64

In [64]:
cds_crash.to_csv("CDS_CrashTable_RegionAdded_Clean.csv", index=False)