# Crash Data VRU and Work Zone Exploration Jupyter Notebook
**Author:** Sophie Kaye

**Date:** 7/12/2022

**Purpose:** TThis notebook will assess data quality of existing data pertaining to crashes involving to Vulnerable Road Users (pedestrians and cyclists) and Work Zones

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
myworkingdirectory = r'C:\Users\Sophie.Kaye\Desktop\NPS Safety\CDS\New CDS Excel Files'
os.chdir(myworkingdirectory)

In [4]:
cds_df = pd.read_excel('./ALL_CRASH.xlsx')

In [16]:
cds_df.columns

Index(['OBJECTID', 'INCID_NO', 'CASE_NUM', 'PARK_ALPHA', 'STATE_CODE',
       'CRASH_DATE', 'CRASH_TIME', 'RTE_NO', 'RTE_NAME', 'NODE_DIST_FT',
       'NODE_DIST_MI', 'NODE_DIR', 'NODE_NUM', 'LIGHT', 'WEATHER',
       'CRASH_LOCATION', 'SURF_COND', 'CRASH_CLASS', 'VEH_COLL', 'OBJ_STRUCK',
       'ROAD_CHAR', 'CON_FACT1', 'CON_FACT2', 'CON_FACT3', 'CON_FACT4',
       'CON_FACT5', 'CON_FACT6', 'HIT_RUN', 'CATEGORY', 'FATALS', 'INJURED',
       'PED_FAT', 'PED_INJ', 'BIKE_FAT', 'BIKE_INJ', 'PED', 'CRASH_YEAR',
       'COMMENTS', 'ZIPFILE', 'LOCATION', 'PHOTOS_TAKEN', 'USPP_NPS_VEH_INV',
       'PARK_PTY_DEST', 'LOCKED_UPDATE', 'LOCKED_BY_USER', 'DATA_SRC',
       'LATITUDE', 'LONGITUDE', 'MILEPOST', 'IMPORT_DATE', 'FILE_NAME',
       'SAVE_DATE', 'ROUTE_IDENT', 'RIP_CYCLE', 'MP_NODE', 'SPTL_LOC'],
      dtype='object')

In [4]:
#3.0 refers to Collision with Pedestrian and 4.0 refers to Collision with Cyclist
cds_df['CRASH_CLASS'].value_counts().sort_index()

0.0      14174
1.0     102007
2.0      47867
3.0       1438
4.0       1498
5.0       8176
6.0         32
7.0      18945
10.0        34
88.0      5396
98.0       822
99.0      2858
Name: CRASH_CLASS, dtype: int64

In [6]:
cds_df.loc[cds_df['CRASH_CLASS'].isnull()==True].shape

(1440, 56)

In [7]:
cds_df.shape

(204687, 56)

In [8]:
#the percentage of crash records with missing "crass class" entries is:
round(1440/204687,3)*100

0.7000000000000001

In [9]:
cds_df_unit = pd.read_excel('./ALL_UNIT.xlsx')

In [10]:
cds_df_unit.columns

Index(['OBJECTID', 'INCID_NO', 'UNIT_NO', 'VEH_YEAR', 'MAKE_MOD', 'MODEL',
       'NUM_OCC', 'REG_STATE', 'REG_YEAR', 'PLATE_NUM', 'DIR_TRAVEL',
       'SPEED_LIMIT', 'BODY_TYPE', 'VEH_MANVR', 'VEH_DAMAGE', 'DAM_LOCATION',
       'LIC_NUM', 'LIC_STATE', 'PED', 'BRTH_DATE', 'DRIVER_SEX', 'DRIVER_BELT',
       'DRIVER_EJECT', 'DRIVER_INJ', 'DRIVER_VIOLTN', 'VIOL_CHG1', 'VIOL_CHG2',
       'PED_TYPE', 'PED_LOC', 'PED_ACTN', 'REPAIR'],
      dtype='object')

In [12]:
cds_df_unit['PED_TYPE'].value_counts().sort_index()

0.0     301573
1.0       1045
2.0       1039
3.0          6
4.0          6
5.0         44
6.0         40
88.0       122
98.0        31
99.0      1308
Name: PED_TYPE, dtype: int64

In [13]:
cds_df_unit.loc[cds_df_unit['PED_TYPE'].isnull()==True].shape

(5843, 31)

In [14]:
cds_df_unit.shape

(311057, 31)

In [15]:
#the percentage of vehicle records with missing "pedestrian type" entries is:
round(5843/311057,3)*100

1.9

In [17]:
cds_df['CON_FACT1'].value_counts().sort_index()

A01     1019
A02     5358
A03     6476
A04     5589
A05     4118
A06    12306
A07     2893
A08     1185
A09     9620
A10     3499
A11     8182
A12      997
A13       81
A14     1413
A15     3200
A16    40316
A17      693
A18      178
A88    13487
B01      137
B02      806
B03      106
B04      397
B05     6258
B06    16703
B07      977
B08     1816
B88     1450
C01     1075
C02      606
C03       10
C04       65
C05       34
C06      185
C07      204
C08      198
C88     1048
D01       11
D02       26
D03      234
D04      116
D05      184
D06       38
D07       20
D88      128
E01     1810
E02     2204
E03     1106
E04      508
E05     1123
E06       86
E07      278
E88      621
U99    21055
a16        2
e88        1
Name: CON_FACT1, dtype: int64

In [18]:
cds_df['CON_FACT2'].value_counts().sort_index()

A01       88
A02      680
A03      644
A04     1341
A05     1582
A06     3092
A07      887
A08      645
A09     1914
A10      777
A11     1169
A12      427
A13       67
A14      500
A15      562
A16    11034
A17      513
A18      167
A88     2355
B01      140
B02      450
B03       57
B04      118
B05     6221
B06     2581
B07     1350
B08     4082
B88     1373
C01      565
C02      477
C03        4
C04       78
C05      117
C06       58
C07       98
C08      200
C88      910
D01        2
D02       18
D03       57
D04       74
D05      107
D06       40
D07       34
D88      121
E01     5979
E02     4357
E03      835
E04      678
E05     1199
E06       59
E07     1167
E88     1141
U99    80563
Name: CON_FACT2, dtype: int64

In [19]:
cds_df['CON_FACT3'].value_counts().sort_index()

A01        69
A02       334
A03       219
A04       423
A05       637
A06      1742
A07       338
A08       330
A09       454
A10       306
A11       249
A12       179
A13        28
A14       106
A15       168
A16      2693
A17       228
A18        57
A88       680
B01        53
B02       185
B03        29
B04        62
B05      2448
B06       441
B07       455
B08      2264
B88       379
C01       226
C02       471
C03         5
C04        33
C05        57
C06        36
C07        41
C08        99
C88       729
D01         2
D02         9
D03        24
D04         7
D05        34
D06        15
D07        24
D88        63
E01      4356
E02      1544
E03       314
E04       253
E05       449
E06        43
E07       302
E88       336
U99    104746
Name: CON_FACT3, dtype: int64

In [20]:
cds_df['CON_FACT4'].value_counts().sort_index()

A01        31
A02       219
A03       219
A04       203
A05       302
A06       811
A07       180
A08       194
A09       341
A10       181
A11       236
A12        83
A13        15
A14        99
A15        31
A16      1391
A17        95
A18        27
A88       474
B01        26
B02        55
B03        11
B04        26
B05       819
B06        97
B07       151
B08       710
B88       124
C01        58
C02        84
C03         1
C04         8
C05        22
C06        10
C07         7
C08        24
C88       101
D01         3
D02         9
D03        18
D04        25
D05        14
D06         9
D07        11
D88       275
E01      1297
E02       447
E03        90
E04        79
E05       184
E06        11
E07       123
E88       208
U99    114381
Name: CON_FACT4, dtype: int64

In [21]:
cds_df['CON_FACT5'].value_counts().sort_index()

A01        17
A02        90
A03        44
A04       105
A05       128
A06       267
A07        37
A08        42
A09        43
A10        27
A11        34
A12        33
A13         9
A14         9
A15        11
A16       418
A17        36
A18        15
A88        99
B01         7
B02        12
B03         4
B04         7
B05       197
B06        15
B07        67
B08       175
B88        63
C01        19
C02        26
C04         5
C05         5
C06         3
C07         1
C08         6
C88        34
D03         5
D04         7
D05         9
D06         3
D88        11
E01       554
E02       210
E03        42
E04        41
E05        82
E06         5
E07        70
E88       272
U99    118594
Name: CON_FACT5, dtype: int64

In [22]:
cds_df['CON_FACT6'].value_counts().sort_index()

A01        20
A02        52
A03        22
A04        44
A05        39
A06        95
A07         7
A08        16
A09         5
A10         6
A11         7
A12         6
A14         4
A15         1
A16        91
A17        12
A18         3
A88        44
B01         1
B02         4
B03         1
B05        37
B06         3
B07        13
B08        29
B88         3
C01         5
C02         4
C04         2
C05         1
C07         2
C08         3
C88         6
D02         1
D03         4
D04         1
D88         4
E01       168
E02        58
E03        10
E04        19
E05        24
E06         4
E07        29
E88        56
U99    120172
Name: CON_FACT6, dtype: int64

In [30]:
# Note that all Contributing Factors beginning with 'D' refer to VRUs

In [23]:
# total crashes in which "Pedestrian/Cyclist: Under Influence of Drugs" (D01) was listed as at least one of six contributing factors:
11+2+2+3+0+0

18

In [24]:
# total crashes in which "Pedestrian/Cyclist: Under Influence of Alcohol" (D02) was listed as at least one of six contributing factors:
26+18+9+9+0+1

63

In [25]:
# total crashes in which "Pedestrian/Cyclist: Failed to Yield Right of Way" (D03) was listed as at least one of six contributing factors:
234+57+24+18+5+4

342

In [26]:
# total crashes in which "Pedestrian/Cyclist: Disregarded Traffic Control" (D04) was listed as at least one of six contributing factors:
116+74+7+25+7+1

230

In [27]:
# total crashes in which "Pedestrian/Cyclist: Illegally in Roadway" (D05) was listed as at least one of six contributing factors:
184+107+34+14+9+0

348

In [28]:
# total crashes in which "Pedestrian/Cyclist: Bicycle Violation" (D06) was listed as at least one of six contributing factors:
38+40+15+9+3+0

105

In [29]:
# total crashes in which "Pedestrian/Cyclist: Clothing Not Visible" (D07) was listed as at least one of six contributing factors:
20+34+24+11+0+0

89

In [31]:
# total crashes in which "Pedestrian/Cyclist: Other" (D88) was listed as at least one of six contributing factors:
128+121+63+275+11+4

602

In [32]:
no_ConFact1 = cds_df.loc[cds_df['CON_FACT1'].isnull()==True]
no_ConFact1_2 = no_ConFact1.loc[no_ConFact1['CON_FACT2'].isnull()==True]
no_ConFact1_3 = no_ConFact1_2.loc[no_ConFact1_2['CON_FACT3'].isnull()==True]
no_ConFact1_4 = no_ConFact1_3.loc[no_ConFact1_3['CON_FACT4'].isnull()==True]
no_ConFact1_5 = no_ConFact1_4.loc[no_ConFact1_4['CON_FACT5'].isnull()==True]
no_ConFact_any = no_ConFact1_5.loc[no_ConFact1_5['CON_FACT6'].isnull()==True]
no_ConFact_any.shape

(21445, 56)

In [33]:
#the percentage of crash records with no contributing factor entries whatsoever is:
round(21445/204687,3)*100

10.5

In [34]:
cds_df_unit['PED_ACTN'].value_counts().sort_index()

0.0     189034
1.0        757
2.0        676
3.0        115
4.0         54
5.0        136
6.0         13
88.0       542
98.0      1887
99.0    112547
Name: PED_ACTN, dtype: int64

In [35]:
cds_df_unit.loc[cds_df_unit['PED_ACTN'].isnull()==True].shape

(5296, 31)

In [36]:
#the percentage of vehicle records with no pedestrian action entries is:
round(5296/311057,3)*100

1.7000000000000002

In [37]:
cds_df_unit['PED_LOC'].value_counts().sort_index()

0.0     189071
1.0        447
2.0       1195
3.0         79
4.0        199
88.0      1170
98.0      1862
99.0    111750
Name: PED_LOC, dtype: int64

In [38]:
cds_df_unit.loc[cds_df_unit['PED_LOC'].isnull()==True].shape

(5284, 31)

In [39]:
#the percentage of vehicle records with no pedestrian location entries is:
round(5284/311057,3)*100

1.7000000000000002

In [5]:
cds_df['CRASH_LOCATION'].value_counts().sort_index()

11.0    27565
12.0     8433
13.0     9705
14.0     3841
15.0     4512
16.0    93236
17.0     5041
21.0    15483
22.0     1477
23.0    15800
24.0     1198
25.0    10919
26.0      242
27.0       13
28.0      483
29.0     1229
30.0       31
98.0      618
99.0     3662
Name: CRASH_LOCATION, dtype: int64

In [7]:
cds_df.loc[cds_df['CRASH_LOCATION'].isnull()==True].shape

(1199, 56)

In [8]:
#the percentage of crash records with no crash location entry is:
round(1199/204687,3)*100

0.6

In [5]:
cds_df['PARK_ALPHA'].value_counts().sort_index()

ABLI        7
ACAD     1419
AGFO        1
ALPO       20
AMIS       94
        ...  
WUPA        8
YELL    10158
YOSE     7756
ZION     1478
ZZZZ      778
Name: PARK_ALPHA, Length: 243, dtype: int64

In [6]:
cds_df.loc[cds_df['PARK_ALPHA'].isnull()==True].shape

(0, 56)