## Crash Data Speed and Wildlife Exploration Jupyter Notebook

**Author:** Smitha Mahesh 

**Date:** 6/28/2022 

**Purpose:** This notebook use the new CDS files as the input and identifies the count present for too fast, speeding, and wildlife-related crashes, as well as figuring out what count of the data is missing for these variables of interest. 

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
myworkingdirectory = r"C:\Users\smitha.mahesh\Desktop\New CDS Excel Files"
os.chdir(myworkingdirectory)

In [3]:
cds_df = pd.read_excel('./ALL_CRASH.xlsx')
cds_df_passengers = pd.read_excel('./ALL_PASSENGER.xlsx')

In [10]:
cds_df.head()

Unnamed: 0,OBJECTID,INCID_NO,CASE_NUM,PARK_ALPHA,STATE_CODE,CRASH_DATE,CRASH_TIME,RTE_NO,RTE_NAME,NODE_DIST_FT,...,LATITUDE,LONGITUDE,MILEPOST,IMPORT_DATE,FILE_NAME,SAVE_DATE,ROUTE_IDENT,RIP_CYCLE,MP_NODE,SPTL_LOC
0,2,ABLI070425075000,5540070001,ABLI,KY,2007-04-25,750,0.0,KNOB CREEK PARKING,0.0,...,,,,,,NaT,,,,0
1,3,ABLI070804175500,5540070013,ABLI,KY,2007-08-04,1755,0.0,,0.0,...,,,,,,NaT,,,,0
2,4,ABLI091117170900,N08113,ABLI,NY,2009-11-17,1709,,NEW YORK AVE,,...,,,,,,2014-02-07,,,,0
3,5,ABLI121009110000,12474,ABLI,KY,2012-10-09,1100,101.0,PRIVATE DRIVEWAY OFF EAST BEACH ROAD (875),,...,,,,,,2015-03-16,,,,0
4,6,ABLI140610163500,14054379,ABLI,KY,2014-06-10,1635,,DC 295,,...,38.91205,-76.93412,0.0,,,2014-06-18,,,,0


In [11]:
cds_df_passengers.head()

Unnamed: 0,OBJECTID,INCID_NO,UNIT_NO,PASS_SEQ,PASS_SEX,PASS_BELT,PASS_EJECT,PASS_SEAT,PASS_INJ,PASS_AGE
0,5,ABLI121009110000,1,1,1.0,1.0,0.0,3.0,0.0,52.0
1,193356,ABLI9100000001,1,1,1.0,1.0,0.0,3.0,0.0,64.0
2,198783,ACAD0000000014,1,1,2.0,1.0,0.0,3.0,0.0,36.0
3,199384,ACAD0000000083,1,1,2.0,1.0,0.0,3.0,0.0,46.0
4,199385,ACAD0000000083,1,2,2.0,1.0,0.0,4.0,0.0,25.0


In [14]:
cds_df.CRASH_CLASS.value_counts()

1.0     102007
2.0      47867
7.0      18945
0.0      14174
5.0       8176
88.0      5396
99.0      2858
4.0       1498
3.0       1438
98.0       822
10.0        34
6.0         32
Name: CRASH_CLASS, dtype: int64

In [15]:
cds_df['Non-Collision']= np.where(cds_df['CRASH_CLASS']==0, 1,0)
cds_df['Collision with Other Motor Vehicle']= np.where(cds_df['CRASH_CLASS']==1, 1,0)
cds_df['Collision with Fixed Object']= np.where(cds_df['CRASH_CLASS']==2, 1,0)
cds_df['Collision with Pedestrian']= np.where(cds_df['CRASH_CLASS']==3, 1,0)
cds_df['Collision with Bicycle']= np.where(cds_df['CRASH_CLASS']==4, 1,0)
cds_df['Collision with Parked Motor Vehicle']= np.where(cds_df['CRASH_CLASS']==5, 1,0)
cds_df['Collision with Railway Train']= np.where(cds_df['CRASH_CLASS']==6, 1,0)
cds_df['Collision with Animal']= np.where(cds_df['CRASH_CLASS']==7, 1,0)
cds_df['Collision with Other Object']= np.where(cds_df['CRASH_CLASS']==88, 1,0)
cds_df['Collision with Unknown']= np.where(cds_df['CRASH_CLASS']==99, 1,0)
cds_df['Other Accident Class']= np.where(cds_df['CRASH_CLASS'].isin([0,1,2,3,4,5,6,7,88,99])==False,
                                                                                        1,0)


In [16]:
cds_df_passengers['No Injury']= np.where(cds_df_passengers['PASS_INJ']==0, 1,0)
cds_df_passengers['Possible Injury']= np.where(cds_df_passengers['PASS_INJ']==1, 1,0)
cds_df_passengers['Non-incapacitating Injury']= np.where(cds_df_passengers['PASS_INJ']==2, 1,0)
cds_df_passengers['Incapacitating Injury']= np.where(cds_df_passengers['PASS_INJ']==3, 1,0)
cds_df_passengers['Fatality']= np.where(cds_df_passengers['PASS_INJ']==4, 1,0)
cds_df_passengers['Unknown Injury']= np.where(cds_df_passengers['PASS_INJ'].isin([98,99]), 1,0)
cds_df_passengers['NUM_OCC']=1

In [17]:
cds_df_passengers.shape, cds_df.shape

((114151, 17), (204687, 67))

#### Data Quality Issues
- 

In [18]:
cds_df['FATALS'].sum(), cds_df_passengers['Fatality'].sum()

(1206.0, 298)

In [19]:
cds_df_passengers_slim = cds_df_passengers[['INCID_NO', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury', 'NUM_OCC']]

In [20]:
cds_df_passengers_agg = cds_df_passengers_slim.groupby(by=['INCID_NO']).sum()
cds_df_passengers_agg = cds_df_passengers_agg.reset_index()

In [21]:
cds_df_passengers_agg.shape, cds_df_passengers.shape, cds_df.shape

((60136, 8), (114151, 17), (204687, 67))

In [22]:
cds_df_join = cds_df.merge(cds_df_passengers_agg, on = 'INCID_NO',how='left',indicator = True)

In [23]:
cds_df_join._merge.value_counts()

left_only     144551
both           60136
right_only         0
Name: _merge, dtype: int64

In [24]:
26734/83926

0.31854252555822987

In [25]:
cds_df_join = cds_df_join.rename(columns={"CASE_NUM": "CASENUM", 
                                      "STATE_CODE": "STATE"})


In [26]:
cds_df_join['CRASH_DATE2'] = cds_df_join['CRASH_DATE']
cds_df_join['CRASH_DATE'] = pd.to_datetime( cds_df_join['CRASH_DATE'], 
                                            format = '%A, %B %d, %Y')

In [27]:
cds_df_join = cds_df_join[['INCID_NO', 'CASENUM', 'NUM_OCC', 'Non-Collision',
       'Collision with Other Motor Vehicle', 'Collision with Fixed Object',
       'Collision with Pedestrian', 'Collision with Bicycle',
       'Collision with Parked Motor Vehicle', 'Collision with Railway Train',
       'Collision with Animal', 'Collision with Other Object',
       'Collision with Unknown', 'Other Accident Class', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury', 'PARK_ALPHA', 'STATE', 'CRASH_DATE',
       'CRASH_TIME', 'LATITUDE', 'LONGITUDE', 'FATALS', 'INJURED']]

In [28]:
cds_df.shape, cds_df_join.shape

((204687, 67), (204687, 28))

In [29]:
cds_df_join.to_csv("crash_data_CDS_clean.csv", index=False)


In [1]:
# Speed Data Investigation

In [5]:
#Contributing Factors  Code A05 refers to Exceeded Posted Speed Limits. Contributing Factors Code A06 refers to Too Fast for Conditions. See Code Dictionary via Microsoft Access Database 
cds_df['CON_FACT1'].value_counts().sort_index()

A01     1019
A02     5358
A03     6476
A04     5589
A05     4118
A06    12306
A07     2893
A08     1185
A09     9620
A10     3499
A11     8182
A12      997
A13       81
A14     1413
A15     3200
A16    40316
A17      693
A18      178
A88    13487
B01      137
B02      806
B03      106
B04      397
B05     6258
B06    16703
B07      977
B08     1816
B88     1450
C01     1075
C02      606
C03       10
C04       65
C05       34
C06      185
C07      204
C08      198
C88     1048
D01       11
D02       26
D03      234
D04      116
D05      184
D06       38
D07       20
D88      128
E01     1810
E02     2204
E03     1106
E04      508
E05     1123
E06       86
E07      278
E88      621
U99    21055
a16        2
e88        1
Name: CON_FACT1, dtype: int64

In [7]:
cds_df.shape

(204687, 56)

In [6]:
cds_df.loc[cds_df['CON_FACT1'].isnull()==True].shape

(22451, 56)

In [8]:
#Computing the Null for Contributing Factors 1  
round(22451/204687,3)*100

11.0

In [9]:
cds_df['CON_FACT2'].value_counts().sort_index()

A01       88
A02      680
A03      644
A04     1341
A05     1582
A06     3092
A07      887
A08      645
A09     1914
A10      777
A11     1169
A12      427
A13       67
A14      500
A15      562
A16    11034
A17      513
A18      167
A88     2355
B01      140
B02      450
B03       57
B04      118
B05     6221
B06     2581
B07     1350
B08     4082
B88     1373
C01      565
C02      477
C03        4
C04       78
C05      117
C06       58
C07       98
C08      200
C88      910
D01        2
D02       18
D03       57
D04       74
D05      107
D06       40
D07       34
D88      121
E01     5979
E02     4357
E03      835
E04      678
E05     1199
E06       59
E07     1167
E88     1141
U99    80563
Name: CON_FACT2, dtype: int64

In [10]:
cds_df.loc[cds_df['CON_FACT2'].isnull()==True].shape

(60933, 56)

In [43]:
cds_df.shape

(204687, 56)

In [11]:
#Computing the Null for Contributing Factors 2 
round(60933/204687,3)*100

29.799999999999997

In [12]:
cds_df['CON_FACT3'].value_counts().sort_index()

A01        69
A02       334
A03       219
A04       423
A05       637
A06      1742
A07       338
A08       330
A09       454
A10       306
A11       249
A12       179
A13        28
A14       106
A15       168
A16      2693
A17       228
A18        57
A88       680
B01        53
B02       185
B03        29
B04        62
B05      2448
B06       441
B07       455
B08      2264
B88       379
C01       226
C02       471
C03         5
C04        33
C05        57
C06        36
C07        41
C08        99
C88       729
D01         2
D02         9
D03        24
D04         7
D05        34
D06        15
D07        24
D88        63
E01      4356
E02      1544
E03       314
E04       253
E05       449
E06        43
E07       302
E88       336
U99    104746
Name: CON_FACT3, dtype: int64

In [15]:
cds_df.loc[cds_df['CON_FACT3'].isnull()==True].shape

(74913, 56)

In [16]:
cds_df.shape

(204687, 56)

In [17]:
#Computing the Null for Contributing Factors 3
round(74913/204687,3)*100

36.6

In [18]:
cds_df['CON_FACT4'].value_counts().sort_index()

A01        31
A02       219
A03       219
A04       203
A05       302
A06       811
A07       180
A08       194
A09       341
A10       181
A11       236
A12        83
A13        15
A14        99
A15        31
A16      1391
A17        95
A18        27
A88       474
B01        26
B02        55
B03        11
B04        26
B05       819
B06        97
B07       151
B08       710
B88       124
C01        58
C02        84
C03         1
C04         8
C05        22
C06        10
C07         7
C08        24
C88       101
D01         3
D02         9
D03        18
D04        25
D05        14
D06         9
D07        11
D88       275
E01      1297
E02       447
E03        90
E04        79
E05       184
E06        11
E07       123
E88       208
U99    114381
Name: CON_FACT4, dtype: int64

In [19]:
cds_df.loc[cds_df['CON_FACT4'].isnull()==True].shape

(80037, 56)

In [20]:
cds_df.shape

(204687, 56)

In [21]:
#Computing the Null for Contributing Factors 4
round(80037/204687,3)*100

39.1

In [22]:
cds_df['CON_FACT5'].value_counts().sort_index()

A01        17
A02        90
A03        44
A04       105
A05       128
A06       267
A07        37
A08        42
A09        43
A10        27
A11        34
A12        33
A13         9
A14         9
A15        11
A16       418
A17        36
A18        15
A88        99
B01         7
B02        12
B03         4
B04         7
B05       197
B06        15
B07        67
B08       175
B88        63
C01        19
C02        26
C04         5
C05         5
C06         3
C07         1
C08         6
C88        34
D03         5
D04         7
D05         9
D06         3
D88        11
E01       554
E02       210
E03        42
E04        41
E05        82
E06         5
E07        70
E88       272
U99    118594
Name: CON_FACT5, dtype: int64

In [23]:
cds_df.loc[cds_df['CON_FACT5'].isnull()==True].shape

(82672, 56)

In [24]:
cds_df.shape

(204687, 56)

In [25]:
#Computing the Null for Contributing Factors 5
round(82672/204687,3)*100

40.400000000000006

In [26]:
cds_df['CON_FACT6'].value_counts().sort_index()

A01        20
A02        52
A03        22
A04        44
A05        39
A06        95
A07         7
A08        16
A09         5
A10         6
A11         7
A12         6
A14         4
A15         1
A16        91
A17        12
A18         3
A88        44
B01         1
B02         4
B03         1
B05        37
B06         3
B07        13
B08        29
B88         3
C01         5
C02         4
C04         2
C05         1
C07         2
C08         3
C88         6
D02         1
D03         4
D04         1
D88         4
E01       168
E02        58
E03        10
E04        19
E05        24
E06         4
E07        29
E88        56
U99    120172
Name: CON_FACT6, dtype: int64

In [27]:
cds_df.loc[cds_df['CON_FACT6'].isnull()==True].shape

(83549, 56)

In [28]:
cds_df.shape

(204687, 56)

In [29]:
#Computing the Null for Contributing Factors 6
round(83549/204687,3)*100

40.8

In [None]:
cds_df_unit = pd.read_excel('./ALL_UNIT.xlsx')

In [6]:
cds_df_unit.SPEED_LIMIT.value_counts().sort_index()

5.0      9385
10.0     4002
15.0    26493
20.0     3218
25.0    65332
30.0    14313
35.0    47061
40.0    15184
45.0    47545
50.0    16816
55.0    28737
60.0      132
65.0      117
70.0        4
75.0       19
99.0    17894
Name: SPEED_LIMIT, dtype: int64

In [7]:
cds_df_unit.shape

(311057, 31)

In [9]:
cds_df_unit.loc[cds_df_unit['SPEED_LIMIT'].isnull()==True].shape

(14805, 31)

In [10]:
#the percentage of vehicle records with missing speed limits is: 
round(14805/311057,3)*100

4.8

In [11]:
#Violation Charge Code 05 refers to Exceeding Posted Speed Limit. Charge Code 06 refers to Too Fast for Conditions. See Code Dictionary via Microsoft Access Database 
cds_df_unit['VIOL_CHG1'].value_counts().sort_index()

0.0     224650
1.0        655
2.0       4278
3.0       5025
4.0       2599
5.0        977
6.0       4259
7.0       1635
8.0        753
9.0       8226
10.0      3137
11.0      2356
12.0       510
13.0        52
14.0       423
15.0       713
16.0     15809
17.0       789
18.0         4
88.0     26051
98.0      1638
99.0      1331
Name: VIOL_CHG1, dtype: int64

In [62]:
cds_df_unit.loc[cds_df_unit['VIOL_CHG1'].isnull()==True].shape

(5187, 31)

In [15]:
#Computing the Null for Violations Charge 1 
round(5187/311057,3)*100

1.7000000000000002

In [12]:
#Violation Charge Code 05 refers to Exceeding Posted Speed Limit. Charge Code 06 refers to Too Fast for Conditions. See Code Dictionary via Microsoft Access Database 
cds_df_unit['VIOL_CHG2'].value_counts().sort_index()

0.0     285121
1.0        153
2.0        580
3.0        413
4.0        695
5.0        463
6.0        850
7.0        260
8.0        281
9.0        757
10.0       423
11.0       225
12.0       107
13.0        14
14.0        73
15.0       169
16.0      2974
17.0       618
18.0         3
88.0      8381
98.0      1832
99.0      1313
Name: VIOL_CHG2, dtype: int64

In [63]:
cds_df_unit.loc[cds_df_unit['VIOL_CHG2'].isnull()==True].shape

(5352, 31)

In [61]:
cds_df_unit.shape

(311057, 31)

In [16]:
#Computing the Null for Violations Charge 2 
round(5352/311057,3)*100

1.7000000000000002

In [66]:
##Crash Class 07 (or 7.0 below) refers to Collision with Animal. See Code Dictionary via Microsoft Access Database
cds_df['CRASH_CLASS'].value_counts()

1.0     102007
2.0      47867
7.0      18945
0.0      14174
5.0       8176
88.0      5396
99.0      2858
4.0       1498
3.0       1438
98.0       822
10.0        34
6.0         32
Name: CRASH_CLASS, dtype: int64

In [67]:
cds_df.shape 

(204687, 56)

In [68]:
cds_df.loc[cds_df['CRASH_CLASS'].isnull()==True].shape

(1440, 56)

In [69]:
#the percentage of unit records with missing "CRASH CLASS" 
round((1440)/204687,3)*100

0.7000000000000001