## Crash Data Speed and Wildlife Exploration Jupyter Notebook

**Author:** Smitha Mahesh 

**Date:** 6/28/2022 

**Purpose:** This notebook use the new CDS files as the input and identifies the count present for too fast, speeding, and wildlife-related crashes, as well as figuring out what count of the data is missing for these variables of interest. 

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
myworkingdirectory = r"C:\Users\smitha.mahesh\Desktop\New CDS Excel Files"
os.chdir(myworkingdirectory)

In [3]:
cds_df = pd.read_excel('./ALL_CRASH.xlsx')
cds_df_passengers = pd.read_excel('./ALL_PASSENGER.xlsx')

In [10]:
cds_df.head()

Unnamed: 0,OBJECTID,INCID_NO,CASE_NUM,PARK_ALPHA,STATE_CODE,CRASH_DATE,CRASH_TIME,RTE_NO,RTE_NAME,NODE_DIST_FT,...,LATITUDE,LONGITUDE,MILEPOST,IMPORT_DATE,FILE_NAME,SAVE_DATE,ROUTE_IDENT,RIP_CYCLE,MP_NODE,SPTL_LOC
0,2,ABLI070425075000,5540070001,ABLI,KY,2007-04-25,750,0.0,KNOB CREEK PARKING,0.0,...,,,,,,NaT,,,,0
1,3,ABLI070804175500,5540070013,ABLI,KY,2007-08-04,1755,0.0,,0.0,...,,,,,,NaT,,,,0
2,4,ABLI091117170900,N08113,ABLI,NY,2009-11-17,1709,,NEW YORK AVE,,...,,,,,,2014-02-07,,,,0
3,5,ABLI121009110000,12474,ABLI,KY,2012-10-09,1100,101.0,PRIVATE DRIVEWAY OFF EAST BEACH ROAD (875),,...,,,,,,2015-03-16,,,,0
4,6,ABLI140610163500,14054379,ABLI,KY,2014-06-10,1635,,DC 295,,...,38.91205,-76.93412,0.0,,,2014-06-18,,,,0


In [11]:
cds_df_passengers.head()

Unnamed: 0,OBJECTID,INCID_NO,UNIT_NO,PASS_SEQ,PASS_SEX,PASS_BELT,PASS_EJECT,PASS_SEAT,PASS_INJ,PASS_AGE
0,5,ABLI121009110000,1,1,1.0,1.0,0.0,3.0,0.0,52.0
1,193356,ABLI9100000001,1,1,1.0,1.0,0.0,3.0,0.0,64.0
2,198783,ACAD0000000014,1,1,2.0,1.0,0.0,3.0,0.0,36.0
3,199384,ACAD0000000083,1,1,2.0,1.0,0.0,3.0,0.0,46.0
4,199385,ACAD0000000083,1,2,2.0,1.0,0.0,4.0,0.0,25.0


In [14]:
cds_df.CRASH_CLASS.value_counts()

1.0     102007
2.0      47867
7.0      18945
0.0      14174
5.0       8176
88.0      5396
99.0      2858
4.0       1498
3.0       1438
98.0       822
10.0        34
6.0         32
Name: CRASH_CLASS, dtype: int64

In [15]:
cds_df['Non-Collision']= np.where(cds_df['CRASH_CLASS']==0, 1,0)
cds_df['Collision with Other Motor Vehicle']= np.where(cds_df['CRASH_CLASS']==1, 1,0)
cds_df['Collision with Fixed Object']= np.where(cds_df['CRASH_CLASS']==2, 1,0)
cds_df['Collision with Pedestrian']= np.where(cds_df['CRASH_CLASS']==3, 1,0)
cds_df['Collision with Bicycle']= np.where(cds_df['CRASH_CLASS']==4, 1,0)
cds_df['Collision with Parked Motor Vehicle']= np.where(cds_df['CRASH_CLASS']==5, 1,0)
cds_df['Collision with Railway Train']= np.where(cds_df['CRASH_CLASS']==6, 1,0)
cds_df['Collision with Animal']= np.where(cds_df['CRASH_CLASS']==7, 1,0)
cds_df['Collision with Other Object']= np.where(cds_df['CRASH_CLASS']==88, 1,0)
cds_df['Collision with Unknown']= np.where(cds_df['CRASH_CLASS']==99, 1,0)
cds_df['Other Accident Class']= np.where(cds_df['CRASH_CLASS'].isin([0,1,2,3,4,5,6,7,88,99])==False,
                                                                                        1,0)


In [16]:
cds_df_passengers['No Injury']= np.where(cds_df_passengers['PASS_INJ']==0, 1,0)
cds_df_passengers['Possible Injury']= np.where(cds_df_passengers['PASS_INJ']==1, 1,0)
cds_df_passengers['Non-incapacitating Injury']= np.where(cds_df_passengers['PASS_INJ']==2, 1,0)
cds_df_passengers['Incapacitating Injury']= np.where(cds_df_passengers['PASS_INJ']==3, 1,0)
cds_df_passengers['Fatality']= np.where(cds_df_passengers['PASS_INJ']==4, 1,0)
cds_df_passengers['Unknown Injury']= np.where(cds_df_passengers['PASS_INJ'].isin([98,99]), 1,0)
cds_df_passengers['NUM_OCC']=1

In [17]:
cds_df_passengers.shape, cds_df.shape

((114151, 17), (204687, 67))

#### Data Quality Issues
- 

In [18]:
cds_df['FATALS'].sum(), cds_df_passengers['Fatality'].sum()

(1206.0, 298)

In [19]:
cds_df_passengers_slim = cds_df_passengers[['INCID_NO', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury', 'NUM_OCC']]

In [20]:
cds_df_passengers_agg = cds_df_passengers_slim.groupby(by=['INCID_NO']).sum()
cds_df_passengers_agg = cds_df_passengers_agg.reset_index()

In [21]:
cds_df_passengers_agg.shape, cds_df_passengers.shape, cds_df.shape

((60136, 8), (114151, 17), (204687, 67))

In [22]:
cds_df_join = cds_df.merge(cds_df_passengers_agg, on = 'INCID_NO',how='left',indicator = True)

In [23]:
cds_df_join._merge.value_counts()

left_only     144551
both           60136
right_only         0
Name: _merge, dtype: int64

In [24]:
26734/83926

0.31854252555822987

In [25]:
cds_df_join = cds_df_join.rename(columns={"CASE_NUM": "CASENUM", 
                                      "STATE_CODE": "STATE"})


In [26]:
cds_df_join['CRASH_DATE2'] = cds_df_join['CRASH_DATE']
cds_df_join['CRASH_DATE'] = pd.to_datetime( cds_df_join['CRASH_DATE'], 
                                            format = '%A, %B %d, %Y')

In [27]:
cds_df_join = cds_df_join[['INCID_NO', 'CASENUM', 'NUM_OCC', 'Non-Collision',
       'Collision with Other Motor Vehicle', 'Collision with Fixed Object',
       'Collision with Pedestrian', 'Collision with Bicycle',
       'Collision with Parked Motor Vehicle', 'Collision with Railway Train',
       'Collision with Animal', 'Collision with Other Object',
       'Collision with Unknown', 'Other Accident Class', 'No Injury',
       'Possible Injury', 'Non-incapacitating Injury', 'Incapacitating Injury',
       'Fatality', 'Unknown Injury', 'PARK_ALPHA', 'STATE', 'CRASH_DATE',
       'CRASH_TIME', 'LATITUDE', 'LONGITUDE', 'FATALS', 'INJURED']]

In [28]:
cds_df.shape, cds_df_join.shape

((204687, 67), (204687, 28))

In [29]:
cds_df_join.to_csv("crash_data_CDS_clean.csv", index=False)


In [1]:
# Speed Data Investigation

In [12]:
exceeding_speed1= cds_df.loc[cds_df['CON_FACT1'].str.contains("A05")==True]
exceeding_speed1['CON_FACT1']

29        A05
30        A05
85        A05
86        A05
137       A05
         ... 
204482    A05
204484    A05
204493    A05
204567    A05
204636    A05
Name: CON_FACT1, Length: 4118, dtype: object

In [13]:
exceeding_speed1.shape

(4118, 56)

In [None]:
#4118 crashes were contributing factor 1 included 'exceeding posted speeding limits'

In [14]:
cds_df.loc[cds_df['CON_FACT1'].isnull()==True].shape

(22451, 56)

In [47]:
exceeding_speed2= cds_df.loc[cds_df['CON_FACT2'].str.contains("A05")==True]
exceeding_speed2['CON_FACT2']
exceeding_speed2.shape

(1582, 56)

In [28]:
#1582 crashes were contributing factor 2 included 'exceeding posted speeding limits'

In [29]:
cds_df.loc[cds_df['CON_FACT2'].isnull()==True].shape

(60933, 56)

In [30]:
exceeding_speed3= cds_df.loc[cds_df['CON_FACT3'].str.contains("A05")==True]
exceeding_speed3['CON_FACT3']
exceeding_speed3.shape

(637, 56)

In [31]:
#637 crashes were contributing factor 3 included 'exceeding posted speeding limits'

In [32]:
cds_df.loc[cds_df['CON_FACT3'].isnull()==True].shape

(74913, 56)

In [33]:
exceeding_speed4= cds_df.loc[cds_df['CON_FACT4'].str.contains("A05")==True]
exceeding_speed4['CON_FACT4']
exceeding_speed4.shape

(302, 56)

In [None]:
#302 crashes were contributing factor 4 included 'exceeding posted speeding limits'

In [34]:
cds_df.loc[cds_df['CON_FACT4'].isnull()==True].shape

(80037, 56)

In [35]:
exceeding_speed5= cds_df.loc[cds_df['CON_FACT5'].str.contains("A05")==True]
exceeding_speed5['CON_FACT5']
exceeding_speed5.shape

(128, 56)

In [None]:
#128 crashes were contributing factor 5 included 'exceeding posted speeding limits'

In [36]:
cds_df.loc[cds_df['CON_FACT5'].isnull()==True].shape

(82672, 56)

In [37]:
exceeding_speed6= cds_df.loc[cds_df['CON_FACT6'].str.contains("A05")==True]
exceeding_speed6['CON_FACT6']
exceeding_speed6.shape

(39, 56)

In [None]:
#39 crashes were contributing factor 6 included 'exceeding posted speeding limits'

In [38]:
cds_df.loc[cds_df['CON_FACT6'].isnull()==True].shape

(83549, 56)

In [43]:
cds_df.shape

(204687, 56)

In [44]:
#the percentage of crash records with missing "Contributing Factors" entries are among CON_FACT1 through CONFACT6 
round((22451+60933+74913+80037+8267+8354)/204687,3)*100
#This is bigger than 100% because the contributing factor of 'exceeding posted speeding limit' can be recorded in any CON_FACT 1 through 6

124.6

In [55]:
too_fast1= cds_df.loc[cds_df['CON_FACT1'].str.contains("A06")==True]
too_fast1['CON_FACT1']
too_fast1.shape

(12306, 56)

In [None]:
#12306 crashes were contributing factor 1 included 'too fast for conditions'

In [50]:
too_fast2= cds_df.loc[cds_df['CON_FACT2'].str.contains("A06")==True]
too_fast2['CON_FACT2']
too_fast2.shape

(3092, 56)

In [None]:
#3092 crashes were contributing factor 2 included 'too fast for conditions'

In [51]:
too_fast3= cds_df.loc[cds_df['CON_FACT3'].str.contains("A06")==True]
too_fast3['CON_FACT3']
too_fast3.shape

(1742, 56)

In [None]:
#1742 crashes were contributing factor 3 included 'too fast for conditions'

In [52]:
too_fast4= cds_df.loc[cds_df['CON_FACT4'].str.contains("A06")==True]
too_fast4['CON_FACT4']
too_fast4.shape

(811, 56)

In [None]:
#811 crashes were contributing factor 4 included 'too fast for conditions'

In [53]:
too_fast5= cds_df.loc[cds_df['CON_FACT5'].str.contains("A06")==True]
too_fast5['CON_FACT5']
too_fast5.shape

(267, 56)

In [None]:
#267 crashes were contributing factor 5 included 'too fast for conditions'

In [54]:
too_fast6= cds_df.loc[cds_df['CON_FACT6'].str.contains("A06")==True]
too_fast6['CON_FACT6']
too_fast6.shape

(95, 56)

In [None]:
#95 crashes were contributing factor 6 included 'too fast for conditions'

In [56]:
#the percentage of crash records with missing "Contributing Factors" entries are among CON_FACT1 through CONFACT6 
round((22451+60933+74913+80037+8267+8354)/204687,3)*100
#This is bigger than 100% because the contributing factor of 'exceeding posted speeding limit' can be recorded in any CON_FACT 1 through 6

124.6

In [58]:
cds_df_unit = pd.read_excel('./ALL_UNIT.xlsx')
cds_df_unit['VIOL_CHG1'].value_counts()

0.0     224650
88.0     26051
16.0     15809
9.0       8226
3.0       5025
2.0       4278
6.0       4259
10.0      3137
4.0       2599
11.0      2356
98.0      1638
7.0       1635
99.0      1331
5.0        977
17.0       789
8.0        753
15.0       713
1.0        655
12.0       510
14.0       423
13.0        52
18.0         4
Name: VIOL_CHG1, dtype: int64

In [62]:
cds_df_unit.loc[cds_df_unit['VIOL_CHG1'].isnull()==True].shape

(5187, 31)

In [60]:
cds_df_unit['VIOL_CHG2'].value_counts()

0.0     285121
88.0      8381
16.0      2974
98.0      1832
99.0      1313
6.0        850
9.0        757
4.0        695
17.0       618
2.0        580
5.0        463
10.0       423
3.0        413
8.0        281
7.0        260
11.0       225
15.0       169
1.0        153
12.0       107
14.0        73
13.0        14
18.0         3
Name: VIOL_CHG2, dtype: int64

In [63]:
cds_df_unit.loc[cds_df_unit['VIOL_CHG2'].isnull()==True].shape

(5352, 31)

In [61]:
cds_df_unit.shape

(311057, 31)

In [65]:
#the percentage of unit records with missing "Violation Charges" entries are among 
round((5187+5352)/311057,3)*100


3.4000000000000004

In [66]:
cds_df['CRASH_CLASS'].value_counts()

1.0     102007
2.0      47867
7.0      18945
0.0      14174
5.0       8176
88.0      5396
99.0      2858
4.0       1498
3.0       1438
98.0       822
10.0        34
6.0         32
Name: CRASH_CLASS, dtype: int64

In [67]:
cds_df.shape 

(204687, 56)

In [68]:
cds_df.loc[cds_df['CRASH_CLASS'].isnull()==True].shape

(1440, 56)

In [69]:
#the percentage of unit records with missing "CRASH CLASS" 
round((1440)/204687,3)*100

0.7000000000000001

In [14]:
wildlife_crashes= cds_df.loc[cds_df['CRASH_CLASS'].float.contains(7.0)==True]
wildlife_crashes['CRASH_CLASS']
wildlife_crashes.shape

AttributeError: 'Series' object has no attribute 'float'