In [1]:
# Import the dependencies.
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
import numpy as np
import re
import tensorflowjs as tfjs

# Import the CRSS connection string.
from config import crss_conn_string

# CONNECT TO THE DATABASE

In [2]:
# Create CSSR engine.
cssr_engine = create_engine(crss_conn_string);

In [3]:
# Connect to server.
cssr_conn = cssr_engine.connect();

In [4]:
# Create accidents, vehicles, people data frame.
avp_df = pd.read_sql("SELECT * FROM public.accident_vehicle_person", cssr_conn)

In [5]:
# Show first 10 rows.
pd.set_option("display.max_columns", None)
avp_df.head()

Unnamed: 0,casenum,urbancity,ve_total,ve_forms,permvit,num_inj,month,year,day_week,hour,alcohol,max_sev,wrk_zone,lgt_cond,weather,veh_no,numoccs,m_harmname,makename,tow_vehname,trav_speed,deformedname,towedname,speedrelname,vtrafwayname,vspd_lim,bdytyp_imname,mod_year,p_crash1name,per_no,rest_usename,rest_misname,helm_usename,helm_misname,drinkingname,alc_resname,drugsname,hospitalname,locationname,sex_imname,injsev_imname,peralch_imname,seat_imname,age_im
0,201901176716,2,2,2,2,1,1,2019,4,19,2,3,0,2,1,2,1,Motor Vehicle In-Transport,Peterbilt,No Trailing Units,998,Minor Damage,Not Reported,No,"Two-Way, Not Divided",50,Single-unit straight truck or Cab-Chassis (GVW...,2014,Backing Up (other than for Parking Position),1,None Used/Not Applicable,None Used/Not Applicable,Not Applicable,None Used/Not Applicable,No (Alcohol Not Involved),Test Not Given,No (drugs not involved),Not Transported,Occupant of a Motor Vehicle,Male,No Apparent Injury (O),No (Alcohol Not Involved),"Front Seat, Left Side",59
1,201901182038,1,1,1,1,1,1,2019,4,15,9,1,0,1,1,1,99,Pedalcyclist,Unknown Make,No Trailing Units,998,Not Reported,Not Towed,No,Not Reported,98,Unknown body type,9999,Turning Right,1,Other,No Indication of Mis-Use,Not Applicable,None Used/Not Applicable,Reported as Unknown,Reported as Unknown if Tested,Reported as Unknown,Not Transported,Occupant of a Motor Vehicle,Reported as Unknown,Unknown/Not Reported,No (Alcohol Not Involved),"Front Seat, Left Side",998
2,201901248479,2,2,2,2,2,1,2019,5,7,2,1,0,1,1,2,1,Motor Vehicle In-Transport,Nissan/Datsun,No Trailing Units,0,Minor Damage,Not Towed,No,"Two-Way, Divided, Unprotected Median",35,Station Wagon (excluding van and truck based),2015,Going Straight,1,Reported as Unknown,None Used/Not Applicable,Not Applicable,None Used/Not Applicable,No (Alcohol Not Involved),Test Not Given,No (drugs not involved),Not Transported,Occupant of a Motor Vehicle,Female,Possible Injury (C),No (Alcohol Not Involved),"Front Seat, Left Side",39
3,201901248497,2,2,2,2,0,1,2019,6,15,9,0,0,1,1,1,99,Motor Vehicle In-Transport,Unknown Make,No Trailing Units,999,Reported as Unknown,Not Towed,Reported as Unknown,Not Reported,35,"Utility Vehicle, Unknown body type",9999,Going Straight,1,Reported as Unknown,None Used/Not Applicable,Not Applicable,None Used/Not Applicable,Reported as Unknown,Test Not Given,Reported as Unknown,Not Transported,Occupant of a Motor Vehicle,Reported as Unknown,Unknown/Not Reported,No (Alcohol Not Involved),"Front Seat, Left Side",999
4,201901250686,1,2,2,4,4,1,2019,2,19,2,3,0,3,10,1,4,Fire/Explosion,Chevrolet,No Trailing Units,80,Disabling Damage,Towed Due to Disabling Damage,"Yes, Too Fast for Conditions","Two-Way, Divided, Positive Median Barrier",70,"4-door sedan, hardtop",2008,Going Straight,1,Shoulder Belt Only Used,"Yes, Indication of Mis-Use",Not Applicable,None Used/Not Applicable,No (Alcohol Not Involved),Test Not Given,Yes (drugs involved),EMS Unknown Mode,Occupant of a Motor Vehicle,Male,Suspected Minor Injury (B),No (Alcohol Not Involved),"Front Seat, Left Side",17


In [6]:
# Write to CSV
avp_df.to_csv('all_gas_no_brakes.csv')

# PREPROCCESSING

In [7]:
# Describe Data
avp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256440 entries, 0 to 256439
Data columns (total 44 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   casenum         256440 non-null  object
 1   urbancity       256440 non-null  int64 
 2   ve_total        256440 non-null  int64 
 3   ve_forms        256440 non-null  int64 
 4   permvit         256440 non-null  int64 
 5   num_inj         256440 non-null  int64 
 6   month           256440 non-null  int64 
 7   year            256440 non-null  int64 
 8   day_week        256440 non-null  int64 
 9   hour            256440 non-null  int64 
 10  alcohol         256440 non-null  int64 
 11  max_sev         256440 non-null  int64 
 12  wrk_zone        256440 non-null  int64 
 13  lgt_cond        256440 non-null  int64 
 14  weather         256440 non-null  int64 
 15  veh_no          256440 non-null  int64 
 16  numoccs         256440 non-null  int64 
 17  m_harmname      256440 non-nu

In [8]:
# Recode Urban City : 0 = urban, 1 = city
avp_df['urbancity'] = avp_df['urbancity'].replace(1,0)
avp_df['urbancity'] = avp_df['urbancity'].replace(2,1)
avp_df['urbancity'].unique()

array([1, 0], dtype=int64)

In [9]:
# Recode Month : 1 = Jan, 12 = Dec
avp_df['month']= avp_df['month'].replace([1,2,3,4,5,6,7,8,9,10,11,12],['Jan','Feb','Mar','Apr','May',
                                                                             'Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
avp_df['month'].unique()

array(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Sep', 'Jun', 'Jul', 'Aug',
       'Nov', 'Oct', 'Dec'], dtype=object)

In [10]:
# Recode Day of the week 1= Sunday, 7 = Saturday
avp_df['day_week']= avp_df['day_week'].replace([1,2,3,4,5,6,7],['Sun','Mon','Tues','Wed','Thus','Fri','Sat'])
avp_df['day_week'].unique()

array(['Wed', 'Thus', 'Fri', 'Mon', 'Tues', 'Sat', 'Sun'], dtype=object)

In [11]:
# Recode HOUR = if unknown, set at 17 (5 pm - most frequent time)
avp_df['hour'] = avp_df['hour'].replace(99,17)
avp_df['hour'].unique()

array([19, 15,  7,  6,  9, 22,  8, 17, 20, 12, 18, 13, 16, 21, 11,  2, 10,
       14,  0,  3,  5, 23,  4,  1], dtype=int64)

In [12]:
# Recode Alcoholinvolvement of the DRIVER 1= alcohol involved, 2,8,9 =  no alcohol
avp_df['alcohol'] = avp_df['alcohol'].replace([1,2,8,9],[1,0,0,0])
avp_df['alcohol'].unique()

array([0, 1], dtype=int64)

In [13]:
# Recode max severity - accident level 0,8,9 = none, 1,2,5 = minor, 3 = serious, 4,6 = fatal
avp_df['max_sev']=avp_df['max_sev'].replace([0,8,9,1,2,5,3,4,6],['none', 'none','none','minor','minor',
                                                                 'minor','serious','fatal','fatal'])
avp_df['max_sev'].unique()

array(['serious', 'minor', 'none', 'fatal'], dtype=object)

In [14]:
# Recode work zone 0 = no (0), 1,2,3,4 = yes (1)
avp_df['wrk_zone'] = avp_df['wrk_zone'].replace([0,1,2,3,4],[0,1,1,1,1])
avp_df['wrk_zone'].unique()

array([0, 1], dtype=int64)

In [15]:
# Recode lighting conditions 1= daylight, 2,3,6 = dark, 4=dawn 5=dusk, 7,8,9 = other
avp_df['lgt_cond']= avp_df['lgt_cond'].replace([1,2,3,6,4,5,7,8,9],['daylight','dark','dark','dark','dawn','dusk',
                                                                     'other','other','other'])
avp_df['lgt_cond'].unique()

array(['dark', 'daylight', 'dusk', 'dawn', 'other'], dtype=object)

In [16]:
# Recode weather 1 = clear 2,3 = rain/sleet/hail, 4,11 = snow, 5 = fog/smoke, 6=windy, 7=blowing dirt, 
# 10=cloudy, 12=freezing rain, 8, 98, 99 = other
avp_df['weather']=avp_df['weather'].replace([1,2,3,4,11,5,6,7,10,12,8,98,99],['clear','rain_sleet','rain_sleet',
                                                                              'snow_blowsnow','snow_blowsnow',
                                                                              'fog_smoke','windy','blowing_dirt','cloudy',
                                                                              'freezing_rain','other','other','other'])
avp_df['weather'].unique()

array(['clear', 'cloudy', 'other', 'rain_sleet', 'fog_smoke',
       'snow_blowsnow', 'freezing_rain', 'windy', 'blowing_dirt'],
      dtype=object)

In [17]:
avp_df['m_harmname'].unique()

array(['Motor Vehicle In-Transport', 'Pedalcyclist', 'Fire/Explosion',
       'Parked Motor Vehicle', 'Rollover/Overturn',
       'Utility Pole/Light Support', 'Guardrail Face',
       'Fell/Jumped from Vehicle', 'Fence', 'Tree (Standing Only)',
       'Traffic Signal Support', 'Post, Pole or Other Supports', 'Ditch',
       'Pedestrian',
       'Motor Vehicle In-Transport Strikes or is Struck by Cargo, Persons or Objects Set-in-Motion from/by Another Motor Vehicle In Transport',
       'Live Animal', 'Immersion or Partial Immersion', 'Cable Barrier',
       'Embankment', 'Other Fixed Object',
       'Non-Motorist on Personal Conveyance', 'Traffic Sign Support',
       'Guardrail End', 'Working Motor Vehicle',
       'Other Object (not fixed)', 'Concrete Traffic Barrier', 'Building',
       'Unknown Fixed Object', 'Railway Vehicle', 'Other Traffic Barrier',
       'Mail Box', 'Wall', 'Ground', 'Bridge Overhead Structure',
       'Reported as Unknown', 'Culvert', 'Curb', 'Fire Hydrant',

In [18]:
# Recode most harmful event - vehicle level
avp_df['m_harmname'] = avp_df['m_harmname'].replace([
   'Motor Vehicle In-Transport',
    'Motor Vehicle in Motion Outside the Trafficway',
    'Working Motor Vehicle',
    
    'Parked Motor Vehicle', 
    
    'Pedalcyclist',
    'Pedestrian',
    'Non-Motorist on Personal Conveyance',
    'Ridden Animal or Animal Drawn Conveyance',
    'Live Animal',
    
    'Traffic Sign Support',
    'Utility Pole/Light Support',
    'Bridge Pier or Support',
    'Guardrail End',
    'Post, Pole or Other Supports',
    'Impact Attenuator/Crash Cushion',
    'Fire Hydrant',
    'Other Fixed Object',
    'Unknown Fixed Object',
    'Mail Box',
    'Traffic Signal Support',
    'Bridge Overhead Structure',
    'Building',
    
    'Bridge Rail (Includes parapet)',
    'Curb', 
    'Guardrail Face',
    'Concrete Traffic Barrier',
    'Other Traffic Barrier',
    'Wall', 
    'Cable Barrier',
    'Fence',
    
    'Shrubbery',
    'Tree (Standing Only)',
    'Boulder',
    'Snow Bank',
    
    'Embankment',
    'Ditch', 
    'Ground',
    'Culvert',
    
    'Fire/Explosion',
    
    'Motor Vehicle In-Transport Strikes or is Struck by Cargo, Persons or Objects Set-in-Motion from/by Another Motor Vehicle In Transport',
    'Object That Had Fallen From Motor Vehicle In-Transport',
    'Unknown Object Not Fixed',
    'Thrown or Falling Object',
    'Other Object (not fixed)',
    
    'Immersion or Partial Immersion',
    
    'Reported as Unknown',
    'Harmful Event, Details Not Reported',
    
    'Pavement Surface Irregularity (Ruts, Potholes, Grates, etc.)',
    'Jackknife (harmful to this vehicle)',
    'Other Non-Collision',
    'Cargo/Equipment Loss, Shift, or Damage [harmful]',
    'Rollover/Overturn',
    
    'Injured In Vehicle (Non-Collision)',
    'Fell/Jumped from Vehicle', 
    
     'Railway Vehicle',
    'Road Vehicle on Rails'    
],
[
    'harm_moving_veh',
    'harm_moving_veh',
    'harm_moving_veh',
    
    'harm_parked_veh',
    
    'harm_ped_animal',
    'harm_ped_animal',
    'harm_ped_animal',
    'harm_ped_animal',
    'harm_ped_animal',
    
    'harm_fixed_manmade',
    'harm_fixed_manmade',
    'harm_fixed_manmade',
    'harm_fixed_manmade',
    'harm_fixed_manmade',
    'harm_fixed_manmade',
    'harm_fixed_manmade',
    'harm_fixed_manmade',
    'harm_fixed_manmade',
    'harm_fixed_manmade',
    'harm_fixed_manmade',
    'harm_fixed_manmade',
    'harm_fixed_manmade',
    
    'harm_barrier',
    'harm_barrier',
    'harm_barrier',
    'harm_barrier',
    'harm_barrier',
    'harm_barrier',
    'harm_barrier',
    'harm_barrier',
    
    'harm_nat_object',
    'harm_nat_object',
    'harm_nat_object',
    'harm_nat_object',
    
    'harm_terrain',
    'harm_terrain',
    'harm_terrain',
    'harm_terrain',
    
    'harm_fire',
    
    'harm_object',
    'harm_object',
    'harm_object',
    'harm_object',
    'harm_object',
    
    'harm_water',
    
    'harm_unknown',
    'harm_unknown',
    
    'harm_lost_control',
    'harm_lost_control',
    'harm_lost_control',
    'harm_lost_control',
    'harm_lost_control',
    
    'harm_injury_fallout',
    'harm_injury_fallout',
    
    'harm_train',
    'harm_train'    
])
avp_df['m_harmname'].unique()

array(['harm_moving_veh', 'harm_ped_animal', 'harm_fire',
       'harm_parked_veh', 'harm_lost_control', 'harm_fixed_manmade',
       'harm_barrier', 'harm_injury_fallout', 'harm_nat_object',
       'harm_terrain', 'harm_object', 'harm_water', 'harm_train',
       'harm_unknown'], dtype=object)

In [19]:
avp_df['makename'].unique()

array(['Peterbilt', 'Unknown Make', 'Nissan/Datsun', 'Chevrolet', 'KIA',
       'Audi', 'Dodge', 'Toyota', 'Honda', 'Gillig', 'Chrysler', 'Ford',
       'International Harvester/Navistar', 'Hyundai', 'Harley-Davidson',
       'Volkswagen', 'Subaru', 'Infiniti', 'Pontiac', 'GMC',
       'Freightliner', 'Mercedes-Benz',
       'Jeep / Kaiser-Jeep / Willys- Jeep', 'Bluebird', 'Mazda',
       'Not Reported', 'Lincoln', 'BMW', 'Cadillac', 'Acura', 'Volvo',
       'Lexus', 'Scion', 'Mitsubishi', 'Other Make', 'Saturn', 'Yamaha',
       'Buick / Opel', 'Ducati', 'Suzuki', 'Land Rover', 'Kenworth',
       'Kawasaki', 'Other Import', 'Jaguar', 'Mack', 'Mercury',
       'Oldsmobile', 'Isuzu', 'Triumph', 'White/Autocar White/GMC',
       'Fiat', 'Plymouth', 'AM General', 'Porsche',
       'Other Domestic Manufacturers', 'Saab', 'MCI', 'Victory', 'Smart',
       'Grumman', 'Alfa Romeo', 'Thomas Built', 'Moto-Guzzi', 'Daewoo',
       'American Motors', 'Eagle'], dtype=object)

In [20]:
# Recode vehicle make as make_country
avp_df['make_country'] = avp_df['makename'].replace([
    'Toyota',
    'Honda',
    'Subaru',
    'Nissan/Datsun',
    'Acura',
    'Suzuki',
    'Lexus',
    'Mazda', 
    'Mitsubishi', 
    'Infiniti', 
    'Isuzu',
    'Scion',
    
    'KIA',
    'Daewoo',
    'Hyundai',
    
    'Chevrolet', 
    'Ford',
    'Pontiac',
    'Cadillac',
    'Dodge', 
    'Chrysler', 
    'GMC',
    'Jeep / Kaiser-Jeep / Willys- Jeep', 
    'Buick / Opel',
    'Other Domestic Manufacturers', 
    'Lincoln',
    'Oldsmobile', 
    'Mercury',
    'Plymouth', 
    'Eagle', 
    'American Motors',
    'Saturn',
    
    'Freightliner', 
    'AM General', 
    'International Harvester/Navistar',
    'Peterbilt',
    'Mack',
    'Kenworth',
    'Thomas Built', 
    'Bluebird',
    'White/Autocar White/GMC',
    'Gillig', 
    'MCI',
    'Grumman',
    
    'Mercedes-Benz',
    'Volkswagen',
    'Audi',
    'BMW', 
    'Smart',
    'Porsche',
    
    'Victory',
    
    'Volvo',
    'Saab', 
    
    'Ducati', 
    'Harley-Davidson', 
    'Yamaha', 
    'Kawasaki',
    'Moto-Guzzi',
    
    'Jaguar',
    'Land Rover',
    'Triumph', 
    
    'Alfa Romeo',
    'Fiat',
    
    'Other Import',
    'Other Make',
    'Unknown Make',
    'Not Reported',
],
[
    'make_Japan',
    'make_Japan',
    'make_Japan',
    'make_Japan',
    'make_Japan',
    'make_Japan',
    'make_Japan',
    'make_Japan',
    'make_Japan',
    'make_Japan',
    'make_Japan',
    'make_Japan',
    
    'make_Korea',
    'make_Korea',
    'make_Korea',
    
    'make_US',
    'make_US',
    'make_US',
    'make_US',
    'make_US',
    'make_US',
    'make_US',
    'make_US',
    'make_US',
    'make_US',
    'make_US',
    'make_US',
    'make_US',
    'make_US',
    'make_US',
    'make_US',
    'make_US',
    
    'make_US_truck',
    'make_US_truck',
    'make_US_truck',
    'make_US_truck',
    'make_US_truck',
    'make_US_truck',
    'make_US_truck',
    'make_US_truck',
    'make_US_truck',
    'make_US_truck',
    'make_US_truck',
    'make_US_truck',
    
    'make_Germany',
    'make_Germany',
    'make_Germany',
    'make_Germany',
    'make_Germany',
    'make_Germany',
    
    'make_China',
    
    'make_Sweden',
    'make_Sweden',
    
    'make_motorcycle',
    'make_motorcycle',
    'make_motorcycle',
    'make_motorcycle',
    'make_motorcycle',
    
    'make_England',
    'make_England',
    'make_England',
    
    'make_Italy',
    'make_Italy',
    
    'make_other',
    'make_other',
    'make_other',
    'make_other',    
])
avp_df['make_country'].unique()

array(['make_US_truck', 'make_other', 'make_Japan', 'make_US',
       'make_Korea', 'make_Germany', 'make_motorcycle', 'make_Sweden',
       'make_England', 'make_Italy', 'make_China'], dtype=object)

In [21]:
# Recode tow_vehname - towing a vehichle 0 = no, 1 = yes
avp_df['tow_vehname']=avp_df['tow_vehname'].replace([
    'No Trailing Units',
    'One Trailing Unit',
    'Unknown',
    'Two Trailing Units',
    'Vehicle Towing Another Motor Vehicle - Non-Fixed Linkage',
    'Three or More Trailing Units',
    'Vehicle Towing Another Motor Vehicle - Fixed Linkage',
    'Yes, Number of Trailing Units Unknown'
],
[0,1,1,1,1,1,1,1])
avp_df['tow_vehname'].unique()

array([0, 1], dtype=int64)

In [22]:
# Determine median speed for imputing for unknown travel Speed
avp_df['trav_speed_temp'] = avp_df['trav_speed']
avp_df['trav_speed_temp'] = avp_df['trav_speed_temp'].replace([997,998,999],[np.NaN,np.NaN,np.NaN])
avp_df['trav_speed_temp'].describe()

count    126350.000000
mean         22.381559
std          22.198566
min           0.000000
25%           0.000000
50%          15.000000
75%          40.000000
max         150.000000
Name: trav_speed_temp, dtype: float64

In [23]:
# Replace unknown travel speed with median travel speed 
avp_df['trav_speed']=avp_df['trav_speed'].replace([997,998,999],[155,23,23])
avp_df['trav_speed'].describe()

count    256440.000000
mean         22.697348
std          15.593666
min           0.000000
25%          15.000000
50%          23.000000
75%          23.000000
max         155.000000
Name: trav_speed, dtype: float64

In [24]:
# Recode deformedname
avp_df['deformedname']=avp_df['deformedname'].replace([
    'Not Reported',
    'Reported as Unknown'
],
[
    'Minor Damage',
    'Minor Damage'
])
avp_df['deformedname'].unique()

array(['Minor Damage', 'Disabling Damage', 'Functional Damage',
       'No Damage'], dtype=object)

In [25]:
# Recode towed Name 0 = no 1 = yes
avp_df['towedname']=avp_df['towedname'].replace([
    'Towed Due to Disabling Damage',
    'Not Towed',
    'Towed Not Due to Disabling Damage',
    'Towed, Unknown Reason',
    'Not Reported',
    'Towed But Not Due to Disabling Damage',
    'Reported as Unknown'
],[1,0,1,1,0,1,0])
avp_df['towedname'].unique()

array([0, 1], dtype=int64)

In [26]:
avp_df['speedrelname'].unique()

array(['No', 'Reported as Unknown', 'Yes, Too Fast for Conditions',
       'Yes, Specifics Unknown', 'Yes, Exceeded Speed Limit',
       'No Driver Present/Unknown if Driver Present', 'Yes, Racing'],
      dtype=object)

In [27]:
# Recode speed related 0 = n 1 = y
avp_df['speedrelname']=avp_df['speedrelname'].replace([
    'No', 
    'Yes, Too Fast for Conditions',
    'Reported as Unknown',
    'Yes, Exceeded Speed Limit',
    'Yes, Specifics Unknown',
    'No Driver Present/Unknown if Driver Present', 
    'Yes, Racing'
],[0,1,0,1,1,0,1])
avp_df['speedrelname'].unique()

array([0, 1], dtype=int64)

In [28]:
avp_df['vtrafwayname'].unique()

array(['Two-Way, Not Divided', 'Not Reported',
       'Two-Way, Divided, Unprotected Median',
       'Two-Way,  Divided, Positive  Median Barrier',
       'Non-Trafficway or Driveway Access',
       'Two-Way, Not Divided With a Continuous Left-Turn Lane',
       'Entrance/Exit Ramp', 'One-Way Trafficway', 'Reported as Unknown'],
      dtype=object)

In [29]:
# Recode traffic way name 
avp_df['vtrafwayname']=avp_df['vtrafwayname'].replace([
    'Two-Way, Divided, Unprotected Median',
    'Not Reported',
    'Two-Way,  Divided, Positive  Median Barrier',
    'Two-Way, Not Divided',
    'Two-Way, Not Divided With a Continuous Left-Turn Lane',
    'Non-Trafficway or Driveway Access',
    'Entrance/Exit Ramp',
    'One-Way Trafficway', 
    'Reported as Unknown'
],
[
    'Two_way_div_med_nobar',
    'Two_way',
    'Two_way_div_med_bar',
    'Two_way',
    'Two-way',
    'Parking_lot_driveway',
    'Exit_on_ramp',
    'One-way',
    'Two-way'
])
avp_df['vtrafwayname'].unique()

array(['Two_way', 'Two_way_div_med_nobar', 'Two_way_div_med_bar',
       'Parking_lot_driveway', 'Two-way', 'Exit_on_ramp', 'One-way'],
      dtype=object)

In [30]:
avp_df['vspd_lim'].value_counts()

45    47465
35    42468
98    34694
40    24797
55    23671
25    20691
30    15468
65    12710
70    11755
50     7727
0      6416
60     3798
20     1461
15     1445
75      681
10      466
80      303
99      241
5       182
90        1
Name: vspd_lim, dtype: int64

In [31]:
# Recode speed limits (impute with the mode = 45)
avp_df['vspd_lim']=avp_df['vspd_lim'].replace([98,99],[45,45])
avp_df['vspd_lim'].value_counts()

45    82400
35    42468
40    24797
55    23671
25    20691
30    15468
65    12710
70    11755
50     7727
0      6416
60     3798
20     1461
15     1445
75      681
10      466
80      303
5       182
90        1
Name: vspd_lim, dtype: int64

In [32]:
avp_df['bdytyp_imname'].unique()

array(['Single-unit straight truck or Cab-Chassis (GVWR greater than 26,000 lbs.)',
       'Unknown body type',
       'Station Wagon (excluding van and truck based)',
       'Utility Vehicle, Unknown body type', '4-door sedan, hardtop',
       'Sedan/Hardtop, number of doors unknown',
       '5-door/4-door hatchback',
       'Medium/heavy Pickup (GVWR greater than 10,000 lbs.)',
       'Light Pickup', 'Transit Bus (City Bus)',
       'Compact Utility (Utility Vehicle Categories Small and Midsize)',
       'School Bus', 'Two Wheel Motorcycle (excluding motor scooters)',
       'Large utility (ANSI D16.1 Utility Vehicle Categories and Full Size and Large)',
       'Minivan (Chrysler Town and Country, Caravan, Grand Caravan, Voyager, Voyager, Honda-Odyssey, ...)',
       '2-door sedan,hardtop,coupe', 'Other or Unknown automobile type',
       'Single-unit straight truck or Cab-Chassis (GVWR range 19,501 to 26,000 lbs.)',
       'Large Van-Includes van-based buses (B150-B350, Sportsman, R

In [33]:
# Recode body type
avp_df['bdytyp_imname']=avp_df['bdytyp_imname'].replace([
    '4-door sedan, hardtop',
    '5-door/4-door hatchback', 
    'Sedan/Hardtop, number of doors unknown', 
    'Hatchback, number of doors unknown',
    'Auto-based pickup (includes E1 Camino, Caballero, Ranchero, SSR, G8-ST, Subaru Brat, Rabbit Pickup)',
    'Large Limousine-more than four side doors or stretched chassis',
    'Auto-based panel (cargo station wagon, auto-based ambulance or hearse)',
    'Station Wagon (excluding van and truck based)',
    'Other or Unknown automobile type',
    'Unknown body type', 
    'Not Reported',
    
    '2-door sedan,hardtop,coupe',
    '3-door/2-door hatchback',
    '3-door coupe',
    
    'Convertible(excludes sun-roof,t-bar)',
    
    'Light Pickup',
    'Unknown light truck type', 
    'Unknown (pickup style) light conventional truck type',
    'Unknown light vehicle type (automobile,utility vehicle, van, or light truck)',
    'Other light conventional truck type',
    'Compact Utility (Utility Vehicle Categories Small and Midsize)', 
    'Cab Chassis Based (includes Rescue Vehicle, Light Stake, Dump, and Tow Truck)',
    
    'Minivan (Chrysler Town and Country, Caravan, Grand Caravan, Voyager, Voyager, Honda-Odyssey, ...)',
    'Large Van-Includes van-based buses (B150-B350, Sportsman, Royal Maxiwagon, Ram, Tradesman,...)',
    'Van-Based Bus GVWR greater than 10,000 lbs.', 
    'Unknown van type',
    'Step van (GVWR greater than 10,000 lbs.)',
    'Other van type (Hi-Cube Van, Kary)', 
    'Step-van or walk-in van (GVWR less than or equal to 10,000 lbs.)',
    
    'Large utility (ANSI D16.1 Utility Vehicle Categories and Full Size and Large)',
    'Medium/heavy Pickup (GVWR greater than 10,000 lbs.)',
    'Utility Vehicle, Unknown body type', 
    'Utility station wagon (includes suburban limousines, Suburban, Travellall, Grand Wagoneer)',
    'Unknown truck type (light/medium/heavy)',
    
    'Truck-tractor (Cab only, or with any number of trailing unit; any weight)',
    'Single-unit straight truck or Cab-Chassis (GVWR range 19,501 to 26,000 lbs.)',
    'Single-unit straight truck or Cab-Chassis (GVWR unknown)',
    'Unknown medium/heavy truck type', 
    'Single-unit straight truck or Cab-Chassis (GVWR range 10,001 to 19,500 lbs.)',
    'Single-unit straight truck or Cab-Chassis (GVWR greater than 26,000 lbs.)',
    'Unknown if single-unit or combination unit Medium Truck (GVWR range 10,001 lbs. to 26,000 lbs.)',
    'Unknown if single-unit or combination unit Heavy Truck (GVWR greater than 26,000 lbs.)',
    
    'Two Wheel Motorcycle (excluding motor scooters)',
    'Motor Scooter',
    'Moped or motorized bicycle',
    'Unknown motored cycle type',
    'Off-road Motorcycle', 
    'Unenclosed Three Wheel Motorcycle / Unenclosed Autocycle (1 Rear Wheel)',
    'Three-wheel Motorcycle (2 Rear Wheels)',
    'Other motored cycle type (mini-bikes, pocket motorcycles pocket bikes)',
    'Unknown Three Wheel Motorcycle Type',
    
    'School Bus',
    'Transit Bus (City Bus)',
    'Other Bus Type', 
    'Unknown Bus Type',
    'Cross Country/Intercity Bus',
    
    'Construction equipment other than trucks (includes graders)',  
    'Farm equipment other than trucks',
    
    'Medium/heavy truck based motorhome',
    'Medium/Heavy Vehicle Based Motor Home',
    'Camper or motorhome, unknown truck type',
    'Light Vehicle Based Motor Home (chassis mounted)',
    'Light Truck Based Motorhome (Chassis Mounted)',
    
    'Recreational Off-Highway Vehicle',
    'Other vehicle type (includes go-cart, fork-lift, city street sweeper dunes/swamp buggy)',
    'Low Speed Vehicle (LSV) / Neighborhood Electric Vehicle (NEV)',
    'ATV/ATC [All-Terrain Cycle]',
    'Golf Cart',
     
     'Snowmobile'
    
],[
    '4_door_sedan',
    '4_door_sedan',
    '4_door_sedan',
    '4_door_sedan',
    '4_door_sedan',
    '4_door_sedan',
    '4_door_sedan',
    '4_door_sedan',
    '4_door_sedan',
    '4_door_sedan',
    '4_door_sedan',
    
    '2_door_sedan',
    '2_door_sedan',
    '2_door_sedan',
    
    'Convertable',
    
    'Small_SUV_light_truck',
    'Small_SUV_light_truck',
    'Small_SUV_light_truck',
    'Small_SUV_light_truck',
    'Small_SUV_light_truck',
    'Small_SUV_light_truck',
    'Small_SUV_light_truck',
    
    'Van',
    'Van',
    'Van',
    'Van',
    'Van',
    'Van',
    'Van',
    
    'Large_SUV',
    'Large_SUV',
    'Large_SUV',
    'Large_SUV',
    'Large_SUV',
    
    'Truck',
    'Truck',
    'Truck',
    'Truck',
    'Truck',
    'Truck',
    'Truck',
    'Truck',
    
    'Motorcylcle_trike',
    'Motorcylcle_trike',
    'Motorcylcle_trike',
    'Motorcylcle_trike',
    'Motorcylcle_trike',
    'Motorcylcle_trike',
    'Motorcylcle_trike',
    'Motorcylcle_trike',
    'Motorcylcle_trike',
    
    'Bus',
    'Bus',
    'Bus',
    'Bus',
    'Bus',
    
    'Construction_farm_equip',
    'Construction_farm_equip',
    
    'Motorhome_RV',
    'Motorhome_RV',
    'Motorhome_RV',
    'Motorhome_RV',
    'Motorhome_RV',
    
    'ATV_rec_vehicle',
    'ATV_rec_vehicle',
    'ATV_rec_vehicle',
    'ATV_rec_vehicle',
    'ATV_rec_vehicle',
    
     'ATV_rec_vehicle',
    
])
avp_df['bdytyp_imname'].unique()


array(['Truck', '4_door_sedan', 'Large_SUV', 'Small_SUV_light_truck',
       'Bus', 'Motorcylcle_trike', 'Van', '2_door_sedan', 'Convertable',
       'ATV_rec_vehicle', 'Construction_farm_equip', 'Motorhome_RV'],
      dtype=object)

In [34]:
avp_df['p_crash1name'].unique()

array(['Backing Up (other than for Parking Position)', 'Turning Right',
       'Going Straight', 'Starting in Road', 'Decelerating in Road',
       'Changing Lanes', 'Turning Left', 'Negotiating a Curve',
       'Passing or Overtaking Another Vehicle', 'Stopped in Roadway',
       'Making a U-turn', 'Unknown', 'Leaving a Parking Position',
       'Successful Avoidance Maneuver to a Previous Critical Event',
       'No Driver Present / Unknown if Driver Present', 'Merging',
       'Other(specify:)', 'Accelerating in Road',
       'Disabled or Parked in Travel lane', 'Entering a Parking Position'],
      dtype=object)

In [35]:
# Recode prior to crash movement: some not binned. unknowns binned with going straight
avp_df['p_crash1name'] = avp_df['p_crash1name'].replace([
    'Going Straight', 
    'Other(specify:)',
    'Unknown', 
    'No Driver Present / Unknown if Driver Present',
        
    'Decelerating in Road',
    'Stopped in Roadway',
    'Backing Up (other than for Parking Position)',
    
    'Starting in Road',
    'Accelerating in Road', 
    
    'Disabled or Parked in Travel lane', 
    'Entering a Parking Position',
    'Leaving a Parking Position',
     
]
,[
    'Going straight',
    'Going straight',
    'Going straight',
    'Going straight',
    
    'Stopping_backup',
    'Stopping_backup',
    'Stopping_backup',
    
    'Start on road',
    'Start on road',
    
    'Diasbled_parked',
    'Diasbled_parked',
    'Diasbled_parked',
])
avp_df['p_crash1name'].unique()

array(['Stopping_backup', 'Turning Right', 'Going straight',
       'Start on road', 'Changing Lanes', 'Turning Left',
       'Negotiating a Curve', 'Passing or Overtaking Another Vehicle',
       'Making a U-turn', 'Diasbled_parked',
       'Successful Avoidance Maneuver to a Previous Critical Event',
       'Merging'], dtype=object)

In [36]:
# Recode restrictive device
avp_df['rest_usename'] = avp_df['rest_usename'].replace([
    'Shoulder and Lap Belt Used',
    'Shoulder Belt Only Used', 
    'Other',
    'Restraint Used - Type Unknown',
    'Lap Belt Only Used',
    'Reported as Unknown',
    'Not Reported',
    
    'None Used/Not Applicable', 

    'Child Restraint Type Unknown', 
    'Booster Seat', 
    'Child Restraint System - Forward Facing',
    'Child Restraint System  - Rear Facing',
    'Child Restraint System - Rear Facing',
    
    'Racing-Style Harness Used'
    
   
],[
    'Seatbelt',
    'Seatbelt',
    'Seatbelt',
    'Seatbelt',
    'Seatbelt',
    'Seatbelt',
    'Seatbelt',
    
    'No_seatbelt',
    
    'Child_restraint',
    'Child_restraint',
    'Child_restraint',
    'Child_restraint',
    'Child_restraint',
    
    'Harness'
])

avp_df['rest_usename'].unique()

array(['No_seatbelt', 'Seatbelt', 'Child_restraint', 'Harness'],
      dtype=object)

In [37]:
# Recode retrain misuse 0 = no, 1 = yes
avp_df['rest_misname'] = avp_df['rest_misname'].replace([
    'No Indication of Mis-Use', 
    'None Used/Not Applicable',
    'Yes, Indication of Mis-Use'
],[0,0,1])
avp_df['rest_misname'].unique()

array([0, 1], dtype=int64)

In [38]:
# Recode helmet wear 0= no, 1=yes
avp_df['helm_usename']=avp_df['helm_usename'].replace([
    'Not Applicable',
    'Helmet, Other than DOT-Compliant Motorcycle Helmet',
    'Helmet, Unknown if DOT-Compliant', 
    'No Helmet', 
    'Not Reported',
    'DOT-Compliant Motorcycle Helmet',
    'Reported as Unknown if Helmet Worn'
],[0,1,1,0,0,1,0])
avp_df['helm_usename'].unique()

array([0, 1], dtype=int64)

In [39]:
# Recode helmet misuse 0 = no 1 = yes
avp_df['helm_misname']=avp_df['helm_misname'].replace([
    'None Used/Not Applicable', 
    'No Indication of Mis-Use',
    'Yes, Indication of Mis-Use'
],[0,0,1])
avp_df['helm_misname'].unique()

array([0, 1], dtype=int64)

In [40]:
# Recode police reported drinking in the car 0 = no 1 = yes
avp_df['drinkingname'] = avp_df['drinkingname'].replace([
    'No (Alcohol Not Involved)', 
    'Not Reported', 
    'Reported as Unknown',
    'Yes (Alcohol Involved)'
],[0,0,0,1])
avp_df['drinkingname'].unique()

array([0, 1], dtype=int64)

In [41]:
avp_df['alc_resname'].unique()


array(['Test Not Given', 'Reported as Unknown if Tested', 'Not Reported',
       'AC Test Performed, Results Unknown', '0.200 % BAC', '0.220 % BAC',
       '0.170 % BAC', '0.000 % BAC', '0.070 % BAC', '0.100 % BAC',
       '0.144 % BAC', '0.176 % BAC', '0.050 % BAC', '0.162 % BAC',
       '0.116 % BAC', '0.181 % BAC', '0.060 % BAC', '0.180 % BAC',
       '0.080 % BAC', '0.094 % BAC', '0.210 % BAC', '0.157 % BAC',
       '0.152 % BAC', '0.130 % BAC', '0.128 % BAC', '0.190 % BAC',
       '.94 % or Greater', '0.168 % BAC', '0.159 % BAC', '0.196 % BAC',
       '0.245 % BAC', '0.310 % BAC', '0.140 % BAC', '0.072 % BAC',
       '0.221 % BAC', '0.244 % BAC', '0.296 % BAC', '0.255 % BAC',
       '0.125 % BAC', '0.229 % BAC', '0.250 % BAC', '0.191 % BAC',
       '0.142 % BAC', '0.040 % BAC', '0.270 % BAC', '0.121 % BAC',
       '0.172 % BAC', '0.160 % BAC', '0.115 % BAC', '0.101 % BAC',
       '0.361 % BAC', '0.150 % BAC', '0.350 % BAC', '0.076 % BAC',
       '0.167 % BAC', '0.090 % BAC', '0.23

In [42]:
# Recode is in alcohol percent >=0.08 for the person (not specifically driver) 0=no 1=yes
avp_df['alc_resname_08']=avp_df['alc_resname'].replace([
    
    'Test Not Given', 'Not Reported', 'Reported as Unknown if Tested',
       'AC Test Performed, Results Unknown', '0.220 % BAC', '0.000 % BAC',
       '0.070 % BAC', '0.176 % BAC', '0.200 % BAC', '0.080 % BAC',
       '0.244 % BAC', '0.170 % BAC', '0.020 % BAC', '0.125 % BAC',
       '0.216 % BAC', '0.140 % BAC', '0.090 % BAC', '0.190 % BAC',
       '0.230 % BAC', '0.150 % BAC',
       'Positive Reading with No Actual Value', '0.172 % BAC',
       '0.160 % BAC', '0.187 % BAC', '0.100 % BAC', '0.110 % BAC',
       '0.181 % BAC', '0.157 % BAC', '0.210 % BAC', '0.158 % BAC',
       '0.119 % BAC', '0.180 % BAC', '0.189 % BAC', '0.130 % BAC',
       '0.030 % BAC', '0.208 % BAC', '0.245 % BAC', '0.144 % BAC',
       '0.121 % BAC', '0.280 % BAC', '0.195 % BAC', '0.194 % BAC',
       '0.017 % BAC', '0.011 % BAC', '0.050 % BAC', '0.076 % BAC',
       '0.116 % BAC', '0.060 % BAC', '0.072 % BAC', '0.221 % BAC',
       '0.193 % BAC', '0.296 % BAC', '0.161 % BAC', '0.255 % BAC',
       '0.075 % BAC', '0.155 % BAC', '0.229 % BAC', '0.214 % BAC',
       '0.152 % BAC', '0.250 % BAC', '.94 % or Greater', '0.238 % BAC',
       '0.123 % BAC', '0.310 % BAC', '0.014 % BAC', '0.275 % BAC',
       '0.290 % BAC', '0.188 % BAC', '0.137 % BAC', '0.114 % BAC',
       '0.040 % BAC', '0.212 % BAC', '0.078 % BAC', '0.184 % BAC',
       '0.375 % BAC', '0.178 % BAC', '0.088 % BAC', '0.165 % BAC',
       '0.115 % BAC', '0.209 % BAC', '0.300 % BAC', '0.168 % BAC',
       '0.132 % BAC', '0.260 % BAC', '0.318 % BAC', '0.185 % BAC',
       '0.286 % BAC', '0.173 % BAC', '0.077 % BAC', '0.186 % BAC',
       '0.120 % BAC', '0.167 % BAC', '0.162 % BAC', '0.111 % BAC',
       '0.129 % BAC', '0.287 % BAC', '0.081 % BAC', '0.302 % BAC',
       '0.094 % BAC', '0.240 % BAC', '0.135 % BAC', '0.142 % BAC',
       '0.131 % BAC', '0.148 % BAC', '0.128 % BAC', '0.309 % BAC',
       '0.026 % BAC', '0.159 % BAC', '0.196 % BAC', '0.191 % BAC',
       '0.151 % BAC', '0.270 % BAC', '0.108 % BAC', '0.105 % BAC',
       '0.204 % BAC', '0.016 % BAC', '0.057 % BAC', '0.109 % BAC',
       '0.093 % BAC', '0.113 % BAC', '0.307 % BAC', '0.085 % BAC',
       '0.008 % BAC', '0.164 % BAC', '0.269 % BAC', '0.320 % BAC',
       '0.138 % BAC', '0.146 % BAC', '0.062 % BAC', '0.083 % BAC',
       '0.177 % BAC', '0.247 % BAC', '0.175 % BAC', '0.223 % BAC',
       '0.102 % BAC', '0.028 % BAC', '0.222 % BAC', '0.126 % BAC',
       '0.228 % BAC', '0.336 % BAC', '0.920 % BAC', '0.239 % BAC',
       '0.232 % BAC', '0.201 % BAC', '0.281 % BAC', '0.001 % BAC',
       '0.285 % BAC', '0.010 % BAC', '0.197 % BAC', '0.249 % BAC',
       '0.101 % BAC', '0.361 % BAC', '0.395 % BAC', '0.163 % BAC',
       '0.096 % BAC', '0.382 % BAC', '0.271 % BAC', '0.350 % BAC',
       '0.182 % BAC', '0.047 % BAC', '0.211 % BAC', '0.154 % BAC',
       '0.166 % BAC', '0.053 % BAC', '0.042 % BAC', '0.218 % BAC',
       '0.192 % BAC', '0.215 % BAC', '0.004 % BAC', '0.174 % BAC',
       '0.231 % BAC', '0.226 % BAC', '0.227 % BAC', '0.099 % BAC',
       '0.139 % BAC', '0.082 % BAC', '0.124 % BAC', '0.002 % BAC',
       '0.031 % BAC', '0.272 % BAC', '0.213 % BAC', '0.058 % BAC',
       '0.045 % BAC', '0.262 % BAC', '0.198 % BAC', '0.378 % BAC',
       '0.224 % BAC', '0.041 % BAC', '0.390 % BAC', '0.112 % BAC',
       '0.117 % BAC', '0.205 % BAC', '0.305 % BAC', '0.298 % BAC',
       '0.179 % BAC', '0.289 % BAC', '0.242 % BAC', '0.012 % BAC',
       '0.147 % BAC', '0.243 % BAC', '0.248 % BAC', '0.106 % BAC',
       '0.234 % BAC', '0.141 % BAC', '0.274 % BAC', '0.143 % BAC',
       '0.256 % BAC', '0.145 % BAC', '0.006 % BAC', '0.202 % BAC',
       '0.207 % BAC', '0.054 % BAC', '0.380 % BAC', '0.068 % BAC',
       '0.067 % BAC', '0.384 % BAC', '0.027 % BAC', '0.253 % BAC',
       '0.156 % BAC', '0.292 % BAC', '0.136 % BAC', '0.254 % BAC',
       '0.322 % BAC', '0.206 % BAC', '0.237 % BAC', '0.600 % BAC',
       '0.252 % BAC', '0.095 % BAC', '0.340 % BAC', '0.048 % BAC',
       '0.153 % BAC', '0.294 % BAC', '0.337 % BAC', '0.091 % BAC',
       '0.233 % BAC', '0.241 % BAC', '0.043 % BAC', '0.171 % BAC',
       '0.023 % BAC', '0.334 % BAC', '0.520 % BAC', '0.007 % BAC',
       '0.069 % BAC', '0.059 % BAC', '0.104 % BAC', '0.079 % BAC',
       '0.268 % BAC', '0.013 % BAC', '0.107 % BAC', '0.236 % BAC',
       '0.217 % BAC', '0.066 % BAC', '0.098 % BAC', '0.203 % BAC',
       '0.251 % BAC', '0.003 % BAC', '0.308 % BAC', '0.278 % BAC',
       '0.246 % BAC', '0.064 % BAC', '0.235 % BAC', '0.086 % BAC',
       '0.061 % BAC', '0.259 % BAC', '0.169 % BAC'
],[
   0,0,0,
    0,1,0,
    0,1,1,1,
    1,1,0,1,
    1,1,1,1,
    1,1,
    0,1,
    1,1,1,1,
    1,1,1,1,
    1,1,1,1,
    0,1,1,1,
    1,1,1,1,
    0,0,0,0,
    1,0,0,1,
    1,1,1,1,
    0,1,1,1,
    1,1,1,1,
    1,1,0,1,
    1,1,1,1,
    0,1,0,1,
    1,1,1,1,
    1,1,1,1,
    1,1,1,1,
    1,1,0,1,
    1,1,1,1,
    1,1,1,1,
    1,1,1,1,
    1,1,1,1,
    0,1,1,1,
    1,1,1,1,
    1,0,0,1,
    1,1,1,1,
    0,1,1,1,
    1,1,0,1,
    1,1,1,1,
    1,0,1,1,
    1,1,1,1,
    1,1,1,0,
    1,0,1,1,
    1,1,1,1,
    1,1,1,1,
    1,0,1,1,
    1,0,0,1,
    1,1,0,1,
    1,1,1,1,
    1,1,1,0,
    0,1,1,0,
    0,1,1,1,
    1,0,1,1,
    1,1,1,1,
    1,1,1,0,
    1,1,1,1,
    1,1,1,1,
    1,1,0,1,
    1,0,1,0,
    0,1,0,1,
    1,1,1,1,
    1,1,1,1,
    1,1,1,0,
    1,1,1,1,
    1,1,0,1,
    0,1,1,0,
    0,0,1,0,
    1,0,1,1,
    1,0,1,1,
    1,0,1,1,
    1,0,1,1,
    0,1,1   
])
avp_df['alc_resname_08'].unique()

array([0, 1], dtype=int64)

In [43]:
# Recode drugs involved with the driver in the car person is in 0 = no, 1 = yes
avp_df['drugsname']=avp_df['drugsname'].replace([
    'No (drugs not involved)', 
    'Not Reported', 
    'Reported as Unknown',
    'Yes (drugs involved)'
],[0,0,0,1])
avp_df['drugsname'].unique()

array([0, 1], dtype=int64)

In [44]:
# recode transfer to hospital 0 = no 1 = yes
avp_df['hospitalname']=avp_df['hospitalname'].replace([
    'EMS Ground', 
    'Not Transported', 
    'Other', 
    'EMS Air',
    'EMS Unknown Mode', 
    'Reported as Unknown',
    'Transported  Unknown Source', 
    'Not Reported', 
    'Law Enforcement',
    'Not Transported for Treatment'
],[1,0,1,1,1,0,1,0,1,0])
avp_df['hospitalname'].unique()

array([0, 1], dtype=int64)

In [45]:
avp_df['locationname'].unique()
# NOT USEFUL

array(['Occupant of a Motor Vehicle'], dtype=object)

In [46]:
avp_df['sex_imname'].value_counts()

Male                   135342
Female                 109803
Not Reported             7708
Reported as Unknown      3587
Name: sex_imname, dtype: int64

In [47]:
# Recode sex 0 = female, 1 = male : Assign not reported as male, unknown as female
avp_df['sex_imname']=avp_df['sex_imname'].replace([
    'Female', 
    'Male', 
    'Reported as Unknown', 
    'Not Reported'
],[0,1,0,1])

avp_df['sex_imname'].value_counts()

1    143050
0    113390
Name: sex_imname, dtype: int64

In [48]:
# Recode person level of injury
avp_df['injsev_imname'] = avp_df['injsev_imname'].replace([
    'Suspected Serious Injury (A)', 
    'No Apparent Injury (O)',
    'Possible Injury (C)', 
    'Unknown/Not Reported',
    'Suspected Minor Injury (B)',
    'Fatal Injury (K)',
    'Injured, Severity Unknown', 
    'Died Prior to Crash*'
],[
    'serious',
    'none',
    'minor',
    'none',
    'minor',
    'fatal',
    'minor',
    'fatal'
 ])
avp_df['injsev_imname'].unique()

array(['none', 'minor', 'serious', 'fatal'], dtype=object)

In [49]:
# Recode personal alcohol use - DUPLICATE OF DRINKING
avp_df['peralch_imname']=avp_df['peralch_imname'].replace([
    'No (Alcohol Not Involved)',
    'Yes (Alcohol Involved)'
],[0,1])
avp_df['peralch_imname'].unique()

array([0, 1], dtype=int64)

In [50]:
avp_df['seat_imname'].unique()

array(['Front Seat, Left Side', 'Front Seat, Right Side',
       'Second Seat, Left Side', 'Second Seat, Right Side',
       'Second Seat, Middle', 'Not Reported', 'Third Seat, Right Side',
       'Reported as Unknown',
       'Other Passenger in enclosed passenger or cargo area',
       'Front Seat, Middle', 'Sleeper Section of Cab (Truck)',
       'Third Seat, Left Side', 'Third Seat, Middle',
       'Front Seat, Unknown', 'Second Seat, Other',
       'Second Seat, Unknown', 'Third Seat, Unknown',
       'Appended to a Motor Vehicle for Motion',
       'Other Passenger in passenger or cargo area, unknown whether or not enclosed',
       'Front Seat, Other', 'Trailing Unit', 'Fourth Seat, Left Side',
       'Riding on Exterior of Vehicle',
       'Other Passenger in unenclosed passenger or cargo area',
       'Third Seat, Other', 'Fourth Seat, Right Side',
       'Fourth Seat, Middle', 'Fourth Seat, Other'], dtype=object)

In [51]:
avp_df['seat_imname']=avp_df['seat_imname'].replace([
    'Front Seat, Left Side', 
    
    'Front Seat, Right Side',
    'Front Seat, Middle',
    'Front Seat, Unknown',
    'Front Seat, Other', 
    'Not Reported',
    'Reported as Unknown', 
    
    'Second Seat, Left Side', 
    'Second Seat, Middle',
    'Second Seat, Right Side', 
    'Second Seat, Unknown', 
    'Second Seat, Other',
    
    'Third Seat, Left Side',
    'Third Seat, Middle', 
    'Third Seat, Right Side', 
    'Third Seat, Unknown', 
    'Third Seat, Other',
    'Fourth Seat, Left Side',
    'Fourth Seat, Right Side', 
    'Fourth Seat, Middle',
    'Fourth Seat, Other',
    
    'Riding on Exterior of Vehicle', 
    'Appended to a Motor Vehicle for Motion',
    
    'Other Passenger in enclosed passenger or cargo area',
    'Sleeper Section of Cab (Truck)',
    'Other Passenger in passenger or cargo area, unknown whether or not enclosed',
    'Other Passenger in unenclosed passenger or cargo area',
    
    'Trailing Unit',
    
],[
    'Driver',
    
    'Front_passenger',
    'Front_passenger',
    'Front_passenger',
    'Front_passenger',
    'Front_passenger',
    'Front_passenger',
    
    'Second_row',
    'Second_row',
    'Second_row',
    'Second_row',
    'Second_row',
    
    'Third_or_4 Row',
    'Third_or_4 Row',
    'Third_or_4 Row',
    'Third_or_4 Row',
    'Third_or_4 Row',
    'Third_or_4 Row',
    'Third_or_4 Row',
    'Third_or_4 Row',
    'Third_or_4 Row',
    
    'Riding_outside',
    'Riding_outside',
    
    'Cargo_area',
    'Cargo_area',
    'Cargo_area',
    'Cargo_area',
    
    'Trailer'    
])
avp_df['seat_imname'].unique()

array(['Driver', 'Front_passenger', 'Second_row', 'Third_or_4 Row',
       'Cargo_area', 'Riding_outside', 'Trailer'], dtype=object)

In [52]:
avp_df['age_im_temp']=avp_df['age_im'].replace([997,998,999],[np.NaN,np.NaN,np.NaN])
avp_df['age_im_temp'].describe()

count    238805.000000
mean         37.247930
std          19.046642
min           0.000000
25%          22.000000
50%          34.000000
75%          51.000000
max         120.000000
Name: age_im_temp, dtype: float64

In [53]:
# Recode age - impute median age = 34 years for missing
avp_df['age_im']=avp_df['age_im'].replace([998,999],[34,34])
avp_df['age_im'].describe()

count    256440.000000
mean         37.024575
std          18.398439
min           0.000000
25%          23.000000
50%          34.000000
75%          50.000000
max         120.000000
Name: age_im, dtype: float64

In [54]:
# Recode Numoccs as single occupant if unknown
avp_df['numoccs']=avp_df['numoccs'].replace(99,1)
avp_df['numoccs'].unique()

array([ 1,  4,  2,  3,  9,  6,  5, 16, 48,  7, 18,  8, 11, 34, 13, 14, 19,
       21, 10, 31, 28, 37, 15, 29, 20, 25, 26, 17, 22, 39, 43, 23, 41, 46,
       12, 24, 45, 57, 36, 30, 65, 50, 77, 58, 33, 32, 54], dtype=int64)

In [55]:
# replace model years unknown with middle of the range years
avp_df['mod_year']=avp_df['mod_year'].replace([9998,9999],[2011,2010])
avp_df['mod_year'].unique()

array([2014, 2010, 2015, 2008, 2017, 2002, 2007, 2019, 2001, 2013, 2006,
       1998, 2016, 2005, 1997, 2018, 2011, 2000, 1993, 2004, 2003, 2012,
       1988, 1989, 1991, 2009, 1996, 1995, 1999, 1981, 1985, 1992, 1994,
       2020, 1967, 1963, 1990, 1980, 1953, 1987, 1986, 2021, 1977, 1983,
       1973, 1978, 1982, 1984, 1976, 1979, 1975, 1974, 1940, 1932, 1970,
       1965, 1966, 1971, 1950, 1964, 1972, 1957, 1934, 1955, 1958, 1968,
       1956, 1961, 1954, 1969, 1959, 1960, 1931], dtype=int64)

In [56]:
# OUTCOME CODING

In [57]:
avp_df['injsev_imname'].unique()

array(['none', 'minor', 'serious', 'fatal'], dtype=object)

In [58]:
# recode injsev 3 ways to reflect severity of X level or worse - 3 different potential outcomes
avp_df['any_inj']=avp_df['injsev_imname'].replace(['none', 'minor', 'serious', 'fatal'],[0,1,1,1])
avp_df['serious_inj']=avp_df['injsev_imname'].replace(['none', 'minor', 'serious', 'fatal'],[0,0,1,1])
avp_df['fatal_inj']=avp_df['injsev_imname'].replace(['none', 'minor', 'serious', 'fatal'],[0,0,0,1])

In [59]:
avp_df['any_inj'].value_counts()

0    185446
1     70994
Name: any_inj, dtype: int64

In [60]:
avp_df['serious_inj'].value_counts()

0    242364
1     14076
Name: serious_inj, dtype: int64

In [61]:
avp_df['fatal_inj'].value_counts()

0    254397
1      2043
Name: fatal_inj, dtype: int64

In [62]:
# Write to CSV SEMI CLEAN - NEEDS SOME FIELDS DROPPED - NOT THE FINAL SET FOR MACHINE LEARNING
avp_df.to_csv('all_gas_no_brakes_semiclean.csv')

# FINAL PROJECT MACHINE LEARNING CODE


In [63]:
# Drop fields not useful for ML
crash_df=avp_df.drop(['casenum','ve_total','ve_forms','permvit','num_inj','veh_no','towedname',
             'per_no','hospitalname','locationname','age_im_temp','trav_speed_temp','alc_resname',
                      'peralch_imname','year','injsev_imname','makename'],axis = 1)
crash_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256440 entries, 0 to 256439
Data columns (total 34 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   urbancity       256440 non-null  int64 
 1   month           256440 non-null  object
 2   day_week        256440 non-null  object
 3   hour            256440 non-null  int64 
 4   alcohol         256440 non-null  int64 
 5   max_sev         256440 non-null  object
 6   wrk_zone        256440 non-null  int64 
 7   lgt_cond        256440 non-null  object
 8   weather         256440 non-null  object
 9   numoccs         256440 non-null  int64 
 10  m_harmname      256440 non-null  object
 11  tow_vehname     256440 non-null  int64 
 12  trav_speed      256440 non-null  int64 
 13  deformedname    256440 non-null  object
 14  speedrelname    256440 non-null  int64 
 15  vtrafwayname    256440 non-null  object
 16  vspd_lim        256440 non-null  int64 
 17  bdytyp_imname   256440 non-nu

In [64]:
# PREPROCESS THE DATA

In [65]:
crash_df.columns

Index(['urbancity', 'month', 'day_week', 'hour', 'alcohol', 'max_sev',
       'wrk_zone', 'lgt_cond', 'weather', 'numoccs', 'm_harmname',
       'tow_vehname', 'trav_speed', 'deformedname', 'speedrelname',
       'vtrafwayname', 'vspd_lim', 'bdytyp_imname', 'mod_year', 'p_crash1name',
       'rest_usename', 'rest_misname', 'helm_usename', 'helm_misname',
       'drinkingname', 'drugsname', 'sex_imname', 'seat_imname', 'age_im',
       'make_country', 'alc_resname_08', 'any_inj', 'serious_inj',
       'fatal_inj'],
      dtype='object')

In [66]:
# Check unique categorical 
crash_df.nunique()

urbancity           2
month              12
day_week            7
hour               24
alcohol             2
max_sev             4
wrk_zone            2
lgt_cond            5
weather             9
numoccs            47
m_harmname         14
tow_vehname         2
trav_speed        116
deformedname        4
speedrelname        2
vtrafwayname        7
vspd_lim           18
bdytyp_imname      12
mod_year           73
p_crash1name       12
rest_usename        4
rest_misname        2
helm_usename        2
helm_misname        2
drinkingname        2
drugsname           2
sex_imname          2
seat_imname         7
age_im            109
make_country       11
alc_resname_08      2
any_inj             2
serious_inj         2
fatal_inj           2
dtype: int64

In [67]:
occ_counts = crash_df.numoccs.value_counts()
occ_counts

1     147646
2      60596
3      24723
4      14008
5       6016
6       1974
7        700
8        369
9        112
11        61
10        56
12        36
13        20
14        17
17        13
16        10
19         9
21         6
18         6
22         6
15         5
37         5
33         4
24         4
34         3
48         3
23         3
31         3
58         2
32         2
41         2
29         2
39         2
25         2
20         2
43         1
46         1
26         1
45         1
57         1
36         1
30         1
65         1
50         1
77         1
28         1
54         1
Name: numoccs, dtype: int64

In [68]:
# Bin the number of occupants
#replace_occ_counts = list(occ_counts[occ_counts<120].index)
#for occ in replace_occ_counts:
    #crash_df.numoccs = crash_df.numoccs.replace(occ,'more_than_8')
#crash_df.numoccs.value_counts()

In [69]:
crash_df.vspd_lim.value_counts()

45    82400
35    42468
40    24797
55    23671
25    20691
30    15468
65    12710
70    11755
50     7727
0      6416
60     3798
20     1461
15     1445
75      681
10      466
80      303
5       182
90        1
Name: vspd_lim, dtype: int64

In [70]:
# Combine 90 with 80
crash_df['vspd_lim'] = crash_df['vspd_lim'].replace(90,80)
crash_df.vspd_lim.value_counts()

45    82400
35    42468
40    24797
55    23671
25    20691
30    15468
65    12710
70    11755
50     7727
0      6416
60     3798
20     1461
15     1445
75      681
10      466
80      304
5       182
Name: vspd_lim, dtype: int64

In [71]:
crash_df.bdytyp_imname.value_counts()

4_door_sedan               119347
Small_SUV_light_truck       78855
Large_SUV                   17363
Van                         12869
Truck                        8896
2_door_sedan                 8656
Motorcylcle_trike            7550
Convertable                  1587
Bus                          1014
ATV_rec_vehicle               152
Construction_farm_equip        90
Motorhome_RV                   61
Name: bdytyp_imname, dtype: int64

In [72]:
# Write to CSV CLEAN SET FOR MACHINE LEARNING
crash_df.to_csv('all_gas_no_brakes_clean.csv')

In [73]:
crash_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256440 entries, 0 to 256439
Data columns (total 34 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   urbancity       256440 non-null  int64 
 1   month           256440 non-null  object
 2   day_week        256440 non-null  object
 3   hour            256440 non-null  int64 
 4   alcohol         256440 non-null  int64 
 5   max_sev         256440 non-null  object
 6   wrk_zone        256440 non-null  int64 
 7   lgt_cond        256440 non-null  object
 8   weather         256440 non-null  object
 9   numoccs         256440 non-null  int64 
 10  m_harmname      256440 non-null  object
 11  tow_vehname     256440 non-null  int64 
 12  trav_speed      256440 non-null  int64 
 13  deformedname    256440 non-null  object
 14  speedrelname    256440 non-null  int64 
 15  vtrafwayname    256440 non-null  object
 16  vspd_lim        256440 non-null  int64 
 17  bdytyp_imname   256440 non-nu

In [74]:
# Generate our categorical variable lists
crash_cat = crash_df.dtypes[crash_df.dtypes == "object"].index.tolist()

In [75]:
crash_cat

['month',
 'day_week',
 'max_sev',
 'lgt_cond',
 'weather',
 'm_harmname',
 'deformedname',
 'vtrafwayname',
 'bdytyp_imname',
 'p_crash1name',
 'rest_usename',
 'seat_imname',
 'make_country']

In [76]:
crash_df['make_country'].unique()

array(['make_US_truck', 'make_other', 'make_Japan', 'make_US',
       'make_Korea', 'make_Germany', 'make_motorcycle', 'make_Sweden',
       'make_England', 'make_Italy', 'make_China'], dtype=object)

In [77]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(crash_df[crash_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(crash_cat)
encode_df.head()

Unnamed: 0,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep,day_week_Fri,day_week_Mon,day_week_Sat,day_week_Sun,day_week_Thus,day_week_Tues,day_week_Wed,max_sev_fatal,max_sev_minor,max_sev_none,max_sev_serious,lgt_cond_dark,lgt_cond_dawn,lgt_cond_daylight,lgt_cond_dusk,lgt_cond_other,weather_blowing_dirt,weather_clear,weather_cloudy,weather_fog_smoke,weather_freezing_rain,weather_other,weather_rain_sleet,weather_snow_blowsnow,weather_windy,m_harmname_harm_barrier,m_harmname_harm_fire,m_harmname_harm_fixed_manmade,m_harmname_harm_injury_fallout,m_harmname_harm_lost_control,m_harmname_harm_moving_veh,m_harmname_harm_nat_object,m_harmname_harm_object,m_harmname_harm_parked_veh,m_harmname_harm_ped_animal,m_harmname_harm_terrain,m_harmname_harm_train,m_harmname_harm_unknown,m_harmname_harm_water,deformedname_Disabling Damage,deformedname_Functional Damage,deformedname_Minor Damage,deformedname_No Damage,vtrafwayname_Exit_on_ramp,vtrafwayname_One-way,vtrafwayname_Parking_lot_driveway,vtrafwayname_Two-way,vtrafwayname_Two_way,vtrafwayname_Two_way_div_med_bar,vtrafwayname_Two_way_div_med_nobar,bdytyp_imname_2_door_sedan,bdytyp_imname_4_door_sedan,bdytyp_imname_ATV_rec_vehicle,bdytyp_imname_Bus,bdytyp_imname_Construction_farm_equip,bdytyp_imname_Convertable,bdytyp_imname_Large_SUV,bdytyp_imname_Motorcylcle_trike,bdytyp_imname_Motorhome_RV,bdytyp_imname_Small_SUV_light_truck,bdytyp_imname_Truck,bdytyp_imname_Van,p_crash1name_Changing Lanes,p_crash1name_Diasbled_parked,p_crash1name_Going straight,p_crash1name_Making a U-turn,p_crash1name_Merging,p_crash1name_Negotiating a Curve,p_crash1name_Passing or Overtaking Another Vehicle,p_crash1name_Start on road,p_crash1name_Stopping_backup,p_crash1name_Successful Avoidance Maneuver to a Previous Critical Event,p_crash1name_Turning Left,p_crash1name_Turning Right,rest_usename_Child_restraint,rest_usename_Harness,rest_usename_No_seatbelt,rest_usename_Seatbelt,seat_imname_Cargo_area,seat_imname_Driver,seat_imname_Front_passenger,seat_imname_Riding_outside,seat_imname_Second_row,seat_imname_Third_or_4 Row,seat_imname_Trailer,make_country_make_China,make_country_make_England,make_country_make_Germany,make_country_make_Italy,make_country_make_Japan,make_country_make_Korea,make_country_make_Sweden,make_country_make_US,make_country_make_US_truck,make_country_make_motorcycle,make_country_make_other
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [78]:
# Merge one-hot encoded features and drop the originals
crash_df = crash_df.merge(encode_df,left_index=True, right_index=True)
crash_df = crash_df.drop(crash_cat,1)
crash_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,urbancity,hour,alcohol,wrk_zone,numoccs,tow_vehname,trav_speed,speedrelname,vspd_lim,mod_year,rest_misname,helm_usename,helm_misname,drinkingname,drugsname,sex_imname,age_im,alc_resname_08,any_inj,serious_inj,fatal_inj,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep,day_week_Fri,day_week_Mon,day_week_Sat,day_week_Sun,day_week_Thus,day_week_Tues,day_week_Wed,max_sev_fatal,max_sev_minor,max_sev_none,max_sev_serious,lgt_cond_dark,lgt_cond_dawn,lgt_cond_daylight,lgt_cond_dusk,lgt_cond_other,weather_blowing_dirt,weather_clear,weather_cloudy,weather_fog_smoke,weather_freezing_rain,weather_other,weather_rain_sleet,weather_snow_blowsnow,weather_windy,m_harmname_harm_barrier,m_harmname_harm_fire,m_harmname_harm_fixed_manmade,m_harmname_harm_injury_fallout,m_harmname_harm_lost_control,m_harmname_harm_moving_veh,m_harmname_harm_nat_object,m_harmname_harm_object,m_harmname_harm_parked_veh,m_harmname_harm_ped_animal,m_harmname_harm_terrain,m_harmname_harm_train,m_harmname_harm_unknown,m_harmname_harm_water,deformedname_Disabling Damage,deformedname_Functional Damage,deformedname_Minor Damage,deformedname_No Damage,vtrafwayname_Exit_on_ramp,vtrafwayname_One-way,vtrafwayname_Parking_lot_driveway,vtrafwayname_Two-way,vtrafwayname_Two_way,vtrafwayname_Two_way_div_med_bar,vtrafwayname_Two_way_div_med_nobar,bdytyp_imname_2_door_sedan,bdytyp_imname_4_door_sedan,bdytyp_imname_ATV_rec_vehicle,bdytyp_imname_Bus,bdytyp_imname_Construction_farm_equip,bdytyp_imname_Convertable,bdytyp_imname_Large_SUV,bdytyp_imname_Motorcylcle_trike,bdytyp_imname_Motorhome_RV,bdytyp_imname_Small_SUV_light_truck,bdytyp_imname_Truck,bdytyp_imname_Van,p_crash1name_Changing Lanes,p_crash1name_Diasbled_parked,p_crash1name_Going straight,p_crash1name_Making a U-turn,p_crash1name_Merging,p_crash1name_Negotiating a Curve,p_crash1name_Passing or Overtaking Another Vehicle,p_crash1name_Start on road,p_crash1name_Stopping_backup,p_crash1name_Successful Avoidance Maneuver to a Previous Critical Event,p_crash1name_Turning Left,p_crash1name_Turning Right,rest_usename_Child_restraint,rest_usename_Harness,rest_usename_No_seatbelt,rest_usename_Seatbelt,seat_imname_Cargo_area,seat_imname_Driver,seat_imname_Front_passenger,seat_imname_Riding_outside,seat_imname_Second_row,seat_imname_Third_or_4 Row,seat_imname_Trailer,make_country_make_China,make_country_make_England,make_country_make_Germany,make_country_make_Italy,make_country_make_Japan,make_country_make_Korea,make_country_make_Sweden,make_country_make_US,make_country_make_US_truck,make_country_make_motorcycle,make_country_make_other
0,1,19,0,0,1,0,23,0,50,2014,0,0,0,0,0,1,59,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0,15,0,0,1,0,23,0,45,2010,0,0,0,0,0,0,34,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,7,0,0,1,0,0,0,35,2015,0,0,0,0,0,0,39,0,1,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,15,0,0,1,0,23,0,35,2010,0,0,0,0,0,0,34,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,19,0,0,4,0,80,1,70,2008,1,0,0,0,1,1,17,0,1,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


#  MODEL ANY INJURY (any_inj : remove serious_inj and fatal_inj as variables)

In [79]:
# Split our preprocessed data into our features and target arrays
y = crash_df["any_inj"].values
X = crash_df.drop(["any_inj","serious_inj","fatal_inj"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

  This is separate from the ipykernel package so we can avoid doing imports until


In [80]:
crash_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256440 entries, 0 to 256439
Columns: 129 entries, urbancity to make_country_make_other
dtypes: float64(108), int64(21)
memory usage: 252.4 MB


In [81]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [82]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  300
hidden_nodes_layer2 = 100
hidden_nodes_layer3 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 300)               38100     
                                                                 
 dense_1 (Dense)             (None, 100)               30100     
                                                                 
 dense_2 (Dense)             (None, 10)                1010      
                                                                 
 dense_3 (Dense)             (None, 1)                 11        
                                                                 
Total params: 69,221
Trainable params: 69,221
Non-trainable params: 0
_________________________________________________________________


In [83]:
# Import checkpoint dependencies 
import os
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the checkpoint path and filenames
os.makedirs("crash_checkpoints/",exist_ok=True)
checkpoint_path = "crash_checkpoints/weights.{epoch:02d}.hdf5"

In [84]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [85]:
# Create a callback that saves the model's weights every epoch
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True,
    save_freq='epoch')

In [86]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=40, callbacks=[cp_callback])

Epoch 1/40
Epoch 1: saving model to crash_checkpoints\weights.01.hdf5
Epoch 2/40
Epoch 2: saving model to crash_checkpoints\weights.02.hdf5
Epoch 3/40
Epoch 3: saving model to crash_checkpoints\weights.03.hdf5
Epoch 4/40
Epoch 4: saving model to crash_checkpoints\weights.04.hdf5
Epoch 5/40
Epoch 5: saving model to crash_checkpoints\weights.05.hdf5
Epoch 6/40
Epoch 6: saving model to crash_checkpoints\weights.06.hdf5
Epoch 7/40
Epoch 7: saving model to crash_checkpoints\weights.07.hdf5
Epoch 8/40
Epoch 8: saving model to crash_checkpoints\weights.08.hdf5
Epoch 9/40
Epoch 9: saving model to crash_checkpoints\weights.09.hdf5
Epoch 10/40
Epoch 10: saving model to crash_checkpoints\weights.10.hdf5
Epoch 11/40
Epoch 11: saving model to crash_checkpoints\weights.11.hdf5
Epoch 12/40
Epoch 12: saving model to crash_checkpoints\weights.12.hdf5
Epoch 13/40
Epoch 13: saving model to crash_checkpoints\weights.13.hdf5
Epoch 14/40
Epoch 14: saving model to crash_checkpoints\weights.14.hdf5
Epoch 15/4

Epoch 34/40
Epoch 34: saving model to crash_checkpoints\weights.34.hdf5
Epoch 35/40
Epoch 35: saving model to crash_checkpoints\weights.35.hdf5
Epoch 36/40
Epoch 36: saving model to crash_checkpoints\weights.36.hdf5
Epoch 37/40
Epoch 37: saving model to crash_checkpoints\weights.37.hdf5
Epoch 38/40
Epoch 38: saving model to crash_checkpoints\weights.38.hdf5
Epoch 39/40
Epoch 39: saving model to crash_checkpoints\weights.39.hdf5
Epoch 40/40
Epoch 40: saving model to crash_checkpoints\weights.40.hdf5


In [87]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2004/2004 - 2s - loss: 0.4599 - accuracy: 0.8385 - 2s/epoch - 831us/step
Loss: 0.4598594903945923, Accuracy: 0.8385119438171387


In [88]:
# Export our model to HDF5 file
nn.save("Crash.h5")
tfjs.converters.save_keras_model(nn, './tfjs_any')

# MODEL SERIOUS INJURY

In [89]:
# Split our preprocessed data into our features and target arrays
y = crash_df["serious_inj"].values
X = crash_df.drop(["any_inj","serious_inj","fatal_inj"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

  This is separate from the ipykernel package so we can avoid doing imports until


In [90]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [91]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  300
hidden_nodes_layer2 = 100
hidden_nodes_layer3 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 300)               38100     
                                                                 
 dense_5 (Dense)             (None, 100)               30100     
                                                                 
 dense_6 (Dense)             (None, 10)                1010      
                                                                 
 dense_7 (Dense)             (None, 1)                 11        
                                                                 
Total params: 69,221
Trainable params: 69,221
Non-trainable params: 0
_________________________________________________________________


In [92]:
# Import checkpoint dependencies
import os
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the checkpoint path and filenames
os.makedirs("crash_checkpoints_serious/",exist_ok=True)
checkpoint_path = "crash_checkpoints_serious/weights.{epoch:02d}.hdf5"

In [93]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [94]:
# Create a callback that saves the model's weights every epoch
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True,
    save_freq='epoch')

In [95]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=35, callbacks=[cp_callback])

Epoch 1/35
Epoch 1: saving model to crash_checkpoints_serious\weights.01.hdf5
Epoch 2/35
Epoch 2: saving model to crash_checkpoints_serious\weights.02.hdf5
Epoch 3/35
Epoch 3: saving model to crash_checkpoints_serious\weights.03.hdf5
Epoch 4/35
Epoch 4: saving model to crash_checkpoints_serious\weights.04.hdf5
Epoch 5/35
Epoch 5: saving model to crash_checkpoints_serious\weights.05.hdf5
Epoch 6/35
Epoch 6: saving model to crash_checkpoints_serious\weights.06.hdf5
Epoch 7/35
Epoch 7: saving model to crash_checkpoints_serious\weights.07.hdf5
Epoch 8/35
Epoch 8: saving model to crash_checkpoints_serious\weights.08.hdf5
Epoch 9/35
Epoch 9: saving model to crash_checkpoints_serious\weights.09.hdf5
Epoch 10/35
Epoch 10: saving model to crash_checkpoints_serious\weights.10.hdf5
Epoch 11/35
Epoch 11: saving model to crash_checkpoints_serious\weights.11.hdf5
Epoch 12/35
Epoch 12: saving model to crash_checkpoints_serious\weights.12.hdf5
Epoch 13/35
Epoch 13: saving model to crash_checkpoints_se

Epoch 33/35
Epoch 33: saving model to crash_checkpoints_serious\weights.33.hdf5
Epoch 34/35
Epoch 34: saving model to crash_checkpoints_serious\weights.34.hdf5
Epoch 35/35
Epoch 35: saving model to crash_checkpoints_serious\weights.35.hdf5


In [96]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2004/2004 - 1s - loss: 0.1789 - accuracy: 0.9639 - 1s/epoch - 714us/step
Loss: 0.1788676381111145, Accuracy: 0.9638901948928833


In [97]:
# Export our model to HDF5 file
nn.save("Crash_serious.h5")
tfjs.converters.save_keras_model(nn, './tfjs_serious')

In [98]:
# MODEL FATAL ACCIDENTS

In [99]:
# Split our preprocessed data into our features and target arrays
y = crash_df["fatal_inj"].values
X = crash_df.drop(["any_inj","serious_inj","fatal_inj"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

  This is separate from the ipykernel package so we can avoid doing imports until


In [100]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [101]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  300
hidden_nodes_layer2 = 100
hidden_nodes_layer3 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 300)               38100     
                                                                 
 dense_9 (Dense)             (None, 100)               30100     
                                                                 
 dense_10 (Dense)            (None, 10)                1010      
                                                                 
 dense_11 (Dense)            (None, 1)                 11        
                                                                 
Total params: 69,221
Trainable params: 69,221
Non-trainable params: 0
_________________________________________________________________


In [102]:
# Import checkpoint dependencies
import os
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the checkpoint path and filenames
os.makedirs("crash_checkpoints_fatal/",exist_ok=True)
checkpoint_path = "crash_checkpoints_fatal/weights.{epoch:02d}.hdf5"

In [103]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [104]:
# Create a callback that saves the model's weights every epoch
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True,
    save_freq='epoch')

In [105]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=25, callbacks=[cp_callback])

Epoch 1/25
Epoch 1: saving model to crash_checkpoints_fatal\weights.01.hdf5
Epoch 2/25
Epoch 2: saving model to crash_checkpoints_fatal\weights.02.hdf5
Epoch 3/25
Epoch 3: saving model to crash_checkpoints_fatal\weights.03.hdf5
Epoch 4/25
Epoch 4: saving model to crash_checkpoints_fatal\weights.04.hdf5
Epoch 5/25
Epoch 5: saving model to crash_checkpoints_fatal\weights.05.hdf5
Epoch 6/25
Epoch 6: saving model to crash_checkpoints_fatal\weights.06.hdf5
Epoch 7/25
Epoch 7: saving model to crash_checkpoints_fatal\weights.07.hdf5
Epoch 8/25
Epoch 8: saving model to crash_checkpoints_fatal\weights.08.hdf5
Epoch 9/25
Epoch 9: saving model to crash_checkpoints_fatal\weights.09.hdf5
Epoch 10/25
Epoch 10: saving model to crash_checkpoints_fatal\weights.10.hdf5
Epoch 11/25
Epoch 11: saving model to crash_checkpoints_fatal\weights.11.hdf5
Epoch 12/25
Epoch 12: saving model to crash_checkpoints_fatal\weights.12.hdf5
Epoch 13/25
Epoch 13: saving model to crash_checkpoints_fatal\weights.13.hdf5
Epoc

In [106]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2004/2004 - 1s - loss: 0.0321 - accuracy: 0.9944 - 1s/epoch - 693us/step
Loss: 0.03209508955478668, Accuracy: 0.9944314360618591


In [107]:
# Export our model to HDF5 file
nn.save("Crash_fatal.h5")
tfjs.converters.save_keras_model(nn, './tfjs_fatal')