In [1]:
# Import dependencies
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
accidents_df = pd.read_csv('Resources/cleaned_data/accidents_2014_unencoded.csv')
accidents_df.head()

Unnamed: 0,ST_CASE,VE_TOTAL,PEDS,PERSONS,COUNTY,MONTH,DAY_WEEK,YEAR,HOUR,ROAD_FNC,...,ACC_TYPE_F,ACC_TYPE_G,ACC_TYPE_H,ACC_TYPE_I,ACC_TYPE_J,ACC_TYPE_K,ACC_TYPE_L,ACC_TYPE_M,MDRDSTRD,MVIOLATN
0,410001,1,1,1,67,1,2,2014,17,4,...,0,0,0,0,0,0,0,0,99,0
1,410002,1,0,1,35,1,5,2014,8,2,...,0,0,0,0,0,0,0,0,99,0
2,410003,2,0,3,53,1,5,2014,13,3,...,0,1,0,0,0,0,0,0,99,0
3,410004,1,0,1,29,1,2,2014,23,3,...,0,0,0,0,0,0,0,0,99,0
4,410005,2,0,2,35,1,4,2014,6,2,...,0,0,0,0,0,0,0,1,99,0


In [3]:
# Dropping duplicate columns
accidents_df = accidents_df.drop(['MAN_COLL', 'DR_DRINK'], axis=1)
accidents_df

Unnamed: 0,ST_CASE,VE_TOTAL,PEDS,PERSONS,COUNTY,MONTH,DAY_WEEK,YEAR,HOUR,ROAD_FNC,...,ACC_TYPE_F,ACC_TYPE_G,ACC_TYPE_H,ACC_TYPE_I,ACC_TYPE_J,ACC_TYPE_K,ACC_TYPE_L,ACC_TYPE_M,MDRDSTRD,MVIOLATN
0,410001,1,1,1,67,1,2,2014,17,4,...,0,0,0,0,0,0,0,0,99,0
1,410002,1,0,1,35,1,5,2014,8,2,...,0,0,0,0,0,0,0,0,99,0
2,410003,2,0,3,53,1,5,2014,13,3,...,0,1,0,0,0,0,0,0,99,0
3,410004,1,0,1,29,1,2,2014,23,3,...,0,0,0,0,0,0,0,0,99,0
4,410005,2,0,2,35,1,4,2014,6,2,...,0,0,0,0,0,0,0,1,99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,410323,1,0,1,39,11,7,2014,12,6,...,0,0,0,0,0,0,0,1,99,0
318,410324,1,0,2,31,12,7,2014,18,5,...,0,0,0,0,0,0,0,1,99,0
319,410325,1,0,1,65,12,4,2014,9,2,...,0,0,0,0,0,0,0,1,99,0
320,410326,2,0,2,29,12,6,2014,13,13,...,0,0,0,0,0,0,1,0,99,0


In [4]:
accidents_df = accidents_df.set_index('ST_CASE')
accidents_df

Unnamed: 0_level_0,VE_TOTAL,PEDS,PERSONS,COUNTY,MONTH,DAY_WEEK,YEAR,HOUR,ROAD_FNC,ROUTE,...,ACC_TYPE_F,ACC_TYPE_G,ACC_TYPE_H,ACC_TYPE_I,ACC_TYPE_J,ACC_TYPE_K,ACC_TYPE_L,ACC_TYPE_M,MDRDSTRD,MVIOLATN
ST_CASE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
410001,1,1,1,67,1,2,2014,17,4,4,...,0,0,0,0,0,0,0,0,99,0
410002,1,0,1,35,1,5,2014,8,2,2,...,0,0,0,0,0,0,0,0,99,0
410003,2,0,3,53,1,5,2014,13,3,3,...,0,1,0,0,0,0,0,0,99,0
410004,1,0,1,29,1,2,2014,23,3,3,...,0,0,0,0,0,0,0,0,99,0
410005,2,0,2,35,1,4,2014,6,2,3,...,0,0,0,0,0,0,0,1,99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410323,1,0,1,39,11,7,2014,12,6,8,...,0,0,0,0,0,0,0,1,99,0
410324,1,0,2,31,12,7,2014,18,5,3,...,0,0,0,0,0,0,0,1,99,0
410325,1,0,1,65,12,4,2014,9,2,2,...,0,0,0,0,0,0,0,1,99,0
410326,2,0,2,29,12,6,2014,13,13,6,...,0,0,0,0,0,0,1,0,99,0


In [5]:
# creating buckets: 1-7 non-collision, a
# 8, 9, 15 pedestrian, cyclist, b
# 10 train, c
# 11 animal, d
# 12-14 motor vehicle, e
# 16-18 falling object, f
# 19-53, 57-59 into object, off road, g
# 54,55,72,73 cargo, h
# else, other/unknown, i

In [6]:
# Bucketing HARM_EV
accidents_df['HARM_EV_a'] = 0
accidents_df['HARM_EV_b'] = 0
accidents_df['HARM_EV_c'] = 0
accidents_df['HARM_EV_d'] = 0
accidents_df['HARM_EV_e'] = 0
accidents_df['HARM_EV_f'] = 0
accidents_df['HARM_EV_g'] = 0
accidents_df['HARM_EV_h'] = 0
accidents_df['HARM_EV_i'] = 0

In [7]:
for index in accidents_df.index:
    
    event = accidents_df.loc[index]['HARM_EV']
    
    if event <= 7:
        accidents_df.at[index, 'HARM_EV_a'] = 1
    
    elif (event <= 9) or (event == 15):
        accidents_df.at[index, 'HARM_EV_b'] = 1
        
    elif event == 10:
        accidents_df.at[index, 'HARM_EV_c'] = 1
        
    elif event == 11:
        accidents_df.at[index, 'HARM_EV_d'] = 1
        
    elif event <= 14:
        accidents_df.at[index, 'HARM_EV_e'] = 1
        
    elif event <= 18:
        accidents_df.at[index, 'HARM_EV_f'] = 1
        
    elif (event <= 53) or (event == 57) or (event == 58) or (event == 59):
        accidents_df.at[index, 'HARM_EV_g'] = 1
        
    elif event <= 73:
        accidents_df.at[index, 'HARM_EV_h'] = 1
        
    else:
        accidents_df.at[index, 'HARM_EV_i'] = 1

accidents_df.head()

Unnamed: 0_level_0,VE_TOTAL,PEDS,PERSONS,COUNTY,MONTH,DAY_WEEK,YEAR,HOUR,ROAD_FNC,ROUTE,...,MVIOLATN,HARM_EV_a,HARM_EV_b,HARM_EV_c,HARM_EV_d,HARM_EV_e,HARM_EV_f,HARM_EV_g,HARM_EV_h,HARM_EV_i
ST_CASE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
410001,1,1,1,67,1,2,2014,17,4,4,...,0,0,1,0,0,0,0,0,0,0
410002,1,0,1,35,1,5,2014,8,2,2,...,0,0,0,0,0,0,0,1,0,0
410003,2,0,3,53,1,5,2014,13,3,3,...,0,0,0,0,0,1,0,0,0,0
410004,1,0,1,29,1,2,2014,23,3,3,...,0,0,0,0,0,0,0,1,0,0
410005,2,0,2,35,1,4,2014,6,2,3,...,0,0,0,0,0,0,0,1,0,0


In [8]:
# drop harm_ev
accidents_df = accidents_df.drop('HARM_EV', axis = 1)

In [9]:
# Encoding with get_dummies()
encoded_df = pd.get_dummies(accidents_df, columns=['ROAD_FNC', 'ROUTE', 'RELJCT2', 'TYP_INT', 'REL_ROAD', 'WRK_ZONE', 'LGT_COND', 'WEATHER', 'DRUGS', 'OOS_REG', 'MDRDSTRD', 'MVIOLATN'])
encoded_df.head()

Unnamed: 0_level_0,VE_TOTAL,PEDS,PERSONS,COUNTY,MONTH,DAY_WEEK,YEAR,HOUR,LATITUDE,LONGITUD,...,MVIOLATN_0,MVIOLATN_1,MVIOLATN_2,MVIOLATN_11,MVIOLATN_46,MVIOLATN_62,MVIOLATN_72,MVIOLATN_92,MVIOLATN_98,MVIOLATN_99
ST_CASE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
410001,1,1,1,67,1,2,2014,17,45.379664,-122.867736,...,1,0,0,0,0,0,0,0,0,0
410002,1,0,1,35,1,5,2014,8,42.635964,-121.879697,...,1,0,0,0,0,0,0,0,0,0
410003,2,0,3,53,1,5,2014,13,44.883883,-123.331422,...,1,0,0,0,0,0,0,0,0,0
410004,1,0,1,29,1,2,2014,23,42.287886,-122.993594,...,1,0,0,0,0,0,0,0,0,0
410005,2,0,2,35,1,4,2014,6,42.19495,-121.598747,...,1,0,0,0,0,0,0,0,0,0


In [10]:
# Things we don't want to be scaled: county, year, latitude, longitud
prescaled_df = encoded_df.drop(['COUNTY', 'YEAR', 'LATITUDE', 'LONGITUD'], axis=1)
prescaled_df

Unnamed: 0_level_0,VE_TOTAL,PEDS,PERSONS,MONTH,DAY_WEEK,HOUR,SCH_BUS,FATALS,DRUNK_DR,AGE,...,MVIOLATN_0,MVIOLATN_1,MVIOLATN_2,MVIOLATN_11,MVIOLATN_46,MVIOLATN_62,MVIOLATN_72,MVIOLATN_92,MVIOLATN_98,MVIOLATN_99
ST_CASE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
410001,1,1,1,1,2,17,0,1,0,46,...,1,0,0,0,0,0,0,0,0,0
410002,1,0,1,1,5,8,0,1,0,73,...,1,0,0,0,0,0,0,0,0,0
410003,2,0,3,1,5,13,0,1,0,53,...,1,0,0,0,0,0,0,0,0,0
410004,1,0,1,1,2,23,0,1,1,42,...,1,0,0,0,0,0,0,0,0,0
410005,2,0,2,1,4,6,0,1,0,59,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410323,1,0,1,11,7,12,0,1,0,53,...,1,0,0,0,0,0,0,0,0,0
410324,1,0,2,12,7,18,0,1,0,81,...,1,0,0,0,0,0,0,0,0,0
410325,1,0,1,12,4,9,0,1,0,38,...,1,0,0,0,0,0,0,0,0,0
410326,2,0,2,12,6,13,0,1,0,77,...,1,0,0,0,0,0,0,0,0,0


In [11]:
# making unknown age into an 'everage' age
for index in prescaled_df.index:
    
    if prescaled_df.loc[index]['AGE'] == 999:
        prescaled_df.at[index, 'AGE'] = 40
        
prescaled_df.head()

Unnamed: 0_level_0,VE_TOTAL,PEDS,PERSONS,MONTH,DAY_WEEK,HOUR,SCH_BUS,FATALS,DRUNK_DR,AGE,...,MVIOLATN_0,MVIOLATN_1,MVIOLATN_2,MVIOLATN_11,MVIOLATN_46,MVIOLATN_62,MVIOLATN_72,MVIOLATN_92,MVIOLATN_98,MVIOLATN_99
ST_CASE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
410001,1,1,1,1,2,17,0,1,0,46,...,1,0,0,0,0,0,0,0,0,0
410002,1,0,1,1,5,8,0,1,0,73,...,1,0,0,0,0,0,0,0,0,0
410003,2,0,3,1,5,13,0,1,0,53,...,1,0,0,0,0,0,0,0,0,0
410004,1,0,1,1,2,23,0,1,1,42,...,1,0,0,0,0,0,0,0,0,0
410005,2,0,2,1,4,6,0,1,0,59,...,1,0,0,0,0,0,0,0,0,0


In [12]:
# Scaling
data_scaler = StandardScaler()

In [13]:
scaled_data = data_scaler.fit_transform(prescaled_df)
scaled_data

array([[-0.66323157,  1.71846589, -0.69524839, ..., -0.05581456,
        -0.05581456, -0.12559009],
       [-0.66323157, -0.47735163, -0.69524839, ..., -0.05581456,
        -0.05581456, -0.12559009],
       [ 0.67991664, -0.47735163,  0.4646997 , ..., -0.05581456,
        -0.05581456, -0.12559009],
       ...,
       [-0.66323157, -0.47735163, -0.69524839, ..., -0.05581456,
        -0.05581456, -0.12559009],
       [ 0.67991664, -0.47735163, -0.11527434, ..., -0.05581456,
        -0.05581456, -0.12559009],
       [-0.66323157,  1.71846589, -0.69524839, ..., -0.05581456,
        -0.05581456, -0.12559009]])

In [14]:
df_columns = prescaled_df.columns

In [15]:
# Creating a dataframe from the data
scaled_df = pd.DataFrame(data=scaled_data, columns=df_columns)
scaled_df

Unnamed: 0,VE_TOTAL,PEDS,PERSONS,MONTH,DAY_WEEK,HOUR,SCH_BUS,FATALS,DRUNK_DR,AGE,...,MVIOLATN_0,MVIOLATN_1,MVIOLATN_2,MVIOLATN_11,MVIOLATN_46,MVIOLATN_62,MVIOLATN_72,MVIOLATN_92,MVIOLATN_98,MVIOLATN_99
0,-0.663232,1.718466,-0.695248,-1.843891,-1.003938,0.007990,-0.055815,-0.303889,-0.661438,0.054124,...,0.296374,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559
1,-0.663232,-0.477352,-0.695248,-1.843891,0.445694,-0.495400,-0.055815,-0.303889,-0.661438,1.627887,...,0.296374,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559
2,0.679917,-0.477352,0.464700,-1.843891,0.445694,-0.215739,-0.055815,-0.303889,-0.661438,0.462137,...,0.296374,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559
3,-0.663232,-0.477352,-0.695248,-1.843891,-1.003938,0.343584,-0.055815,-0.303889,1.511858,-0.179026,...,0.296374,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559
4,0.679917,-0.477352,-0.115274,-1.843891,-0.037516,-0.607265,-0.055815,-0.303889,-0.661438,0.811862,...,0.296374,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,-0.663232,-0.477352,-0.695248,1.151734,1.412116,-0.271671,-0.055815,-0.303889,-0.661438,0.462137,...,0.296374,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559
318,-0.663232,-0.477352,-0.115274,1.451297,1.412116,0.063923,-0.055815,-0.303889,-0.661438,2.094188,...,0.296374,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559
319,-0.663232,-0.477352,-0.695248,1.451297,-0.037516,-0.439468,-0.055815,-0.303889,-0.661438,-0.412176,...,0.296374,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559
320,0.679917,-0.477352,-0.115274,1.451297,0.928905,-0.215739,-0.055815,-0.303889,-0.661438,1.861038,...,0.296374,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559


In [16]:
# Adding st_case back to df
scaled_df['ST_CASE'] = prescaled_df.index
scaled_df

Unnamed: 0,VE_TOTAL,PEDS,PERSONS,MONTH,DAY_WEEK,HOUR,SCH_BUS,FATALS,DRUNK_DR,AGE,...,MVIOLATN_1,MVIOLATN_2,MVIOLATN_11,MVIOLATN_46,MVIOLATN_62,MVIOLATN_72,MVIOLATN_92,MVIOLATN_98,MVIOLATN_99,ST_CASE
0,-0.663232,1.718466,-0.695248,-1.843891,-1.003938,0.007990,-0.055815,-0.303889,-0.661438,0.054124,...,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559,410001
1,-0.663232,-0.477352,-0.695248,-1.843891,0.445694,-0.495400,-0.055815,-0.303889,-0.661438,1.627887,...,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559,410002
2,0.679917,-0.477352,0.464700,-1.843891,0.445694,-0.215739,-0.055815,-0.303889,-0.661438,0.462137,...,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559,410003
3,-0.663232,-0.477352,-0.695248,-1.843891,-1.003938,0.343584,-0.055815,-0.303889,1.511858,-0.179026,...,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559,410004
4,0.679917,-0.477352,-0.115274,-1.843891,-0.037516,-0.607265,-0.055815,-0.303889,-0.661438,0.811862,...,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559,410005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,-0.663232,-0.477352,-0.695248,1.151734,1.412116,-0.271671,-0.055815,-0.303889,-0.661438,0.462137,...,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559,410323
318,-0.663232,-0.477352,-0.115274,1.451297,1.412116,0.063923,-0.055815,-0.303889,-0.661438,2.094188,...,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559,410324
319,-0.663232,-0.477352,-0.695248,1.451297,-0.037516,-0.439468,-0.055815,-0.303889,-0.661438,-0.412176,...,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559,410325
320,0.679917,-0.477352,-0.115274,1.451297,0.928905,-0.215739,-0.055815,-0.303889,-0.661438,1.861038,...,-0.179029,-0.096976,-0.055815,-0.096976,-0.055815,-0.055815,-0.055815,-0.055815,-0.12559,410326


In [17]:
# save the scaled data
scaled_df.to_csv('Resources/cleaned_data/2014_scaled_data.csv')