# Road Accident Risk

## Initialisation

In [118]:
import pandas as pd
import numpy as np
import fuzzywuzzy
from sklearn.preprocessing import OneHotEncoder

### Path

In [119]:
data_folder = 'resources/'
output_folder = 'output/'
plots_folder = 'output/plots/'

## Train Data loading

In [120]:
train = pd.read_csv(data_folder + "train.csv")

train.head(5)

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [121]:
y_train = pd.DataFrame(train["accident_risk"].copy())

y_train.head(5)

Unnamed: 0,accident_risk
0,0.13
1,0.35
2,0.3
3,0.21
4,0.56


## Preprocessing

### Missing values

In [122]:
X_train.isna().sum()

road_type                 0
num_lanes                 0
curvature                 0
speed_limit               0
lighting                  0
weather                   0
road_signs_present        0
public_road               0
time_of_day               0
holiday                   0
school_season             0
num_reported_accidents    0
dtype: int64

### Data analysis

In [123]:
# id not necessary in this case (don't give information about accident rick)
X_train = pd.DataFrame(train.drop(["accident_risk", "id"], axis=1))

X_train.head(5)

Unnamed: 0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents
0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1
1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0
2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2
3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1
4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1


In [124]:
X_train.dtypes

road_type                  object
num_lanes                   int64
curvature                 float64
speed_limit                 int64
lighting                   object
weather                    object
road_signs_present           bool
public_road                  bool
time_of_day                object
holiday                      bool
school_season                bool
num_reported_accidents      int64
dtype: object

Bool data can be easily interpreted by integer (0 = False and 1 = True)

### Bool type processing

In [125]:
SRP = X_train["road_signs_present"].astype(int)
PR = X_train["public_road"] = X_train["public_road"].astype(int)
H = X_train["holiday"] = X_train["holiday"].astype(int)
SS = X_train["school_season"] = X_train["school_season"].astype(int)

X_train.dtypes

road_type                  object
num_lanes                   int64
curvature                 float64
speed_limit                 int64
lighting                   object
weather                    object
road_signs_present           bool
public_road                 int32
time_of_day                object
holiday                     int32
school_season               int32
num_reported_accidents      int64
dtype: object

### Object type processing

In [126]:
objType = X_train.select_dtypes(include="object").columns

objType

Index(['road_type', 'lighting', 'weather', 'time_of_day'], dtype='object')

#### road_type

In [127]:
X_train['road_type'].unique()

array(['urban', 'rural', 'highway'], dtype=object)

Only 3 value, we can encode them with Dummies variable without adding to much colunms.

In [128]:
RTdummies = pd.get_dummies(X_train['road_type']).astype(int)

print(RTdummies.shape[0] == X_train.shape[0])

RTdummies.head(5)

True


Unnamed: 0,highway,rural,urban
0,0,0,1
1,0,0,1
2,0,1,0
3,1,0,0
4,0,1,0


#### lighting

In [129]:
X_train['lighting'].unique()

array(['daylight', 'dim', 'night'], dtype=object)

Same processing as 'road_type'

In [130]:
Ldummies = pd.get_dummies(X_train['lighting']).astype(int)

print(Ldummies.shape[0] == X_train.shape[0])

Ldummies.head(5)

True


Unnamed: 0,daylight,dim,night
0,1,0,0
1,1,0,0
2,0,1,0
3,0,1,0
4,1,0,0


#### weather

In [131]:
X_train['weather'].unique()

array(['rainy', 'clear', 'foggy'], dtype=object)

Same processing as 'road_type'

In [132]:
Wdummies = pd.get_dummies(X_train['weather']).astype(int)

print(Wdummies.shape[0] == X_train.shape[0])

Wdummies.head(5)

True


Unnamed: 0,clear,foggy,rainy
0,0,0,1
1,1,0,0
2,1,0,0
3,0,0,1
4,0,1,0


#### time_of_day

In [133]:
X_train['time_of_day'].unique()

array(['afternoon', 'evening', 'morning'], dtype=object)

Same processing as 'road_type'

In [134]:
TODdummies = pd.get_dummies(X_train['time_of_day']).astype(int)

print(TODdummies.shape[0] == X_train.shape[0])

TODdummies.head(5)

True


Unnamed: 0,afternoon,evening,morning
0,1,0,0
1,0,1,0
2,0,0,1
3,0,0,1
4,0,1,0


####

In [135]:
X_train.head(5)

Unnamed: 0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents
0,urban,2,0.06,35,daylight,rainy,False,1,afternoon,0,1,1
1,urban,4,0.99,35,daylight,clear,True,0,evening,1,1,0
2,rural,4,0.63,70,dim,clear,False,1,morning,1,0,2
3,highway,4,0.07,35,dim,rainy,True,1,morning,0,0,1
4,rural,1,0.58,60,daylight,foggy,False,0,evening,1,0,1


In [None]:
XTotal = pd.concat([TODdummies, Wdummies, Ldummies, RTdummies, 
                    SRP, PR, H, SS, 
                    X_train['num_lanes'], X_train['curvature'], X_train['speed_limit'], X_train['num_reported_accidents']]
                    , axis=1)

XTotal

Unnamed: 0,afternoon,evening,morning,clear,foggy,rainy,daylight,dim,night,highway,rural,urban,road_signs_present,public_road,holiday,school_season,num_lanes,curvature,speed_limit,num_reported_accidents
0,1,0,0,0,0,1,1,0,0,0,0,1,0,1,0,1,2,0.06,35,1
1,0,1,0,1,0,0,1,0,0,0,0,1,1,0,1,1,4,0.99,35,0
2,0,0,1,1,0,0,0,1,0,0,1,0,0,1,1,0,4,0.63,70,2
3,0,0,1,0,0,1,0,1,0,1,0,0,1,1,0,0,4,0.07,35,1
4,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0.58,60,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517749,1,0,0,0,1,0,1,0,0,1,0,0,1,1,0,0,4,0.10,70,2
517750,0,0,1,0,0,1,1,0,0,0,1,0,1,1,0,0,4,0.47,35,1
517751,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,4,0.62,25,0
517752,1,0,0,1,0,0,0,0,1,1,0,0,1,0,1,1,3,0.63,25,3


In [141]:
print(X_train.shape[0] == XTotal.shape[0])

True
