# Road Accident Risk

## Initialisation

In [682]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler

### Path

In [683]:
data_folder = 'resources/'
output_folder = 'output/'

## Train Data loading

In [684]:
train = pd.read_csv(data_folder + "train.csv")

train.head(5)

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [685]:
y_train = pd.DataFrame(train["accident_risk"].copy())

y_train.head(5)

Unnamed: 0,accident_risk
0,0.13
1,0.35
2,0.3
3,0.21
4,0.56


## Train Preprocessing

### Missing values

In [686]:
X_train.isna().sum()

road_type                 0
num_lanes                 0
curvature                 0
speed_limit               0
lighting                  0
weather                   0
road_signs_present        0
public_road               0
time_of_day               0
holiday                   0
school_season             0
num_reported_accidents    0
dtype: int64

### Data analysis

In [687]:
# id not necessary in this case (don't give information about accident rick)
X_train = pd.DataFrame(train.drop(["accident_risk", "id"], axis=1))

X_train.head(5)

Unnamed: 0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents
0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1
1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0
2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2
3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1
4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1


In [688]:
X_train.dtypes

road_type                  object
num_lanes                   int64
curvature                 float64
speed_limit                 int64
lighting                   object
weather                    object
road_signs_present           bool
public_road                  bool
time_of_day                object
holiday                      bool
school_season                bool
num_reported_accidents      int64
dtype: object

Bool data can be easily interpreted by integer (0 = False and 1 = True)

### Bool type processing

In [689]:
SRP = X_train["road_signs_present"].astype(int)
PR = X_train["public_road"].astype(int)
H = X_train["holiday"].astype(int)
SS = X_train["school_season"].astype(int)

X_train.dtypes

road_type                  object
num_lanes                   int64
curvature                 float64
speed_limit                 int64
lighting                   object
weather                    object
road_signs_present           bool
public_road                  bool
time_of_day                object
holiday                      bool
school_season                bool
num_reported_accidents      int64
dtype: object

### Object type processing

In [690]:
objType = X_train.select_dtypes(include="object").columns

objType

Index(['road_type', 'lighting', 'weather', 'time_of_day'], dtype='object')

#### road_type

In [691]:
X_train['road_type'].unique()

array(['urban', 'rural', 'highway'], dtype=object)

Only 3 value, we can encode them with Dummies variable without adding to much colunms.

In [692]:
RTdummies = pd.get_dummies(X_train['road_type']).astype(int)

print(RTdummies.shape[0] == X_train.shape[0])

RTdummies.head(5)

True


Unnamed: 0,highway,rural,urban
0,0,0,1
1,0,0,1
2,0,1,0
3,1,0,0
4,0,1,0


#### lighting

In [693]:
X_train['lighting'].unique()

array(['daylight', 'dim', 'night'], dtype=object)

Same processing as 'road_type'

In [694]:
Ldummies = pd.get_dummies(X_train['lighting']).astype(int)

print(Ldummies.shape[0] == X_train.shape[0])

Ldummies.head(5)

True


Unnamed: 0,daylight,dim,night
0,1,0,0
1,1,0,0
2,0,1,0
3,0,1,0
4,1,0,0


#### weather

In [695]:
X_train['weather'].unique()

array(['rainy', 'clear', 'foggy'], dtype=object)

Same processing as 'road_type'

In [696]:
Wdummies = pd.get_dummies(X_train['weather']).astype(int)

print(Wdummies.shape[0] == X_train.shape[0])

Wdummies.head(5)

True


Unnamed: 0,clear,foggy,rainy
0,0,0,1
1,1,0,0
2,1,0,0
3,0,0,1
4,0,1,0


#### time_of_day

In [697]:
X_train['time_of_day'].unique()

array(['afternoon', 'evening', 'morning'], dtype=object)

Same processing as 'road_type'

In [698]:
TODdummies = pd.get_dummies(X_train['time_of_day']).astype(int)

print(TODdummies.shape[0] == X_train.shape[0])

TODdummies.head(5)

True


Unnamed: 0,afternoon,evening,morning
0,1,0,0
1,0,1,0
2,0,0,1
3,0,0,1
4,0,1,0


### Concat

In [699]:
XTotal = pd.concat([TODdummies, Wdummies, Ldummies, RTdummies, 
                    SRP, PR, H, SS, 
                    X_train['num_lanes'], X_train['curvature'], X_train['speed_limit'], X_train['num_reported_accidents']]
                    , axis=1)

XTotal.head(5)

Unnamed: 0,afternoon,evening,morning,clear,foggy,rainy,daylight,dim,night,highway,rural,urban,road_signs_present,public_road,holiday,school_season,num_lanes,curvature,speed_limit,num_reported_accidents
0,1,0,0,0,0,1,1,0,0,0,0,1,0,1,0,1,2,0.06,35,1
1,0,1,0,1,0,0,1,0,0,0,0,1,1,0,1,1,4,0.99,35,0
2,0,0,1,1,0,0,0,1,0,0,1,0,0,1,1,0,4,0.63,70,2
3,0,0,1,0,0,1,0,1,0,1,0,0,1,1,0,0,4,0.07,35,1
4,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0.58,60,1


In [700]:
print(X_train.shape[0] == XTotal.shape[0])

True


### Scaler

In [None]:
"""
scaler = StandardScaler()
scaler.fit(XTotal)
XTotal = pd.DataFrame(scaler.transform(XTotal))

XTotal.head(5)
"""

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.420863,-0.707882,-0.709644,-0.727866,-0.734575,1.515952,1.381479,-0.741954,-0.656421,-0.710451,-0.70752,1.421755,-0.99841,0.995498,-1.00702,1.004984,-0.43868,-1.572918,-0.70384,-0.209797
1,-0.703798,1.412664,-0.709644,1.373879,-0.734575,-0.659651,1.381479,-0.741954,-0.656421,-0.710451,-0.70752,1.421755,1.001593,-1.004522,0.993029,1.004984,1.346344,1.839137,-0.70384,-1.325918
2,-0.703798,-0.707882,1.409156,1.373879,-0.734575,-0.659651,-0.723862,1.347792,-0.656421,-0.710451,1.413388,-0.703356,-0.99841,0.995498,0.993029,-0.995041,1.346344,0.518342,1.512963,0.906324
3,-0.703798,-0.707882,1.409156,-0.727866,-0.734575,1.515952,-0.723862,1.347792,-0.656421,1.407557,-0.70752,-0.703356,1.001593,0.995498,-1.00702,-0.995041,1.346344,-1.536229,-0.70384,-0.209797
4,-0.703798,1.412664,-0.709644,-0.727866,1.361331,-0.659651,1.381479,-0.741954,-0.656421,-0.710451,1.413388,-0.703356,-0.99841,-1.004522,0.993029,-0.995041,-1.331192,0.334898,0.879591,-0.209797


## Model fitting

In [702]:
model = LinearRegression()
model.fit(XTotal, y_train)

## Test Preprocessing

In [703]:
test = pd.read_csv(data_folder + "test.csv")
id = test["id"]
test = test.drop("id", axis=1)
test.head(5)

Unnamed: 0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents
0,highway,2,0.34,45,night,clear,True,True,afternoon,True,True,1
1,urban,3,0.04,45,dim,foggy,True,False,afternoon,True,False,0
2,urban,2,0.59,35,dim,clear,True,False,afternoon,True,True,1
3,rural,4,0.95,35,daylight,rainy,False,False,afternoon,False,False,2
4,highway,2,0.86,35,daylight,clear,True,False,evening,False,True,3


### Missing value

In [704]:
test.isna().sum()

road_type                 0
num_lanes                 0
curvature                 0
speed_limit               0
lighting                  0
weather                   0
road_signs_present        0
public_road               0
time_of_day               0
holiday                   0
school_season             0
num_reported_accidents    0
dtype: int64

### Bool type processing

In [705]:
SRP = test["road_signs_present"].astype(int)
PR = test["public_road"].astype(int)
H = test["holiday"].astype(int)
SS = test["school_season"].astype(int)

### Object type processing

In [706]:
RTdummies = pd.get_dummies(test['road_type']).astype(int)
Ldummies = pd.get_dummies(test['lighting']).astype(int)
Wdummies = pd.get_dummies(test['weather']).astype(int)
TODdummies = pd.get_dummies(test['time_of_day']).astype(int)

### Concat

In [707]:
test = pd.concat([TODdummies, Wdummies, Ldummies, RTdummies, 
                    SRP, PR, H, SS, 
                    test['num_lanes'], test['curvature'], test['speed_limit'], test['num_reported_accidents']]
                    , axis=1)

test.isna().sum()

afternoon                 0
evening                   0
morning                   0
clear                     0
foggy                     0
rainy                     0
daylight                  0
dim                       0
night                     0
highway                   0
rural                     0
urban                     0
road_signs_present        0
public_road               0
holiday                   0
school_season             0
num_lanes                 0
curvature                 0
speed_limit               0
num_reported_accidents    0
dtype: int64

## Prediction

In [708]:
prediction = model.predict(test)

prediction



array([[1.02870256e+09],
       [1.06107327e+09],
       [1.06125164e+09],
       ...,
       [1.06123111e+09],
       [1.05501657e+09],
       [1.02850366e+09]])

### Saving data

In [None]:
filename = output_folder + 'submission.csv'

pred = []
for i in prediction:
    pred.append(round(i[0], 3))

submission = pd.DataFrame({
    'id': id,
    'accident_risk': pred
})

submission.to_csv(filename, index=False)
print(f"Saved: {filename}")

Saved: output/submissionScaler.csv
