In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import os

dataset_path = os.path.abspath(os.path.join("..", "dataset"))

train_path = os.path.join(dataset_path, "train.csv")
test_path = os.path.join(dataset_path, "test.csv")

In [3]:
train = pd.read_csv(train_path).set_index("id")
train.head()

Unnamed: 0_level_0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [5]:
rf_reg = RandomForestRegressor(
    n_estimators=1000,
    max_depth=8,
    criterion="squared_error",
    random_state=64,
    n_jobs=-1,
    verbose=0
)

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), train.select_dtypes(include="object").columns)
    ],
    remainder="passthrough"
)

pipeline = Pipeline([
    ("preprocess", preprocess),
    ("random_forest", rf_reg)
])

In [6]:
from sklearn.model_selection import KFold, cross_val_score

In [7]:
X = train.drop(columns=["accident_risk"])
y = train["accident_risk"]

kf = KFold(n_splits=5, shuffle=True, random_state=64)
cvs = cross_val_score(pipeline, cv=kf, X=X, y=y, scoring="r2")

In [8]:
print(f"Cross Val Score: {cvs}")

Cross Val Score: [0.8857902  0.88600823 0.88399767 0.88366626 0.8863468 ]


In [9]:
pipeline.fit(X, y)

0,1,2
,steps,"[('preprocess', ...), ('random_forest', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,1000
,criterion,'squared_error'
,max_depth,8
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [10]:
test = pd.read_csv(test_path).set_index("id")
test['accident_risk'] = pipeline.predict(test)
test = test['accident_risk']
test

id
517754    0.295977
517755    0.125151
517756    0.177261
517757    0.347012
517758    0.413239
            ...   
690334    0.103655
690335    0.523049
690336    0.252467
690337    0.130480
690338    0.495331
Name: accident_risk, Length: 172585, dtype: float64

In [12]:
test.to_csv("../predictions/random_forest.csv")