In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import os

dataset_path = os.path.abspath(os.path.join("..", "dataset"))

train_path = os.path.join(dataset_path, "train.csv")
test_path = os.path.join(dataset_path, "test.csv")

In [3]:
train = pd.read_csv(train_path).set_index("id")
train.head()

Unnamed: 0_level_0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [4]:
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [5]:
xgb = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=64,
    verbosity=0,
    tree_method="hist",
    device="cuda:0"
)

preprocessing = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), train.select_dtypes(include="object").columns)
    ],
    remainder="passthrough"
)

pipeline = Pipeline([
    ("preprocess", preprocessing),
    ("xgboost", xgb)
])

In [6]:
from sklearn.model_selection import KFold, cross_val_score

In [7]:
X = train.drop(columns=["accident_risk"])
y = train["accident_risk"]

kf = KFold(n_splits=5, shuffle=True, random_state=64)
cvs = cross_val_score(pipeline, X=X, y=y, cv=kf, scoring="r2", n_jobs=-1)
print(f"Cross Val Score: {cvs}")

Cross Val Score: [0.88680899 0.88702518 0.88530761 0.88481019 0.88746915]


In [8]:
pipeline = pipeline.fit(X=X, y=y)

In [9]:
test = pd.read_csv(test_path).set_index("id")
test['accident_risk'] = pipeline.predict(test)
test = test['accident_risk']
test

id
517754    0.292809
517755    0.119494
517756    0.186066
517757    0.309710
517758    0.392902
            ...   
690334    0.106740
690335    0.514902
690336    0.255621
690337    0.130964
690338    0.485639
Name: accident_risk, Length: 172585, dtype: float32

In [10]:
test.to_csv("../predictions/xgboost.csv")