In [2]:

import kagglehub
path = kagglehub.dataset_download("ianktoo/simulated-roads-accident-data")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import joblib

import os

df1 = pd.read_csv(os.path.join(path, "synthetic_road_accidents_2k.csv"))
df2 = pd.read_csv(os.path.join(path, "synthetic_road_accidents_10k.csv"))
df3 = pd.read_csv(os.path.join(path, "synthetic_road_accidents_100k.csv"))

_df = pd.concat([df1, df2, df3])

_df = _df.drop_duplicates()

_df.reset_index(drop = True, inplace = True)

_df_copy = _df.copy()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
X = _df_copy.drop(["accident_risk","num_reported_accidents"], axis = 1)
y = _df_copy["accident_risk"]

In [4]:
X["high_speed"] = (X["speed_limit"] > 45).astype(bool)
X = X.drop("speed_limit", axis=1)

In [5]:
# Column types
cat_cols = X.select_dtypes(include = ["object","category"]).columns
num_cols = X.select_dtypes(exclude = ["object","category","bool"]).columns
bool_cols = X.select_dtypes(include = "bool").columns

In [6]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
        ("bool", "passthrough", bool_cols)
    ]
)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
f_df = pd.concat([X_train, y_train], axis = 1)
print(f_df.shape)

(78372, 12)


In [8]:
X_train.head(10)  


Unnamed: 0,road_type,num_lanes,curvature,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,high_speed
80965,rural,2,0.69,daylight,clear,True,True,afternoon,False,True,False
73776,urban,1,0.97,night,clear,False,True,morning,False,True,False
22848,highway,3,0.31,dim,foggy,False,True,morning,True,False,False
31456,highway,3,0.74,night,clear,True,False,afternoon,True,False,False
19772,urban,3,0.86,daylight,rainy,True,False,afternoon,True,False,True
77587,urban,3,0.57,night,rainy,False,True,morning,True,False,False
64063,urban,3,0.95,dim,rainy,False,False,morning,True,True,True
66122,urban,2,0.97,dim,rainy,False,True,afternoon,True,True,True
92821,highway,2,0.48,daylight,foggy,False,True,afternoon,True,False,False
32240,urban,3,0.8,dim,rainy,False,True,evening,False,True,False


In [53]:
df_train = f_df.copy()


In [54]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
from sklearn.pipeline import Pipeline

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

model.fit(X_train, y_train)

y_pred_lr = model.predict(X_test)

y_pred_lr = np.maximum(y_pred_lr, 0)

mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression MAE: {mae_lr:.5f}")
print(f"Linear Regression RMSE: {rmse_lr:.5f}")
print(f"Linear Regression R2 Score: {r2_lr:.5f}")

Linear Regression MAE: 0.05022
Linear Regression RMSE: 0.06320
Linear Regression R2 Score: 0.87402
