In [1]:
#If you don't have kagglehub, install it with pip install kagglehub
#also install, pip install kaggle
%pip install kagglehub kaggle
import kagglehub
path = kagglehub.dataset_download("ianktoo/simulated-roads-accident-data")

Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns


In [3]:
import os

df1 = pd.read_csv(os.path.join(path, "synthetic_road_accidents_2k.csv"))
df2 = pd.read_csv(os.path.join(path, "synthetic_road_accidents_10k.csv"))
df3 = pd.read_csv(os.path.join(path, "synthetic_road_accidents_100k.csv"))

In [4]:
_df = pd.concat([df1, df2, df3])
print(_df.shape)

(112000, 13)


In [5]:
#Final combined dataframe
_df = _df.drop_duplicates()
print(f"Number of duplicates after removal: {_df.duplicated().sum()}")
print(f"Shape of the df after removal: {_df.shape}")

Number of duplicates after removal: 0
Shape of the df after removal: (111961, 13)


In [6]:
#final_df with corrected column indexes.
_df.reset_index(drop = True, inplace = True)
print(_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111961 entries, 0 to 111960
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   road_type               111961 non-null  object 
 1   num_lanes               111961 non-null  int64  
 2   curvature               111961 non-null  float64
 3   speed_limit             111961 non-null  int64  
 4   lighting                111961 non-null  object 
 5   weather                 111961 non-null  object 
 6   road_signs_present      111961 non-null  bool   
 7   public_road             111961 non-null  bool   
 8   time_of_day             111961 non-null  object 
 9   holiday                 111961 non-null  bool   
 10  school_season           111961 non-null  bool   
 11  num_reported_accidents  111961 non-null  int64  
 12  accident_risk           111961 non-null  float64
dtypes: bool(4), float64(2), int64(3), object(4)
memory usage: 8.1+ MB
None


In [7]:
_df_copy = _df.copy()

In [8]:

X = _df_copy.drop(["accident_risk", "num_reported_accidents"], axis=1)
y = _df_copy["accident_risk"]


In [9]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
f_df = pd.concat([X_train, y_train], axis = 1)
print(f_df.shape)

(78372, 12)


In [10]:
from kmodes.kprototypes import KPrototypes
from sklearn.preprocessing import LabelEncoder


In [11]:
# separate features (no target)
X_full = _df_copy.drop(["accident_risk", "num_reported_accidents"], axis=1)

# detect categorical columns
categorical_cols = X_full.select_dtypes(include=["object", "bool"]).columns.tolist()
numerical_cols = X_full.select_dtypes(exclude=["object", "bool"]).columns.tolist()

print("Categorical:", categorical_cols)
print("Numerical:", numerical_cols)


Categorical: ['road_type', 'lighting', 'weather', 'road_signs_present', 'public_road', 'time_of_day', 'holiday', 'school_season']
Numerical: ['num_lanes', 'curvature', 'speed_limit']


In [12]:
X_encoded = X_full.copy()

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_full[col])
    label_encoders[col] = le


In [13]:
# get categorical column indices
cat_idx = [X_encoded.columns.get_loc(col) for col in categorical_cols]

X_train_enc = X_encoded.loc[X_train.index]
X_test_enc  = X_encoded.loc[X_test.index]

kproto = KPrototypes(
    n_clusters=3,
    init="Cao",
    n_init=5,
    random_state=42,
    verbose=1
)

train_clusters = kproto.fit_predict(
    X_train_enc.to_numpy(),
    categorical=cat_idx
)

test_clusters = kproto.predict(
    X_test_enc.to_numpy(),
    categorical=cat_idx
)


Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 9680, ncost: 2748581.566375378
Run: 1, iteration: 2/100, moves: 0, ncost: 2748581.566375378
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 1937, ncost: 4307340.134535434
Run: 2, iteration: 2/100, moves: 0, ncost: 4307340.134535434
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 3, iteration: 1/100, moves: 6130, ncost: 2748581.566375378
Run: 3, iteration: 2/100, moves: 0, ncost: 2748581.566375378
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 4, iteration: 1/100, moves: 3528, ncost: 2750330.0221200916
Run: 4, iteration: 2/100, moves: 0, ncost: 2750330.0221200916
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 5, iteration: 1/1

In [14]:
X_train_clustered = X_train.copy()
X_test_clustered  = X_test.copy()


X_train_clustered["cluster"] = train_clusters
X_test_clustered["cluster"]  = test_clusters


In [15]:
train_df_clustered = pd.concat(
    [X_train_clustered, y_train], axis=1
)

train_df_clustered.groupby("cluster")["accident_risk"].mean()


cluster
0    0.302638
1    0.503213
2    0.302463
Name: accident_risk, dtype: float64

In [16]:
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error


In [17]:
# categorical features for CatBoost
cat_features_cb = [
    X_train_clustered.columns.get_loc(col)
    for col in categorical_cols
]


In [18]:
model_no_cluster = CatBoostRegressor(
    iterations=600,
    depth=6,
    learning_rate=0.05,
    loss_function="RMSE",
    random_state=42,
    verbose=False
)

model_no_cluster.fit(
    X_train,
    y_train,
    cat_features=cat_features_cb
)

y_pred_nc = model_no_cluster.predict(X_test)
y_train_pred_nc = model_no_cluster.predict(X_train)

print("Test R² without cluster:", r2_score(y_test, y_pred_nc))
print("Train R² without cluster:", r2_score(y_train, y_train_pred_nc))

Test R² without cluster: 0.8737510120017573
Train R² without cluster: 0.8754838431225889


In [19]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Predictions
y_pred_no_cluster = model_no_cluster.predict(X_test)

# RMSE
rmse_no_cluster = np.sqrt(mean_squared_error(y_test, y_pred_no_cluster))
rmse_train_nc = np.sqrt(mean_squared_error(y_train, y_train_pred_nc))

print(f"RMSE (Test No Clusters): {rmse_no_cluster:.4f}")
print(f"RMSE (Train No Clusters): {rmse_train_nc:.4f}")


RMSE (Test No Clusters): 0.0633
RMSE (Train No Clusters): 0.0633


In [20]:
from sklearn.metrics import r2_score

cluster_models = {}

for c in sorted(train_df_clustered['cluster'].unique()):
    
    sub_df = train_df_clustered[train_df_clustered['cluster'] == c]

    X_c = sub_df.drop("accident_risk", axis=1)
    y_c = sub_df["accident_risk"]

    model_c = CatBoostRegressor(
        iterations=400,
        depth=5,
        learning_rate=0.05,
        loss_function="RMSE",
        random_state=42,
        verbose=False
    )

    model_c.fit(
        X_c,
        y_c,
        cat_features=cat_features_cb
    )

    cluster_models[c] = model_c

    print(f"Cluster {c} | Train size = {len(X_c)}")


Cluster 0 | Train size = 15654
Cluster 1 | Train size = 31491
Cluster 2 | Train size = 31227


In [21]:
test_df_clustered = pd.concat(
    [X_test_clustered, y_test], axis=1
)


In [22]:
from sklearn.metrics import r2_score

r2_per_cluster = {}

for c, model in cluster_models.items():
    
    test_sub = test_df_clustered[test_df_clustered["cluster"] == c]

    # skip if too small (important for stability)
    if len(test_sub) < 50:
        print(f"Cluster {c}: too few samples ({len(test_sub)})")
        continue

    X_c_test = test_sub.drop("accident_risk", axis=1)
    y_c_test = test_sub["accident_risk"]

    y_pred_c = model.predict(X_c_test)

    r2 = r2_score(y_c_test, y_pred_c)
    r2_per_cluster[c] = r2

    print(f"Cluster {c} | Test R² = {r2:.4f} | n = {len(test_sub)}")


Cluster 0 | Test R² = 0.8234 | n = 6817
Cluster 1 | Test R² = 0.8190 | n = 13434
Cluster 2 | Test R² = 0.8191 | n = 13338


In [23]:
for cluster_id in sorted(test_df_clustered["cluster"].unique()):
    
    test_sub = test_df_clustered[test_df_clustered["cluster"] == cluster_id]
    
    X_c_test = test_sub.drop("accident_risk", axis=1)
    y_c_test = test_sub["accident_risk"]
    
    y_pred_c = cluster_models[cluster_id].predict(X_c_test)
    
    rmse_c = np.sqrt(mean_squared_error(y_c_test, y_pred_c))
    
    print(f"Cluster {cluster_id} RMSE: {rmse_c:.4f}")


Cluster 0 RMSE: 0.0634
Cluster 1 RMSE: 0.0637
Cluster 2 RMSE: 0.0628


In [24]:
total_n = sum(len(test_df_clustered[test_df_clustered["cluster"] == c])
              for c in r2_per_cluster)

weighted_r2 = sum(
    r2_per_cluster[c] * len(test_df_clustered[test_df_clustered["cluster"] == c])
    for c in r2_per_cluster
) / total_n

print("Weighted R² (cluster models):", weighted_r2)


Weighted R² (cluster models): 0.8199341492687116


In [28]:
print("\nCLUSTERED MODEL – TRAIN RESULTS (INDIVIDUAL)")

for c, model in cluster_models.items():
    sub = train_df_clustered[train_df_clustered["cluster"] == c]

    X_c = sub.drop("accident_risk", axis=1)
    y_c = sub["accident_risk"]

    y_pred = model.predict(X_c)

    rmse = np.sqrt(mean_squared_error(y_c, y_pred))
    r2 = r2_score(y_c, y_pred)

    print(f"\nCluster {c}")
    print(f"Samples   : {len(sub)}")
    print(f"Train RMSE: {rmse:.4f}")
    print(f"Train R²  : {r2:.4f}")



CLUSTERED MODEL – TRAIN RESULTS (INDIVIDUAL)

Cluster 0
Samples   : 15654
Train RMSE: 0.0625
Train R²  : 0.8240

Cluster 1
Samples   : 31491
Train RMSE: 0.0639
Train R²  : 0.8215

Cluster 2
Samples   : 31227
Train RMSE: 0.0632
Train R²  : 0.8211


In [29]:
print("\nCLUSTERED MODEL – TEST RESULTS (INDIVIDUAL)")

for c, model in cluster_models.items():
    sub = test_df_clustered[test_df_clustered["cluster"] == c]

    if len(sub) < 30:
        print(f"\nCluster {c} skipped (too few samples: {len(sub)})")
        continue

    X_c = sub.drop("accident_risk", axis=1)
    y_c = sub["accident_risk"]

    y_pred = model.predict(X_c)

    rmse = np.sqrt(mean_squared_error(y_c, y_pred))
    r2 = r2_score(y_c, y_pred)

    print(f"\nCluster {c}")
    print(f"Samples  : {len(sub)}")
    print(f"Test RMSE: {rmse:.4f}")
    print(f"Test R²  : {r2:.4f}")



CLUSTERED MODEL – TEST RESULTS (INDIVIDUAL)

Cluster 0
Samples  : 6817
Test RMSE: 0.0634
Test R²  : 0.8234

Cluster 1
Samples  : 13434
Test RMSE: 0.0637
Test R²  : 0.8190

Cluster 2
Samples  : 13338
Test RMSE: 0.0628
Test R²  : 0.8191


In [25]:
rmse_train_c = []
r2_train_c = []
weights_train = []

for c, model in cluster_models.items():
    sub = train_df_clustered[train_df_clustered["cluster"] == c]
    X_c = sub.drop("accident_risk", axis=1)
    y_c = sub["accident_risk"]
    
    y_pred = model.predict(X_c)
    
    rmse_train_c.append(
        np.sqrt(mean_squared_error(y_c, y_pred)) * len(sub)
    )
    r2_train_c.append(
        r2_score(y_c, y_pred) * len(sub)
    )
    weights_train.append(len(sub))

rmse_train_clustered = sum(rmse_train_c) / sum(weights_train)
r2_train_clustered   = sum(r2_train_c) / sum(weights_train)


In [26]:
rmse_test_c = []
r2_test_c = []
weights_test = []

for c, model in cluster_models.items():
    sub = test_df_clustered[test_df_clustered["cluster"] == c]
    
    if len(sub) < 30:
        continue
    
    X_c = sub.drop("accident_risk", axis=1)
    y_c = sub["accident_risk"]
    
    y_pred = model.predict(X_c)
    
    rmse_test_c.append(
        np.sqrt(mean_squared_error(y_c, y_pred)) * len(sub)
    )
    r2_test_c.append(
        r2_score(y_c, y_pred) * len(sub)
    )
    weights_test.append(len(sub))

rmse_test_clustered = sum(rmse_test_c) / sum(weights_test)
r2_test_clustered   = sum(r2_test_c) / sum(weights_test)


In [27]:
print("\nCLUSTERED MODEL")
print(f"Train RMSE: {rmse_train_clustered:.4f}")
print(f"Train R²  : {r2_train_clustered:.4f}")
print(f"Test RMSE : {rmse_test_clustered:.4f}")
print(f"Test R²   : {r2_test_clustered:.4f}")



CLUSTERED MODEL
Train RMSE: 0.0633
Train R²  : 0.8218
Test RMSE : 0.0633
Test R²   : 0.8199
