In [29]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import joblib

In [37]:

#Load the datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [38]:
#Check dataset shape

print("Train shape:", train.shape)
print("Test shape:", test.shape)

print(train.columns)

Train shape: (517754, 14)
Test shape: (172585, 13)
Index(['id', 'road_type', 'num_lanes', 'curvature', 'speed_limit', 'lighting',
       'weather', 'road_signs_present', 'public_road', 'time_of_day',
       'holiday', 'school_season', 'num_reported_accidents', 'accident_risk'],
      dtype='object')


In [39]:
train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517754 entries, 0 to 517753
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      517754 non-null  int64  
 1   road_type               517754 non-null  object 
 2   num_lanes               517754 non-null  int64  
 3   curvature               517754 non-null  float64
 4   speed_limit             517754 non-null  int64  
 5   lighting                517754 non-null  object 
 6   weather                 517754 non-null  object 
 7   road_signs_present      517754 non-null  bool   
 8   public_road             517754 non-null  bool   
 9   time_of_day             517754 non-null  object 
 10  holiday                 517754 non-null  bool   
 11  school_season           517754 non-null  bool   
 12  num_reported_accidents  517754 non-null  int64  
 13  accident_risk           517754 non-null  float64
dtypes: bool(4), float64(

In [40]:
#Check missing values

train.isnull().sum()


id                        0
road_type                 0
num_lanes                 0
curvature                 0
speed_limit               0
lighting                  0
weather                   0
road_signs_present        0
public_road               0
time_of_day               0
holiday                   0
school_season             0
num_reported_accidents    0
accident_risk             0
dtype: int64

In [41]:
#Handle missing values

train = train.fillna(train.mean(numeric_only=True))
test = test.fillna(test.mean(numeric_only=True))

In [42]:

target_column = "accident_risk"


X = train.drop(columns=["accident_risk"])
y = train["accident_risk"]


In [43]:
#Define column types

cat_cols = ["road_type", "lighting", "weather", "time_of_day"]

num_cols = [
    "id",
    "num_lanes",
    "curvature",
    "speed_limit",
    "road_signs_present",
    "public_road",
    "holiday",
    "school_season",
    "num_reported_accidents"
]



In [44]:
#Create preprocessing pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)



In [45]:
#Create Random Forest model

rf = RandomForestRegressor(
    n_estimators=150,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)



In [46]:
#Combine into ONE pipeline

pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", rf)
])



In [47]:
#Train + Evaluate (RMSE)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)

val_pred = pipeline.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, val_pred))

print("Validation RMSE:", rmse)



Validation RMSE: 0.05658162787695726


In [48]:
#Train on full train data

pipeline.fit(X, y)



0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,150
,criterion,'squared_error'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [53]:

joblib.dump(pipeline, "accident_random_forest_model.pkl", compress=3)


['accident_random_forest_model.pkl']

In [50]:
#submission file create

test_df = pd.read_csv("test.csv")
model = joblib.load("accident_random_forest_model.pkl")

predictions = model.predict(test_df)

In [51]:
predictions = model.predict(test_df)
submission = pd.read_csv("sample_submission.csv")
submission['accident_risk'] = predictions

In [52]:
submission.to_csv("random_forest_submission.csv", index=False)