In [33]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Flatten, Dropout
from sklearn.metrics import mean_squared_error
import numpy as np

In [34]:
#Load the datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [35]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517754 entries, 0 to 517753
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      517754 non-null  int64  
 1   road_type               517754 non-null  object 
 2   num_lanes               517754 non-null  int64  
 3   curvature               517754 non-null  float64
 4   speed_limit             517754 non-null  int64  
 5   lighting                517754 non-null  object 
 6   weather                 517754 non-null  object 
 7   road_signs_present      517754 non-null  bool   
 8   public_road             517754 non-null  bool   
 9   time_of_day             517754 non-null  object 
 10  holiday                 517754 non-null  bool   
 11  school_season           517754 non-null  bool   
 12  num_reported_accidents  517754 non-null  int64  
 13  accident_risk           517754 non-null  float64
dtypes: bool(4), float64(

In [36]:
#Check missing values

train.isnull().sum()

id                        0
road_type                 0
num_lanes                 0
curvature                 0
speed_limit               0
lighting                  0
weather                   0
road_signs_present        0
public_road               0
time_of_day               0
holiday                   0
school_season             0
num_reported_accidents    0
accident_risk             0
dtype: int64

In [37]:
#Handle missing values

train = train.fillna(train.mean(numeric_only=True))
test = test.fillna(test.mean(numeric_only=True))

In [38]:

X = train.drop(["accident_risk", "id"], axis=1)
y = train["accident_risk"]

In [39]:
#Define column groups

cat_cols = ["road_type", "lighting", "weather", "time_of_day"]

num_cols = [
    "num_lanes",
    "curvature",
    "speed_limit",
    "num_reported_accidents"
]

bool_cols = [
    "road_signs_present",
    "public_road",
    "holiday",
    "school_season"
]


In [40]:
#Preprocessing 

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("bool", "passthrough", bool_cols)
    ]
)



In [41]:
#Train / validation split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)



In [42]:
#Transform data

X_train_p = preprocessor.fit_transform(X_train)
X_val_p = preprocessor.transform(X_val)



In [43]:
#Reshape for CNN

X_train_cnn = X_train_p.reshape(X_train_p.shape[0], X_train_p.shape[1], 1)
X_val_cnn = X_val_p.reshape(X_val_p.shape[0], X_val_p.shape[1], 1)



In [44]:
#Build CNN model


model = Sequential([
    Conv1D(32, 3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)),
    Conv1D(64, 3, activation='relu'),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1)
])

model.compile(
    optimizer='adam',
    loss='mse'
)



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [45]:
#Train CNN

model.fit(
    X_train_cnn, y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.1,
    verbose=1
)



Epoch 1/20
[1m11650/11650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 3ms/step - loss: 0.0044 - val_loss: 0.0035
Epoch 2/20
[1m11650/11650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step - loss: 0.0040 - val_loss: 0.0034
Epoch 3/20
[1m11650/11650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 3ms/step - loss: 0.0039 - val_loss: 0.0034
Epoch 4/20
[1m11650/11650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 3ms/step - loss: 0.0039 - val_loss: 0.0033
Epoch 5/20
[1m11650/11650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 3ms/step - loss: 0.0039 - val_loss: 0.0033
Epoch 6/20
[1m11650/11650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 3ms/step - loss: 0.0039 - val_loss: 0.0034
Epoch 7/20
[1m11650/11650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 3ms/step - loss: 0.0039 - val_loss: 0.0033
Epoch 8/20
[1m11650/11650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 3ms/step - loss: 0.0039 - val_loss: 0.0033


<keras.src.callbacks.history.History at 0x1c63c9df7c0>

In [46]:
#Evaluate RMSE

pred = model.predict(X_val_cnn)

rmse = np.sqrt(mean_squared_error(y_val, pred))

print("CNN RMSE:", rmse)

[1m3236/3236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step
CNN RMSE: 0.05728487957612289


In [53]:
#save model

model.save("accident_cnn_model.h5")





In [56]:
#submission file create

from tensorflow.keras.models import load_model

model = load_model("accident_cnn_model.h5", compile=False)
