# Kaggle - Predicting Road Accident Risk
### Playground Series - Season 5, Episode 10

**Model: XGBoost**

MAE on validation data: 0.04362

Kaggle score = 0.05569 (42nd percentile) (rank #1 score = 0.05537)

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

filePath = '/Users/samtrustrum/Desktop/X/KaggleRoadAccidentRisk/data'

for dirname, _, filenames in os.walk(filePath):
    for filename in filenames:
        print(os.path.join(dirname, filename))
print("Done.")

/Users/samtrustrum/Desktop/X/KaggleRoadAccidentRisk/data/test.csv
/Users/samtrustrum/Desktop/X/KaggleRoadAccidentRisk/data/train.csv
/Users/samtrustrum/Desktop/X/KaggleRoadAccidentRisk/data/sample_submission.csv
Done.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
train_data = pd.read_csv(filePath + '/train.csv', index_col='id')
test_data = pd.read_csv(filePath + '/test.csv', index_col='id')


In [3]:
# Get names of columns with missing values
cols_with_missing = [col for col in train_data.columns if train_data[col].isnull().any()]
print(cols_with_missing)

[]


In [5]:
train_data.head()

Unnamed: 0_level_0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [31]:
# Remove rows with missing target
X = train_data.copy()
X.dropna(axis=0, subset=['accident_risk'], inplace=True)

# Separate target from predictors
y = X.accident_risk              
X.drop(['accident_risk'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numeric columns
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_final = X[my_cols].copy()
X_test = test_data[my_cols].copy()

# One-hot encode the data (to shorten the code, we use pandas)
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_final = pd.get_dummies(X_final)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_final = X_train.align(X_final, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [12]:
%pip install xgboost

Collecting xgboost
  Using cached xgboost-3.1.1-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Using cached xgboost-3.1.1-py3-none-macosx_12_0_arm64.whl (2.2 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.1.1
Note: you may need to restart the kernel to use updated packages.


In [26]:
from xgboost import XGBRegressor

# Define the model
model_1 = XGBRegressor(n_estimators=1000, learning_rate=0.05, 
                       early_stopping_rounds=5,
                       random_state=2)

# Fit the model_1
model_1.fit(X_train, y_train,
             eval_set=[(X_valid, y_valid)], 
             verbose=False)


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,5
,enable_categorical,False


In [None]:
from sklearn.metrics import mean_absolute_error

# Get predictions
predictions = model_1.predict(X_valid)

# Calculate MAE
mae_1 = mean_absolute_error(predictions, y_valid)

# Uncomment to print MAE
print("Mean Absolute Error:" , mae_1)

Mean Absolute Error: 0.04368778380572639


## Finding Optimal Model Parameters
Execution time on Macbook = 42 mins

In [28]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import numpy as np
# Define the parameter grid for optimization
param_grid = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
# Initialize the model
xgb_model = XGBRegressor(random_state=0)
# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           scoring='neg_mean_absolute_error', cv=3, verbose=1)
grid_search.fit(X_train, y_train)
# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
# Print the best parameters
print("Best Parameters:", best_params)
# Evaluate the optimized model on the validation set
optimized_predictions = best_model.predict(X_valid)
optimized_mae = mean_absolute_error(optimized_predictions, y_valid)
print("Optimized Mean Absolute Error:", optimized_mae)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 1000, 'subsample': 0.8}
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 1000, 'subsample': 0.8}
Optimized Mean Absolute Error: 0.04362332090199869
Optimized Mean Absolute Error: 0.04362332090199869


## Final Model

In [32]:
# Define the model
model_final = XGBRegressor(n_estimators=1000, 
                       learning_rate=0.01, 
                       max_depth=7,
                       min_child_weight=5,
						subsample=0.8,
                       random_state=0,
                       colsample_bytree = 1.0)

# Fit the model_1
model_final.fit(X_final, y)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,1.0
,device,
,early_stopping_rounds,
,enable_categorical,False


In [35]:
# Get predictions
test_preds = model_final.predict(X_test)

output = pd.DataFrame({'id': test_data.index,
                       'accident_risk': test_preds})
output.to_csv('submissionV2.csv', index=False)