In [None]:
!kaggle competitions download -c playground-series-s5e10 
!unzip -o playground-series-s5e10.zip -d ./playground-series-s5e10/
!rm playground-series-s5e10.zip

In [None]:
import pandas as pd
from sklearn.model_selection import RepeatedKFold
import optuna
import numpy as np
import xgboost as xgb   
from app import downcasting

In [None]:
train = pd.read_csv('./playground-series-s5e10/train.csv', index_col="id")
test = pd.read_csv('./playground-series-s5e10/test.csv', index_col="id")
submission = pd.read_csv('./playground-series-s5e10/sample_submission.csv')

In [None]:
categorical_columns = ["road_type", "lighting", "weather", "time_of_day"]
for cc in categorical_columns:
    train[cc] = train[cc].astype("category")
    test[cc] = test[cc].astype("category")

In [None]:
train = downcasting(train)
test = downcasting(test)

In [None]:
base_params = {
    'tree_method': 'hist', 
    'device': 'cuda', 
    'eval_metric': 'rmse', 
    'enable_categorical': True, 
    'n_estimators': 1000,
    'random_state': 42,
    'learning_rate': 0.1,
    'early_stopping_rounds': 20
}

In [None]:
def objective(trial):
    xgb_params = dict(base_params)
    xgb_params.update({
        "max_depth": trial.suggest_int("max_depth", 6, 12), 
        'reg_lambda': trial.suggest_float("reg_lambda", 0, 20)
    })
    rkf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)
    avg_score = 0
    avg_iter = 0
    for i, (train_index, test_index) in enumerate(rkf.split(train)):
        X_train, y_train = train.loc[train_index].drop("accident_risk", axis=1), train.loc[train_index, "accident_risk"]
        X_test, y_test = train.loc[test_index].drop("accident_risk", axis=1), train.loc[test_index, "accident_risk"]
        reg = xgb.XGBRegressor(**xgb_params)
        fit = reg.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        avg_score += fit.best_score
        avg_iter += fit.best_iteration
    
    avg_iter /= rkf.get_n_splits()
    avg_score /= rkf.get_n_splits()
    return avg_score

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10, n_jobs=4)

In [None]:
best_params = dict(base_params)
best_params.update(study.best_params)

In [None]:
rkf = RepeatedKFold(n_splits=10, n_repeats=2, random_state=42)
avg_score = 0
test_predictions = []
best_iteration = 0
for i, (train_index, test_index) in enumerate(rkf.split(train)):
    X_train, y_train = train.loc[train_index].drop("accident_risk", axis=1), train.loc[train_index, "accident_risk"]
    X_test, y_test = train.loc[test_index].drop("accident_risk", axis=1), train.loc[test_index, "accident_risk"]
    reg = xgb.XGBRegressor(**best_params)
    reg.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    avg_score += reg.best_score
    best_iteration += reg.best_iteration
    test_predictions.append(reg.predict(test))
avg_score /= rkf.get_n_splits()
best_iteration /= rkf.get_n_splits()

In [None]:
avg_score

In [None]:
test_predictions = np.stack(test_predictions).mean(0)

In [None]:
full_ds_iterations = int(10/9*best_iteration)

In [None]:
full_fit_params = dict(best_params)
full_fit_params["early_stopping_rounds"] = None
full_fit_params["n_estimators"] = full_ds_iterations
reg = xgb.XGBRegressor(**full_fit_params)
reg.fit(train.drop("accident_risk", axis=1), train["accident_risk"])
reg.save_model("accident_risk_model.json")

In [None]:
submission["accident_risk"] = reg.predict(test)

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
!kaggle competitions submit -c playground-series-s5e10 -f submission.csv -m "XGBoost RepeatedKFold with Optuna Hyperparameter Tuning final model trained on full dataset via api"