In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e10/sample_submission.csv
/kaggle/input/playground-series-s5e10/train.csv
/kaggle/input/playground-series-s5e10/test.csv


In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s5e10/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e10/test.csv")

In [3]:

# Basic shapes
print("TRAIN shape:", train.shape)
print("TEST  shape:", test.shape)
print()

# Show first 5 rows of train
print("=== Train head ===")
display(train.head())

# Info: dtypes and non-null counts
print("\n=== Train info ===")
train.info(verbose=True)

# Columns list and a concise dtype / unique-count summary (top 30)
cols = train.columns.tolist()
print("\nTotal columns:", len(cols))
print("First 30 columns:", cols[:30])

# Missing values summary (columns with any missing values)
missing = train.isna().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print("\nColumns with missing values (count):")
print(missing if not missing.empty else "No missing values detected in train")

# For object / categorical-like columns show unique counts (top 20)
obj_cols = train.select_dtypes(include=['object']).columns.tolist()
print("\nObject-type columns (count):", len(obj_cols))
if obj_cols:
    uniq_counts = {c: train[c].nunique(dropna=False) for c in obj_cols}
    uniq_df = pd.Series(uniq_counts).sort_values(ascending=False)
    print("Top object columns by cardinality:")
    print(uniq_df.head(20))

# Quick target (if present) summary
target = "accident_risk"
if target in train.columns:
    print("\n=== Target summary ===")
    print("Min / median / mean / max:")
    print(train[target].min(), train[target].median(), train[target].mean(), train[target].max())
    print("Quantiles:")
    print(train[target].quantile([0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99]))
else:
    print(f"\nTarget column '{target}' not found in train.")

# Save a small preview CSV of columns and dtypes to inspect in UI if you like
preview = pd.DataFrame({"column": train.columns, "dtype": train.dtypes.astype(str), "n_unique": [train[c].nunique(dropna=False) for c in train.columns]})
preview.to_csv("columns_preview.csv", index=False)
print("\nWrote columns_preview.csv with dtype and unique-count info.")

TRAIN shape: (517754, 14)
TEST  shape: (172585, 13)

=== Train head ===


Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56



=== Train info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517754 entries, 0 to 517753
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      517754 non-null  int64  
 1   road_type               517754 non-null  object 
 2   num_lanes               517754 non-null  int64  
 3   curvature               517754 non-null  float64
 4   speed_limit             517754 non-null  int64  
 5   lighting                517754 non-null  object 
 6   weather                 517754 non-null  object 
 7   road_signs_present      517754 non-null  bool   
 8   public_road             517754 non-null  bool   
 9   time_of_day             517754 non-null  object 
 10  holiday                 517754 non-null  bool   
 11  school_season           517754 non-null  bool   
 12  num_reported_accidents  517754 non-null  int64  
 13  accident_risk           517754 non-null  float64
dtype

In [4]:
# Notebook cell: unified transformation for both train and test
import numpy as np
import pandas as pd

def transform_accident_data(df: pd.DataFrame, is_train: bool = True, target_col: str = "accident_risk"):
    """
    Transform accident dataset (works for both train and test):
      - Converts bool columns to int
      - Adds cyclic sine/cosine features for 'time_of_day'
      - Identifies column groups for CatBoost

    Args:
        df (pd.DataFrame): train or test DataFrame
        is_train (bool): if True, dataset has target_col; used only to exclude it from lists
        target_col (str): name of target column in train

    Returns:
        transformed_df (pd.DataFrame)
        meta (dict): {
            'bool_cols', 'cat_cols', 'num_cols', 'cyclic_cols'
        }
    """
    data = df.copy()

    # --- 1. Convert bool → int
    bool_cols = [c for c in data.columns if data[c].dtype == "bool"]
    for c in bool_cols:
        data[c] = data[c].astype(int)

    # --- 2. Add cyclical features for 'time_of_day'
    if "time_of_day" in data.columns:
        time_order = ["morning", "afternoon", "evening"]  # adjust if dataset differs
        k = len(time_order)
        angle_map = {cat: 2 * np.pi * i / k for i, cat in enumerate(time_order)}

        def add_time_cyclic(df, col="time_of_day", angle_map=angle_map):
            angles = df[col].map(angle_map).fillna(0.0)
            df[col + "_sin"] = np.sin(angles)
            df[col + "_cos"] = np.cos(angles)
            return df

        data = add_time_cyclic(data)

    # --- 3. Identify column types
    exclude = {"id"}
    if is_train:
        exclude.add(target_col)

    cat_cols = [
        c for c in data.columns
        if data[c].dtype == "object" and c not in exclude
    ]
    if "time_of_day" in data.columns and "time_of_day" not in cat_cols:
        cat_cols.append("time_of_day")

    cyclic_cols = [c for c in ["time_of_day_sin", "time_of_day_cos"] if c in data.columns]
    num_cols = [
        c for c in data.columns
        if c not in cat_cols + cyclic_cols and c not in exclude
    ]

    meta = {
        "bool_cols": bool_cols,
        "cat_cols": cat_cols,
        "num_cols": num_cols,
        "cyclic_cols": cyclic_cols,
    }

    print(f"\n=== Transform summary ({'train' if is_train else 'test'}) ===")
    print("Bool → int:", bool_cols)
    print("Categorical columns:", cat_cols)
    print("Numeric columns:", num_cols)
    print("Cyclic columns:", cyclic_cols)
    print("Final shape:", data.shape)

    return data, meta

# --- Usage example ---
train_transformed, meta = transform_accident_data(train, is_train=True)
test_transformed, _ = transform_accident_data(test, is_train=False)

display(train_transformed.head(5))



=== Transform summary (train) ===
Bool → int: ['road_signs_present', 'public_road', 'holiday', 'school_season']
Categorical columns: ['road_type', 'lighting', 'weather', 'time_of_day']
Numeric columns: ['num_lanes', 'curvature', 'speed_limit', 'road_signs_present', 'public_road', 'holiday', 'school_season', 'num_reported_accidents']
Cyclic columns: ['time_of_day_sin', 'time_of_day_cos']
Final shape: (517754, 16)

=== Transform summary (test) ===
Bool → int: ['road_signs_present', 'public_road', 'holiday', 'school_season']
Categorical columns: ['road_type', 'lighting', 'weather', 'time_of_day']
Numeric columns: ['num_lanes', 'curvature', 'speed_limit', 'road_signs_present', 'public_road', 'holiday', 'school_season', 'num_reported_accidents']
Cyclic columns: ['time_of_day_sin', 'time_of_day_cos']
Final shape: (172585, 15)


Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk,time_of_day_sin,time_of_day_cos
0,0,urban,2,0.06,35,daylight,rainy,0,1,afternoon,0,1,1,0.13,0.866025,-0.5
1,1,urban,4,0.99,35,daylight,clear,1,0,evening,1,1,0,0.35,-0.866025,-0.5
2,2,rural,4,0.63,70,dim,clear,0,1,morning,1,0,2,0.3,0.0,1.0
3,3,highway,4,0.07,35,dim,rainy,1,1,morning,0,0,1,0.21,0.0,1.0
4,4,rural,1,0.58,60,daylight,foggy,0,0,evening,1,0,1,0.56,-0.866025,-0.5


In [5]:
X_train = train_transformed.drop(columns=["id", "accident_risk"])
y_train = train_transformed["accident_risk"]
X_test = test_transformed.drop(columns=["id"])

In [6]:
# Notebook cell: Prepare training/validation/test splits
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd

# --- 1. Separate target and features
TARGET = "accident_risk"

X = train_transformed.drop(columns=["id", TARGET])
y = train_transformed[TARGET].copy()

test_ids = test_transformed["id"].copy()
X_test = test_transformed.drop(columns=["id"])

print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)
print("Test matrix shape:", X_test.shape)
print("Stored test_ids length:", len(test_ids))

# --- 2. Create stratified folds based on binned target
N_SPLITS = 5
SEED = 42

# Bin target into quantiles for stratified CV
y_bins = pd.qcut(y, q=10, labels=False, duplicates='drop')
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

fold_indices = list(skf.split(X, y_bins))

# --- 3. Example: extract train/val for the first fold (we’ll use this pattern for CV)
train_idx, val_idx = fold_indices[0]
X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

print(f"\nFold 1 sample shapes:")
print("  Train fold:", X_train_fold.shape, "Val fold:", X_val_fold.shape)

# --- 4. Optional: save for reference or inspection
# --- 4. Optional: save for reference or inspection (safe serialization)
import joblib

# Save fold indices safely
joblib.dump(fold_indices, "fold_indices.pkl")
print("Saved fold_indices.pkl (contains train/val indices for all folds).")

# Save test ids for submission reference
test_ids.to_csv("test_ids.csv", index=False)
print("Saved test_ids.csv")

# Sanity check on fold structure
print(f"Total folds saved: {len(fold_indices)}")
for i, (tr, val) in enumerate(fold_indices[:2], 1):
    print(f"Fold {i}: train={len(tr)} val={len(val)}")


Feature matrix shape: (517754, 14)
Target vector shape: (517754,)
Test matrix shape: (172585, 14)
Stored test_ids length: 172585

Fold 1 sample shapes:
  Train fold: (414203, 14) Val fold: (103551, 14)
Saved fold_indices.pkl (contains train/val indices for all folds).
Saved test_ids.csv
Total folds saved: 5
Fold 1: train=414203 val=103551
Fold 2: train=414203 val=103551


In [7]:
# Notebook cell: CatBoost cross-validation training
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error
import numpy as np
import joblib
import gc

SEED = 42
N_FOLDS = 5
TARGET = "accident_risk"

# --- Load folds if not in memory
try:
    fold_indices
except NameError:
    fold_indices = joblib.load("fold_indices.pkl")

# --- Retrieve column info from the previous meta dict
cat_cols = meta["cat_cols"]
cyclic_cols = meta["cyclic_cols"]
num_cols = meta["num_cols"]
all_features = cat_cols + num_cols + cyclic_cols

print(f"Using {len(cat_cols)} categorical, {len(num_cols)} numeric, "
      f"{len(cyclic_cols)} cyclic features ({len(all_features)} total).")

# --- Prepare arrays
X_full = train_transformed.drop(columns=["id", TARGET])
y_full = train_transformed[TARGET].values
X_test = test_transformed.drop(columns=["id"])

# --- CatBoost parameters
CAT_PARAMS = {
    "iterations": 20000,
    "learning_rate": 0.05,
    "depth": 8,
    "loss_function": "RMSE",
    "eval_metric": "RMSE",
    "task_type": "GPU", 
    "random_seed": SEED,
    "early_stopping_rounds": 200,
    "verbose": 500,
    "thread_count": -1,
}

# --- CV training loop
oof = np.zeros(len(X_full))
preds = np.zeros(len(X_test))
fold_rmse = []
models = []

for fold, (train_idx, val_idx) in enumerate(fold_indices[:N_FOLDS], 1):
    print(f"\n================ Fold {fold}/{N_FOLDS} ================")
    X_tr, X_val = X_full.iloc[train_idx], X_full.iloc[val_idx]
    y_tr, y_val = y_full[train_idx], y_full[val_idx]

    train_pool = Pool(X_tr, label=y_tr, cat_features=cat_cols)
    val_pool   = Pool(X_val, label=y_val, cat_features=cat_cols)
    test_pool  = Pool(X_test, cat_features=cat_cols)

    model = CatBoostRegressor(**CAT_PARAMS)
    model.fit(train_pool, eval_set=val_pool, use_best_model=True)

    val_pred = model.predict(val_pool)
    test_pred = model.predict(test_pool)

    oof[val_idx] = val_pred
    preds += test_pred / N_FOLDS

    rmse = mean_squared_error(y_val, val_pred, squared=False)
    fold_rmse.append(rmse)
    print(f"Fold {fold} RMSE: {rmse:.6f}")

    models.append(model)
    joblib.dump(model, f"catboost_fold{fold}.pkl")
    del X_tr, X_val, y_tr, y_val, train_pool, val_pool
    gc.collect()

# --- OOF and final scores
oof_rmse = mean_squared_error(y_full, oof, squared=False)
print("\nFold RMSEs:", [round(s, 5) for s in fold_rmse])
print("Mean OOF RMSE:", round(oof_rmse, 6))

# --- Clip predictions to [0,1]
preds = np.clip(preds, 0.0, 1.0)

# --- Save outputs
oof_df = pd.DataFrame({"id": train_transformed["id"], "oof_pred": oof, "y_true": y_full})
oof_df.to_csv("oof_predictions.csv", index=False)

submission = pd.DataFrame({"id": test_ids, "accident_risk": preds})
submission.to_csv("submission_catboost.csv", index=False)

print("\nSaved:")
print("  • oof_predictions.csv")
print("  • submission_catboost.csv")
print("  • catboost_fold*.pkl (models per fold)")


Using 4 categorical, 8 numeric, 2 cyclic features (14 total).

0:	learn: 0.1591494	test: 0.1592279	best: 0.1592279 (0)	total: 134ms	remaining: 44m 45s
500:	learn: 0.0557412	test: 0.0562401	best: 0.0562400 (499)	total: 14.7s	remaining: 9m 32s
1000:	learn: 0.0552848	test: 0.0562121	best: 0.0562098 (924)	total: 29.3s	remaining: 9m 15s
bestTest = 0.05620601544
bestIteration = 1212
Shrink model to first 1213 iterations.
Fold 1 RMSE: 0.056206

0:	learn: 0.1591875	test: 0.1590649	best: 0.1590649 (0)	total: 43.8ms	remaining: 14m 36s
500:	learn: 0.0557721	test: 0.0560412	best: 0.0560410 (494)	total: 14.7s	remaining: 9m 33s
1000:	learn: 0.0553297	test: 0.0560071	best: 0.0560064 (995)	total: 29.4s	remaining: 9m 17s
bestTest = 0.05600640919
bestIteration = 995
Shrink model to first 996 iterations.
Fold 2 RMSE: 0.056006

0:	learn: 0.1591026	test: 0.1594054	best: 0.1594054 (0)	total: 29.2ms	remaining: 9m 43s
500:	learn: 0.0557699	test: 0.0561181	best: 0.0561176 (491)	total: 14.4s	remaining: 9m 19s
1