In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e10/sample_submission.csv
/kaggle/input/playground-series-s5e10/train.csv
/kaggle/input/playground-series-s5e10/test.csv


# Imports

In [2]:
import numpy as np
import pandas as pd
import warnings

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor


warnings.filterwarnings('ignore')

# Data

In [3]:
train = pd.read_csv("/kaggle/input/playground-series-s5e10/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e10/test.csv")
sample_submission = pd.read_csv("/kaggle/input/playground-series-s5e10/sample_submission.csv")

In [4]:
train.head()

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [5]:
test.head()

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents
0,517754,highway,2,0.34,45,night,clear,True,True,afternoon,True,True,1
1,517755,urban,3,0.04,45,dim,foggy,True,False,afternoon,True,False,0
2,517756,urban,2,0.59,35,dim,clear,True,False,afternoon,True,True,1
3,517757,rural,4,0.95,35,daylight,rainy,False,False,afternoon,False,False,2
4,517758,highway,2,0.86,35,daylight,clear,True,False,evening,False,True,3


# Feature Engineering

In [6]:
def create_features(df):
    # Copy dataframe
    df = df.copy()

    # Polynomial features
    df['curvature_squared'] = df['curvature'] ** 2
    df['curvature_cubed'] = df['curvature'] ** 3
    df['speed_squared'] = df['speed_limit'] ** 2

    # Binned features
    df['curvature_bin'] = pd.cut(df['curvature'], bins=[0, 0.3, 0.6, 1.0], labels=[0, 1, 2])
    df['speed_category'] = pd.cut(df['speed_limit'], bins=[0, 30, 50, 100], labels=[0, 1, 2])

    # Interaction features
    df['speed_curvature'] = df['speed_limit'] * df['curvature']
    df['lanes_curvature'] = df['num_lanes'] * df['curvature']
    df['speed_lanes'] = df['speed_limit'] * df['num_lanes']
    df['accidents_curvature'] = df['num_reported_accidents'] * df['curvature']
    df['accidents_speed'] = df['num_reported_accidents'] * df['speed_limit']

    # Risk combinations
    df['high_risk_combo'] = ((df['curvature'] > 0.5) & (df['speed_limit'] >= 60)).astype(int)
    df['weather_lighting_risk'] = (
        ((df['weather'] == 'foggy') | (df['weather'] == 'rainy')) &
        ((df['lighting'] == 'dim') | (df['lighting'] == 'night'))
    ).astype(int)

    # Derived categorical indicators
    df['is_night'] = (df['lighting'] == 'night').astype(int)
    df['is_bad_weather'] = df['weather'].isin(['foggy', 'rainy']).astype(int)
    df['is_highway'] = (df['road_type'] == 'highway').astype(int)
    df['is_urban'] = (df['road_type'] == 'urban').astype(int)

    # Time-based and holiday proxies
    df['is_peak_time'] = df['time_of_day'].isin(['morning', 'evening']).astype(int)
    df['is_weekend'] = df['holiday'].astype(int)

    # Safety and danger scores
    df['safety_score'] = (
        df['road_signs_present'].astype(int) * 2 +
        (df['lighting'] == 'daylight').astype(int) +
        (df['weather'] == 'clear').astype(int)
    )

    df['danger_score'] = (
        (df['curvature'] > 0.6).astype(int) +
        (df['speed_limit'] >= 60).astype(int) +
        df['is_bad_weather'] +
        df['is_night'] +
        (df['num_reported_accidents'] >= 2).astype(int)
    )

    # Ratio and intensity features
    df['accidents_per_lane'] = df['num_reported_accidents'] / (df['num_lanes'] + 1)
    df['risk_intensity'] = df['curvature'] * df['speed_limit'] / 50

    return df


# Models

In [7]:
def train_models(X, y, X_test, n_folds=5):
    y_binned = pd.qcut(y, q=10, labels=False, duplicates='drop')

    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

    models = {
        'LightGBM': LGBMRegressor(
            n_estimators=1000, learning_rate=0.05, max_depth=7, num_leaves=31,
            min_child_samples=20, subsample=0.8, colsample_bytree=0.8,
            reg_alpha=0.1, reg_lambda=0.1, random_state=42, verbose=-1
        ),
        'CatBoost': CatBoostRegressor(
            iterations=1000,
            learning_rate=0.05,
            depth=7,
            l2_leaf_reg=3.0,
            subsample=0.8,
            loss_function='RMSE',
            eval_metric='RMSE',
            random_seed=42,
            verbose=0
        ),
        'GradientBoosting': GradientBoostingRegressor(
            n_estimators=1000,
            learning_rate=0.05,
            max_depth=7,
            subsample=0.8,
            min_samples_split=20,
            min_samples_leaf=10,
            max_features='sqrt',
            random_state=42,
            verbose=0
        ),
        'XGBoost': XGBRegressor(
            n_estimators=1000,
            learning_rate=0.05,
            max_depth=7,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.1,
            reg_lambda=0.1,
            random_state=42,
            verbosity=0
        )
    }

    # Store results
    results = {}
    oof_predictions = {}
    test_predictions = {}

    # Train each model
    for name, model in models.items():
        print(f"\n{'='*60}\nTraining {name}\n{'='*60}")

        oof_preds = np.zeros(len(X))
        test_preds = np.zeros(len(X_test))
        fold_scores = []

        for fold, (train_idx, val_idx) in enumerate(kf.split(X, y_binned), 1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            # Train
            model.fit(X_train, y_train)

            # Predict
            oof_preds[val_idx] = model.predict(X_val)
            test_preds += model.predict(X_test) / n_folds

            # Compute RMSE
            fold_rmse = np.sqrt(mean_squared_error(y_val, oof_preds[val_idx]))
            fold_scores.append(fold_rmse)
            print(f"Fold {fold}: RMSE = {fold_rmse:.6f}")

        # Overall RMSE
        oof_rmse = np.sqrt(mean_squared_error(y, oof_preds))
        results[name] = {
            'oof_score': oof_rmse,
            'fold_scores': fold_scores,
            'std': np.std(fold_scores)
        }
        oof_predictions[name] = oof_preds
        test_predictions[name] = test_preds

        print(f"Overall OOF RMSE: {oof_rmse:.6f} (+/- {np.std(fold_scores):.6f})")

    return results, oof_predictions, test_predictions


# Ensemble

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

def create_ensemble(results, oof_predictions, test_predictions, y, X, X_test):
    # Convert OOF predictions dict to DataFrame (base model predictions as features)
    oof_preds_df = pd.DataFrame(oof_predictions)
    test_preds_df = pd.DataFrame(test_predictions)
    
    # Initialize Logistic Regression meta learner
    meta_learner = LogisticRegression(max_iter=1000)
    
    # Fit meta learner on OOF predictions
    meta_learner.fit(oof_preds_df, y)
    
    # Predict on OOF and test predictions
    ensemble_oof_pred = meta_learner.predict_proba(oof_preds_df)[:, 1]  # Probability estimates for positive class
    ensemble_test_pred = meta_learner.predict_proba(test_preds_df)[:, 1]
    
    # Calculate RMSE on OOF predictions for stacking ensemble
    ensemble_rmse = np.sqrt(mean_squared_error(y, ensemble_oof_pred))
    
    # Identify best single model RMSE
    best_single_rmse = min(results.values())
    best_single_model = min(results, key=results.get)
    
    improvement = ((best_single_rmse - ensemble_rmse) / best_single_rmse) * 100
    
    print(f"\nStacking Ensemble Results\n")
    print(f"Best Single Model RMSE: {best_single_rmse:.6f} ({best_single_model})")
    print(f"Stacking Ensemble OOF RMSE: {ensemble_rmse:.6f}")
    print(f"Improvement over best single model: {improvement:.2f}%")
    
    return ensemble_test_pred, ensemble_rmse, results


# Main Function

In [9]:
def create_ensemble(results, oof_predictions, test_predictions, y, X, X_test):
    # Compute weights based on inverse RMSE
    results_df = pd.DataFrame(results).T.sort_values('oof_score')
    weights = 1 / results_df['oof_score'].values
    weights = weights / weights.sum()

    # Display model weights
    print(f"\n\nEnsemble Weights\n")
    for model, weight in zip(results_df.index, weights):
        print(f"{model}: {weight:.4f} ({weight*100:.2f}%)")

    # Initialize predictions
    ensemble_oof = np.zeros(len(X))
    ensemble_test = np.zeros(len(X_test))

    # Weighted average
    for model, weight in zip(results_df.index, weights):
        ensemble_oof += oof_predictions[model] * weight
        ensemble_test += test_predictions[model] * weight

    # Compute ensemble RMSE
    ensemble_rmse = np.sqrt(mean_squared_error(y, ensemble_oof))
    improvement = (
        (results_df['oof_score'].iloc[0] - ensemble_rmse) /
        results_df['oof_score'].iloc[0] * 100
    )

    print(f"\nEnsemble Results\n")
    print(f"Best Single Model RMSE: {results_df['oof_score'].iloc[0]:.6f} ({results_df.index[0]})")
    print(f"Ensemble OOF RMSE: {ensemble_rmse:.6f}")
    print(f"Improvement over best single model: {improvement:.2f}%")

    return ensemble_test, ensemble_rmse, results_df

In [10]:
def main():
    # Apply feature engineering
    train_fe = create_features(train)
    test_fe = create_features(test)

    print(f"\nFeatures created successfully!")
    print(f"Train shape after FE: {train_fe.shape}")
    print(f"Test shape after FE: {test_fe.shape}")

    # Prepare train/test data
    X = train_fe.drop(['id', 'accident_risk'], axis=1)
    y = train_fe['accident_risk']
    X_test = test_fe.drop(['id'], axis=1)

    # Encode categorical columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))

    print(f"\nEncoded {len(categorical_cols)} categorical columns.")

    # Convert boolean columns
    bool_cols = X.select_dtypes(include=['bool']).columns
    X[bool_cols] = X[bool_cols].astype(int)
    X_test[bool_cols] = X_test[bool_cols].astype(int)

    print(f"Converted {len(bool_cols)} boolean columns to integers.")

    # Identify numeric columns for scaling
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns

    # Apply StandardScaler
    scaler = StandardScaler()
    X[numeric_cols] = scaler.fit_transform(X[numeric_cols])
    X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

    print(f"Applied StandardScaler to {len(numeric_cols)} numeric columns.")
    print(f"\nFinal feature count: {X.shape[1]}")

    # Train models
    results, oof_predictions, test_predictions = train_models(X, y, X_test)

    # Create ensemble
    ensemble_test, ensemble_rmse, results_df = create_ensemble(
        results, oof_predictions, test_predictions, y, X, X_test
    )

    # Prepare final submission
    submission = sample_submission.copy()
    submission['accident_risk'] = ensemble_test

    # Clip values to valid range [0, 1]
    submission['accident_risk'] = submission['accident_risk'].clip(0, 1)

    # Save submission
    submission.to_csv('submission.csv', index=False)
    
    print(f"\nSubmission Summary\n")
    print(f"Best Single Model: {results_df.index[0]} (RMSE: {results_df['oof_score'].iloc[0]:.6f})")
    print(f"Ensemble RMSE: {ensemble_rmse:.6f}")
    print(f"\nPrediction Statistics:")
    print(f"  Min: {submission['accident_risk'].min():.6f}")
    print(f"  Max: {submission['accident_risk'].max():.6f}")
    print(f"  Mean: {submission['accident_risk'].mean():.6f}")
    print(f"  Median: {submission['accident_risk'].median():.6f}")
    print(f"\nSubmission saved to 'submission.csv'")
    
    return results_df, ensemble_rmse


In [11]:
if __name__ == "__main__":
    main()


Features created successfully!
Train shape after FE: (517754, 36)
Test shape after FE: (172585, 35)

Encoded 6 categorical columns.
Converted 4 boolean columns to integers.
Applied StandardScaler to 34 numeric columns.

Final feature count: 34

Training LightGBM
Fold 1: RMSE = 0.056196
Fold 2: RMSE = 0.056032
Fold 3: RMSE = 0.056069
Fold 4: RMSE = 0.056287
Fold 5: RMSE = 0.056158
Overall OOF RMSE: 0.056148 (+/- 0.000091)

Training CatBoost
Fold 1: RMSE = 0.056176
Fold 2: RMSE = 0.056001
Fold 3: RMSE = 0.056044
Fold 4: RMSE = 0.056236
Fold 5: RMSE = 0.056161
Overall OOF RMSE: 0.056124 (+/- 0.000087)

Training GradientBoosting
Fold 1: RMSE = 0.056261
Fold 2: RMSE = 0.056078
Fold 3: RMSE = 0.056146
Fold 4: RMSE = 0.056323
Fold 5: RMSE = 0.056227
Overall OOF RMSE: 0.056207 (+/- 0.000086)

Training XGBoost
Fold 1: RMSE = 0.056306
Fold 2: RMSE = 0.056145
Fold 3: RMSE = 0.056193
Fold 4: RMSE = 0.056422
Fold 5: RMSE = 0.056276
Overall OOF RMSE: 0.056269 (+/- 0.000096)


Ensemble Weights

CatB