Welcome to the 2025 Kaggle Playground Series! We plan to continue in the spirit of previous playgrounds, providing interesting and approachable datasets for our community to practice their machine learning skills, and anticipate a competition each month.

Your Goal: Predict the likelihood of accidents on different types of roads.

For this Playground Series challenge, we have teamed up with Stack Overflow to give you a two-part challenge. The Stack Overflow Challenge is the second part and builds upon this one by having participants develop a web application. We encourage you to check out the Stack Overflow Challenge!

If you complete both challenges, we’ll recognize your breadth of skills with a special “Code Scientist” badge which will appear on both Kaggle and Stack Overflow.

# Dataset Description

The dataset for this competition (both train and test) was generated from a deep learning model trained on the Simulated Roads Accident dataset. Feature distributions are close to, but not exactly the same, as the original. Feel free to use the original dataset as part of this competition, both to explore differences as well as to see whether incorporating the original in training improves model performance.

# Ensamble

In [1]:
# Retrain XGBoost and KNN with Best Parameters Found
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd 
import numpy as np


train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Prepare features and target using processed data
X = train.drop(['id', 'accident_risk'], axis=1)
y = train['accident_risk']
X_test_final = test.drop(['id'], axis=1)

print(f"Training data shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Test data shape: {X_test_final.shape}")

# Apply encoding to all data
print("\nApplying label encoding...")
X_encoded = X.copy()
X_test_encoded = X_test_final.copy()

# Use the same label encoders from previous training
# cols = ['num_lanes', 'num_reported_accidents']
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('onehot', OneHotEncoder(sparse_output=False, handle_unknown="ignore"), cols)
#     ],
#     remainder='passthrough'
# )
# # Fit e transform
# X_encoded = preprocessor.fit_transform(X_encoded)
# X_test_encoded = preprocessor.transform(X_test_encoded)

# feature_names = preprocessor.get_feature_names_out()
# feature_names = [x.split("__")[-1] for x in feature_names]
# X_encoded = pd.DataFrame(X_encoded, columns=feature_names)
# X_test_encoded = pd.DataFrame(X_test_encoded, columns=feature_names)

categorical_cols = ['road_type', 'lighting', 'weather', 'time_of_day']
for col in categorical_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
    X_test_encoded[col] = le.transform(X_test_encoded[col].astype(str))

# X_encoded['speed_limit'] = X_encoded['speed_limit'].astype(int)
# X_encoded['curvature'] = X_encoded['curvature'].astype(float)

# X_test_encoded['speed_limit'] = X_test_encoded['speed_limit'].astype(int)
# X_test_encoded['curvature'] = X_test_encoded['curvature'].astype(float)

print(f"Encoded training data shape: {X_encoded.shape}")
print(f"Encoded test data shape: {X_test_encoded.shape}")

# Split data for validation
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

print(f"\nFinal training set shape: {X_train_final.shape}")
print(f"Final validation set shape: {X_val_final.shape}")

# Train XGBoost with best parameters
print("\n1. Training XGBoost with Best Parameters")
print("-" * 40)

# Best XGBoost parameters from previous training
best_xgb_params = {
    'device': 'cuda',
    'random_state': 42,
    'enable_categorical': True,
    'max_depth': 7,
    'learning_rate': 0.01,
    'n_estimators': 1000
}

print(f"XGBoost parameters: {best_xgb_params}")

# Train XGBoost
xgb_final = XGBRegressor(**best_xgb_params)
print("Training XGBoost...")
xgb_final.fit(X_train_final, y_train_final)

# Make predictions on validation set
xgb_val_pred = xgb_final.predict(X_val_final)

# Calculate metrics
xgb_final_mse = mean_squared_error(y_val_final, xgb_val_pred)
xgb_final_mae = mean_absolute_error(y_val_final, xgb_val_pred)
xgb_final_r2 = r2_score(y_val_final, xgb_val_pred)
xgb_final_rmse = np.sqrt(xgb_final_mse)

print(f"\nXGBoost Final Performance:")
print(f"MSE: {xgb_final_mse:.6f}")
print(f"RMSE: {xgb_final_rmse:.6f}")
print(f"MAE: {xgb_final_mae:.6f}")
print(f"R²: {xgb_final_r2:.6f}")

# Train KNN with best parameters
print("\n2. Training KNN with Best Parameters")
print("-" * 40)

# Best KNN parameters from previous training
best_knn_params = {'n_neighbors': 20}

print(f"KNN parameters: {best_knn_params}")

# Create KNN pipeline with StandardScaler
knn_pipeline_final = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(**best_knn_params))
])

print("Training KNN...")
knn_pipeline_final.fit(X_train_final, y_train_final)

# Make predictions on validation set
knn_val_pred = knn_pipeline_final.predict(X_val_final)

# Calculate metrics
knn_final_mse = mean_squared_error(y_val_final, knn_val_pred)
knn_final_mae = mean_absolute_error(y_val_final, knn_val_pred)
knn_final_r2 = r2_score(y_val_final, knn_val_pred)
knn_final_rmse = np.sqrt(knn_final_mse)

print(f"\nKNN Final Performance:")
print(f"MSE: {knn_final_mse:.6f}")
print(f"RMSE: {knn_final_rmse:.6f}")
print(f"MAE: {knn_final_mae:.6f}")
print(f"R²: {knn_final_r2:.6f}")

# xgb_final = xgb_final.fit(X_encoded, y)
# knn_pipeline_final = knn_pipeline_final.fit(X_encoded, y)

# xgb_val_pred = xgb_final.predict(X_test_encoded)
# knn_val_pred = knn_pipeline_final.predict(X_test_encoded)

# Store final models and results
final_models = {
    'XGBoost': {
        'model': xgb_final,
        'mse': xgb_final_mse,
        'rmse': xgb_final_rmse,
        'mae': xgb_final_mae,
        'r2': xgb_final_r2,
        'predictions': xgb_val_pred
    },
    'KNN': {
        'model': knn_pipeline_final,
        'mse': knn_final_mse,
        'rmse': knn_final_rmse,
        'mae': knn_final_mae,
        'r2': knn_final_r2,
        'predictions': knn_val_pred
    }
}

# Compare final models
print(f"\nFinal Model Comparison:")
print("=" * 30)
print(f"{'Model':<10} {'RMSE':<12} {'R²':<12}")
print("-" * 35)
print(f"{'XGBoost':<10} {xgb_final_rmse:<12.6f} {xgb_final_r2:<12.6f}")
print(f"{'KNN':<10} {knn_final_rmse:<12.6f} {knn_final_r2:<12.6f}")

# Determine best model
best_model_name = 'XGBoost' if xgb_final_rmse < knn_final_rmse else 'KNN'
best_model_rmse = min(xgb_final_rmse, knn_final_rmse)

print(f"\nBest model: {best_model_name}")
print(f"Best RMSE: {best_model_rmse:.6f}")

print(f"\nModels trained successfully with best parameters!")


Training data shape: (517754, 12)
Target shape: (517754,)
Test data shape: (172585, 12)

Applying label encoding...
Encoded training data shape: (517754, 12)
Encoded test data shape: (172585, 12)

Final training set shape: (414203, 12)
Final validation set shape: (103551, 12)

1. Training XGBoost with Best Parameters
----------------------------------------
XGBoost parameters: {'device': 'cuda', 'random_state': 42, 'enable_categorical': True, 'max_depth': 7, 'learning_rate': 0.01, 'n_estimators': 1000}
Training XGBoost...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)



XGBoost Final Performance:
MSE: 0.003156
RMSE: 0.056178
MAE: 0.043590
R²: 0.885703

2. Training KNN with Best Parameters
----------------------------------------
KNN parameters: {'n_neighbors': 20}
Training KNN...

KNN Final Performance:
MSE: 0.004129
RMSE: 0.064256
MAE: 0.050093
R²: 0.850473

Final Model Comparison:
Model      RMSE         R²          
-----------------------------------
XGBoost    0.056178     0.885703    
KNN        0.064256     0.850473    

Best model: XGBoost
Best RMSE: 0.056178

Models trained successfully with best parameters!


In [2]:
# Ensemble Methods: Combining KNN and XGBoost
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import numpy as np

print("Implementing Ensemble Methods for KNN + XGBoost...")
print("=" * 60)

# Method 1: Weighted Average Ensemble
print("\n1. Weighted Average Ensemble")
print("-" * 30)

# Calculate weights based on RMSE (inverse relationship)
xgb_rmse = final_models['XGBoost']['rmse']
knn_rmse = final_models['KNN']['rmse']

# Calculate weights based on RMSE (inverse relationship)
# Lower RMSE should get higher weight
xgb_weight = 1 / xgb_rmse
knn_weight = 1 / knn_rmse

# Normalize weights
total_weight = xgb_weight + knn_weight
xgb_weight = xgb_weight / total_weight
knn_weight = knn_weight / total_weight

print(f"XGBoost RMSE: {xgb_rmse:.4f} - Weight: {xgb_weight:.4f}")
print(f"KNN RMSE: {knn_rmse:.4f} - Weight: {knn_weight:.4f}")
print(f"Total weight: {xgb_weight + knn_weight:.4f}")

# Create weighted ensemble predictions on validation set
weighted_val_predictions = (xgb_weight * final_models['XGBoost']['predictions'] + 
                           knn_weight * final_models['KNN']['predictions'])

# Calculate ensemble metrics
ensemble_mse = mean_squared_error(y_val_final, weighted_val_predictions)
ensemble_mae = mean_absolute_error(y_val_final, weighted_val_predictions)
ensemble_r2 = r2_score(y_val_final, weighted_val_predictions)

print(f"\nWeighted Ensemble Validation Performance:")
print(f"MSE: {ensemble_mse:.4f}")
print(f"MAE: {ensemble_mae:.4f}")
print(f"R²: {ensemble_r2:.4f}")

# Method 2: Stacking Regressor
print("\n2. Stacking Regressor Ensemble")
print("-" * 30)

# Create base models
base_models = [
    ('xgb', final_models['XGBoost']['model']),
    ('knn', final_models['KNN']['model'])
]

# Create stacking regressor with linear regression as meta-learner
stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=LinearRegression(),
    cv=5,
    n_jobs=-1
)

print("Training Stacking Regressor...")
stacking_regressor.fit(X_train_final, y_train_final)

# Make predictions on validation set

stacking_val_predictions = stacking_regressor.predict(X_val_final)

# Calculate stacking metrics
stacking_mse = mean_squared_error(y_val_final, stacking_val_predictions)
stacking_mae = mean_absolute_error(y_val_final, stacking_val_predictions)
stacking_r2 = r2_score(y_val_final, stacking_val_predictions)

print(f"\nStacking Regressor Validation Performance:")
print(f"MSE: {stacking_mse:.4f}")
print(f"MAE: {stacking_mae:.4f}")
print(f"R²: {stacking_r2:.4f}")

# Store ensemble results
ensemble_results = {
    'weighted_average': {
        'model': None,  # No actual model, just weights
        'weights': {'xgb': xgb_weight, 'knn': knn_weight},
        'mse': ensemble_mse,
        'mae': ensemble_mae,
        'r2': ensemble_r2,
        'predictions': weighted_val_predictions
    },
    'stacking': {
        'model': stacking_regressor,
        'mse': stacking_mse,
        'mae': stacking_mae,
        'r2': stacking_r2,
        'predictions': stacking_val_predictions
    }
}

print(f"\nEnsemble implementation completed successfully!")


Implementing Ensemble Methods for KNN + XGBoost...

1. Weighted Average Ensemble
------------------------------
XGBoost RMSE: 0.0562 - Weight: 0.5335
KNN RMSE: 0.0643 - Weight: 0.4665
Total weight: 1.0000

Weighted Ensemble Validation Performance:
MSE: 0.0034
MAE: 0.0452
R²: 0.8780

2. Stacking Regressor Ensemble
------------------------------
Training Stacking Regressor...

Stacking Regressor Validation Performance:
MSE: 0.0032
MAE: 0.0436
R²: 0.8857

Ensemble implementation completed successfully!


## Predict with full data

In [3]:


xgb_final = xgb_final.fit(X_encoded, y)
knn_pipeline_final = knn_pipeline_final.fit(X_encoded, y)

xgb_val_pred = xgb_final.predict(X_test_encoded)
knn_val_pred = knn_pipeline_final.predict(X_test_encoded)


# Method 1: Weighted Average Ensemble
xgb_rmse = final_models['XGBoost']['rmse']
knn_rmse = final_models['KNN']['rmse']

# Calculate weights based on RMSE (inverse relationship)
# Lower RMSE should get higher weight
xgb_weight = 1 / xgb_rmse
knn_weight = 1 / knn_rmse

# Normalize weights
total_weight = xgb_weight + knn_weight
xgb_weight = xgb_weight / total_weight
knn_weight = knn_weight / total_weight

print(f"XGBoost RMSE: {xgb_rmse:.4f} - Weight: {xgb_weight:.4f}")
print(f"KNN RMSE: {knn_rmse:.4f} - Weight: {knn_weight:.4f}")
print(f"Total weight: {xgb_weight + knn_weight:.4f}")

# Create weighted ensemble predictions on validation set
weighted_val_predictions = (xgb_weight * xgb_val_pred + 
                           knn_weight * knn_val_pred)


# Method 2: Stacking Regressor
print("\n2. Stacking Regressor Ensemble")
print("-" * 30)

# Create base models
base_models = [
    ('xgb', xgb_final),
    ('knn', knn_pipeline_final)
]

# Create stacking regressor with linear regression as meta-learner
stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=LinearRegression(),
    cv=5,
    n_jobs=-1
)

print("Training Stacking Regressor...")
stacking_regressor.fit(X_encoded, y)

# Make predictions on validation set

stacking_val_predictions = stacking_regressor.predict(X_test_encoded)



XGBoost RMSE: 0.0562 - Weight: 0.5335
KNN RMSE: 0.0643 - Weight: 0.4665
Total weight: 1.0000

2. Stacking Regressor Ensemble
------------------------------
Training Stacking Regressor...


In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Generate Final Predictions and Create Submission Files
print("Generating Final Predictions and Submissions")
print("=" * 50)

stacking_submission_df = pd.DataFrame({
    'id': test['id'],
    'accident_risk': stacking_val_predictions
})

weighted_submission_df = pd.DataFrame({
    'id': test['id'],
    'accident_risk': weighted_val_predictions
})

stacking_submission_df.to_csv('stacking_submission_df.csv', index=False)
print("Stacking submission saved as 'stacking_submission_df.csv'")

weighted_submission_df.to_csv('weighted_submission_df.csv', index=False)
print("Weighted submission saved as 'weighted_submission_df.csv'")

# Statistical comparison of test predictions
print(f"\nTest Predictions Statistics:")
print("=" * 40)
print(f"{'Model':<10} {'Mean':<12} {'Std':<12} {'Min':<12} {'Max':<12}")
print("-" * 60)
print(f"{'Stacking':<10} {stacking_val_predictions.mean():<12.6f} {stacking_val_predictions.std():<12.6f} {stacking_val_predictions.min():<12.6f} {stacking_val_predictions.max():<12.6f}")
print(f"{'Weighted':<10} {weighted_val_predictions.mean():<12.6f} {weighted_val_predictions.std():<12.6f} {weighted_val_predictions.min():<12.6f} {weighted_val_predictions.max():<12.6f}")

# Correlation between predictions
prediction_correlation = np.corrcoef(stacking_val_predictions, weighted_val_predictions)[0, 1]
print(f"\nCorrelation between Stacking and Weighted test predictions: {prediction_correlation:.6f}")

# Visualize prediction distributions
plt.figure(figsize=(15, 5))

# Stacking predictions distribution
plt.subplot(1, 3, 1)
plt.hist(stacking_val_predictions, bins=50, alpha=0.7, edgecolor='black', color='blue')
plt.title('Stacking Test Predictions')
plt.xlabel('Predicted Accident Risk')
plt.ylabel('Frequency')

# Weighted predictions distribution
plt.subplot(1, 3, 2)
plt.hist(weighted_val_predictions, bins=50, alpha=0.7, edgecolor='black', color='orange')
plt.title('Weighted Test Predictions')
plt.xlabel('Predicted Accident Risk')
plt.ylabel('Frequency')

# Overlay comparison
plt.subplot(1, 3, 3)
plt.hist(stacking_val_predictions, bins=50, alpha=0.5, edgecolor='black', color='blue', label='Stacking')
plt.hist(weighted_val_predictions, bins=50, alpha=0.5, edgecolor='black', color='orange', label='Weighted')
plt.title('Prediction Distributions Comparison')
plt.xlabel('Predicted Accident Risk')
plt.ylabel('Frequency')
plt.legend()

plt.tight_layout()
plt.show()

Generating Final Predictions and Submissions
Stacking submission saved as 'stacking_submission_df.csv'
Weighted submission saved as 'weighted_submission_df.csv'

Test Predictions Statistics:
Model      Mean         Std          Min          Max         
------------------------------------------------------------
Stacking   0.351666     0.156860     -0.000155    0.880733    
Weighted   0.350851     0.149560     0.032875     0.854899    

Correlation between Stacking and Weighted test predictions: 0.996827


NameError: name 'plt' is not defined