# NYC Crash Risk Prediction - Model Experiments

Comparison of machine learning models with proper cross-validation.

In [1]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')

## 1. Load Data

In [None]:
TRAIN_SAMPLE = 100_000
TEST_SAMPLE = 20_000

train_df = pd.read_csv('../data/processed/train.csv', parse_dates=['hour'])
test_df = pd.read_csv('../data/processed/test.csv', parse_dates=['hour'])

train_df = train_df.sample(n=min(TRAIN_SAMPLE, len(train_df)), random_state=42)
test_df = test_df.sample(n=min(TEST_SAMPLE, len(test_df)), random_state=42)

FEATURE_COLS = [
    'temperature', 'precipitation', 'wind_speed', 'snow_depth',
    'hour_of_day', 'day_of_week', 'month', 'year',
    'accidents_1h_ago', 'accidents_24h_ago', 'rolling_mean_7d',
    'is_holiday', 'is_weekend'
]

for col in ['is_holiday', 'is_weekend']:
    train_df[col] = train_df[col].astype(int)
    test_df[col] = test_df[col].astype(int)

X_train = train_df[FEATURE_COLS].fillna(0)
y_train = train_df['accident_count']
X_test = test_df[FEATURE_COLS].fillna(0)
y_test = test_df['accident_count']

print(f"Train: {len(X_train):,}, Test: {len(X_test):,}")

## 2. Train Model Suite

In [None]:
from advanced_models import ModelSuite, compare_feature_importance

suite = ModelSuite(FEATURE_COLS)
comparison = suite.train_all_models(X_train, y_train, X_test, y_test)

In [None]:
comparison

## 3. Feature Importance Comparison

In [None]:
importance_comparison = compare_feature_importance(suite)
importance_comparison

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
importance_comparison.drop('Avg_Rank', axis=1).plot(kind='barh', ax=ax)
ax.set_xlabel('Importance (%)')
ax.set_title('Feature Importance Comparison Across Models')
plt.tight_layout()
plt.savefig('../models/feature_importance_comparison.png', dpi=150)
plt.show()

## 4. Time-Series Cross-Validation

In [None]:
from validation import TimeSeriesValidator
import xgboost as xgb

validator = TimeSeriesValidator(n_splits=5, method='expanding')

def create_xgb():
    return xgb.XGBRegressor(
        objective='count:poisson',
        n_estimators=100,
        max_depth=7,
        learning_rate=0.1,
        random_state=42,
        n_jobs=-1
    )

print("Running Time-Series Cross-Validation...")
cv_metrics, cv_results = validator.cross_validate(
    create_xgb,
    train_df,
    train_df['accident_count'],
    FEATURE_COLS,
    time_column='hour'
)

In [None]:
cv_metrics

## 5. Uncertainty Quantification

In [None]:
from uncertainty import create_uncertainty_pipeline

best_name, best_model = suite.get_best_model()
print(f"Best model: {best_name}")

conformal = create_uncertainty_pipeline(
    best_model, X_train, y_train,
    calibration_fraction=0.2,
    confidence_level=0.90
)

In [None]:
coverage_metrics = conformal.evaluate_coverage(X_test, y_test)
for k, v in coverage_metrics.items():
    print(f"{k}: {v:.4f}")

## 6. SHAP Explainability

In [None]:
from explainability import SHAPExplainer

explainer = SHAPExplainer(best_model, FEATURE_COLS, model_type='tree')
explainer.fit(X_train.sample(min(500, len(X_train))))

importance = explainer.get_global_importance(X_test.sample(min(1000, len(X_test))))
importance

In [None]:
explainer.plot_summary(X_test.sample(min(500, len(X_test))), save_path='../models/shap_summary.png')

## 7. Summary

In [None]:
print("="*60)
print("MODEL EXPERIMENTS SUMMARY")
print("="*60)
print(f"\nBest Model: {best_name}")
print(f"Test RMSE: {suite.results[best_name].metrics['RMSE']:.4f}")
print(f"Test MAE: {suite.results[best_name].metrics['MAE']:.4f}")
print(f"\nTime-Series CV RMSE: {cv_metrics['RMSE'].mean():.4f} (+/- {cv_metrics['RMSE'].std():.4f})")
print(f"Prediction Interval Coverage: {coverage_metrics['empirical_coverage']*100:.1f}%")
print(f"")
print("Top 5 Features:")
for _, row in importance.head(5).iterrows():
    print(f"  - {row['feature']}: {row['importance_pct']:.1f}%")