In [None]:
# ============================================================
# Notebook setup
# ============================================================
%load_ext autoreload
%autoreload 2
from util import util
import pandas as pd
import numpy as np
import xgboost as xgb
import shap
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from matplotlib import pyplot as plt

# ============================================================
# WP1: Data & Baseline (Student A)
# ============================================================

# 1. Load Data
# We assume the file is downloaded as per 'Methods and Tools' setup
data_path = '../data/playground-series-s5e10/train.csv'
data = pd.read_csv(data_path)

# Target is the numerical 'accident_risk'
target_col = 'accident_risk'
X = data.drop(columns=['id', target_col])
y = data[target_col]

# 2. Preprocessing
# From "Biomedical Data Analysis": Handling mixed types
num_cols = X.select_dtypes(include=['number']).columns
cat_cols = X.select_dtypes(include=['object', 'bool']).columns # bool fits here too

# One-hot encoding for categoricals (urban/rural, rainy/clear)
X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Split Training and Validation
# From "Anomaly Detection": Standard split to validate generalization
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Scaling
# From "Non-Linear Models" (Slide: NNs and Standardization):
# Lasso requires standardized features to penalize weights fairly.
scaler = StandardScaler()
X_train_s = X_train.copy()
X_val_s = X_val.copy()
X_train_s[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val_s[num_cols] = scaler.transform(X_val[num_cols])

# 3. Lasso Baseline
# From "Non-Linear Models" (Slide: Lasso):
# "The Lasso weights are sparse, i.e. only a few attributes will have impact"
print("Training Lasso Baseline...")
lasso = Lasso(alpha=0.001) # Low alpha since we have few features
lasso.fit(X_train_s, y_train)

# Evaluation
pred_lasso = lasso.predict(X_val_s)
rmse_lasso = np.sqrt(mean_squared_error(y_val, pred_lasso))
print(f"Baseline Lasso RMSE: {rmse_lasso:.4f}")

# From "Non-Linear Models": Inspecting weights to find correlates
coeffs = pd.Series(lasso.coef_, index=X_train.columns)
coeffs[coeffs.abs() > 0].sort_values().plot(kind='barh', figsize=(10, 6))
plt.title("Lasso Coefficients (Linear Drivers of Risk)")
plt.show()

# ============================================================
# WP2: Advanced Modeling (Student B)
# ============================================================

# 1. XGBoost Regressor
# From "Non-Linear Models" (Slide: Gradient Boosted Trees Model):
# We use GBT to capture non-linearities (e.g. speed_limit * curvature interactions)
print("\nTraining XGBoost...")
model = xgb.XGBRegressor(
    n_estimators=500, 
    max_depth=5, 
    learning_rate=0.05, 
    n_jobs=-1,
    random_state=42
)
model.fit(X_train, y_train)

pred_xgb = model.predict(X_val)
rmse_xgb = np.sqrt(mean_squared_error(y_val, pred_xgb))
print(f"XGBoost RMSE: {rmse_xgb:.4f}")

# ============================================================
# WP3: Explainability & Business Evaluation (Student C)
# ============================================================

# 1. SHAP Analysis
# From "Additive Feature Attribution"
# Explaining specific risk factors for the test set
print("\nCalculating SHAP values...")
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_val)

plt.figure()
shap.summary_plot(shap_values, X_val)

# 2. Asymmetric Cost Evaluation
# From "RUL Prediction" (Slide: Cost Model):
# "A failure costs C units more than maintenance".
# In our case: Underestimating risk (Safety Hazard) costs more than Overestimating (Caution).

def asymmetric_risk_loss(y_true, y_pred, penalty_underestimate=5.0):
    """
    Custom metric for 'AI in Industry'. 
    If y_true > y_pred (Underestimate), we penalize by factor 'penalty_underestimate'.
    """
    diff = y_pred - y_true
    # Where diff < 0 (Underestimate), multiply error by penalty
    # Where diff > 0 (Overestimate), keep error as is
    weighted_diff = np.where(diff < 0, diff * penalty_underestimate, diff)
    return np.mean(np.abs(weighted_diff))

# Compare models using this "Industrial" metric
cost_lasso = asymmetric_risk_loss(y_val, pred_lasso)
cost_xgb = asymmetric_risk_loss(y_val, pred_xgb)

print(f"\n--- Industrial Evaluation (Safety Weighted Cost) ---")
print(f"Lasso Cost: {cost_lasso:.4f}")
print(f"XGBoost Cost: {cost_xgb:.4f}")
print(f"XGBoost Improvement: {((cost_lasso - cost_xgb)/cost_lasso)*100:.1f}%")