In [None]:
# ============================================================
# Notebook setup
# From "Methods and Tools":
# We use autoreload to update the 'util' module without restarting the kernel
# ============================================================
%load_ext autoreload
%autoreload 2

# From "Methods and Tools":
# We organize our project using a local module
from util import util

import pandas as pd
import numpy as np
import xgboost as xgb
import shap
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt

# ============================================================
# WP1: Data & Baseline (Student 1)
# ============================================================

# 1. Load Data
# From "Methods and Tools": Reproducible data loading via util module
data = util.load_data('../data/playground-series-s5e10/train.csv')
target_col = 'Accident_Risk' 

# 2. Preprocessing
X = data.drop(columns=['id', target_col])
y = data[target_col]

# From "Biomedical Data Analysis": 
# Identifying categorical vs numerical features for proper handling
num_cols = X.select_dtypes(include=['number']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# Simple One-Hot encoding 
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# From "Biomedical Data Analysis" / "Anomaly Detection":
# We split data to ensure we test generalization.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# From "Non-Linear Models" (Slide: NNs and Standardization):
# "Normalization is important... expected far less reliable results [without it]"
# This is crucial for the Lasso baseline to converge and for coefficient interpretation.
scaler = StandardScaler()
X_train_s = X_train.copy()
X_val_s = X_val.copy()
X_train_s[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val_s[num_cols] = scaler.transform(X_val[num_cols])

# 3. Lasso Baseline
# From "Non-Linear Models" (Slide: Lasso):
# "We will use L1 regularization... to sparsify the weights"
# This provides our explainable baseline (Occam's Razor from "Anomaly Detection").
lasso = LogisticRegression(penalty='l1', solver='liblinear', C=0.1)
lasso.fit(X_train_s, y_train)

print(f"Baseline AUC: {roc_auc_score(y_val, lasso.predict_proba(X_val_s)[:,1]):.3f}")

# From "Non-Linear Models" (Slide: Important Attributes in Lasso):
# We inspect weights to find the most relevant correlates.
coeffs = pd.Series(lasso.coef_[0], index=X_train.columns)
coeffs.sort_values().plot(kind='barh', figsize=(10, 8))
plt.title("Lasso Coefficients (Feature Importance Baseline)")
plt.show()

# ============================================================
# WP2: Advanced Modeling (Student 2)
# ============================================================

# 1. XGBoost
# From "Non-Linear Models" (Slide: Gradient Boosted Trees Model):
# We switch to a non-linear model to "account for interactions among variables".
# Note: We use raw X_train (trees are scale-invariant), unlike the linear model.
model = xgb.XGBClassifier(
    n_estimators=500, 
    max_depth=5, 
    learning_rate=0.05, 
    n_jobs=-1,
    random_state=42
)
model.fit(X_train, y_train)

# Get probabilities for the validation set
# From "Neuro-Probabilistic Models": 
# We need probabilities, not just classes, to perform threshold optimization later.
val_probs = model.predict_proba(X_val)[:, 1]
print(f"XGBoost AUC: {roc_auc_score(y_val, val_probs):.3f}")

# ============================================================
# WP3: Explainability & Decision (Student 3)
# ============================================================

# 1. SHAP Analysis
# From "Additive Feature Attribution" (Slide: SHAP in Action):
# "The authors of the SHAP paper maintain a nice Python package... used to explain our non-linear model"
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_val)

# From "Additive Feature Attribution" (Slide: Beeswarm Plot):
# "We can stack multiple scatter plots to obtain a beeswarm plot"
plt.figure()
shap.summary_plot(shap_values, X_val)

# 2. Optimization / Decision Policy
# From "Anomaly Detection" (Slide: Cost Model):
# "It is much better to devise a cost model... focus on the value for our customer"
C_FALSE_ALARM = 1   # Cost of sending patrol when no accident happens
C_MISSED_ACCIDENT = 50 # Cost of an accident occurring without patrol

# From "Anomaly Detection" (Slide: Threshold Optimization):
# "We can now optimize the threshold over the validation set" using line search.
best_thr, min_cost = util.opt_threshold(y_val, val_probs, C_FALSE_ALARM, C_MISSED_ACCIDENT)

print(f"Optimal Threshold: {best_thr:.3f}")
print(f"Business Cost at Optimal Threshold: {min_cost}")

# Compare with default threshold (0.5)
# This demonstrates the "Value Generated" by using AI properly (Prediction + Optimization).
default_cost = util.calculate_cost(y_val, val_probs, 0.5, C_FALSE_ALARM, C_MISSED_ACCIDENT)
print(f"Business Cost at Default Threshold (0.5): {default_cost}")
print(f"Value Generated by Optimization: {default_cost - min_cost} units")