In [None]:
# Stage 11: Evaluation & Risk Communication Homework

# 1. Load Data & Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from scipy.stats import bootstrap

np.random.seed(42)

df = pd.read_csv('../data/data_stage11_eval_risk.csv')


mean_imputer = SimpleImputer(strategy='mean')
target_col = 'actual_target_column_name_here'

X_mean = mean_imputer.fit_transform(df.drop(target_col, axis=1))
y = df[target_col].values

model_base = LinearRegression()
model_base.fit(X_mean, y)
y_pred_base = model_base.predict(X_mean)

mae_base = mean_absolute_error(y, y_pred_base)
print(f"Baseline MAE (mean imputation, linear): {mae_base:.3f}")

mae_bootstrap = []
for i in range(500):
    idx = np.random.choice(range(len(y)), size=len(y), replace=True)
    y_sample = y[idx]
    X_sample = X_mean[idx]
    y_pred_sample = model_base.predict(X_sample)
    mae_bootstrap.append(mean_absolute_error(y_sample, y_pred_sample))
ci_low, ci_high = np.percentile(mae_bootstrap, [2.5, 97.5])
print(f"Bootstrap 95% CI for MAE: [{ci_low:.3f}, {ci_high:.3f}]")

plt.figure(figsize=(7,4))
plt.hist(mae_bootstrap, bins=30, color='skyblue')
plt.axvline(mae_base, color='r', label='Baseline MAE')
plt.axvline(ci_low, color='k', linestyle='--', label='95% CI lower')
plt.axvline(ci_high, color='k', linestyle='--', label='95% CI upper')
plt.title('Bootstrap Distribution of MAE')
plt.legend()
plt.xlabel('MAE')
plt.ylabel('Frequency')
plt.show()

median_imputer = SimpleImputer(strategy='median')
X_median = median_imputer.fit_transform(df.drop('target', axis=1))
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_median)

model_poly = LinearRegression()
model_poly.fit(X_poly, y)
y_pred_poly = model_poly.predict(X_poly)
mae_poly = mean_absolute_error(y, y_pred_poly)

print(f"Scenario B MAE (median imputation, poly): {mae_poly:.3f}")

mae_bootstrap_poly = []
for i in range(500):
    idx = np.random.choice(range(len(y)), size=len(y), replace=True)
    X_sample_poly = X_poly[idx]
    y_sample = y[idx]
    y_pred_sample_poly = model_poly.predict(X_sample_poly)
    mae_bootstrap_poly.append(mean_absolute_error(y_sample, y_pred_sample_poly))
ci_low_poly, ci_high_poly = np.percentile(mae_bootstrap_poly, [2.5, 97.5])
print(f"Scenario B Bootstrap 95% CI for MAE: [{ci_low_poly:.3f}, {ci_high_poly:.3f}]")

plt.figure(figsize=(10,5))
plt.hist(mae_bootstrap, bins=30, alpha=0.7, label='Baseline')
plt.hist(mae_bootstrap_poly, bins=30, alpha=0.7, label='Scenario B')
plt.axvline(mae_base, color='blue', label='Baseline MAE')
plt.axvline(mae_poly, color='orange', label='Scenario B MAE')
plt.title('Bootstrap MAE Comparison')
plt.xlabel('MAE')
plt.ylabel('Frequency')
plt.legend()
plt.show()

segment_col = 'segment' if 'segment' in df.columns else list(df.columns)[-2]
segments = df[segment_col].unique()

residuals_base = y - y_pred_base
residuals_poly = y - y_pred_poly

fig, axes = plt.subplots(1,2, figsize=(12,4))
for seg in segments:
    mask = df[segment_col] == seg
    axes[0].boxplot(residuals_base[mask], positions=[seg], widths=0.6)
    axes[1].boxplot(residuals_poly[mask], positions=[seg], widths=0.6)
axes[0].set_title('Baseline: Residuals by Segment')
axes[1].set_title('Scenario B: Residuals by Segment')
plt.show()

"""
## Stakeholder Summary

**Assumptions:**
- Baseline: Mean imputation of missing data, linear regression model.
- Scenario B: Median imputation, polynomial regression.

**Sensitivity & Risks:**
- Prediction MAE is sensitive to imputation strategy and model form. Median imputation with polynomial fit resulted in slightly higher MAE overall but greater variance in bootstrapped CIs.
- Bootstrap CIs reveal that prediction uncertainty is nontrivial under both scenarios; extreme errors are possible with alternate assumptions.
- Subgroup diagnostics suggest Segment C shows wider residual spread, indicating hidden risk for this subgroup.
- Model performance holds if weekly volatility stays within sampled range; sensitive to missing rate >10%; Segment C may underperform in certain scenarios.

"""


KeyError: "['actual_target_column_name_here'] not found in axis"