In [None]:
# notebooks/5_ensemble_modeling.ipynb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
import joblib
from xgboost import train

# 1. Load Processed Data
test = pd.read_csv("../data/processed/walmart_final.csv")
test['Date'] = pd.to_datetime(test['Date'])

# Set your test cutoff date (same as before)
test_cutoff = test['Date'].max() - pd.DateOffset(months=3)
test = test[test['Date'] >= test_cutoff]

# 2. Generate Missing Predictions

# A. Load XGBoost Model and Scaler
xgb_model = joblib.load('../models/xgb_optimized.pkl')
scaler = joblib.load('../models/scaler.pkl')

# Prepare features (same as training)
features = [col for col in test.columns if col not in ['Date', 'Weekly_Sales', 'Store']]
X_test_scaled = scaler.transform(test[features])

# Generate XGB predictions
test['XGB_Pred'] = xgb_model.predict(X_test_scaled)

# B. Generate Prophet Predictions (if available)
try:
    from prophet import Prophet
    prophet_models = {}  # Load your saved Prophet models if available
    prophet_preds = []
    for store in test['Store'].unique():
        future = test[test['Store'] == store][['Date', 'Holiday_Flag', 'Temperature', 'CPI']]
        future = future.rename(columns={'Date': 'ds'})
        forecast = prophet_models[store].predict(future)
        prophet_preds.extend(forecast['yhat'].values)
    test['Prophet_Pred'] = prophet_preds
except:
    print("Prophet predictions skipped - model not available")

# C. Generate SARIMA Predictions (if available)
try:
    from statsmodels.tsa.statespace.sarimax import SARIMAX
    sarima_models = {}  # Load your saved SARIMA models if available
    sarima_preds = []
    for store in test['Store'].unique():
        if store in sarima_models:
            preds = sarima_models[store].predict(
                start=len(train[train['Store'] == store]),
                end=len(train[train['Store'] == store])+len(test[test['Store'] == store])-1
            )
            sarima_preds.extend(preds)
    test['SARIMA_Pred'] = sarima_preds
except:
    print("SARIMA predictions skipped - model not available")

# 3. Verify Predictions Exist
available_models = []
if 'XGB_Pred' in test.columns:
    available_models.append(('XGBoost', test['XGB_Pred']))
if 'Prophet_Pred' in test.columns:
    available_models.append(('Prophet', test['Prophet_Pred']))
if 'SARIMA_Pred' in test.columns:
    available_models.append(('SARIMA', test['SARIMA_Pred']))

if not available_models:
    raise ValueError("No model predictions available!")

# 4. Create Weighted Ensemble
weights = {'XGBoost': 0.6, 'Prophet': 0.3, 'SARIMA': 0.1}  # Adjust based on available models
total_weight = sum(weights[model[0]] for model in available_models)

test['Ensemble_Pred'] = sum(
    weights[model[0]]/total_weight * model[1].fillna(0) 
    for model in available_models
)

# 5. Evaluate
results = [('Ensemble', test['Ensemble_Pred'])] + available_models
for name, preds in results:
    mae = mean_absolute_error(test['Weekly_Sales'], preds)
    print(f"{name} MAE: ${mae:,.0f}")

# 6. Save Results
test.to_csv("../data/processed/ensemble_results.csv", index=False)

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


Prophet predictions skipped - model not available
SARIMA predictions skipped - model not available
Ensemble MAE: $32,371
XGBoost MAE: $32,371
