In [None]:
# Set working directory (optional during development)
import os
os.chdir('/Users/sudishmakarki/My_project2')  # only if needed
print(" Working directory:", os.getcwd())

# Standard Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from prophet.diagnostics import cross_validation, performance_metrics
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import scipy.stats as stats

# Custom Functions
from models.data_interpolation import (
    load_data,
    preprocess_data,
    split_train_test,
    generate_time_series_splits
)

from models.model_sarimax import (
    prepare_sarimax_data,
    check_stationarity,
    plot_acf_pacf,
    fit_sarimax_model,
    analyze_residual_spike,
    ljung_box_test,
    forecast_sarimax_model,
    identify_peak_hours_sarimax,
    evaluate_sarimax_metrics,
    rolling_forecast_sarimax,
    generate_future_forecast_sarimax,
    group_forecast_by_hour,
    
)

from models.model_sarimax import (
    create_exogenous_variables,
    fit_sarimax_with_exog,
    ljung_box_test_refined_sarimax,
    analyze_largest_residual_sarimax_exog,
    forecast_with_exog,
    analyze_peak_hours_exog,
    evaluate_sarimax_exog_metrics,
    rolling_forecast_sarimax_exog,
    generate_future_forecast_sarimax_exog,
    group_forecast_by_hour_sarimax_exog
    
)

In [None]:
# Custom Functions
from models.data_interpolation import (
    load_data,
    preprocess_data,
    split_train_test,
    generate_time_series_splits
)

In [None]:
# Load and preprocess
df = load_data('data/RestaurantData.csv')
df_clean = preprocess_data(df)

In [None]:
# Split into train and test
restaurant_train, restaurant_test = split_train_test(df_clean, split_date='2022-01-01')
# Format for SARIMAX
train_series, test_series = prepare_sarimax_data(restaurant_train, restaurant_test)

SARIMAX Exogenous Variables model refinement 2

In [None]:
# Ensure the datetime index is correctly set and frequency is hourly
restaurant_subset_train = restaurant_train.copy()
restaurant_subset_test = restaurant_test.copy()

restaurant_subset_train.index = pd.to_datetime(restaurant_subset_train.index)
restaurant_subset_train = restaurant_subset_train.asfreq('h')

restaurant_subset_test.index = pd.to_datetime(restaurant_subset_test.index)
restaurant_subset_test = restaurant_subset_test.asfreq('h')

In [None]:
# Extract exogenous variables (hour + holidays) for train and test
exog_train, exog_test = create_exogenous_variables(restaurant_subset_train, restaurant_subset_test)
exog_train.head()


In [None]:
'''# --- Quick Test Mode: SARIMAX with Exogenous Variables (Small Subset) ---

# Slice last 2000 rows for fast testing
small_train_series = train_series.iloc[-2000:]
small_exog_train = exog_train.iloc[-2000:]

# Define SARIMAX parameters
order = (1, 1, 1)
seasonal_order = (1, 1, 1, 24)

# Fit model on small subset
results_exog_test = fit_sarimax_with_exog(
    small_train_series,
    small_exog_train,
    order=order,
    seasonal_order=seasonal_order
)

# Forecast 24 steps ahead using test exogenous features
exog_forecast_input = exog_test.iloc[:24]
forecast_test, forecast_ci_test = forecast_with_exog(results_exog_test, exog_forecast_input, exog_forecast_input.index)

# Preview forecast
forecast_test.head()'''

In [None]:
# Fit SARIMAX model with exogenous variables
order = (1, 1, 1)
seasonal_order = (1, 1, 1, 24)

results_exog_full = fit_sarimax_with_exog(train_series, exog_train, order=order, seasonal_order=seasonal_order)


In [None]:
# Plot residuals from the exogenous model
residuals_exog = results_exog_full.resid

plt.figure(figsize=(12, 6))
plt.plot(residuals_exog)
plt.title("Residuals Over Time (Exogenous Model)")
plt.xlabel("Time")
plt.ylabel("Residuals")
plt.tight_layout()
plt.show()

plot_acf(residuals_exog.dropna(), lags=40)
plt.title("ACF of Residuals")
plt.show()

plot_pacf(residuals_exog.dropna(), lags=40)
plt.title("PACF of Residuals")
plt.show()

In [None]:
#Largest Residual Check
analyze_largest_residual_sarimax_exog(residuals_exog, restaurant_subset_train)

In [None]:
#Ljung-Box Test
ljung_box_test_refined_sarimax(residuals_exog)

In [None]:
# Forecasting on the Test Set with Exogenous Variables
forecast_mean_exog, forecast_ci_exog = forecast_with_exog(
    results_exog_full,
    exog_test=exog_test,
    test_index=restaurant_subset_test.index
)

In [None]:
#Plot Forecast vs Actual
plt.figure(figsize=(12, 6))
plt.plot(train_series.index, train_series, label='Training Data', color='steelblue')
plt.plot(test_series.index, test_series, label='Actual Test Data', color='blue')
plt.plot(forecast_mean_exog.index, forecast_mean_exog, label='Forecast', color='red')
plt.fill_between(forecast_ci_exog.index,
                 forecast_ci_exog.iloc[:, 0],
                 forecast_ci_exog.iloc[:, 1],
                 color='pink', alpha=0.3, label='Confidence Interval')
plt.title("SARIMAX (Exogenous) Forecast vs Actual")
plt.xlabel("Time")
plt.ylabel("CustomerCount")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
#Identify Peak Hours
peak_hours_exog, threshold_exog, hourly_avg_exog, forecast_peak_exog, test_peak_exog = analyze_peak_hours_exog(
    forecast_mean_exog, test_series, threshold_ratio=0.6
)

# Optional: display hourly average
display(
    hourly_avg_exog.reset_index(name='Avg Forecast (yhat)')
    .style.set_caption("SARIMAX Exog: Hourly Avg Forecast")
    .background_gradient(cmap='Blues', subset=['Avg Forecast (yhat)'])
)

In [None]:
#Evaluate Metrics
display(evaluate_sarimax_exog_metrics(
    test_series, forecast_mean_exog, test_peak_exog, forecast_peak_exog
))

In [None]:
#Rolling Forecast with Exogenous Inputs
rolling_overall_metrics_exog, rolling_peak_metrics_exog, _, _, _, _ = rolling_forecast_sarimax_exog(
    train_series=train_series,
    test_series=test_series,
    exog_train=exog_train,
    exog_test=exog_test,
    best_order=order,
    best_seasonal_order=seasonal_order,
    peak_hours=peak_hours_exog
)

# Format and display
rolling_metrics_df = pd.DataFrame([
    ["MAE", "Overall", rolling_overall_metrics_exog['MAE']],
    ["RMSE", "Overall", rolling_overall_metrics_exog['RMSE']],
    ["MAPE", "Overall", rolling_overall_metrics_exog['MAPE']],
    ["MAE", "Peak Hours", rolling_peak_metrics_exog['MAE']],
    ["RMSE", "Peak Hours", rolling_peak_metrics_exog['RMSE']],
    ["MAPE", "Peak Hours", rolling_peak_metrics_exog['MAPE']],
], columns=["Metric", "Type", "Value"])

display(rolling_metrics_df.style.set_caption("Rolling Forecast Metrics (Exogenous SARIMAX)").background_gradient(cmap='Blues', subset=["Value"]))

In [None]:
 #Future Forecast (30 Days)
forecast_future_exog = generate_future_forecast_sarimax_exog(results_exog_full)

# Optional: Plot
plt.figure(figsize=(12, 6))
plt.plot(forecast_future_exog['ds'], forecast_future_exog['yhat'], label='Forecast')
plt.title("30-Day Future Forecast (SARIMAX with Exogenous)")
plt.xlabel("Date")
plt.ylabel("Forecasted CustomerCount")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
#Group Forecast by Hour
future_hourly_df_exog, threshold_future_exog, future_peak_hours_exog = group_forecast_by_hour_sarimax_exog(
    forecast_future_exog, threshold_ratio=0.6
)

# Display as styled table
display(future_hourly_df_exog.style.set_caption("Future Hourly Avg Forecast (Exog)").background_gradient(cmap='Blues'))