In [None]:
# Set working directory (optional during development)
import os
os.chdir('/Users/sudishmakarki/My_project2')  # only if needed
print(" Working directory:", os.getcwd())

# Standard Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from prophet.diagnostics import cross_validation, performance_metrics


# Custom Functions
from models.data_interpolation import (
    load_data,
    preprocess_data,
    split_train_test,
    generate_time_series_splits
)

from models.model_prophet import (
    prepare_prophet_data,
    train_baseline_prophet,
    forecast_with_model,
    calculate_peak_hours,
    evaluate_metrics,
    cross_validate_baseline
)

from models.model_prophet import (
    tune_prophet_model,
    forecast_with_model_r1,
    select_peak_hours,
    evaluate_tuned_model_metrics,
    cross_validate_tuned_r1,
    forecast_future_with_model_r1
)

from models.model_prophet import (
    prepare_holiday_df,
    tune_prophet_model_r2,
    forecast_with_model_r2,
    select_peak_hours_r2,
    evaluate_metrics_r2,
    cross_validate_model_r2,
    forecast_future_with_model_r2
)

In [None]:
# Load and preprocess
df = load_data('data/RestaurantData.csv')
df_clean = preprocess_data(df)

In [None]:
# Split into train and test
restaurant_train, restaurant_test = split_train_test(df_clean, split_date='2022-01-01')
# Format for Prophet
restaurant_train_prophet, restaurant_test_prophet = prepare_prophet_data(restaurant_train, restaurant_test)
# Train the baseline Prophet model
m = train_baseline_prophet(restaurant_train_prophet)

In [None]:
# -- Test Set Forecasting --
# Predict on the test set and display the first few rows
test_forecast_df = forecast_with_model(m, restaurant_test_prophet)
print("Forecast on Test Set (first 5 rows):")
test_forecast_df.head()


In [None]:
# Calculate average forecast per hour
hourly_avg = test_forecast_df.groupby('Hour')['yhat'].mean()
print("\nAverage Forecast by Hour:")
display(hourly_avg)

In [None]:
peak_hours_dynamic, hourly_avg, threshold = calculate_peak_hours(test_forecast_df)

print("\nThreshold for Peak Hours:", threshold)
print("\nDynamically Identified Peak Hours:", peak_hours_dynamic)

# Filter forecast and actuals for dynamically identified peak hours
forecast_peak_df = test_forecast_df[test_forecast_df['Hour'].isin(peak_hours_dynamic)]
actual_peak_df = restaurant_test_prophet[restaurant_test_prophet['Hour'].isin(peak_hours_dynamic)]

In [None]:
# ----- Average forecasted customer count by hour -----
plt.figure(figsize=(10, 5))
hourly_avg.plot(kind='bar', color='skyblue')
plt.title("Average Forecasted Customer Count by Hour")
plt.xlabel("Hour of Day")
plt.ylabel("Average Forecast (yhat)")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# ----- Compare Forecast to Actuals (All Hours) -----
# Plot the forecast with the actual test values
f, ax = plt.subplots(figsize=(15, 5))
ax.scatter(restaurant_test.index, restaurant_test['CustomerCount'], color='r', label='Actual')
fig = m.plot(test_forecast_df, ax=ax)
ax.set_title("Prophet Forecast with Actuals")
ax.legend()
plt.show()

In [None]:
# ----- Zoom In: January 2022 (All Hours) -----
fig, ax = plt.subplots(figsize=(10, 5))
ax.scatter(restaurant_test.index, restaurant_test['CustomerCount'], color='r', label='Actual')
fig = m.plot(test_forecast_df, ax=ax)
ax.set_xbound(lower=pd.to_datetime('2022-01-01'), upper=pd.to_datetime('2022-02-01'))
ax.set_ylim(0, 80)
plt.suptitle('January 2022 Forecast vs Actuals')
ax.legend()
plt.show()

# ----- Zoom In Further: First Week of January 2022 (All Hours) -----
fig, ax = plt.subplots(figsize=(10, 5))
ax.scatter(restaurant_test.index, restaurant_test['CustomerCount'], color='r', label='Actual')
fig = m.plot(test_forecast_df, ax=ax)
lower_bound = pd.to_datetime('2022-01-01')
upper_bound = pd.to_datetime('2022-01-08')
ax.set_xbound(lower=lower_bound, upper=upper_bound)
ax.set_ylim(0, 80)
ax.set_title('First Week of January 2022 Forecast vs Actuals')
ax.legend()
plt.show()

In [None]:
# ----- Compare Forecast vs. Actuals for Dynamically Identified Peak Hours -----
plt.figure(figsize=(10, 5))
plt.scatter(actual_peak_df['ds'], actual_peak_df['y'], 
            color='r', label='Actual Peak Hours', alpha=0.7)

plt.plot(forecast_peak_df['ds'], forecast_peak_df['yhat'], 
         marker='o', linestyle='-', color='skyblue', label='Baseline Forecast')

plt.xlabel('Date')
plt.ylabel('Customer Count')
plt.title('Baseline Model Forecast for Dynamically Identified Peak Hours')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# ----- Plot Prophet Components for the Test Forecast -----
fig = m.plot_components(test_forecast_df)
plt.show()

In [None]:
# -- Future Forecasting (baseline) --
# Generate future data for the next 30 days at hourly frequency
future = m.make_future_dataframe(periods=30*24, freq='h')
forecast_future = m.predict(future)

print("Future Forecast:")
display(forecast_future.head())

# Plot the future forecast (historical data in black, forecast in blue)
m.plot(forecast_future)
plt.title("Future Forecast (Hourly)")
plt.show()

# Calculate the average forecast (yhat) per hour from the future forecast
forecast_future['Hour'] = forecast_future['ds'].dt.hour

future_hourly_avg = (
    forecast_future
    .groupby('Hour')['yhat']
    .mean()
    .round(2)
)

future_hourly_avg_df = future_hourly_avg.reset_index(name='Avg Forecast (yhat)')

display(
    future_hourly_avg_df
    .style
    .set_caption("ðŸ“Š Baseline Model: Future Avg Forecast by Hour")
    .background_gradient(cmap='Blues')
    .hide(axis='index')  
)

# Threshold-based peak hour selection
threshold = 0.6 * future_hourly_avg.max()
print("\n Threshold for Peak Hours:", round(threshold, 2))

future_peak_hours = sorted([hour for hour, val in future_hourly_avg.items() if val >= threshold])
print("\n Dynamically Selected Peak Hours:", future_peak_hours)

In [None]:
# ----- Evaluate Error Metrics for All Test Data for the baseline -----
mae_all = mean_absolute_error(
    y_true=restaurant_test['CustomerCount'],
    y_pred=test_forecast_df['yhat']
)
rmse_all = np.sqrt(mean_squared_error(
    y_true=restaurant_test['CustomerCount'],
    y_pred=test_forecast_df['yhat']
))
mape_all = mean_absolute_percentage_error(
    y_true=restaurant_test['CustomerCount'],
    y_pred=test_forecast_df['yhat']
)

print("Overall Test Data Metrics:")
print("MAE:", mae_all)
print("RMSE:", rmse_all)
print("MAPE:", mape_all)

In [None]:
# ----- Evaluate Error Metrics for Peak Hours -----
# Align the forecasts and actual values by their datetime 'ds' for peak hours
actual_peak = actual_peak_df.set_index('ds')['y']
predicted_peak = forecast_peak_df.set_index('ds')['yhat']

mae_peak = mean_absolute_error(actual_peak, predicted_peak)
rmse_peak = np.sqrt(mean_squared_error(actual_peak, predicted_peak))
mape_peak = mean_absolute_percentage_error(actual_peak, predicted_peak)

print("\nPeak Hours Metrics:")
print("Baseline Peak Hours MAE:", mae_peak)
print("Baseline Peak Hours RMSE:", rmse_peak)
print("Baseline Peak Hours MAPE:", mape_peak)

In [None]:
# Use your baseline model (m) for cross-validation.
df_cv_baseline = cross_validation(m, initial='730 days', period='180 days', horizon='365 days')
df_p_baseline = performance_metrics(df_cv_baseline)

print("Cross-Validation Performance Metrics for Baseline Model:")
df_p_baseline.head()

In [None]:
##Cross-Validation Summary for baseline
print("Cross-Validation Performance Metrics (Average) for Baseline Model (Overall):")
available_metrics = ['rmse', 'mae', 'mape', 'smape']

for metric in available_metrics:
    if metric in df_p_baseline.columns:
        print(f"{metric.upper()}: {df_p_baseline[metric].mean():.3f}")

In [None]:
#  Peak Hours CV Metrics for baseline
df_cv_baseline['Hour'] = df_cv_baseline['ds'].dt.hour
df_cv_baseline_peak = df_cv_baseline[df_cv_baseline['Hour'].isin(peak_hours_dynamic)]

rmse_peak = np.sqrt(mean_squared_error(df_cv_baseline_peak['y'], df_cv_baseline_peak['yhat']))
mae_peak = mean_absolute_error(df_cv_baseline_peak['y'], df_cv_baseline_peak['yhat'])
mape_peak = mean_absolute_percentage_error(df_cv_baseline_peak['y'], df_cv_baseline_peak['yhat'])

# SMAPE custom calculation
smape_peak = 100 * np.mean(
    2 * np.abs(df_cv_baseline_peak['yhat'] - df_cv_baseline_peak['y']) /
    (np.abs(df_cv_baseline_peak['yhat']) + np.abs(df_cv_baseline_peak['y']))
)

print("\nCross-Validation Performance Metrics (Average) for Baseline Model (Peak Hours Only):")
print(f"RMSE: {rmse_peak:.3f}")
print(f"MAE: {mae_peak:.3f}")
print(f"MAPE: {mape_peak:.3f}")
print(f"SMAPE: {smape_peak:.3f}")

In [None]:
#Model 1
param_grid = {
    'changepoint_prior_scale': [0.001, 0.01, 0.1, 0.5, 1.0],
    'seasonality_prior_scale': [0.01, 0.1, 1.0, 10.0],
    'seasonality_mode': ['additive', 'multiplicative'],
    'changepoint_range': [0.8, 0.9, 1.0]
}

m_best_r1, best_params, tuning_results = tune_prophet_model(
    train_df=restaurant_train_prophet,
    test_df=restaurant_test_prophet,
    param_grid=param_grid
)

In [None]:
# -- Test Set Forecasting with the Tuned Model --
restaurant_test_fcst_best_r1 = forecast_with_model_r1(m_best_r1, restaurant_test_prophet)
restaurant_test_fcst_best_r1.head()

In [None]:
# Continue with your analysis using the tuned forecasts:
restaurant_test_fcst_best_r1['Hour'] = restaurant_test_fcst_best_r1['ds'].dt.hour
hourly_avg_best_r1 = restaurant_test_fcst_best_r1.groupby('Hour')['yhat'].mean()
print("\nAverage Forecast by Hour (Tuned Model):")
print(hourly_avg_best_r1)


In [None]:
# --- Select Peak Hours using the forecasted test set and actual test data ---

(
    peak_hours_dynamic_best_r1,           
    threshold_best_r1,                    
    tuned_peak_fcst_dynamic_best_r1,      
    restaurant_test_prophet_peak_dynamic_best_r1,  
    hourly_avg_best_r1                   
) = select_peak_hours(
    restaurant_test_fcst_best_r1,         
    restaurant_test_prophet,              
    threshold_ratio=0.6                  
)

# View results
print("Threshold for Peak Hours:", threshold_best_r1)
print("Dynamically Identified Peak Hours:", peak_hours_dynamic_best_r1)

In [None]:
# Evaluate error metrics for the tuned model (overall)
mae_all_best_r1 = mean_absolute_error(
    y_true=restaurant_test['CustomerCount'],
    y_pred=restaurant_test_fcst_best_r1['yhat']
)
rmse_all_best_r1 = np.sqrt(mean_squared_error(
    y_true=restaurant_test['CustomerCount'],
    y_pred=restaurant_test_fcst_best_r1['yhat']
))
mape_all_best_r1 = mean_absolute_percentage_error(
    y_true=restaurant_test['CustomerCount'],
    y_pred=restaurant_test_fcst_best_r1['yhat']
)
print("\nTuned Model Overall Test Data Metrics:")
print("MAE:", mae_all_best_r1)
print("RMSE:", rmse_all_best_r1)
print("MAPE:", mape_all_best_r1)


In [None]:
# Evaluate error metrics for the tuned model ( peak hours)
actual_peak_best_r1 = restaurant_test_prophet_peak_dynamic_best_r1.set_index('ds')['y']
predicted_peak_best_r1 = tuned_peak_fcst_dynamic_best_r1.set_index('ds')['yhat']
mae_peak_best_r1 = mean_absolute_error(actual_peak_best_r1, predicted_peak_best_r1)
rmse_peak_best_r1 = np.sqrt(mean_squared_error(actual_peak_best_r1, predicted_peak_best_r1))
mape_peak_best_r1 = mean_absolute_percentage_error(actual_peak_best_r1, predicted_peak_best_r1)
print("\nTuned Model Peak Hours Metrics:")
print("MAE:", mae_peak_best_r1)
print("RMSE:", rmse_peak_best_r1)
print("MAPE:", mape_peak_best_r1)

In [None]:
#----Cross_validation----
df_cv_r1 = cross_validation(m_best_r1, initial='730 days', period='180 days', horizon='365 days')
df_p_r1 = performance_metrics(df_cv_r1)

print("\nCross-Validation Performance Metrics (First 5 rows) for Tuned Model (Refinement 1):")
df_p_r1.head()

In [None]:
#Cross-Validation Summary for overall
available_metrics = ['rmse', 'mae', 'mape', 'smape']

for metric in available_metrics:
    if metric in df_p_r1.columns:
        print(f"{metric.upper()}: {df_p_r1[metric].mean():.3f}")

In [None]:
# Peak Hours CV Metrics
df_cv_r1['hour'] = df_cv_r1['ds'].dt.hour
df_cv_r1_peak = df_cv_r1[df_cv_r1['hour'].isin(peak_hours_dynamic_best_r1)]
df_p_r1_peak = performance_metrics(df_cv_r1_peak)

# Display average metrics
print("\nCross-Validation Performance Metrics (Average) for Peak Hours Only:")
for metric in ['rmse', 'mae', 'mape', 'smape']:
    if metric in df_p_r1_peak.columns:
        print(f"{metric.upper()}: {df_p_r1_peak[metric].mean():.3f}")

In [None]:
# Generate 30-day future forecast using Refinement 1 model
forecast_future_r1, future_hourly_avg_r1, threshold_r1, future_peak_hours_r1 = forecast_future_with_model_r1(
    m_best_r1, days=30, freq='h', threshold_ratio=0.6)

# View first few rows of the forecast
print("Future Forecast (Refinement 1):")
display(forecast_future_r1.head())

#Future forecast
# Convert Series to clean DataFrame for display
future_hourly_avg_r1_df = future_hourly_avg_r1.reset_index(name='Avg Forecast (yhat)')

# Display with heatmap-style coloring and no index
display(
    future_hourly_avg_r1_df
    .style
    .set_caption("ðŸ“Š Refinement 1: Future Avg Forecast by Hour")
    .hide(axis='index')  # Hides the index column
    .background_gradient(cmap='Blues')
)

In [None]:
#Model 2
start_year = restaurant_train.index.min().year
end_year = restaurant_test.index.max().year
holiday_df = prepare_holiday_df(start_year, end_year)

print("Holiday Data:")
display(holiday_df.head())

In [None]:
# Prepare Training and Test Data
restaurant_train_prophet = restaurant_train.reset_index().rename(
    columns={'Timestamp': 'ds', 'CustomerCount': 'y'}
)
restaurant_test_prophet = restaurant_test.reset_index().rename(
    columns={'Timestamp': 'ds', 'CustomerCount': 'y'}
)

restaurant_train_prophet['hour'] = pd.to_datetime(restaurant_train_prophet['ds']).dt.hour
restaurant_test_prophet['hour'] = pd.to_datetime(restaurant_test_prophet['ds']).dt.hour

print("Training Data Columns:", restaurant_train_prophet.columns)

In [None]:
#Define Hyperparameter Grid
param_grid_r2 = {
    'changepoint_prior_scale': [0.01, 0.1],
    'seasonality_prior_scale': [0.1, 1.0],
    'seasonality_mode': ['additive', 'multiplicative']
}

#Tune and Train Final R2 Model
m_best_r2, best_params_r2, tuning_results_r2 = tune_prophet_model_r2(
    train_df=restaurant_train_prophet,
    test_df=restaurant_test_prophet,
    holiday_df=holiday_df,
    param_grid=param_grid_r2
)

print("Best Hyperparameters (Refinement 2):/n")
print(best_params_r2)

In [None]:
restaurant_test_fcst_best_r2 = forecast_with_model_r2(m_best_r2, restaurant_test_prophet)
display(restaurant_test_fcst_best_r2.head())

In [None]:
(
    peak_hours_dynamic_best_r2,
    threshold_best_r2,
    tuned_peak_fcst_dynamic_best_r2,
    restaurant_test_prophet_peak_dynamic_best_r2,
    hourly_avg_best_r2
) = select_peak_hours_r2(restaurant_test_fcst_best_r2, restaurant_test_prophet, threshold_ratio=0.6)

print("Threshold:", threshold_best_r2)
print("Peak Hours (Refinement 2):", peak_hours_dynamic_best_r2)


In [None]:
# Evaluate error metrics for the tuned model 2 (overall)
overall_mae_r2 = mean_absolute_error(restaurant_test['CustomerCount'], restaurant_test_fcst_best_r2['yhat'])
overall_rmse_r2 = np.sqrt(mean_squared_error(restaurant_test['CustomerCount'], restaurant_test_fcst_best_r2['yhat']))
overall_mape_r2 = mean_absolute_percentage_error(restaurant_test['CustomerCount'], restaurant_test_fcst_best_r2['yhat'])

print("\nTuned Model Overall Test Data Metrics (Refinement 2):")
print("MAE:", overall_mae_r2)
print("RMSE:", overall_rmse_r2)
print("MAPE:", overall_mape_r2)

In [None]:
print("Columns in restaurant_test:")
print(restaurant_test.columns)

In [None]:
# Step 1: Rename 'y' to 'CustomerCount' BEFORE calling the function
restaurant_test_prophet_renamed = restaurant_test_prophet.rename(columns={'y': 'CustomerCount'})

# Step 2: Now call the function with the renamed DataFrame
metrics_r2 = evaluate_metrics_r2(
    forecast_df=restaurant_test_fcst_best_r2,
    actual_df=restaurant_test_prophet_renamed
)

# Step 3: Extract and print peak metrics
peak_mae_r2 = metrics_r2['peak_hours']['MAE']
peak_rmse_r2 = metrics_r2['peak_hours']['RMSE']
peak_mape_r2 = metrics_r2['peak_hours']['MAPE']

print("\nTuned Model Peak Hour Test Data Metrics (Refinement 2):")
print("MAE:", peak_mae_r2)
print("RMSE:", peak_rmse_r2)
print("MAPE:", peak_mape_r2)

In [None]:
df_cv_r2, df_p_r2 = cross_validate_model_r2(m_best_r2)

print("Cross-Validation (First 5 rows):")
display(df_p_r2.head())

In [None]:
#Cross-Validation Summary for overall
print("\nCross-Validation Summary (Refinement 2):")
for metric in ['rmse', 'mae', 'mape', 'smape']:
    if metric in df_p_r2.columns:
        print(f"{metric.upper()}: {df_p_r2[metric].mean():.3f}")

In [None]:
# --- Peak Hours CV Metrics for Refinement 2 ---

df_cv_r2['hour'] = df_cv_r2['ds'].dt.hour
df_cv_r2_peak = df_cv_r2[df_cv_r2['hour'].isin(peak_hours_dynamic_best_r2)]

df_p_r2_peak = performance_metrics(df_cv_r2_peak)

# Display average metrics for peak hour performance
print("\nCross-Validation Performance Metrics (Average) for Peak Hours Only - Refinement 2:")
for metric in ['rmse', 'mae', 'mape', 'smape']:
    if metric in df_p_r2_peak.columns:
        print(f"{metric.upper()}: {df_p_r2_peak[metric].mean():.3f}")

In [None]:
restaurant_test_fcst_best_r2 = forecast_with_model_r2(m_best_r2, restaurant_test_prophet)
restaurant_test_fcst_best_r2.head()

In [None]:
# Generate 30-day future forecast using Refinement 2 model
forecast_future_r2, future_hourly_avg_r2, threshold_r2, future_peak_hours_r2 = forecast_future_with_model_r2(
    m_best_r2, days=30, freq='h', threshold_ratio=0.6)

# View first few rows of the forecast
print("Future Forecast (Refinement 2):")
display(forecast_future_r2.head())

# Plot forecast
m_best_r2.plot(forecast_future_r2)
plt.title("Future Forecast (Hourly) - Tuned + Holiday + Hour Model")
plt.show()


In [None]:
# Calculate the average forecast (yhat) per hour
forecast_future_r2['Hour'] = forecast_future_r2['ds'].dt.hour
future_hourly_avg_r2 = forecast_future_r2.groupby('Hour')['yhat'].mean().reset_index()

# Round the forecast values for clarity
future_hourly_avg_r2['yhat'] = future_hourly_avg_r2['yhat'].round(2)

# Rename columns for nicer display
future_hourly_avg_r2.columns = ['Hour of Day', 'Average Forecasted Customers']

# Display as a pretty table with hidden index
print("\nðŸ“Š Future Average Forecast by Hour (Refinement 2):")

display(
    future_hourly_avg_r2
    .style
    .set_caption("Average Hourly Forecast")
    .hide(axis='index')  # Hide index here
    .background_gradient(cmap='Blues')
)