In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

# Add the parent directory to the path to import from src
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

from src.models import MultivariateForecaster

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the data from silver layer
data_path = '../../data/silver/forecast_data.csv'
df = pd.read_csv(data_path, index_col=0)

# Display the first few rows
display(df.head())

Unnamed: 0,Client_1,year,hour,is_business_hour,month,day_of_week,is_weekend,hour_sin,hour_cos,day_of_week_sin,...,lag_1,lag_24,diff_1,pct_change_1,diff_24,pct_change_24,rolling_mean_3,rolling_std_3,rolling_mean_7,rolling_std_7
2011-01-01 00:00:00,0.0,2011,0,0,1,5,1,0.0,1.0,-0.974928,...,,,,,,,0.0,,0.0,
2011-01-01 01:00:00,0.0,2011,1,0,1,5,1,0.258819,0.965926,-0.974928,...,0.0,,0.0,,,,0.0,0.0,0.0,0.0
2011-01-01 02:00:00,0.0,2011,2,0,1,5,1,0.5,0.866025,-0.974928,...,0.0,,0.0,,,,0.0,0.0,0.0,0.0
2011-01-01 03:00:00,0.0,2011,3,0,1,5,1,0.707107,0.707107,-0.974928,...,0.0,,0.0,,,,0.0,0.0,0.0,0.0
2011-01-01 04:00:00,0.0,2011,4,0,1,5,1,0.866025,0.5,-0.974928,...,0.0,,0.0,,,,0.0,0.0,0.0,0.0


In [3]:
# Filter the DataFrame to keep only data from June 2012 onwards
df = df[df.index >= '2011-12-01'].iloc[:, :8]

# Display the updated DataFrame info
print(f"Data shape after filtering: {df.shape}")
print(f"Date range after filtering: {df.index.min()} to {df.index.max()}")

Data shape after filtering: (27049, 8)
Date range after filtering: 2011-12-01 00:00:00 to 2015-01-01 00:00:00


In [4]:
# Ensure the date column is in datetime format and set it as index
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)
elif 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.set_index('timestamp', inplace=True)

# Display basic info about the data
print(f"Data shape: {df.shape}")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"Available columns: {df.columns.tolist()}")

Data shape: (27049, 8)
Date range: 2011-12-01 00:00:00 to 2015-01-01 00:00:00
Available columns: ['Client_1', 'year', 'hour', 'is_business_hour', 'month', 'day_of_week', 'is_weekend', 'hour_sin']


In [5]:
# ===============================
# 3. Initialize and Optimize Forecaster
# ===============================

# Set parameters for the forecaster
initial_window = 8760  # One full year of hourly data to capture all seasonal patterns
step_length = 168*30     # One week of data (24 hours × 7 days) for regular model updates
forecast_horizon = np.arange(1, 25)  # Forecast horizon of 24 steps (one full day ahead)

# Initialize the forecaster
forecaster = MultivariateForecaster(
    y=df,
    config_path="../../config/small_mv_config.json",
    initial_window=initial_window,
    step_length=step_length,
    fh=forecast_horizon
)


🕒 PREPARING DATETIME INDEX
   Converting index to datetime...
✓ Successfully converted to DatetimeIndex


In [6]:
# Run optimization to find the best model
# This may take some time depending on n_trials
n_trials = 10  # Number of trials for optimization
n_jobs = -1    # Use all available cores

print("Starting model optimization...")
forecast = forecaster.optimize(n_trials=n_trials, n_jobs=n_jobs)
print("Optimization complete!")

# Display the best model parameters
print("Best model parameters:")
best_params = forecaster.study.best_params
for param, value in best_params.items():
    print(f"  {param}: {value}")

print(f"\nBest model performance (RMSE): {forecaster.study.best_value:.4f}")

[I 2025-03-29 19:59:00,553] A new study created in memory with name: no-name-0f804e02-b418-4c9e-8add-ab93e303ceff


Starting model optimization...


[I 2025-03-29 19:59:43,210] Trial 0 finished with value: 1.0859079918350234 and parameters: {'model': 'RandomForestRegressor', 'n_estimators': 59, 'max_depth': 6, 'min_samples_split': 3, 'bootstrap': False}. Best is trial 0 with value: 1.0859079918350234.
[I 2025-03-29 19:59:54,780] Trial 7 finished with value: 1.0050885550506952 and parameters: {'model': 'RandomForestRegressor', 'n_estimators': 78, 'max_depth': 9, 'min_samples_split': 5, 'bootstrap': False}. Best is trial 7 with value: 1.0050885550506952.
[I 2025-03-29 19:59:54,881] Trial 6 finished with value: 1.0072448893121067 and parameters: {'model': 'RandomForestRegressor', 'n_estimators': 77, 'max_depth': 7, 'min_samples_split': 4, 'bootstrap': False}. Best is trial 7 with value: 1.0050885550506952.
[I 2025-03-29 20:00:06,741] Trial 8 finished with value: 1.0860633988031392 and parameters: {'model': 'RandomForestRegressor', 'n_estimators': 98, 'max_depth': 6, 'min_samples_split': 2, 'bootstrap': False}. Best is trial 7 with val

AttributeError: 'NoneType' object has no attribute 'T'

In [None]:
# ===============================
# 4. Generate and Visualize Forecasts
# ===============================

# Generate forecasts for the next time periods
forecast = forecaster.forecast()
print("Forecasts:")
print(forecast)

# Plot the historical data and forecasts for each series
plt.figure(figsize=(12, 6))
for column in df.columns:
    plt.plot(df.index, df[column], label=f'Historical: {column}')
    if column in forecast.columns:
        plt.plot(forecast.index, forecast[column], label=f'Forecast: {column}', linestyle='--')

plt.title('Multivariate Time Series Forecast')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# ===============================
# 5. Save the Results
# ===============================

# Save the forecasts to a CSV file
output_path = '../../data/gold/multivariate_forecasting_results.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
forecast.to_csv(output_path)
print(f"Forecasts saved to {output_path}")

# Show the tail of the forecasts
print(forecast.tail(15))