# Quick Start: Running Chronos Bolt on MultiTS-Eval Benchmark

This notebook shows how to run Chronos Bolt models on the MultiTS-Eval benchmark using the `run_multieval.py` script.

Make sure you have the MultiTS-Eval benchmark data downloaded and set the `--benchmark-path` correctly before running this notebook.

We will use the MultiTS-Eval framework to load the data and run the Chronos Bolt model. This notebook demonstrates how to integrate Chronos Bolt with the MultiTS-Eval evaluation framework.


## Installation

Install required packages:

```bash
pip install chronos-forecasting
pip install multieval
```

Make sure you have PyTorch installed with CUDA support if you want to use GPU acceleration.


In [1]:
import os
import sys
import subprocess
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from typing import Optional
from abc import ABC, abstractmethod

# Add the src directory to the Python path
sys.path.insert(0, str(Path.cwd() / 'src'))

# Import MultiTS-Eval components
from multieval.data import Benchmark
from multieval.metrics import MAPE, MAE, RMSE, NMAE

print("MultiTS-Eval components imported successfully!")


MultiTS-Eval components imported successfully!


In [2]:
# Self-contained ChronosForecast class for the notebook
class ChronosForecast:
    """
    Chronos forecasting model wrapper for MultiTS-Eval evaluation.
    This class is self-contained within the notebook.
    """
    
    def __init__(self, model_path: str = "amazon/chronos-bolt-base", device: str = "cuda:0", num_samples: int = 20):
        """
        Initialize Chronos forecast model.
        
        Args:
            model_path: Path to Chronos model (HuggingFace model ID or local path)
            device: Device to run the model on
            num_samples: Number of samples for probabilistic forecasting
        """
        self.model_path = model_path
        self.device = device
        self.num_samples = num_samples
        self.pipeline = None
        self._load_model()
    
    def _load_model(self):
        """Load the Chronos model."""
        try:
            from chronos import BaseChronosPipeline, ForecastType
            
            self.pipeline = BaseChronosPipeline.from_pretrained(
                self.model_path,
                device_map=self.device,
            )
            print(f"Loaded Chronos model: {self.model_path}")
        except ImportError:
            raise ImportError("Chronos package not installed. Please install with: pip install chronos-forecasting")
        except Exception as e:
            raise RuntimeError(f"Failed to load Chronos model: {e}")
    
    def forecast(self, history: np.ndarray, covariates: Optional[np.ndarray] = None, forecast_horizon: Optional[int] = None) -> np.ndarray:
        """
        Generate forecast from historical data using Chronos.
        
        Args:
            history: Historical time series data
            covariates: Optional covariate data (ignored for Chronos)
            forecast_horizon: Number of future points to forecast (default: 1)
            
        Returns:
            Forecast values
        """
        if forecast_horizon is None:
            forecast_horizon = 1
        
        # Convert history to torch tensor
        if isinstance(history, np.ndarray):
            history_tensor = torch.tensor(history, dtype=torch.float32)
        else:
            history_tensor = torch.tensor(np.array(history), dtype=torch.float32)
        
        # Remove NaN values
        history_clean = history_tensor[~torch.isnan(history_tensor)]
        
        if len(history_clean) == 0:
            # If no valid data, return zeros
            return np.zeros(forecast_horizon)
        
        # Ensure we have enough history for forecasting
        if len(history_clean) < 2:
            # If insufficient data, return the last value repeated
            last_value = float(history_clean[-1]) if len(history_clean) > 0 else 0.0
            return np.full(forecast_horizon, last_value)
        
        # Generate forecast using Chronos
        context = [history_clean]
        
        # Determine prediction kwargs based on forecast type
        predict_kwargs = {}
        if hasattr(self.pipeline, 'forecast_type'):
            from chronos import ForecastType
            if self.pipeline.forecast_type == ForecastType.SAMPLES:
                predict_kwargs = {"num_samples": self.num_samples}
        
        # Generate forecast
        forecast_output = self.pipeline.predict(
            context,
            prediction_length=forecast_horizon,
            **predict_kwargs
        )
        
        # Convert to numpy array
        if isinstance(forecast_output, torch.Tensor):
            forecast_np = forecast_output.numpy()
        else:
            forecast_np = np.array(forecast_output)
        
        # Handle different output shapes
        if forecast_np.ndim > 1:
            # Chronos Bolt returns (batch_size, num_quantiles, prediction_length)
            # We want the median (0.5 quantile) which is typically at index 4 (0.1, 0.2, ..., 0.9)
            if forecast_np.ndim == 3 and forecast_np.shape[1] == 9:  # Standard Chronos Bolt quantiles
                # Take the median (0.5 quantile) at index 4
                forecast_np = forecast_np[0, 4, :]  # (batch_size=1, quantile=4, prediction_length)
            elif forecast_np.shape[0] > 1:
                # If we have multiple samples, take the mean
                forecast_np = np.mean(forecast_np, axis=0)
            else:
                forecast_np = forecast_np[0]
        
        # Ensure we have the right length
        if len(forecast_np) != forecast_horizon:
            if len(forecast_np) > forecast_horizon:
                forecast_np = forecast_np[:forecast_horizon]
            else:
                # Pad with the last value if needed - use concatenation to avoid broadcasting issues
                if len(forecast_np) > 0:
                    last_val = float(forecast_np[-1])
                    padding_length = forecast_horizon - len(forecast_np)
                    padding = np.full(padding_length, last_val)
                    forecast_np = np.concatenate([forecast_np, padding])
                else:
                    # If no valid forecast, fill with zeros
                    forecast_np = np.zeros(forecast_horizon)
        
        return forecast_np

print("ChronosForecast class defined successfully!")


ChronosForecast class defined successfully!


## Configuration

Set up the benchmark path and model parameters. Adjust these according to your setup.
ma

In [3]:
# Configuration
BENCHMARK_PATH = "../../multits-eval-nested/"  # Adjust this path to your MultiTS-Eval data
MODEL_PATH = "amazon/chronos-bolt-base"  # Chronos Bolt model
DEVICE = "cuda:0"  # Use "cpu" if you don't have CUDA
NUM_SAMPLES = 20  # Number of samples for probabilistic forecasting
MAX_WINDOWS = 50  # Limit windows per dataset for faster testing
OUTPUT_DIR = "../../multieval_runs/chronos_bolt"
HISTORY_LENGTH = 512
FORECAST_HORIZON = 128
STRIDE = 256

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Benchmark path: {BENCHMARK_PATH}")
print(f"Model: {MODEL_PATH}")
print(f"Device: {DEVICE}")
print(f"Output directory: {OUTPUT_DIR}")


Benchmark path: ../../multits-eval-nested/
Model: amazon/chronos-bolt-base
Device: cuda:0
Output directory: ../../multieval_runs/chronos_bolt


## Initialize Chronos Model

Create a ChronosForecast instance that integrates with the MultiTS-FM framework.


In [4]:
# Initialize Chronos model
try:
    chronos_model = ChronosForecast(
        model_path=MODEL_PATH,
        device=DEVICE,
        num_samples=NUM_SAMPLES,        
    )
    print("Chronos model initialized successfully!")
except Exception as e:
    print(f"Error initializing Chronos model: {e}")
    print("Make sure you have installed chronos-forecasting and have the required dependencies.")


  from .autonotebook import tqdm as notebook_tqdm


Loaded Chronos model: amazon/chronos-bolt-base
Chronos model initialized successfully!


## Run Chronos Model Directly in Notebook

Instead of using `run_multieval.py`, we can run Chronos directly in the notebook for more control and immediate results.


In [5]:
# Load the MultiTS-Eval benchmark
print("Loading MultiTS-Eval benchmark...")
try:
    benchmark = Benchmark(BENCHMARK_PATH, history_length=HISTORY_LENGTH, forecast_horizon=FORECAST_HORIZON, stride=STRIDE, load_cached_counts=True)
    print(f"Benchmark loaded successfully!")
    print(f"Number of categories: {len(benchmark)}")
    
    # Print some basic info about the benchmark
    total_datasets = 0
    for category in benchmark:
        print(f"Category: {category.category} ({len(category)} domains)")
        for domain in category:
            print(f"  Domain: {domain.domain_name} ({len(domain)} datasets)")
            total_datasets += len(domain)
    
    print(f"Total datasets in benchmark: {total_datasets}")
    
except Exception as e:
    print(f"Error loading benchmark: {e}")
    print(f"Make sure the benchmark path '{BENCHMARK_PATH}' is correct and contains MultiTS-Eval data.")
    raise


Loading MultiTS-Eval benchmark...
Loading KITTI data from ../../multits-eval-nested/sequential/KITTI
Found 6114 parquet files
Successfully loaded 6114 valid files
Domain ALL_DATASETS not found in file hierarchy
loading window counts from ../../multits-eval-nested/sequential_window_counts_h512_f128_s256.json
loaded 8 cached window counts for category sequential
['hopper_csv_out', 'spriteworld', 'ant_csv_out', 'cheetah_csv_out', 'walker2d_csv_out', 'KITTI', 'openwebtext_timeseries_csvs', 'cifar100_timeseries_csvs']
successfully counted windows from cached JSON files
Dataset aus_electricity not found in data_hierarchy.json
Loading ECL data from ../../multits-eval-nested/traditional/ecl
Found 1 parquet files
Successfully loaded 206 valid chunks
Dataset aus_electricity_nsw not found in file hierarchy
Dataset aus_electricity_qld not found in file hierarchy
Dataset cursor-tabs not found in file hierarchy
Domain ALL_DATASETS not found in file hierarchy
loading window counts from ../../multits-

## Direct Evaluation

Now let's run Chronos directly on the benchmark data for immediate results and full control.


In [6]:
# Direct evaluation function
def evaluate_chronos_directly(benchmark, model, max_datasets=5, max_windows_per_dataset=10):
    """
    Directly evaluate Chronos model on benchmark data.
    """
    results = []
    dataset_count = 0
    
    print(f"Starting direct evaluation on up to {max_datasets} datasets...")
    
    for category in benchmark:
        if dataset_count >= max_datasets:
            break
            
        for domain in category:
            if dataset_count >= max_datasets:
                break
                
            for dataset in domain:
                if dataset_count >= max_datasets:
                    break
                    
                print(f"\nEvaluating dataset: {dataset.dataset_name} ({dataset_count + 1}/{max_datasets})")
                print(f"Category: {category.category}, Domain: {domain.domain_name}")
                print(f"Dataset size: {len(dataset)} windows")
                
                # Limit windows for faster evaluation
                windows_processed = 0
                dataset_metrics = {'MAPE': [], 'MAE': [], 'RMSE': [], 'NMAE': []}
                
                for window in dataset:
                    if windows_processed >= max_windows_per_dataset:
                        break
                        
                    # Get history and future data
                    history = window.history()
                    target = window.target()
                    covariates = window.covariates()
                    forecast_horizon = len(target)
                    
                    # Generate forecast
                    forecast = model.forecast(
                        history=history,
                        forecast_horizon=forecast_horizon,
                        covariates=covariates
                    )
                    
                    # Calculate metrics
                    mape = MAPE(target, forecast)
                    mae = MAE(target, forecast)
                    rmse = RMSE(target, forecast)
                    nmae = NMAE(target, forecast)
                    
                    dataset_metrics['MAPE'].append(mape)
                    dataset_metrics['MAE'].append(mae)
                    dataset_metrics['RMSE'].append(rmse)
                    dataset_metrics['NMAE'].append(nmae)
                    
                    windows_processed += 1
                        
                
                # Calculate average metrics for this dataset
                if windows_processed > 0:
                    avg_metrics = {}
                    for metric_name, values in dataset_metrics.items():
                        if values:
                            avg_metrics[metric_name] = np.mean(values)
                        else:
                            avg_metrics[metric_name] = np.nan
                    
                    result = {
                        'dataset': dataset.dataset_name,
                        'category': category.category,
                        'domain': domain.domain_name,
                        'windows_processed': windows_processed,
                        **avg_metrics
                    }
                    results.append(result)
                    
                    print(f"Processed {windows_processed} windows")
                    print(f"Average MAPE: {avg_metrics['MAPE']:.4f}")
                    print(f"Average MAE: {avg_metrics['MAE']:.4f}")
                    print(f"Average RMSE: {avg_metrics['RMSE']:.4f}")
                    print(f"Average NMAE: {avg_metrics['NMAE']:.4f}")
                
                dataset_count += 1
    
    return results

# Run direct evaluation
print("Starting Chronos evaluation...")
direct_results = evaluate_chronos_directly(benchmark, chronos_model, max_datasets=3, max_windows_per_dataset=5)
print(f"\nDirect evaluation completed on {len(direct_results)} datasets")


Starting Chronos evaluation...
Starting direct evaluation on up to 3 datasets...

Evaluating dataset: hopper_csv_out (1/3)
Category: sequential, Domain: Scientific
Dataset size: 33117 windows
Processed 5 windows
Average MAPE: 234.0065
Average MAE: 6.4502
Average RMSE: 7.9760
Average NMAE: 1.0689

Evaluating dataset: spriteworld (2/3)
Category: sequential, Domain: Scientific
Dataset size: 156272 windows
Processed 5 windows
Average MAPE: 106.8181
Average MAE: 0.4516
Average RMSE: 0.7205
Average NMAE: 0.9895

Evaluating dataset: ant_csv_out (3/3)
Category: sequential, Domain: Scientific
Dataset size: 19659 windows
Processed 5 windows
Average MAPE: 232.4465
Average MAE: 0.4740
Average RMSE: 1.2097
Average NMAE: 0.4060

Direct evaluation completed on 3 datasets
