### Imports

In [1]:
import os
from time import time
import warnings

In [2]:
from tqdm.autonotebook import tqdm

from statsforecast.models import (
    HistoricAverage,
    Naive,
    RandomWalkWithDrift,
    SeasonalNaive,
    WindowAverage,
    SeasonalWindowAverage,
    SimpleExponentialSmoothing,
    Holt,
    ADIDA,
    CrostonClassic,
    CrostonSBA,
    IMAPA,
    TSB,
    MSTL,
    Theta,
    ARCH,
    ARIMA,
    AutoRegressive,
    AutoARIMA,
)

  from tqdm.autonotebook import tqdm


In [3]:
import pandas as pd
from statsforecast import StatsForecast
from fugue import transform
from datasetsforecast.losses import mse, mae, rmse, mape, smape

In [4]:
warnings.filterwarnings('ignore')

### Constants

In [5]:
DATA_FOLDER = "data/"
M5_FILE = "m5-dataset.parquet.gzip"
# FOZZY_FILE = "fozzy-dataset.parquet.gzip"

HORIZON = 14
N_WINDOWS = 3

In [6]:
FULL_RESULTS_PATH = "results/full_cv/"
EVALUATION_RESULTS_PATH = "results/evaluation/"
METRICS_RESULTS_PATH = "results/metrics/"

experiments_config = [

    # Baseline Models
    {
        "experiment_name": "m5-baseline",
        "data": os.path.join(DATA_FOLDER, M5_FILE),
        "models": [
            HistoricAverage(),
            Naive(),
            RandomWalkWithDrift(),
            SeasonalNaive(season_length=HORIZON),
            WindowAverage(window_size=HORIZON),
            SeasonalWindowAverage(season_length=HORIZON, window_size=HORIZON),
        ],
    },

    # Exponential Smoothing Models
    {
        "experiment_name": "m5-exponential-smoothing",
        "data": os.path.join(DATA_FOLDER, M5_FILE),
        "models": [
            SimpleExponentialSmoothing(alpha=0.3),
            Holt(),
        ],
    },

    # Sparse or Intermittent Models
    {
        "experiment_name": "m5-sparse-intermittent",
        "data": os.path.join(DATA_FOLDER, M5_FILE),
        "models": [
            ADIDA(),
            CrostonClassic(),
            CrostonSBA(),
            IMAPA(),
            TSB(alpha_d=0.2, alpha_p=0.2)
        ],
    },

    # Other Models
    {
        "experiment_name": "m5-other",
        "data": os.path.join(DATA_FOLDER, M5_FILE),
        "models": [
            MSTL(season_length=HORIZON),
            Theta(),
            ARCH()
        ],
    },

    # AR and ARIMA
    {
        "experiment_name": "m5-ar-arima",
        "data": os.path.join(DATA_FOLDER, M5_FILE),
        "models": [
            ARIMA(),
            AutoRegressive(lags=HORIZON)
        ],
    },

    # AR and ARIMA
    {
        "experiment_name": "m5-AutoARIMA",
        "data": os.path.join(DATA_FOLDER, M5_FILE),
        "models": [
            AutoARIMA()
        ],
    },
]




### Code base for Experiments

In [7]:
def convert_classes_to_string(class_list):
    class_string = '-'.join([obj.__class__.__name__ for obj in class_list])
    return class_string


def check_folders():
    folders = [
        FULL_RESULTS_PATH,
        EVALUATION_RESULTS_PATH, 
        METRICS_RESULTS_PATH
    ]
    for path in folders:
        isExist = os.path.exists(path)
        if not isExist:
            os.makedirs(path)

In [8]:
class ClassicModel:
    def __init__(self, df: pd.DataFrame, models: list) -> None:
        self.df = df
        self.models = models
        self.metrics = [mse, mae, rmse, mape, smape]
    
    def modeling(self) -> pd.DataFrame:
        sf = StatsForecast(
            models=self.models, 
            freq='D', 
            n_jobs=-1,
        )
        result = sf.cross_validation(
            df=self.df, 
            h=HORIZON, 
            n_windows=N_WINDOWS, 
            step_size=HORIZON, 
            level=[90]
        )
        return result
    
    def evaludate_cross_validation(self, cv_results: pd.DataFrame) -> pd.DataFrame:
        cv_results["unique_id"] = cv_results.index
        cv_results = cv_results.reset_index(drop=True)

        str_models = cv_results.loc[:, ~cv_results.columns.str.contains('unique_id|y|ds|cutoff|lo|hi')].columns
        str_models = ','.join([f"{model}:float" for model in str_models])
        
        evaluation_df = transform(
            cv_results.loc[:, ~cv_results.columns.str.contains('lo|hi')], 
            self.evaluate, 
            params={'metrics': self.metrics}, 
            schema=f"unique_id:str,cutoff:str,metric:str, {str_models}", 
            as_local=True,
            partition={'by': ['unique_id', 'cutoff']}
        )
        return evaluation_df

    def run(self):
        file_name = convert_classes_to_string(self.models) + ".csv"
        check_folders()

        print("Starting cross validation...")
        init = time()
        result = self.modeling()
        end = time()
        print(f"Cross Validation Finished In: {(end - init) / 60}")

        print(f"Saving CV results as {file_name}...")
        results_path = os.path.join(FULL_RESULTS_PATH, file_name)
        result.to_csv(results_path, index=False)

        print("Starting Evaluation...")
        init = time()
        evaluation = self.evaludate_cross_validation(result)
        end = time()
        print(f"Evaluation Finished In: {(end - init) / 60}")

        print(f"Saving evaluation as {file_name}...")
        evaluation.to_csv(os.path.join(EVALUATION_RESULTS_PATH, file_name), index=False)

        print("Saving metrics...")
        metrics = evaluation.groupby(['metric']).mean(numeric_only=True)
        metrics.to_csv(os.path.join(METRICS_RESULTS_PATH, file_name), index=False)
        print(metrics)

        print("Done!")

    @staticmethod
    def evaluate(df: pd.DataFrame, metrics: list) -> pd.DataFrame:
        eval_ = {}
        models = df.loc[:, ~df.columns.str.contains('unique_id|y|ds|cutoff|lo|hi')].columns
        for model in models:
            eval_[model] = {}
            for metric in metrics:
                eval_[model][metric.__name__] = metric(df['y'], df[model])
        eval_df = pd.DataFrame(eval_).rename_axis('metric').reset_index()
        eval_df.insert(0, 'cutoff', df['cutoff'].iloc[0])
        eval_df.insert(0, 'unique_id', df['unique_id'].iloc[0])
        return eval_df

### Run Experiments - M5

In [None]:
for config in experiments_config:
    print(config, "\n")
    
    df = pd.read_parquet(config["data"])

    inst = ClassicModel(df, config["models"])
    inst.run()

{'experiment_name': 'm5-baseline', 'data': 'data/m5-dataset.parquet.gzip', 'models': [HistoricAverage, Naive, RWD, SeasonalNaive, WindowAverage, SeasWA]} 

Starting cross validation...
Cross Validation Finished In: 0.4957402507464091
Saving CV results as HistoricAverage-Naive-RandomWalkWithDrift-SeasonalNaive-WindowAverage-SeasonalWindowAverage.csv...
Starting Evaluation...
Evaluation Finished In: 18.154188819726308
Saving evaluation as HistoricAverage-Naive-RandomWalkWithDrift-SeasonalNaive-WindowAverage-SeasonalWindowAverage.csv...
Saving metrics...
        HistoricAverage      Naive         RWD  SeasonalNaive  WindowAverage  \
metric                                                                         
mae            1.150872   1.345297    1.353526       1.250565       1.031242   
mape          27.010265  40.528393   40.771969      38.057278      26.771885   
mse            6.036355   8.400986    8.446992       7.541581       4.536812   
rmse           1.435070   1.720832    1.72

In [9]:
config = experiments_config[5]
config

{'experiment_name': 'm5-AutoARIMA',
 'data': 'data/m5-dataset.parquet.gzip',
 'models': [AutoARIMA]}

In [None]:
df = pd.read_parquet(config["data"])

inst = ClassicModel(df, config["models"])
inst.run()

Starting cross validation...
