### Imports

In [1]:
import os
from time import time
import warnings

In [2]:
from tqdm.autonotebook import tqdm

from statsforecast.models import (
    HistoricAverage,
    Naive,
    RandomWalkWithDrift,
    SeasonalNaive,
    WindowAverage,
    SeasonalWindowAverage,
    SimpleExponentialSmoothing,
    Holt,
    ADIDA,
    CrostonClassic,
    CrostonSBA,
    IMAPA,
    TSB,
    MSTL,
    Theta,
    ARCH,
    ARIMA,
    AutoRegressive,
    AutoARIMA,
)

  from tqdm.autonotebook import tqdm


In [3]:
import pandas as pd
from statsforecast import StatsForecast
from fugue import transform
from datasetsforecast.losses import mse, mae, rmse, mape, smape

In [4]:
warnings.filterwarnings('ignore')

### Constants

In [5]:
DATA_FOLDER = "data/"
M5_FILE = "m5-dataset.parquet.gzip"
FOZZY_FILE = "fozzy-dataset.parquet.gzip"
FAV_FILE = "favorita-grocery.parquet.gzip"

HORIZON = 14
N_WINDOWS = 1

In [7]:
df = pd.read_parquet(os.path.join(DATA_FOLDER, FAV_FILE))
df.y.loc[df.y >= 0].describe().round(2)

count    1.245182e+08
mean     8.580000e+00
std      2.192000e+01
min      0.000000e+00
25%      2.000000e+00
50%      4.000000e+00
75%      9.000000e+00
max      4.414200e+04
Name: y, dtype: float64

In [11]:
df.y.loc[df.y >= 0].max()

44142.0

### Code base for Experiments

In [6]:
def convert_classes_to_string(class_list):
    class_string = '-'.join([obj.__class__.__name__ for obj in class_list])
    return class_string


def check_folders():
    folders = [
        FULL_RESULTS_PATH,
        EVALUATION_RESULTS_PATH, 
        METRICS_RESULTS_PATH
    ]
    for path in folders:
        isExist = os.path.exists(path)
        if not isExist:
            os.makedirs(path)

In [7]:
class ClassicModel:
    def __init__(self, df: pd.DataFrame, models: list) -> None:
        self.df = df
        self.models = models
        self.metrics = [mse, mae, rmse, mape, smape]
    
    def modeling(self) -> pd.DataFrame:
        sf = StatsForecast(
            models=self.models, 
            freq='D', 
            n_jobs=-1,
        )
        result = sf.cross_validation(
            df=self.df, 
            h=HORIZON, 
            n_windows=N_WINDOWS, 
            step_size=HORIZON, 
            level=[90]
        )
        return result
    
    def evaludate_cross_validation(self, cv_results: pd.DataFrame) -> pd.DataFrame:
        cv_results["unique_id"] = cv_results.index
        cv_results = cv_results.reset_index(drop=True)

        str_models = cv_results.loc[:, ~cv_results.columns.str.contains('unique_id|y|ds|cutoff|lo|hi')].columns
        str_models = ','.join([f"{model}:float" for model in str_models])
        
        evaluation_df = transform(
            cv_results.loc[:, ~cv_results.columns.str.contains('lo|hi')], 
            self.evaluate, 
            params={'metrics': self.metrics}, 
            schema=f"unique_id:str,cutoff:str,metric:str, {str_models}", 
            as_local=True,
            partition={'by': ['unique_id', 'cutoff']}
        )
        return evaluation_df

    def run(self):
        file_name = convert_classes_to_string(self.models) + ".csv"
        check_folders()

        print("Starting cross validation...")
        init = time()
        result = self.modeling()
        end = time()
        print(f"Cross Validation Finished In: {(end - init) / 60}")

        print(f"Saving CV results as {file_name}...")
        results_path = os.path.join(FULL_RESULTS_PATH, file_name)
        result.to_csv(results_path, index=False)

        print("Starting Evaluation...")
        init = time()
        evaluation = self.evaludate_cross_validation(result)
        end = time()
        print(f"Evaluation Finished In: {(end - init) / 60}")

        print(f"Saving evaluation as {file_name}...")
        evaluation.to_csv(os.path.join(EVALUATION_RESULTS_PATH, file_name), index=False)

        print("Saving metrics...")
        metrics = evaluation.groupby(['metric']).mean(numeric_only=True)
        metrics.to_csv(os.path.join(METRICS_RESULTS_PATH, file_name), index=False)
        print(metrics)

        print("Done!")

    @staticmethod
    def evaluate(df: pd.DataFrame, metrics: list) -> pd.DataFrame:
        eval_ = {}
        models = df.loc[:, ~df.columns.str.contains('unique_id|y|ds|cutoff|lo|hi')].columns
        for model in models:
            eval_[model] = {}
            for metric in metrics:
                eval_[model][metric.__name__] = metric(df['y'], df[model])
        eval_df = pd.DataFrame(eval_).rename_axis('metric').reset_index()
        eval_df.insert(0, 'cutoff', df['cutoff'].iloc[0])
        eval_df.insert(0, 'unique_id', df['unique_id'].iloc[0])
        return eval_df

# Run Experiments - FAV

In [8]:
FULL_RESULTS_PATH = "results/favorita-full_cv/"
EVALUATION_RESULTS_PATH = "results/favorita-evaluation/"
METRICS_RESULTS_PATH = "results/favorita-metrics/"


experiments_config = [

    # HistoricAverage
    {
        "experiment_name": "favorita-HistoricAverage",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [HistoricAverage()],
    },
    
    # Naive
    {
        "experiment_name": "favorita-baseline",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [Naive()],
    },
    
    # RandomWalkWithDrift
    {
        "experiment_name": "favorita-baseline",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [RandomWalkWithDrift()],
    },
    
    # SeasonalNaive
    {
        "experiment_name": "favorita-baseline",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [SeasonalNaive(season_length=HORIZON)],
    },
    
    # WindowAverage
    {
        "experiment_name": "favorita-baseline",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [WindowAverage(window_size=HORIZON)],
    },
    
    # SeasonalWindowAverage
    {
        "experiment_name": "favorita-baseline",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [SeasonalWindowAverage(season_length=HORIZON, window_size=HORIZON)],
    },

    # SimpleExponentialSmoothing
    {
        "experiment_name": "favorita-exponential-smoothing",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [SimpleExponentialSmoothing(alpha=0.3)],
    },
    
    # Holt
    {
        "experiment_name": "favorita-exponential-smoothing",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [Holt()],
    },

    # ADIDA
    {
        "experiment_name": "favorita-sparse-intermittent",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [ADIDA()],
    },
    
    # CrostonClassic
    {
        "experiment_name": "favorita-sparse-intermittent",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [CrostonClassic()],
    },
    
    # CrostonSBA
    {
        "experiment_name": "favorita-sparse-intermittent",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [CrostonSBA()],
    },
    
    # IMAPA
    {
        "experiment_name": "favorita-sparse-intermittent",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [IMAPA()],
    },
    
    # TSB
    {
        "experiment_name": "favorita-sparse-intermittent",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [TSB(alpha_d=0.2, alpha_p=0.2)],
    },

    # MSTL
    {
        "experiment_name": "favorita-other",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [MSTL(season_length=HORIZON)],
    },
    
    # Theta
    {
        "experiment_name": "favorita-other",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [Theta()],
    },
    
    # ARCH
    {
        "experiment_name": "favorita-other",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [ARCH()],
    },

    # ARIMA
    {
        "experiment_name": "favorita-arima",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [ARIMA()],
    },
    
    # AutoRegressive
    {
        "experiment_name": "favorita-ar",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [AutoRegressive(lags=HORIZON)],
    },

    # AutoARIMA
    {
        "experiment_name": "favorita-AutoARIMA",
        "data": os.path.join(DATA_FOLDER, FAV_FILE),
        "models": [AutoARIMA()],
    },
]

In [10]:
for config in experiments_config[17:]:
    print(config)

{'experiment_name': 'favorita-ar', 'data': 'data/favorita-grocery.parquet.gzip', 'models': [AutoRegressive]}
{'experiment_name': 'favorita-AutoARIMA', 'data': 'data/favorita-grocery.parquet.gzip', 'models': [AutoARIMA]}


In [11]:
for config in experiments_config[17:]:
    print(config, "\n")
    
    df = pd.read_parquet(config["data"])

    try:
        inst = ClassicModel(df, config["models"])
        inst.run()
    except Exception as e:
        print(f"Error with {config['experiment_name']}")
        print(e)

{'experiment_name': 'favorita-ar', 'data': 'data/favorita-grocery.parquet.gzip', 'models': [AutoRegressive]} 

Starting cross validation...


  x -= np.dot(xreg, par[narma + np.arange(ncxreg)])
  x -= np.dot(xreg, par[narma + np.arange(ncxreg)])


Cross Validation Finished In: 420.8960395296415
Saving CV results as AutoRegressive.csv...
Starting Evaluation...
Evaluation Finished In: 9.75882534980774
Saving evaluation as AutoRegressive.csv...
Saving metrics...
        AutoRegressive
metric                
mae       1.103118e+02
mape      1.751388e+03
mse       4.286200e+09
rmse      1.724310e+02
smape     5.388262e+01
Done!
{'experiment_name': 'favorita-AutoARIMA', 'data': 'data/favorita-grocery.parquet.gzip', 'models': [AutoARIMA]} 

Starting cross validation...


  fit["sigma2"] = np.nansum(fit["residuals"] ** 2) / (nstar - npar + 1)


Cross Validation Finished In: 198.28100754817328
Saving CV results as AutoARIMA.csv...
Starting Evaluation...
Evaluation Finished In: 9.761488183339436
Saving evaluation as AutoARIMA.csv...
Saving metrics...
         AutoARIMA
metric            
mae       3.713717
mape     88.944397
mse     376.815491
rmse      4.805626
smape    55.093552
Done!
