### Imports

In [1]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px

import plotly.graph_objects as go
from plotly.colors import n_colors

### Constants

In [2]:
M5_RES = "results/evaluation/"
FOZZY_RES = "results/fozzy-evaluation/"
FAV_RES = "results/favorita-evaluation/"

### Functions

In [3]:
def read_data_to_single_df(folder: str) -> pd.DataFrame:
    result_df = None
    key_columns = ["unique_id", "cutoff", "metric"]
    
    for path in os.listdir(folder):
        print(f"Reading {path}...")

        data = pd.read_csv(os.path.join(folder, path))

        if result_df is None:
            result_df = data
        else:
            result_df = pd.merge(result_df, data, on=key_columns, how="inner")
    return result_df

In [4]:
def prepare_data(df: pd.DataFrame, metric: str) -> pd.DataFrame:
    # remove unused columns
    df = df[np.setdiff1d(df.columns , ['unique_id', 'cutoff'])]
    
    # select metric for analysis
    data = df.loc[(df["metric"] == metric)]
    data = data.drop(columns=["metric"])
    data = data.melt(var_name='model', value_name='value')
    return data

In [5]:
def build_ridgeline_plot(df: pd.DataFrame, metric: str):
    # prepare data
    data = prepare_data(df, metric)
    
    # filter outliers
    data = data.loc[data["value"] < data["value"].quantile(0.95)]
    
    # sort models
    # Calculate the standard deviation for each group
    group_std = data.groupby('model')['value'].mean()

    # Sort the groups based on their standard deviation values
    sorted_groups = group_std.sort_values(ascending=False).index.tolist()
    
    # plot
    colors = n_colors( 
        'rgb(200, 10, 10)', 
        'rgb(5, 200, 200)',
        data["model"].nunique(), 
        colortype='rgb'
    )

    fig = go.Figure()
    for model, color in zip(sorted_groups, colors):
        x_data = data["value"].loc[data["model"] == model].to_numpy()
        fig.add_trace(go.Violin(x=x_data, line_color=color, name=model))

    fig.update_traces(orientation='h', side='positive', width=3, points=False)
    fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False)
    fig.update_layout(
        width=1000, 
        height=800,
        title=f"Violin Plot of {metric.upper()}"
    )
    fig.show()

### Read data

In [10]:
df = read_data_to_single_df(folder=FAV_RES)
df.head()

Reading SimpleExponentialSmoothing.csv...
Reading SeasonalWindowAverage.csv...
Reading WindowAverage.csv...
Reading RandomWalkWithDrift.csv...
Reading ARCH.csv...
Reading HistoricAverage.csv...
Reading ARIMA.csv...
Reading AutoARIMA.csv...
Reading SeasonalNaive.csv...
Reading AutoRegressive.csv...
Reading Theta.csv...
Reading Holt.csv...
Reading IMAPA.csv...
Reading CrostonClassic.csv...
Reading TSB.csv...
Reading Naive.csv...
Reading MSTL.csv...
Reading ADIDA.csv...
Reading CrostonSBA.csv...


Unnamed: 0,unique_id,cutoff,metric,SES,SeasWA,WindowAverage,RWD,ARCH(1),HistoricAverage,ARIMA,...,AutoRegressive,Theta,Holt,IMAPA,CrostonClassic,TSB,Naive,MSTL,ADIDA,CrostonSBA
0,1-1000866,2017-07-24,mae,0.571553,0.30102,0.244898,1.922645,1.284681,0.298137,0.298137,...,0.364569,0.32145,0.36614,0.317234,0.318125,0.431174,1.857143,0.297433,0.317234,0.273647
1,1-1000866,2017-07-24,mape,54.29942,25.000004,18.367348,184.80972,111.03274,24.223598,24.223598,...,31.230125,26.779688,31.689024,26.324282,26.422316,38.85767,178.57143,25.130203,26.324282,21.529772
2,1-1000866,2017-07-24,mse,0.331588,0.147595,0.122449,3.826491,2.335644,0.128004,0.128004,...,0.166036,0.134106,0.151491,0.132705,0.132959,0.190458,3.571429,0.181009,0.132705,0.124069
3,1-1000866,2017-07-24,rmse,0.575837,0.384181,0.349927,1.956142,1.528282,0.357777,0.357777,...,0.407475,0.366204,0.389218,0.364286,0.364635,0.436415,1.889822,0.425452,0.364286,0.352234
4,1-1000866,2017-07-24,smape,42.742355,23.700941,19.220785,93.07597,146.37563,23.756523,23.756523,...,28.634504,25.642168,29.100391,25.305906,25.377256,33.79398,91.42857,23.220362,25.305906,21.710829


### Analytics

In [11]:
df.head()

Unnamed: 0,unique_id,cutoff,metric,SES,SeasWA,WindowAverage,RWD,ARCH(1),HistoricAverage,ARIMA,...,AutoRegressive,Theta,Holt,IMAPA,CrostonClassic,TSB,Naive,MSTL,ADIDA,CrostonSBA
0,1-1000866,2017-07-24,mae,0.571553,0.30102,0.244898,1.922645,1.284681,0.298137,0.298137,...,0.364569,0.32145,0.36614,0.317234,0.318125,0.431174,1.857143,0.297433,0.317234,0.273647
1,1-1000866,2017-07-24,mape,54.29942,25.000004,18.367348,184.80972,111.03274,24.223598,24.223598,...,31.230125,26.779688,31.689024,26.324282,26.422316,38.85767,178.57143,25.130203,26.324282,21.529772
2,1-1000866,2017-07-24,mse,0.331588,0.147595,0.122449,3.826491,2.335644,0.128004,0.128004,...,0.166036,0.134106,0.151491,0.132705,0.132959,0.190458,3.571429,0.181009,0.132705,0.124069
3,1-1000866,2017-07-24,rmse,0.575837,0.384181,0.349927,1.956142,1.528282,0.357777,0.357777,...,0.407475,0.366204,0.389218,0.364286,0.364635,0.436415,1.889822,0.425452,0.364286,0.352234
4,1-1000866,2017-07-24,smape,42.742355,23.700941,19.220785,93.07597,146.37563,23.756523,23.756523,...,28.634504,25.642168,29.100391,25.305906,25.377256,33.79398,91.42857,23.220362,25.305906,21.710829


In [16]:
res = prepare_data(df, "smape")
res = res["value"].describe().round(2)
print(pd.DataFrame(res).reset_index().to_latex(index=False, escape=False))

\begin{tabular}{lr}
\toprule
index &      value \\
\midrule
count & 2910458.00 \\
 mean &      59.62 \\
  std &      30.23 \\
  min &       0.00 \\
  25% &      42.29 \\
  50% &      52.85 \\
  75% &      66.79 \\
  max &     200.00 \\
\bottomrule
\end{tabular}



  print(pd.DataFrame(res).reset_index().to_latex(index=False, escape=False))


### Visualizations

In [75]:
# build_ridgeline_plot(df, "mape")

### Experimenting

In [14]:
M5_METRICS = "results/metrics/"
FOZZY_METRICS = "results/fozzy-metrics/"
FAV_METRICS = "results/favorita-metrics/"

In [25]:
path = M5_METRICS
metric_names = ["mae", "mape", "mse", "rmse", "smape"]
data = pd.DataFrame()

for file in os.listdir(path):
    
    df = pd.read_csv(os.path.join(path, file))
    df["metrics"] = metric_names
    
    if data.empty:
        data = df
    else:
        data = data.merge(df, how='inner', on='metrics')

data = pd.melt(data, id_vars=['metrics'], var_name='model_name', value_name='metrics_value')
data

Unnamed: 0,metrics,model_name,metrics_value
0,mae,SES,1.087095
1,mape,SES,29.250298
2,mse,SES,4.988080
3,rmse,SES,1.374435
4,smape,SES,142.377790
...,...,...,...
90,mae,SeasWA,1.072436
91,mape,SeasWA,27.349192
92,mse,SeasWA,5.015110
93,rmse,SeasWA,1.376366


# Summary Statistics

In [8]:
M5_RES = "results/evaluation/"
FOZZY_RES = "results/fozzy-evaluation/"
FAV_RES = "results/favorita-evaluation/"

In [47]:
df = read_data_to_single_df(folder=M5_RES)
df.head()

Reading SimpleExponentialSmoothing-Holt.csv...
Reading ADIDA-CrostonClassic-CrostonSBA-IMAPA-TSB.csv...
Reading MSTL-Theta-ARCH.csv...
Reading AutoARIMA.csv...
Reading ARIMA-AutoRegressive.csv...
Reading HistoricAverage-Naive-RandomWalkWithDrift-SeasonalNaive-WindowAverage-SeasonalWindowAverage.csv...


Unnamed: 0,unique_id,cutoff,metric,SES,Holt,ADIDA,CrostonClassic,CrostonSBA,IMAPA,TSB,...,ARCH(1),AutoARIMA,ARIMA,AutoRegressive,HistoricAverage,Naive,RWD,SeasonalNaive,WindowAverage,SeasWA
0,FOODS_1_001_CA_1,2016-04-10,mse,1.193014,1.145749,1.152618,1.152664,1.1448,1.169181,1.203514,...,6.001423,1.153829,1.189459,1.428748,1.189471,2.142857,2.134372,2.214286,1.270408,1.392128
1,FOODS_1_001_CA_1,2016-04-10,mae,0.842261,0.744192,0.770742,0.770873,0.739472,0.806998,0.85502,...,1.942738,0.774141,0.775964,0.999062,0.775972,1.285714,1.282892,0.928571,0.918367,0.887755
2,FOODS_1_001_CA_1,2016-04-10,rmse,1.092252,1.070396,1.0736,1.073622,1.069953,1.081287,1.097048,...,2.44978,1.074164,1.090623,1.195302,1.090629,1.46385,1.460949,1.488048,1.127124,1.179885
3,FOODS_1_001_CA_1,2016-04-10,mape,20.09854,14.279613,15.852062,15.859879,13.995455,18.004808,20.856115,...,104.52639,16.053923,23.679249,27.65133,23.680695,46.42857,46.25734,35.714287,24.617348,31.632656
4,FOODS_1_001_CA_1,2016-04-10,smape,94.529144,90.83461,91.90121,91.9064,90.62074,93.282524,94.958435,...,177.68495,92.035286,103.88027,99.00221,103.882324,104.76191,104.70302,92.85714,96.93432,116.06454


In [48]:
df["metric"].unique()

array(['mse', 'mae', 'rmse', 'mape', 'smape'], dtype=object)

In [49]:
# m5: (457350, 22)
# fozzy: (460140, 21)
# favorita: (765910, 22)

df.shape

(457350, 22)

In [50]:
r_mape = df.loc[df["metric"] == "mape"].describe().T
r_mape = r_mape.round(2)
r_mape.sort_values(by='mean', ascending=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AutoRegressive,91470.0,25.98,23.27,0.0,12.24,21.25,33.1,622.8
CrostonSBA,91470.0,26.18,22.46,0.0,12.58,21.8,33.33,580.25
CrostonClassic,91470.0,26.48,23.87,0.0,12.32,21.32,33.31,612.29
ADIDA,91470.0,26.77,25.24,0.0,12.44,21.39,33.18,826.81
WindowAverage,91470.0,26.77,24.35,0.0,12.24,21.43,34.25,771.94
IMAPA,91470.0,26.85,25.46,0.0,12.37,21.36,33.37,826.81
Holt,91470.0,26.87,25.65,0.0,12.44,21.37,33.29,829.04
HistoricAverage,91470.0,27.01,25.68,0.0,12.23,22.11,34.7,654.45
Theta,91470.0,27.01,25.97,0.0,12.23,21.37,33.8,842.8
ARIMA,91470.0,27.01,25.68,0.0,12.23,22.11,34.7,654.47


In [62]:
r_smape = df.loc[df["metric"] == "smape"].describe().T
r_smape = r_smape.round(2)
r_smape.sort_values(by='mean', ascending=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Naive,92028.0,17.02,40.1,0.0,0.0,0.0,14.29,200.0
SeasonalNaive,92028.0,17.84,31.2,0.0,0.0,0.0,28.57,192.86
WindowAverage,92028.0,55.11,83.34,0.0,0.0,0.0,158.61,200.0
SeasWA,92028.0,58.19,68.67,0.0,0.0,14.29,114.29,200.0
SES,92028.0,179.75,54.99,0.0,199.29,200.0,200.0,200.0
IMAPA,92028.0,184.5,46.61,0.0,197.95,200.0,200.0,200.0
ADIDA,92028.0,184.5,46.59,0.0,197.83,200.0,200.0,200.0
Holt,92028.0,188.98,37.28,0.0,198.98,200.0,200.0,200.0
CrostonClassic,92028.0,194.1,19.06,0.0,197.75,200.0,200.0,200.0
CrostonSBA,92028.0,194.23,18.85,0.0,197.85,200.0,200.0,200.0


In [46]:
# .to_latex(index=False)

# ANOVA

In [48]:
import pandas as pd
from scipy.stats import f_oneway

In [49]:
M5_RES = "results/evaluation/"
FOZZY_RES = "results/fozzy-evaluation/"
FAV_RES = "results/favorita-evaluation/"

In [50]:
df = read_data_to_single_df(folder=M5_RES)
df.head()

Reading SimpleExponentialSmoothing-Holt.csv...
Reading ADIDA-CrostonClassic-CrostonSBA-IMAPA-TSB.csv...
Reading MSTL-Theta-ARCH.csv...
Reading AutoARIMA.csv...
Reading ARIMA-AutoRegressive.csv...
Reading HistoricAverage-Naive-RandomWalkWithDrift-SeasonalNaive-WindowAverage-SeasonalWindowAverage.csv...


Unnamed: 0,unique_id,cutoff,metric,SES,Holt,ADIDA,CrostonClassic,CrostonSBA,IMAPA,TSB,...,ARCH(1),AutoARIMA,ARIMA,AutoRegressive,HistoricAverage,Naive,RWD,SeasonalNaive,WindowAverage,SeasWA
0,FOODS_1_001_CA_1,2016-04-10,mse,1.193014,1.145749,1.152618,1.152664,1.1448,1.169181,1.203514,...,6.001423,1.153829,1.189459,1.428748,1.189471,2.142857,2.134372,2.214286,1.270408,1.392128
1,FOODS_1_001_CA_1,2016-04-10,mae,0.842261,0.744192,0.770742,0.770873,0.739472,0.806998,0.85502,...,1.942738,0.774141,0.775964,0.999062,0.775972,1.285714,1.282892,0.928571,0.918367,0.887755
2,FOODS_1_001_CA_1,2016-04-10,rmse,1.092252,1.070396,1.0736,1.073622,1.069953,1.081287,1.097048,...,2.44978,1.074164,1.090623,1.195302,1.090629,1.46385,1.460949,1.488048,1.127124,1.179885
3,FOODS_1_001_CA_1,2016-04-10,mape,20.09854,14.279613,15.852062,15.859879,13.995455,18.004808,20.856115,...,104.52639,16.053923,23.679249,27.65133,23.680695,46.42857,46.25734,35.714287,24.617348,31.632656
4,FOODS_1_001_CA_1,2016-04-10,smape,94.529144,90.83461,91.90121,91.9064,90.62074,93.282524,94.958435,...,177.68495,92.035286,103.88027,99.00221,103.882324,104.76191,104.70302,92.85714,96.93432,116.06454


In [51]:
df = df.drop(columns="cutoff")

In [52]:
df.dtypes

unique_id           object
metric              object
SES                float64
Holt               float64
ADIDA              float64
CrostonClassic     float64
CrostonSBA         float64
IMAPA              float64
TSB                float64
MSTL               float64
Theta              float64
ARCH(1)            float64
AutoARIMA          float64
ARIMA              float64
AutoRegressive     float64
HistoricAverage    float64
Naive              float64
RWD                float64
SeasonalNaive      float64
WindowAverage      float64
SeasWA             float64
dtype: object

In [53]:
models = [
    'SES', 'Holt', 'ADIDA', 'CrostonClassic',
       'CrostonSBA', 'IMAPA', 'TSB', 'MSTL', 'Theta', 'ARCH(1)', 'AutoARIMA',
       'ARIMA', 'AutoRegressive', 'HistoricAverage', 'Naive', 'RWD',
       'SeasonalNaive', 'WindowAverage', 'SeasWA'
]

print("Shape before", df.shape)
for col in models:
    df = df.loc[df[col] != 0]
    
df.shape

Shape before (457350, 21)


(420762, 21)

In [54]:
df.describe()

Unnamed: 0,SES,Holt,ADIDA,CrostonClassic,CrostonSBA,IMAPA,TSB,MSTL,Theta,ARCH(1),AutoARIMA,ARIMA,AutoRegressive,HistoricAverage,Naive,RWD,SeasonalNaive,WindowAverage,SeasWA
count,420762.0,420762.0,420762.0,420762.0,420762.0,420762.0,420762.0,420762.0,420762.0,420762.0,420762.0,420762.0,420762.0,420762.0,420762.0,420762.0,420762.0,420762.0,420204.0
mean,35.801365,34.755012,34.699906,34.619771,34.7034,34.714691,34.979839,37.999677,34.813433,75.26298,34.44595,35.399269,34.400714,35.399342,29.491424,42.404477,28.035761,33.206887,33.94741
std,60.958851,59.974898,59.638798,59.290922,59.477084,59.649196,59.741154,62.968459,60.024265,5378.113,59.03988,60.948835,58.722854,60.948923,53.752495,72.374276,45.059526,57.272029,57.09666
min,0.004864,9e-06,0.00038,0.000243,0.000219,0.000721,0.001843,0.00594,0.009746,0.011393,3.091789e-19,0.000643,0.000781,0.000643,0.071429,0.071522,0.071429,0.005102,8.51495e-07
25%,0.745171,0.692152,0.691031,0.69443,0.688433,0.695253,0.716285,0.82715,0.70395,1.60181,0.6967192,0.706762,0.706273,0.70676,0.886405,0.886405,1.0,0.707107,0.7101923
50%,2.275407,2.107412,2.10234,2.113351,2.096468,2.108404,2.156013,2.516106,2.119705,5.999426,2.124794,2.234501,2.097758,2.23456,3.214286,3.223191,3.142857,2.116505,2.199708
75%,39.776062,35.911821,35.714287,35.925228,35.740129,35.904807,37.329921,45.659102,36.316314,116.6179,36.43812,37.868915,35.51968,37.870436,42.857143,54.233922,44.642857,34.82143,38.42651
max,3167.0725,2765.889,2136.19,2562.978,2599.7092,2019.6334,1847.338,4804.0503,2730.0828,3325161.0,2365.549,2659.8025,1775.5791,2659.8025,4564.5713,4601.3853,3733.2856,2047.6787,3229.355


In [55]:
# df = df[["unique_id", "metric", "ARIMA", "AutoARIMA"]]

In [56]:
# List of metrics (replace with your actual metric names)
metric_names = ['mae', 'mape', 'mse', 'rmse', 'smape']

# Loop through each metric
for metric_name in metric_names:
    metric_data = []

    # Loop through each model column (excluding 'unique_id' and 'metric')
    for column in df.columns:
        if column not in ['unique_id', 'metric']:
            model_metric_data = df[df['metric'] == metric_name][column]
            metric_data.append(model_metric_data)

    # Perform ANOVA
    f_statistic, p_value = f_oneway(*metric_data)

    print(f"Metric: {metric_name}")
    print("ANOVA results:")
    print("F-statistic:", f_statistic)
    print("P-value:", p_value)

    if p_value < 0.05:
        print("Reject null hypothesis: There is a significant difference among at least one group.\n")
    else:
        print("Fail to reject null hypothesis: No significant difference.\n")


Metric: mae
ANOVA results:
F-statistic: nan
P-value: nan
Fail to reject null hypothesis: No significant difference.

Metric: mape
ANOVA results:
F-statistic: 4478.199551920638
P-value: 0.0
Reject null hypothesis: There is a significant difference among at least one group.

Metric: mse
ANOVA results:
F-statistic: nan
P-value: nan
Fail to reject null hypothesis: No significant difference.

Metric: rmse
ANOVA results:
F-statistic: nan
P-value: nan
Fail to reject null hypothesis: No significant difference.

Metric: smape
ANOVA results:
F-statistic: 13952.621034078395
P-value: 0.0
Reject null hypothesis: There is a significant difference among at least one group.

