In [1]:
import pandas as pd
from metaflow import Flow, Run, Step

## Collect results of experiments from Metaflow

In [2]:
def get_results(run_id: int) -> pd.DataFrame:
    try:
        # Try loading the final results if run has finished
        results_for_run = Run(f"ForecastEvaluation/{run_id}").data.results_full
    except (KeyError, AttributeError):
        # Manually collecting results for run if some jobs are still in progress
        results_list = []
        for t in Step(f"ForecastEvaluation/{run_id}/evaluate_dataset").tasks():
            try:
                results_list.append(t.data.results)
            except KeyError:
                pass
        results_for_run = pd.concat(results_list)
    return results_for_run

### Option 1: Provide IDs of runs for each model

In [3]:
save_to_disk = False

In [4]:
# Make sure to replace these with your run IDs!
results_all = pd.concat([
    get_results(1712079941097970),  # SeasonalNaive
    get_results(1712079795572065),  # StatisticalEnsemble
    get_results(1712080010851589),  # chronos_mini
    get_results(1712081461874960),  # chronos_large
])
if save_to_disk:
    results_all.to_csv("results/results_all.csv", index=False)

### Option 2: Collect results from the most recent runs

In [5]:
results_all = []
for run in list(Flow("ForecastEvaluation").runs())[:5]:
    results_all.append(get_results(run.id))
results_all = pd.concat(results_all).dropna(subset="value").drop_duplicates(["dataset", "model", "metric"])
if save_to_disk:
    results_all.to_csv("results/results_all.csv", index=False)

### Option 3: Load results from disk

In [6]:
results_all = pd.read_csv("results/results_all.csv")

## Combine the results into a table

In [7]:
table = results_all.set_index(["dataset", "metric", "model"]).unstack(level=1).unstack(1).round(3).droplevel(0, axis=1)

In [8]:
# StatisticalEnsemble takes >24 hours to forecast on `ett_small_15min`, so the result is missing for this dataset.
dataset_order = [
    "australian_electricity_demand", 
    "car_parts_without_missing",
    "cif_2016", 
    "covid_deaths", 
    "dominick", 
    "ercot", 
    "ett_small_15min",
    "ett_small_1h", 
    "exchange_rate", 
    "fred_md", 
    "hospital", 
    "m5",
    "nn5_daily_without_missing", 
    "nn5_weekly", 
    "traffic", 
    "weather",
    "m1_monthly",
    "m1_quarterly", 
    "m1_yearly", 
    "m3_monthly", 
    "m3_other", 
    "m3_quarterly",
    "m3_yearly", 
    "m4_quarterly", 
    "m4_yearly", 
    "tourism_monthly",
    "tourism_quarterly", 
    "tourism_yearly", 
]
table = table.reindex(dataset_order)
table

metric,mase,mase,mase,mase,scaled_crps,scaled_crps,scaled_crps,scaled_crps,smape,smape,smape,smape,time,time,time,time
model,SeasonalNaive,StatisticalEnsemble,amazon/chronos-t5-large,amazon/chronos-t5-mini,SeasonalNaive,StatisticalEnsemble,amazon/chronos-t5-large,amazon/chronos-t5-mini,SeasonalNaive,StatisticalEnsemble,amazon/chronos-t5-large,amazon/chronos-t5-mini,SeasonalNaive,StatisticalEnsemble,amazon/chronos-t5-large,amazon/chronos-t5-mini
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
australian_electricity_demand,1.34,1.215,1.184,0.882,0.098,0.058,0.054,0.042,0.059,0.055,0.051,0.04,0.333,12602.872,4.703,2.116
car_parts_without_missing,1.12,1.05,0.807,0.803,2.225,1.132,1.059,1.022,0.31,0.897,0.947,0.957,0.944,110.575,61.375,5.296
cif_2016,1.289,0.918,0.986,1.025,0.056,0.021,0.015,0.019,0.094,0.058,0.074,0.076,0.672,83.17,3.744,1.532
covid_deaths,7.762,5.246,6.54,6.555,0.116,0.025,0.05,0.072,0.093,0.054,0.205,0.204,0.883,86.333,32.049,3.871
dominick,0.828,0.848,0.786,0.782,2.21,0.529,0.414,0.399,0.16,0.782,0.809,0.817,11.452,981.563,8661.922,653.726
ercot,0.761,1.979,0.578,0.585,0.039,0.05,0.017,0.016,0.016,0.04,0.012,0.012,0.375,5119.783,3.666,1.979
ett_small_15min,0.768,,0.714,0.739,0.143,,0.083,0.088,0.095,,0.11,0.116,0.422,,4.933,2.05
ett_small_1h,0.932,1.003,0.737,0.805,0.153,0.123,0.083,0.085,0.103,0.131,0.091,0.1,0.183,4206.25,4.607,1.688
exchange_rate,1.524,1.429,1.882,2.118,0.016,0.007,0.011,0.01,0.005,0.004,0.006,0.007,0.075,259.485,3.613,1.554
fred_md,1.101,0.489,0.571,0.564,0.082,0.035,0.029,0.029,0.073,0.052,0.052,0.052,0.872,235.237,15.487,2.359


### Compute average performance using geometric mean
1. For each dataset and each model, we compute the **relative score** by dividing the model score by the score of the baseline (SeasonalNaive). This makes the scores comparable across datasets.
2. We aggregate the relative scores of each model across all datasets by taking the **geometric mean** (as recommended by [Fleming & Wallace](https://dl.acm.org/doi/10.1145/5666.5673)).

In [9]:
from scipy.stats import gmean

results = []
for metric in ["mase", "scaled_crps", "smape"]:
    scaled = table[metric].divide(table[metric]["SeasonalNaive"], axis=0).fillna(1.0)
    agg = pd.concat({metric: scaled.apply(gmean)})
    results.append(agg)
pd.concat(results).round(3).to_frame().T

Unnamed: 0_level_0,mase,mase,mase,mase,scaled_crps,scaled_crps,scaled_crps,scaled_crps,smape,smape,smape,smape
model,SeasonalNaive,StatisticalEnsemble,amazon/chronos-t5-large,amazon/chronos-t5-mini,SeasonalNaive,StatisticalEnsemble,amazon/chronos-t5-large,amazon/chronos-t5-mini,SeasonalNaive,StatisticalEnsemble,amazon/chronos-t5-large,amazon/chronos-t5-mini
0,1.0,0.836,0.81,0.845,1.0,0.505,0.472,0.485,1.0,0.987,1.034,1.085


## Create LaTeX table

In [10]:
full_df = []
for metric in ["mase", "scaled_crps", "smape", "time"]:
    tab = table[metric][["StatisticalEnsemble", "amazon/chronos-t5-large", "amazon/chronos-t5-mini", "SeasonalNaive"]]
    if metric == "time":
        tab = tab.round(1)
    tab = tab.rename(columns={"amazon/chronos-t5-large": "Chronos (large)", "amazon/chronos-t5-mini": "Chronos (mini)"})
    formatted_df = []
    for dataset, row in tab.iterrows():
        first, second = row.nsmallest(2).index
        d = 1 if metric == "time" else 3
        row = row.apply(lambda x: f"{x:.{d}f}")
        row.loc[first] = r"\textbf{" + row.loc[first] + "}"
        row.loc[second] = r"\underline{" + row.loc[second] + "}"
        formatted_df.append(row.to_frame().T)
    formatted_df = pd.concat(formatted_df)
    formatted_df.columns = pd.MultiIndex.from_product([[metric], formatted_df.columns])
    full_df.append(formatted_df)
full_df = pd.concat(full_df, axis=1)
full_df.index = [x.replace("_", "-") for x in full_df.index]

In [11]:
print(full_df.style.to_latex().replace("nan", "N/A"))

\begin{tabular}{lllllllllllllllll}
 & \multicolumn{4}{r}{mase} & \multicolumn{4}{r}{scaled_crps} & \multicolumn{4}{r}{smape} & \multicolumn{4}{r}{time} \\
model & StatisticalEnsemble & Chronos (large) & Chronos (mini) & SeasonalNaive & StatisticalEnsemble & Chronos (large) & Chronos (mini) & SeasonalNaive & StatisticalEnsemble & Chronos (large) & Chronos (mini) & SeasonalNaive & StatisticalEnsemble & Chronos (large) & Chronos (mini) & SeasonalNaive \\
australian-electricity-demand & 1.215 & \underline{1.184} & \textbf{0.882} & 1.340 & 0.058 & \underline{0.054} & \textbf{0.042} & 0.098 & 0.055 & \underline{0.051} & \textbf{0.040} & 0.059 & 12602.9 & 4.7 & \underline{2.1} & \textbf{0.3} \\
car-parts-without-missing & 1.050 & \underline{0.807} & \textbf{0.803} & 1.120 & 1.132 & \underline{1.059} & \textbf{1.022} & 2.225 & \underline{0.897} & 0.947 & 0.957 & \textbf{0.310} & 110.6 & 61.4 & \underline{5.3} & \textbf{0.9} \\
cif-2016 & \textbf{0.918} & \underline{0.986} & 1.025 & 1.289 & 0.0