# VN1 Competition

## Introduction

The VN1 Forecasting Accuracy Challenge tasked participants with forecasting future sales using historical sales and pricing data. The goal was to develop robust predictive models capable of anticipating sales trends for various products across different clients and warehouses. Submissions were evaluated based on their accuracy and bias against actual sales figures.

The competition was structured into two phases:

- **Phase 1** (September 12 - October 3, 2024): Participants used the provided Phase 0 sales data to predict sales for Phase 1. This phase lasted three weeks and featured live leaderboard updates to track participant progress.
- **Phase 2** (October 3 - October 17, 2024): Participants utilized both Phase 0 and Phase 1 data to predict sales for Phase 2. Unlike Phase 1, there were no leaderboard updates during this phase until the competition concluded.

In the following notebook, we'll be showcasing how to create forecasts with ETS, ARIMA, CES and Theta models from  `statsforecast` as well as using an ensemble of this models and a simple hierarchical reconciliation.

In [None]:
import pandas as pd 
import numpy as np
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA, AutoETS, AutoCES, AutoTheta, Naive
from utilsforecast.preprocessing import fill_gaps

## 1. Load data

In [None]:
def read_and_prepare_data(file_path: str, value_name: str = "y") -> pd.DataFrame:
    """Reads data in wide format, and returns it in long format with columns `unique_id`, `ds`, `y`"""
    df = pd.read_csv(file_path)
    uid_cols = ["Client", "Warehouse", "Product"]
    df["unique_id"] = df[uid_cols].astype(str).agg("-".join, axis=1)
    df = df.drop(uid_cols, axis=1)
    df = df.melt(id_vars=["unique_id"], var_name="ds", value_name=value_name)
    df["ds"] = pd.to_datetime(df["ds"])
    df = df.sort_values(by=["unique_id", "ds"])
    return df

In [None]:
df0 = read_and_prepare_data("../data/phase_0_sales.csv")
df1 = read_and_prepare_data("../data/phase_1_sales.csv")

df = pd.concat([df0, df1], ignore_index=True)
df = df.sort_values(by=["unique_id", "ds"])
test_df = read_and_prepare_data("../data/phase_2_sales.csv")

In [None]:
StatsForecast.plot(df, test_df.rename(columns={"y": "actual"}), engine='matplotlib')

## 2. Load Top 5 Solutions

In [None]:
def get_competition_forecasts() -> pd.DataFrame:
    """Reads all competition forecasts and returns it in long format with columns `unique_id`, `ds`, `y`"""
    fcst_df: pd.DataFrame | None = None
    for place in ["1st", "2nd", "3rd", "4th", "5th"]:
        fcst_df_place = read_and_prepare_data(
            f"../data/solution_{place}_place.csv", place
        )
        if fcst_df is None:
            fcst_df = fcst_df_place
        else:
            fcst_df = fcst_df.merge(
                fcst_df_place,
                on=["unique_id", "ds"],
                how="left",
            )
    return fcst_df

In [None]:
solutions = get_competition_forecasts()

In [None]:
def vn1_competition_evaluation(forecasts: pd.DataFrame) -> pd.DataFrame:
    """Computes competition evaluation scores"""
    actual = read_and_prepare_data("../data/phase_2_sales.csv")
    res = actual[["unique_id", "ds", "y"]].merge(
        forecasts, on=["unique_id", "ds"], how="left"
    )
    ids_forecasts = forecasts["unique_id"].unique()
    ids_res = res["unique_id"].unique()
    assert set(ids_forecasts) == set(ids_res), "Some unique_ids are missing"
    scores = {}
    for model in [col for col in forecasts.columns if col not in ["unique_id", "ds"]]:
        abs_err = np.nansum(np.abs(res[model] - res["y"]))
        err = np.nansum(res[model] - res["y"])
        score = abs_err + abs(err)
        score = score / res["y"].sum()
        scores[model] = round(score, 4)
    score_df = pd.DataFrame(list(scores.items()), columns=["model", "score"])
    score_df = score_df.sort_values(by="score")
    return score_df

In [None]:
scores = vn1_competition_evaluation(solutions)
scores 

## 3. Data Processing
### 3.1. Remove leading zeros 

In [None]:
# 3. Remove leading zeros from each series.
def _remove_leading_zeros(group): 
    """
    Removes leading zeros from series 
    """
    first_non_zero_index = group['y'].ne(0).idxmax()
    return group.loc[first_non_zero_index:]

df_clean = df_complete.groupby("unique_id").apply(_remove_leading_zeros).reset_index(drop=True)

df_clean.shape, df_complete.shape

### 3.2. Identify obsolete series 

In [None]:
# 4. Identify obsoletes series 
def _is_obsolete(group, days_obsoletes):
    """
    Identify obsolete series
    """
    last_date = group["ds"].max()
    cutoff_date = last_date - pd.Timedelta(days=days_obsoletes)
    recent_data = group.query("ds >= @cutoff_date")
    return (recent_data["y"] == 0).all()

days_obsoletes=180 # context-dependent 
obsolete_series = df_clean.groupby("unique_id").apply(_is_obsolete, days_obsoletes=days_obsoletes)
obsolete_ids = obsolete_series[obsolete_series].index.tolist()

## 4. Model fitting

In [None]:
models = [
    AutoARIMA(season_length=52), 
    AutoETS(season_length=52), 
    AutoCES(season_length=52), 
    AutoTheta(season_length=52)
]

sf = StatsForecast(
    models=models, 
    freq='W-MON', 
    n_jobs=-1, 
    fallback_model=Naive()
)

In [None]:
fc = sf.forecast(
    df=df_clean, 
    h=13
)

### 4.1 Model ensembling

In [None]:
# Create ensemble
fc['Ensemble'] = fc[['AutoARIMA', 'AutoETS', 'CES', 'AutoTheta']].median(axis=1)
fc.loc[fc['Ensemble'] <= 1e-1, 'Ensemble'] = 0

# Set forecasts for obsolete series to zero
fc.loc[fc["unique_id"].isin(obsolete_ids), "Ensemble"] = 0

### 4.2 Evaluate results

In [None]:
# Evaluate results 
forecasts = solutions.merge(fc, on=["unique_id", "ds"], how="inner")
vn1_competition_evaluation(forecasts)

## 5. Hierarchical approach

In [None]:
df_client = df_clean.copy()
df_clean[['Client', 'Warehouse', 'Product']] = df_clean['unique_id'].str.split('-', expand=True)
df_client = df_clean.groupby(['Client', 'ds'])['y'].sum().reset_index()
print('There are ', df_client['Client'].nunique(), 'clients in the dataset.')

In [None]:
# Identify obsolete series
client_obsolete_series = df_client.groupby("Client").apply(_is_obsolete, days_obsoletes=days_obsoletes)
client_obsolete_ids = client_obsolete_series[client_obsolete_series].index.tolist()

In [None]:
sf_client = StatsForecast(
    models=models, 
    freq='W-MON', 
    n_jobs=-1, 
    fallback_model=Naive()
)

fc_client = sf_client.forecast(
    df=df_client, 
    h=13, 
    id_col="Client"
)

# Create ensemble
fc_client['Ensemble'] = fc_client[['AutoARIMA', 'AutoETS', 'CES', 'AutoTheta']].median(axis=1)

# Set forecasts for obsolete series to zero
fc_client.loc[fc_client["Client"].isin(client_obsolete_ids), "Ensemble"] = 0

In [None]:
StatsForecast.plot(df_client, fc_client, id_col="Client", engine='matplotlib')

In [None]:
fc[['Client', 'Warehouse', 'Product']] = fc['unique_id'].str.split('-', expand=True)

In [None]:
# hierarchical reconciliation 
total = fc.groupby(['Client', 'ds'])['Ensemble'].sum().reset_index()
total.rename(columns={'Ensemble': 'total_forecasted'}, inplace=True)
total['zero_base_fc'] = np.where(
    total['total_forecasted'] == 0, 
    True, 
    False
)

fc = fc.merge(total, on=['Client', 'ds'], how='left')

fc['fc_proportions'] = fc['Ensemble']/fc['total_forecasted']
fc['fc_proportions'] = fc['fc_proportions'].fillna(0)

fc_client.rename(columns={'Ensemble': 'Ensemble_client', 'unique_id': 'Client'}, inplace=True)

fc = fc.merge(fc_client[['Client', 'ds', 'Ensemble_client']], on=['Client', 'ds'], how='left')

products_per_client = fc.groupby(['Client', 'ds'])['Product'].nunique().reset_index(name='products_per_client')

fc = fc.merge(products_per_client, on=['Client', 'ds'], how='left')

fc['Ensemble-hierar'] = np.where(
    fc['zero_base_fc'] == False,
    fc['fc_proportions']*fc['Ensemble_client'],
    fc['Ensemble_client']/fc['products_per_client']
)

fc_hierar = fc[['unique_id', 'ds', 'Ensemble-hierar']]
fc_hierar.loc[fc_hierar['Ensemble-hierar'] <= 1e-1, 'Ensemble-hierar'] = 0

In [None]:
forecasts = forecasts.merge(fc_hierar, on=["unique_id", "ds"], how="left")

In [None]:
vn1_competition_evaluation(forecasts)