## Notebook Configuration && Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
import os

import numpy as np
import pandas as pd

from matplotlib.ticker import ScalarFormatter
import matplotlib.pyplot as plt
import seaborn as sns

from storesales.baseline.sales_predictor import SalesPredictor
from storesales.baseline.utils import (
    run_study,
    load_baseline_data,
    load_submission,
    make_time_series_split,
)
from storesales.baseline.model_wrappers import (
    DailyMeanModelWrapper,
    ProphetWrapper,
    DayOfWeekMeanModelWrapper,
    WeightedDayMeanModelWrapper,
)
from storesales.baseline.param_suggestions import (
    IntSuggestions,
    FloatSuggestions,
    CategoricalSuggestions,
)
from storesales.baseline.constants import FAMILY_GROUPS, STORES
from storesales.constants import SUBMISSIONS_PATH

## Load && Prepare Data

In [3]:
train_df, test_df, holidays_df = load_baseline_data(use_light_gbm_preprocessing=True)

## Prepare SalesPredictor

In [4]:
initial = "760 days"  # train period

### DailyMeanModelWrapper

In [5]:
daily_mean_int_suggestions = [IntSuggestions("window", 3, 60)]

In [6]:
daily_wrapper = DailyMeanModelWrapper(int_suggestions=daily_mean_int_suggestions)

### DayOfWeekMeanModelWrapper

In [7]:
day_of_week_mean_int_suggestions = [
    IntSuggestions("weekdays_window", 3, 50),
    IntSuggestions("weekends_window", 1, 10),
]

In [8]:
day_of_week_wrapper = DayOfWeekMeanModelWrapper(
    int_suggestions=day_of_week_mean_int_suggestions
)

### WeightedDayMeanModelWrapper

In [9]:
weighted_day_mean_int_suggestions = [
    IntSuggestions("weeks_window", 1, 7),
    IntSuggestions("months_window", 0, 12),
    IntSuggestions("years_window", 0, 4),
]
weighted_day_mean_float_suggestions = [
    FloatSuggestions("week_weight", 0.27, 0.39),
    FloatSuggestions("month_weight", 0.27, 0.39),
    FloatSuggestions("year_weight", 0.27, 0.39),
]

In [10]:
weighted_day_mean_wrapper = WeightedDayMeanModelWrapper(
    int_suggestions=weighted_day_mean_int_suggestions,
    float_suggestions=weighted_day_mean_float_suggestions,
)

### ProphetWrapper

In [11]:
prophet_int_suggestions = [
    IntSuggestions("n_changepoints", 20, 50),
]
prophet_float_suggestions = [
    FloatSuggestions("changepoint_prior_scale", 0.01, 0.5),
    FloatSuggestions("holidays_prior_scale", 5, 80),
    FloatSuggestions("seasonality_prior_scale", 5, 80),
]
prophet_categorical_suggestions = [
    CategoricalSuggestions("seasonality_mode", ["additive", "multiplicative"]),
]

prophet_base_params = {
    "daily_seasonality": False,
    "weekly_seasonality": True,
    "yearly_seasonality": True,
    "uncertainty_samples": False,
    "holidays": holidays_df,
}

In [12]:
prophet_wrapper = ProphetWrapper(
    initial=initial,
    extra_regressors=["dcoilwtico"],
    int_suggestions=prophet_int_suggestions,
    float_suggestions=prophet_float_suggestions,
    categorical_suggestions=prophet_categorical_suggestions,
    model_base_params=prophet_base_params,
)

### SalesPredictor

In [13]:
model_wrappers = {
    # DailyMeanModelWrapper.__name__: daily_wrapper,
    # DayOfWeekMeanModelWrapper.__name__: day_of_week_wrapper,
    WeightedDayMeanModelWrapper.__name__: weighted_day_mean_wrapper,
    # ProphetWrapper.__name__: prophet_wrapper,  # disabled due to long training time
}

optuna_optimize_kwargs = dict(
    n_trials=50,
    show_progress_bar=True,
    timeout=45,
)

outer_cutoffs_dates = [
    # "2016-08-16",
    # "2016-09-10",
    # "2016-10-01",
    # "2016-10-25",
    # "2016-11-18",
    # "2016-12-10",
    # "2017-01-01",
    # "2017-01-23",
    # "2017-02-20",
    # "2017-03-12",
    "2017-04-04",
    # "2017-05-15",
    "2017-07-31",
]
outer_cutoffs = [pd.Timestamp(date) for date in outer_cutoffs_dates]

family_group_to_stores = dict(
    [[family_group, STORES] for family_group in FAMILY_GROUPS]
)

predictor = SalesPredictor(
    model_wrappers=model_wrappers,
    family_groups=FAMILY_GROUPS,
    outer_cutoffs=outer_cutoffs,
    inner_cutoffs=[-365, -180, -49, -17],
    optuna_optimize_kwargs=optuna_optimize_kwargs,
    n_group_store_family_choices=4,
    n_single_store_family_choices=3,
    family_group_to_stores=family_group_to_stores,
    initial=initial,
)

## Run Study

In [14]:
dataset = make_time_series_split(train_df, predictor.outer_cutoffs, 16)

100%|██████████| 1729/1729 [00:03<00:00, 461.50it/s]


In [None]:
predictor = run_study(dataset, predictor)

## Analyze Loss Results && Select Top Loss Pairs

In [17]:
families = []
stores = []
mean_loss = []

for (store, family), losses in predictor.store_family_loss_storage.items():
    mean_loss.append(np.mean(losses))
    stores.append(store)
    families.append(family)

In [None]:
loss_df = pd.DataFrame({"store": stores, "family": families, "loss": mean_loss})

top_loss_df = loss_df.sort_values("loss", ascending=False)[:100]
top_loss_df

In [None]:
top_loss_family_to_store = {}

for family, stores in top_loss_df.groupby("family")["store"].apply(list).items():
    top_loss_family_to_store[(family,)] = stores

top_loss_family_to_store

## Fit Prophet For Top Loss Pairs

In [None]:
# top_loss_family_to_store.pop(('SCHOOL AND OFFICE SUPPLIES',))  # to speed up

top_loss_family_groups = [family for family in top_loss_family_to_store.keys()]

In [None]:
model_wrappers = {ProphetWrapper.__name__: prophet_wrapper}

optuna_optimize_kwargs = dict(
    n_trials=100,
    show_progress_bar=True,
    timeout=85,
)

outer_cutoffs_dates = [
    # "2016-08-16",
    "2017-05-15",
    "2017-07-31",
]
outer_cutoffs = [pd.Timestamp(date) for date in outer_cutoffs_dates]

top_loss_predictor = SalesPredictor(
    model_wrappers=model_wrappers,
    family_groups=top_loss_family_groups,
    outer_cutoffs=outer_cutoffs,
    inner_cutoffs=[-365, -180, -100, -49, -33, -17],
    optuna_optimize_kwargs=optuna_optimize_kwargs,
    n_group_store_family_choices=4,
    n_single_store_family_choices=4,
    family_group_to_stores=top_loss_family_to_store,
)

In [None]:
top_loss_dataset = make_time_series_split(
    train_df.copy(), top_loss_predictor.outer_cutoffs, 16
)

In [None]:
logger = logging.getLogger("cmdstanpy")
logger.setLevel(logging.ERROR)

top_loss_predictor = run_study(top_loss_dataset, top_loss_predictor)

## Analyze Loss Results

In [None]:
def make_plot(data_dict, loss_sample_name, title):
    x_name = "outer cv fold start"
    y_name = "loss"

    for i_group, samples in enumerate(data_dict.values()):
        plot_data = [
            [outer_cutoffs_dates[sample], loss]
            for sample, losses in samples[loss_sample_name].items()
            for loss in losses
        ]

        df = pd.DataFrame(plot_data, columns=[x_name, y_name])

        fig, ax = plt.subplots(figsize=(16, 8))
        sns.boxplot(x=x_name, y=y_name, data=df, showmeans=False, ax=ax)

        means = df.groupby(x_name)[y_name].mean().reset_index()
        sns.pointplot(
            x=x_name,
            y=y_name,
            data=means,
            color="red",
            linestyle="none",
            markers="o",
            estimator=np.mean,
            errorbar=None,
            ax=ax,
        )

        ax.set_yscale("log")
        ax.yaxis.set_major_formatter(ScalarFormatter())
        ax.yaxis.set_minor_formatter(ScalarFormatter())
        ax.ticklabel_format(style="plain", axis="y")

        ax.set_title(f"{title} - Group {FAMILY_GROUPS[i_group]}")
        plt.xticks(rotation=90)
        plt.tight_layout()

        plt.show()

In [None]:
plot_title = "Loss Distribution per Outer Fold Grouped by Family Pairs"
make_plot(predictor.tune_loss_storage, "fold_losses", plot_title)

## Make Submission
### Train Predictor

In [None]:
top_loss_predictor.fit(train_df, initial)

In [20]:
predictor.fit(train_df, initial)

100%|██████████| 1782/1782 [02:08<00:00, 13.84it/s]


In [None]:
# combine estimators
predictor.combine_with_predictor(top_loss_predictor)

### Predict && Save Submission

In [21]:
submission = load_submission()

In [22]:
tuned_submission = predictor.predict(test_df, submission)

100%|██████████| 1782/1782 [01:12<00:00, 24.61it/s]


In [24]:
submission_file_path = os.path.join(SUBMISSIONS_PATH, "test_WeightedDayMeanModel.csv")
tuned_submission.to_csv(submission_file_path)