## Notebook Configuration && Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import pandas as pd

from storesales.baseline.sales_predictor import SalesPredictor
from storesales.baseline.utils import (
    run_study,
    load_baseline_data,
    make_time_series_dataset,
    evaluate,
)
from storesales.baseline.model_wrappers import (
    DailyMeanModelWrapper,
    DayOfWeekMeanModelWrapper,
    WeightedDayMeanModelWrapper,
)
from storesales.baseline.param_suggestions import (
    IntSuggestions,
    FloatSuggestions,
    CategoricalSuggestions,
)
from storesales.baseline.constants import STORES
from storesales.constants import (
    TRAIN_TEST_SPLIT_DATE,
    HORIZON_STR,
    LOSSES_DATA_PATH,
    MODELS_PATH,
)

## Load Data && Create Dataset

In [3]:
train_df, test_df, holidays_df = load_baseline_data(use_light_gbm_preprocessing=False)

In [4]:
train_test_split_date = pd.Timestamp(TRAIN_TEST_SPLIT_DATE)

outer_cutoffs_dates = [
    "2016-08-16",
    "2016-11-18",
    "2017-02-20",
    "2017-04-04",
]
outer_cutoffs = [pd.Timestamp(date) for date in outer_cutoffs_dates]

for cutoff in outer_cutoffs:
    if cutoff + pd.Timedelta(HORIZON_STR) >= train_test_split_date:
        raise ValueError(f"cutoff {cutoff} falls into test period!")

In [5]:
# families = ["FROZEN FOODS", "EGGS"]
# families = ["BREAD/BAKERY", "CELEBRATION"]
families = train_df["family"].unique()

family_groups = [(family,) for family in families]

family_group_to_stores = dict(
    [[family_group, STORES] for family_group in family_groups]
)

In [6]:
families_data = train_df[train_df["family"].isin(families)]

In [7]:
train_dataset = make_time_series_dataset(families_data, outer_cutoffs, 16)

100%|██████████| 1782/1782 [00:07<00:00, 224.24it/s]


## SalesPredictor
### DailyMeanModel

In [8]:
daily_wrapper = DailyMeanModelWrapper(int_suggestions=[IntSuggestions("window", 3, 60)])

daily_wrapper_dict = {DailyMeanModelWrapper.__name__: daily_wrapper}

daily_optuna_kwargs = dict(
    n_trials=100,
    show_progress_bar=False,
    timeout=45,
)

daily_predictor = SalesPredictor(
    model_wrappers=daily_wrapper_dict,
    family_groups=family_groups,
    inner_cutoffs=[-365, -180, -49, -17],
    optuna_optimize_kwargs=daily_optuna_kwargs,
    n_group_store_family_choices=4,
    n_single_store_family_choices=3,
    family_group_to_stores=family_group_to_stores,
    initial=None,
)

In [None]:
daily_predictor = run_study(train_dataset, daily_predictor, disable_tqdm=False)

In [10]:
daily_eval_loss = evaluate(families_data, daily_predictor, disable_tqdm=False)

100%|██████████| 82/82 [01:07<00:00,  1.21it/s]


In [11]:
daily_store_loss = daily_eval_loss.mean(axis=1).rename("daily_rmsle")

In [12]:
daily_store_loss.groupby(level="family").mean()

family
AUTOMOTIVE                    0.549853
BABY CARE                     0.194055
BEAUTY                        0.574898
BEVERAGES                     0.275564
BOOKS                         0.078480
BREAD/BAKERY                  0.236766
CELEBRATION                   0.595359
CLEANING                      0.320998
DAIRY                         0.246879
DELI                          0.254749
EGGS                          0.386635
FROZEN FOODS                  0.399494
GROCERY I                     0.241929
GROCERY II                    0.569867
HARDWARE                      0.546267
HOME AND KITCHEN I            0.543460
HOME AND KITCHEN II           0.481282
HOME APPLIANCES               0.418675
HOME CARE                     0.291221
LADIESWEAR                    0.501788
LAWN AND GARDEN               0.445791
LINGERIE                      0.683757
LIQUOR,WINE,BEER              0.708146
MAGAZINES                     0.538552
MEATS                         0.299842
PERSONAL CARE     

### Save Model

In [13]:
daily_predictor.fit(families_data)

100%|██████████| 1782/1782 [00:00<00:00, 1952.67it/s]


In [14]:
loss_file = os.path.join(LOSSES_DATA_PATH, "test_daily_eval_loss.csv")
daily_eval_loss.to_csv(loss_file, index=True)
daily_predictor.eval_loss_csv = loss_file

In [15]:
model_file = os.path.join(MODELS_PATH, "test_daily_predictor.pkl")
daily_predictor.save(model_file)

### WeightedDayMeanModelWrapper

In [None]:
weighted_day_mean_wrapper = WeightedDayMeanModelWrapper(
    int_suggestions=[
        IntSuggestions("days_window", 1, 20),
        IntSuggestions("weeks_window", 1, 10),
        IntSuggestions("months_window", 0, 12),
        IntSuggestions("years_window", 0, 4),
    ],
    float_suggestions=[
        FloatSuggestions("day_weight", 0.88, 1.12),
        FloatSuggestions("week_weight", 0.88, 1.12),
        FloatSuggestions("month_weight", 0.88, 1.12),
        FloatSuggestions("year_weight", 0.88, 1.12),
        FloatSuggestions("bias", -100.0, 100.0),
    ],
)

weighted_day_mean_wrapper_dict = {
    WeightedDayMeanModelWrapper.__name__: weighted_day_mean_wrapper
}

weighted_day_mean_optuna_kwargs = dict(
    n_trials=100,
    show_progress_bar=True,
    timeout=45,
    n_jobs=6,
)

weighted_day_mean_predictor = SalesPredictor(
    model_wrappers=weighted_day_mean_wrapper_dict,
    family_groups=family_groups,
    inner_cutoffs=[-365, -180, -49, -17],
    optuna_optimize_kwargs=weighted_day_mean_optuna_kwargs,
    n_group_store_family_choices=4,
    n_single_store_family_choices=3,
    family_group_to_stores=family_group_to_stores,
    initial=None,
)

In [None]:
weighted_day_mean_predictor = run_study(
    train_dataset, weighted_day_mean_predictor, disable_tqdm=False
)

In [None]:
weighted_day_loss = evaluate(
    families_data, weighted_day_mean_predictor, disable_tqdm=False
)

In [None]:
weighted_day_store_loss = weighted_day_loss.mean(axis=1).rename("weighted_day_rmsle")

In [None]:
losses = pd.concat([daily_store_loss, weighted_day_store_loss], axis=1)

In [None]:
losses.groupby(level="family").mean()

In [None]:
losses[daily_store_loss < weighted_day_store_loss]

In [None]:
weighted_day_mean_predictor.family_to_model_params_storage