## Notebook Configuration && Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

from storesales.baseline.sales_predictor import SalesPredictor
from storesales.baseline.utils import (
    run_study,
    load_baseline_data,
    make_time_series_dataset,
    evaluate,
)
from storesales.baseline.model_wrappers import (
    DailyMeanModelWrapper,
    DayOfWeekMeanModelWrapper,
    WeightedDayMeanModelWrapper,
)
from storesales.baseline.param_suggestions import (
    IntSuggestions,
    FloatSuggestions,
    CategoricalSuggestions,
)
from storesales.baseline.constants import STORES
from storesales.constants import TRAIN_TEST_SPLIT_DATE, HORIZON_STR

## Load Data && Create Dataset

In [3]:
train_df, test_df, holidays_df = load_baseline_data(use_light_gbm_preprocessing=False)

In [4]:
train_test_split_date = pd.Timestamp(TRAIN_TEST_SPLIT_DATE)

outer_cutoffs_dates = [
    "2016-08-16",
    "2016-11-18",
    "2017-02-20",
    "2017-04-04",
]
outer_cutoffs = [pd.Timestamp(date) for date in outer_cutoffs_dates]

for cutoff in outer_cutoffs:
    if cutoff + pd.Timedelta(HORIZON_STR) >= train_test_split_date:
        raise ValueError(f"cutoff {cutoff} falls into test period!")

In [5]:
# families = ["FROZEN FOODS", "EGGS", "CELEBRATION"]
families = ["BREAD/BAKERY"]

family_groups = [(family,) for family in families]

family_group_to_stores = dict(
    [[family_group, STORES] for family_group in family_groups]
)

In [6]:
families_data = train_df[train_df["family"].isin(families)]

In [7]:
train_dataset = make_time_series_dataset(families_data, outer_cutoffs, 16)

100%|██████████| 54/54 [00:00<00:00, 228.03it/s]


## SalesPredictor
### DailyMeanModel

In [8]:
daily_wrapper = DailyMeanModelWrapper(int_suggestions=[IntSuggestions("window", 3, 60)])

daily_wrapper_dict = {DailyMeanModelWrapper.__name__: daily_wrapper}

daily_optuna_kwargs = dict(
    n_trials=100,
    show_progress_bar=False,
    timeout=45,
)

daily_predictor = SalesPredictor(
    model_wrappers=daily_wrapper_dict,
    family_groups=family_groups,
    inner_cutoffs=[-365, -180, -49, -17],
    optuna_optimize_kwargs=daily_optuna_kwargs,
    n_group_store_family_choices=4,
    n_single_store_family_choices=3,
    family_group_to_stores=family_group_to_stores,
    initial=None,
)

In [9]:
daily_predictor = run_study(train_dataset, daily_predictor, disable_tqdm=False)

Family Group: ('BREAD/BAKERY',):


100%|██████████| 3/3 [00:11<00:00,  3.90s/it]

RMSLE: 0.29958993816286955





In [10]:
daily_eval_loss = evaluate(families_data, daily_predictor, disable_tqdm=False)

100%|██████████| 82/82 [00:04<00:00, 18.80it/s]


In [11]:
daily_store_loss = daily_eval_loss.mean(axis=1).rename("daily_rmsle")

In [12]:
daily_store_loss.head()

family        store_nbr
BREAD/BAKERY  1            0.335604
              2            0.183433
              3            0.165592
              4            0.221086
              5            0.165789
Name: daily_rmsle, dtype: float64

### WeightedDayMeanModelWrapper

In [21]:
weighted_day_mean_wrapper = WeightedDayMeanModelWrapper(
    int_suggestions=[
        IntSuggestions("days_window", 1, 20),
        IntSuggestions("weeks_window", 1, 10),
        IntSuggestions("months_window", 0, 12),
        IntSuggestions("years_window", 0, 4),
    ],
    float_suggestions=[
        FloatSuggestions("day_weight", 0.88, 1.12),
        FloatSuggestions("week_weight", 0.88, 1.12),
        FloatSuggestions("month_weight", 0.88, 1.12),
        FloatSuggestions("year_weight", 0.88, 1.12),
        FloatSuggestions("bias", -100.0, 100.0),
    ],
)

weighted_day_mean_wrapper_dict = {
    WeightedDayMeanModelWrapper.__name__: weighted_day_mean_wrapper
}

weighted_day_mean_optuna_kwargs = dict(
    n_trials=100,
    show_progress_bar=True,
    timeout=45,
    n_jobs=6,
)

weighted_day_mean_predictor = SalesPredictor(
    model_wrappers=weighted_day_mean_wrapper_dict,
    family_groups=family_groups,
    inner_cutoffs=[-365, -180, -49, -17],
    optuna_optimize_kwargs=weighted_day_mean_optuna_kwargs,
    n_group_store_family_choices=4,
    n_single_store_family_choices=3,
    family_group_to_stores=family_group_to_stores,
    initial=None,
)

In [22]:
weighted_day_mean_predictor = run_study(train_dataset, weighted_day_mean_predictor, disable_tqdm=False)

Family Group: ('BREAD/BAKERY',):


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 33%|███▎      | 1/3 [01:16<02:32, 76.49s/it]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

 67%|██████▋   | 2/3 [02:31<01:15, 75.49s/it]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 3/3 [03:47<00:00, 75.90s/it]

RMSLE: 0.28483781691672705





In [23]:
weighted_day_loss = evaluate(families_data, weighted_day_mean_predictor, disable_tqdm=False)

100%|██████████| 82/82 [00:29<00:00,  2.78it/s]


In [24]:
weighted_day_store_loss = weighted_day_loss.mean(axis=1).rename("weighted_day_rmsle")

In [25]:
losses = pd.concat([daily_store_loss, weighted_day_store_loss], axis=1)

In [26]:
losses.groupby(level="family").mean()

Unnamed: 0_level_0,daily_rmsle,weighted_day_rmsle
family,Unnamed: 1_level_1,Unnamed: 2_level_1
BREAD/BAKERY,0.236654,0.202178


In [27]:
losses[daily_store_loss < weighted_day_store_loss]

Unnamed: 0_level_0,Unnamed: 1_level_0,daily_rmsle,weighted_day_rmsle
family,store_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1
BREAD/BAKERY,7,0.14901,0.149389
BREAD/BAKERY,43,0.211763,0.231231
BREAD/BAKERY,52,0.239691,0.594062


In [28]:
weighted_day_mean_predictor.family_to_model_params_storage

{'BREAD/BAKERY': {'params': {'model': 'WeightedDayMeanModelWrapper',
   'days_window': 10,
   'weeks_window': 6,
   'months_window': 6,
   'years_window': 1,
   'day_weight': 1.0043105602931213,
   'week_weight': 1.046386251939234,
   'month_weight': 0.994065674117155,
   'year_weight': 0.9807991102525296,
   'bias': -9.848525056679737},
  'stores': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
         35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
         52, 53, 54]),
  'loss': 0.28483781691672705}}