## Notebook Configuration && Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
import os

import numpy as np
import pandas as pd

from matplotlib.ticker import ScalarFormatter
import matplotlib.pyplot as plt
import seaborn as sns

from storesales.baseline.sales_predictor import SalesPredictor
from storesales.baseline.utils import (
    run_study,
    load_baseline_data,
    load_submission,
    make_time_series_dataset,
    evaluate,
)
from storesales.baseline.model_wrappers import (
    DailyMeanModelWrapper,
    ProphetWrapper,
    DayOfWeekMeanModelWrapper,
    WeightedDayMeanModelWrapper,
)
from storesales.baseline.param_suggestions import (
    IntSuggestions,
    FloatSuggestions,
    CategoricalSuggestions,
)
from storesales.baseline.constants import FAMILY_GROUPS, STORES
from storesales.constants import SUBMISSIONS_PATH, TRAIN_TEST_SPLIT_DATE, HORIZON_STR

## Load Data && Create Dataset

In [3]:
train_df, test_df, holidays_df = load_baseline_data(use_light_gbm_preprocessing=False)

In [4]:
train_test_split_date = pd.Timestamp(TRAIN_TEST_SPLIT_DATE)

outer_cutoffs_dates = [
    "2016-08-16",
    "2016-11-18",
    "2017-02-20",
    "2017-04-04",
]
outer_cutoffs = [pd.Timestamp(date) for date in outer_cutoffs_dates]

for cutoff in outer_cutoffs:
    if cutoff + pd.Timedelta(HORIZON_STR) >= train_test_split_date:
        raise ValueError(f"cutoff {cutoff} falls into test period!")

In [2]:
train_dataset = make_time_series_dataset(train_df, outer_cutoffs, 16)

NameError: name 'train_df' is not defined

## Prepare SalesPredictor

In [6]:
initial = "760 days"  # train period

In [7]:
family_group_to_stores = dict(
    [[family_group, STORES] for family_group in FAMILY_GROUPS]
)

### DailyMeanModel
#### Wrapper

In [8]:
daily_mean_int_suggestions = [IntSuggestions("window", 3, 60)]

In [9]:
daily_wrapper = DailyMeanModelWrapper(int_suggestions=daily_mean_int_suggestions)

In [10]:
daily_wrapper_dict = {DailyMeanModelWrapper.__name__: daily_wrapper}

#### Predictor

In [11]:
daily_optuna_kwargs = dict(
    n_trials=10,
    show_progress_bar=False,
    timeout=45,
)

In [12]:
daily_predictor = SalesPredictor(
    model_wrappers=daily_wrapper_dict,
    family_groups=FAMILY_GROUPS,
    inner_cutoffs=[-365, -180, -49, -17],
    optuna_optimize_kwargs=daily_optuna_kwargs,
    n_group_store_family_choices=4,
    n_single_store_family_choices=3,
    family_group_to_stores=family_group_to_stores,
    initial=None,
)

#### Fit && Evaluate

In [None]:
daily_predictor = run_study(train_dataset, daily_predictor)

In [14]:
print("Train RMSLE:", daily_predictor.calc_total_fit_loss())

Train RMSLE: 0.4358343460817977


In [4]:
daily_eval_loss = evaluate(train_df, daily_predictor)

NameError: name 'evaluate' is not defined

In [32]:
daily_eval_loss.mean(axis=1)

family
AUTOMOTIVE                    0.563069
BABY CARE                     0.296582
BEAUTY                        0.585663
BEVERAGES                     0.288135
BOOKS                         0.137782
BREAD/BAKERY                  0.244431
CELEBRATION                   0.614983
CLEANING                      0.365463
DAIRY                         0.248597
DELI                          0.259397
EGGS                          0.412773
FROZEN FOODS                  0.415936
GROCERY I                     0.248053
GROCERY II                    0.632126
HARDWARE                      0.561582
HOME AND KITCHEN I            0.546128
HOME AND KITCHEN II           0.508673
HOME APPLIANCES               0.421929
HOME CARE                     0.300631
LADIESWEAR                    0.594897
LAWN AND GARDEN               0.546760
LINGERIE                      0.693747
LIQUOR,WINE,BEER              0.753127
MAGAZINES                     0.546264
MEATS                         0.313916
PERSONAL CARE     

In [36]:
print(f"Eval RMSLE: {daily_eval_loss.values.mean()}")

Eval RMSLE: 0.4521408725108731


### DayOfWeekMeanModel
#### Wrapper

In [18]:
day_of_week_mean_int_suggestions = [
    IntSuggestions("weekdays_window", 3, 50),
    IntSuggestions("weekends_window", 1, 10),
]

In [19]:
day_of_week_wrapper = DayOfWeekMeanModelWrapper(
    int_suggestions=day_of_week_mean_int_suggestions
)

In [20]:
day_of_week_wrapper_dict = {DayOfWeekMeanModelWrapper.__name__: day_of_week_wrapper}

#### Predictor

In [21]:
day_of_week_optuna_kwargs = dict(
    n_trials=100,
    show_progress_bar=False,
    timeout=45,
)

In [22]:
day_of_week_predictor = SalesPredictor(
    model_wrappers=day_of_week_wrapper_dict,
    family_groups=FAMILY_GROUPS,
    inner_cutoffs=[-365, -180, -49, -17],
    optuna_optimize_kwargs=day_of_week_optuna_kwargs,
    n_group_store_family_choices=4,
    n_single_store_family_choices=3,
    family_group_to_stores=family_group_to_stores,
    initial=None,
)

#### Fit && Evaluate

In [None]:
day_of_week_predictor = run_study(train_dataset, day_of_week_predictor)

In [24]:
print("Train RMSLE:", day_of_week_predictor.calc_total_fit_loss())

Train RMSLE: 0.42853936792092817


In [25]:
day_of_week_eval_loss = evaluate(train_df, day_of_week_predictor)

100%|██████████| 98/98 [01:59<00:00,  1.22s/it]


In [26]:
print(f"Eval RMSLE: {day_of_week_eval_loss}")

Eval RMSLE: 0.4672134931128895


### WeightedDayMeanModelWrapper
#### Wrapper

In [33]:
weighted_day_mean_int_suggestions = [
    IntSuggestions("days_window", 1, 14),
    IntSuggestions("weeks_window", 1, 7),
    IntSuggestions("months_window", 0, 12),
    IntSuggestions("years_window", 0, 4),
]
weighted_day_mean_float_suggestions = [
    FloatSuggestions("day_weight", 0.8, 1.2),
    FloatSuggestions("week_weight", 0.8, 1.2),
    FloatSuggestions("month_weight", 0.8, 1.2),
    FloatSuggestions("year_weight", 0.8, 1.2),
]

In [34]:
weighted_day_mean_wrapper = WeightedDayMeanModelWrapper(
    int_suggestions=weighted_day_mean_int_suggestions,
    float_suggestions=weighted_day_mean_float_suggestions,
)

In [35]:
weighted_day_mean_wrapper_dict = {
    WeightedDayMeanModelWrapper.__name__: weighted_day_mean_wrapper
}

#### Predictor

In [36]:
weighted_day_mean_optuna_kwargs = dict(
    n_trials=10,
    show_progress_bar=False,
    timeout=45,
)

In [37]:
weighted_day_mean_predictor = SalesPredictor(
    model_wrappers=weighted_day_mean_wrapper_dict,
    family_groups=FAMILY_GROUPS,
    inner_cutoffs=[-365, -180, -49, -17],
    optuna_optimize_kwargs=weighted_day_mean_optuna_kwargs,
    n_group_store_family_choices=4,
    n_single_store_family_choices=3,
    family_group_to_stores=family_group_to_stores,
    initial=None,
)

#### Fit && Evaluate

In [None]:
weighted_day_mean_predictor = run_study(train_dataset, weighted_day_mean_predictor)

In [39]:
print("Train RMSLE:", weighted_day_mean_predictor.calc_total_fit_loss())

Train RMSLE: 0.42354329389170525


In [40]:
weighted_day_eval_loss = evaluate(train_df, weighted_day_mean_predictor)

100%|██████████| 98/98 [15:29<00:00,  9.49s/it]


In [41]:
print(f"Eval RMSLE: {weighted_day_eval_loss}")

Eval RMSLE: 0.4701158008427704


### Combined Stat Predictor

In [46]:
stat_wrappers_dict = {
    DailyMeanModelWrapper.__name__: daily_wrapper,
    DayOfWeekMeanModelWrapper.__name__: day_of_week_wrapper,
    WeightedDayMeanModelWrapper.__name__: weighted_day_mean_wrapper,
}

stat_optuna_kwargs = dict(
    n_trials=250,
    show_progress_bar=False,
    timeout=45,
)

In [48]:
combined_stat_predictor = SalesPredictor(
    model_wrappers=stat_wrappers_dict,
    family_groups=FAMILY_GROUPS,
    inner_cutoffs=[-365, -180, -49, -17],
    optuna_optimize_kwargs=stat_optuna_kwargs,
    n_group_store_family_choices=4,
    n_single_store_family_choices=3,
    family_group_to_stores=family_group_to_stores,
    initial=None,
)

#### Fit && Evaluate

In [None]:
combined_stat_predictor = run_study(train_dataset, combined_stat_predictor)

In [50]:
print("Train RMSLE:", combined_stat_predictor.calc_total_fit_loss())

Train RMSLE: 0.41812641510037996


In [51]:
combined_stat_eval_loss = evaluate(train_df, combined_stat_predictor)

100%|██████████| 98/98 [05:05<00:00,  3.11s/it]


In [52]:
print(f"Eval RMSLE: {combined_stat_eval_loss}")

Eval RMSLE: 0.4653801165379105


### ProphetWrapper

In [11]:
prophet_int_suggestions = [
    IntSuggestions("n_changepoints", 20, 50),
]
prophet_float_suggestions = [
    FloatSuggestions("changepoint_prior_scale", 0.01, 0.5),
    FloatSuggestions("holidays_prior_scale", 5, 80),
    FloatSuggestions("seasonality_prior_scale", 5, 80),
]
prophet_categorical_suggestions = [
    CategoricalSuggestions("seasonality_mode", ["additive", "multiplicative"]),
]

prophet_base_params = {
    "daily_seasonality": False,
    "weekly_seasonality": True,
    "yearly_seasonality": True,
    "uncertainty_samples": False,
    "holidays": holidays_df,
}

In [12]:
prophet_wrapper = ProphetWrapper(
    initial=initial,
    extra_regressors=["dcoilwtico"],
    int_suggestions=prophet_int_suggestions,
    float_suggestions=prophet_float_suggestions,
    categorical_suggestions=prophet_categorical_suggestions,
    model_base_params=prophet_base_params,
)

In [None]:
prophet_wrapper_dict = {ProphetWrapper.__name__: prophet_wrapper}

In [None]:
prophet_optuna_kwargs = dict(
    n_trials=50,
    show_progress_bar=False,
    timeout=45,
)

In [None]:
prophet_predictor = SalesPredictor(
    model_wrappers=prophet_wrapper_dict,
    family_groups=FAMILY_GROUPS,
    inner_cutoffs=[-365, -180, -49, -17],
    optuna_optimize_kwargs=prophet_optuna_kwargs,
    n_group_store_family_choices=4,
    n_single_store_family_choices=3,
    family_group_to_stores=family_group_to_stores,
    initial=initial,
)

## Analyze Loss Results && Select Top Loss Pairs

In [17]:
families = []
stores = []
mean_loss = []

for (store, family), losses in predictor.store_family_loss_storage.items():
    mean_loss.append(np.mean(losses))
    stores.append(store)
    families.append(family)

In [None]:
loss_df = pd.DataFrame({"store": stores, "family": families, "loss": mean_loss})

top_loss_df = loss_df.sort_values("loss", ascending=False)[:100]
top_loss_df

In [None]:
top_loss_family_to_store = {}

for family, stores in top_loss_df.groupby("family")["store"].apply(list).items():
    top_loss_family_to_store[(family,)] = stores

top_loss_family_to_store

## Fit Prophet For Top Loss Pairs

In [None]:
# top_loss_family_to_store.pop(('SCHOOL AND OFFICE SUPPLIES',))  # to speed up

top_loss_family_groups = [family for family in top_loss_family_to_store.keys()]

In [None]:
model_wrappers = {ProphetWrapper.__name__: prophet_wrapper}

optuna_optimize_kwargs = dict(
    n_trials=100,
    show_progress_bar=True,
    timeout=85,
)

outer_cutoffs_dates = [
    # "2016-08-16",
    "2017-05-15",
    "2017-07-31",
]
outer_cutoffs = [pd.Timestamp(date) for date in outer_cutoffs_dates]

top_loss_predictor = SalesPredictor(
    model_wrappers=model_wrappers,
    family_groups=top_loss_family_groups,
    inner_cutoffs=[-365, -180, -100, -49, -33, -17],
    optuna_optimize_kwargs=optuna_optimize_kwargs,
    n_group_store_family_choices=4,
    n_single_store_family_choices=4,
    family_group_to_stores=top_loss_family_to_store,
)

In [None]:
top_loss_dataset = make_time_series_dataset(
    train_df.copy(), top_loss_predictor.outer_cutoffs, 16
)

In [None]:
logger = logging.getLogger("cmdstanpy")
logger.setLevel(logging.ERROR)

top_loss_predictor = run_study(top_loss_dataset, top_loss_predictor)

## Analyze Loss Results

In [None]:
def make_plot(data_dict, loss_sample_name, title):
    x_name = "outer cv fold start"
    y_name = "loss"

    for i_group, samples in enumerate(data_dict.values()):
        plot_data = [
            [outer_cutoffs_dates[sample], loss]
            for sample, losses in samples[loss_sample_name].items()
            for loss in losses
        ]

        df = pd.DataFrame(plot_data, columns=[x_name, y_name])

        fig, ax = plt.subplots(figsize=(16, 8))
        sns.boxplot(x=x_name, y=y_name, data=df, showmeans=False, ax=ax)

        means = df.groupby(x_name)[y_name].mean().reset_index()
        sns.pointplot(
            x=x_name,
            y=y_name,
            data=means,
            color="red",
            linestyle="none",
            markers="o",
            estimator=np.mean,
            errorbar=None,
            ax=ax,
        )

        ax.set_yscale("log")
        ax.yaxis.set_major_formatter(ScalarFormatter())
        ax.yaxis.set_minor_formatter(ScalarFormatter())
        ax.ticklabel_format(style="plain", axis="y")

        ax.set_title(f"{title} - Group {FAMILY_GROUPS[i_group]}")
        plt.xticks(rotation=90)
        plt.tight_layout()

        plt.show()

In [None]:
plot_title = "Loss Distribution per Outer Fold Grouped by Family Pairs"
make_plot(predictor.tune_loss_storage, "fold_losses", plot_title)

## Make Submission
### Train Predictor

In [None]:
top_loss_predictor.fit(train_df, initial)

In [17]:
daily_predictor.fit(train_df)



  0%|          | 0/1782 [00:00<?, ?it/s][A[A

  0%|          | 1/1782 [00:00<11:40,  2.54it/s][A[A

 18%|█▊        | 326/1782 [00:00<00:01, 867.49it/s][A[A

 32%|███▏      | 572/1782 [00:00<00:00, 1306.38it/s][A[A

 51%|█████     | 909/1782 [00:00<00:00, 1888.81it/s][A[A

 71%|███████   | 1263/1782 [00:00<00:00, 2362.75it/s][A[A

100%|██████████| 1782/1782 [00:00<00:00, 1882.62it/s][A[A


In [None]:
# combine estimators
predictor.combine_with_predictor(top_loss_predictor)

### Predict && Save Submission

In [54]:
submission = load_submission()

In [18]:
predictions = daily_predictor.predict(test_df)

predictions.set_index("id", inplace=True)
submission["sales"] = predictions["yhat"]



  0%|          | 0/1782 [00:00<?, ?it/s][A[A

123
             ds  store_nbr      family       id  dcoilwtico      yhat
0    2017-08-16          1  AUTOMOTIVE  3000888       46.80  4.909091
1782 2017-08-17          1  AUTOMOTIVE  3002670       47.07  4.909091
3564 2017-08-18          1  AUTOMOTIVE  3004452       48.59  4.909091
5346 2017-08-19          1  AUTOMOTIVE  3006234       48.59  4.909091
7128 2017-08-20          1  AUTOMOTIVE  3008016       48.59  4.909091





AttributeError: 'NoneType' object has no attribute 'set_index'

In [56]:
submission_file_path = os.path.join(SUBMISSIONS_PATH, "combined_stat_submission.csv")
submission.to_csv(submission_file_path)