## Notebook Configuration && Imports

In [1]:
import prophet.diagnostics
%load_ext autoreload
%autoreload 2

In [168]:
import os
import itertools
import logging
from tqdm import tqdm

import pandas as pd
from prophet import Prophet

from storesales.constants import (
    SUBMISSIONS_PATH,
    EXTERNAL_TRAIN_PATH,
    EXTERNAL_SAMPLE_SUBMISSION_PATH,
    EXTERNAL_TEST_PATH,
    EXTERNAL_OIL_PATH
)

## Load && Prepare Data

In [170]:
original_train_df = pd.read_csv(EXTERNAL_TRAIN_PATH, parse_dates=["date"])
original_train_df.sort_values(by=["date", "store_nbr", "family"], inplace=True)

original_test_df = pd.read_csv(EXTERNAL_TEST_PATH, parse_dates=["date"])

sample_submission_df = pd.read_csv(EXTERNAL_SAMPLE_SUBMISSION_PATH, index_col="id")

oil_df = pd.read_csv(EXTERNAL_OIL_PATH, parse_dates=["date"])
oil_df.set_index("date", inplace=True)
oil_df = oil_df.asfreq("D")
oil_df["dcoilwtico"] = oil_df["dcoilwtico"].ffill()
oil_df = oil_df.dropna()

In [4]:
train_period = original_train_df.index.unique()

## Facebook Prophet

In [6]:
train_df = original_train_df[["date", "store_nbr", "family", "sales"]].copy()
train_df.rename(columns={"date": "ds", "sales": "y"}, inplace=True)

test_df = original_test_df.rename(columns={"date": "ds"})

prophet_submission_df = sample_submission_df.copy()

In [7]:
train_groups = train_df.groupby(["store_nbr", "family"])
test_groups = test_df.groupby(["store_nbr", "family"])

In [8]:
stores = train_df["store_nbr"].unique()
families = train_df["family"].unique()

In [9]:
groups = itertools.product(stores, families)
total = len(stores) * len(families) - 1

In [None]:
logging.getLogger("prophet").setLevel(logging.ERROR)
logging.getLogger("cmdstanpy").setLevel(logging.ERROR)

In [12]:
for store_nbt_to_family in tqdm(groups, total=total):
    train_group = train_groups.get_group(store_nbt_to_family)
    test_group = test_groups.get_group(store_nbt_to_family).reset_index(drop=True)

    m = Prophet()
    m.fit(train_group)

    forecast = m.predict(test_group)

    test_group["yhat"] = forecast["yhat"]

    test_group.set_index("id", inplace=True)
    prophet_submission_df.loc[test_group.index, "sales"] = test_group["yhat"]

 99%|█████████▉| 1771/1782 [06:59<00:02,  4.22it/s]


In [13]:
prophet_submission_file = "prophet_submission.csv"
prophet_submission_df.to_csv(os.path.join(SUBMISSIONS_PATH, prophet_submission_file))

## Store nbr '1' & Family 'GROCERY I'

In [165]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
import optuna
from prophet.diagnostics import cross_validation

In [126]:
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_true) - np.log1p(y_pred))))

In [99]:
con = (original_train_df["store_nbr"] == 1) & (original_train_df["family"] == "GROCERY I")
train_df = original_train_df[con][["date", "sales"]].copy()
train_df.rename(columns={"date": "ds", "sales": "y"}, inplace=True)

In [171]:
train_df = train_df.merge(oil_df, left_on="ds", right_index=True, how="left")

In [174]:
train_df.dropna(inplace=True)

In [176]:
train_df.sort_values(by="ds", inplace=True)

In [177]:
logging.getLogger("cmdstanpy").setLevel(logging.WARNING)

In [181]:
def inner_objective(trial: optuna.Trial, train):
    yearly_seasonality = trial.suggest_categorical("yearly_seasonality", [True, False])
    weekly_seasonality = trial.suggest_categorical("weekly_seasonality", [True, False])
    daily_seasonality = trial.suggest_categorical("daily_seasonality", [True, False])
    changepoint_prior_scale = trial.suggest_float("changepoint_prior_scale", 0.001, 0.5)
    seasonality_prior_scale = trial.suggest_int("seasonality_prior_scale", 1, 20)
    seasonality_mode = trial.suggest_categorical("seasonality_mode", ["additive", "multiplicative"])
    
    inner_results = []
    inner_tscv = TimeSeriesSplit(n_splits=3, test_size=16)

    for inner_train_index, inner_val_index in inner_tscv.split(train):
        inner_train, inner_val = train.iloc[inner_train_index], train.iloc[inner_val_index]
        
        model = Prophet(
            yearly_seasonality=yearly_seasonality,
            weekly_seasonality=weekly_seasonality,
            daily_seasonality=daily_seasonality,
            changepoint_prior_scale=changepoint_prior_scale,
            seasonality_prior_scale=seasonality_prior_scale,
            seasonality_mode=seasonality_mode
        )
        model.add_regressor("dcoilwtico")
        
        model.fit(inner_train)
        
        future = inner_val[["ds", "dcoilwtico"]]
        forecast = model.predict(future)
        y_pred = forecast["yhat"].values
        y_true = inner_val["y"].values
        
        loss = rmsle(y_true, y_pred)
        inner_results.append(loss)

    return np.mean(inner_results)

In [182]:
outer_tscv = TimeSeriesSplit(n_splits=5, test_size=16)
outer_results = []

for outer_train_index, outer_test_index in outer_tscv.split(train_df):
    outer_train, outer_test = train_df.iloc[outer_train_index], train_df.iloc[outer_test_index]

    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: inner_objective(trial, outer_train), n_trials=20, show_progress_bar=True) 

    final_model = Prophet(**study.best_params)
    final_model.fit(outer_train)

    future = outer_test[["ds", "dcoilwtico"]]
    forecast = final_model.predict(future)
    y_pred = forecast["yhat"].values
    y_true = outer_test["y"].values
    
    outer_rmsle = rmsle(y_true, y_pred)
    outer_results.append(outer_rmsle)

final_outer_rmsle = np.mean(outer_results)
print(f'Final Outer RMSLE: {final_outer_rmsle}')

[I 2024-09-29 15:23:34,896] A new study created in memory with name: no-name-6e7ce9e8-e1e9-4725-bfc5-56f702588d9b


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2024-09-29 15:23:36,173] Trial 0 finished with value: 0.3465777019089246 and parameters: {'yearly_seasonality': False, 'weekly_seasonality': True, 'daily_seasonality': True, 'changepoint_prior_scale': 0.3592928355122128, 'seasonality_prior_scale': 10, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 0.3465777019089246.
[I 2024-09-29 15:23:36,982] Trial 1 finished with value: 0.3381838247532267 and parameters: {'yearly_seasonality': True, 'weekly_seasonality': True, 'daily_seasonality': True, 'changepoint_prior_scale': 0.17316238474220141, 'seasonality_prior_scale': 13, 'seasonality_mode': 'additive'}. Best is trial 1 with value: 0.3381838247532267.
[I 2024-09-29 15:23:37,465] Trial 2 finished with value: 0.32827562065065136 and parameters: {'yearly_seasonality': False, 'weekly_seasonality': True, 'daily_seasonality': True, 'changepoint_prior_scale': 0.014184986440862704, 'seasonality_prior_scale': 13, 'seasonality_mode': 'additive'}. Best is trial 2 with value: 0.3282756

[I 2024-09-29 15:23:50,944] A new study created in memory with name: no-name-c1ca36c8-fe88-4688-a77f-db1c1b375b49


[I 2024-09-29 15:23:50,784] Trial 19 finished with value: 0.350255307146728 and parameters: {'yearly_seasonality': False, 'weekly_seasonality': True, 'daily_seasonality': False, 'changepoint_prior_scale': 0.47016162926290417, 'seasonality_prior_scale': 8, 'seasonality_mode': 'additive'}. Best is trial 11 with value: 0.3255670146792222.


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2024-09-29 15:23:52,571] Trial 0 finished with value: 0.240119054583418 and parameters: {'yearly_seasonality': True, 'weekly_seasonality': True, 'daily_seasonality': True, 'changepoint_prior_scale': 0.47795937265636457, 'seasonality_prior_scale': 13, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 0.240119054583418.
[I 2024-09-29 15:23:53,295] Trial 1 finished with value: 0.25281575297159503 and parameters: {'yearly_seasonality': False, 'weekly_seasonality': True, 'daily_seasonality': True, 'changepoint_prior_scale': 0.177315589877538, 'seasonality_prior_scale': 17, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 0.240119054583418.
[I 2024-09-29 15:23:53,839] Trial 2 finished with value: 0.4162004381149062 and parameters: {'yearly_seasonality': False, 'weekly_seasonality': False, 'daily_seasonality': True, 'changepoint_prior_scale': 0.1606514656117522, 'seasonality_prior_scale': 1, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 0.2401190545834

[I 2024-09-29 15:24:08,906] A new study created in memory with name: no-name-ff1c25d3-6a60-4623-bed5-5b60bf56b826


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2024-09-29 15:24:10,078] Trial 0 finished with value: 0.15949147845035622 and parameters: {'yearly_seasonality': True, 'weekly_seasonality': True, 'daily_seasonality': False, 'changepoint_prior_scale': 0.46710960630653275, 'seasonality_prior_scale': 10, 'seasonality_mode': 'multiplicative'}. Best is trial 0 with value: 0.15949147845035622.
[I 2024-09-29 15:24:11,265] Trial 1 finished with value: 0.17479183473395973 and parameters: {'yearly_seasonality': True, 'weekly_seasonality': True, 'daily_seasonality': True, 'changepoint_prior_scale': 0.36257239239181566, 'seasonality_prior_scale': 6, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 0.15949147845035622.
[I 2024-09-29 15:24:11,872] Trial 2 finished with value: 0.19243696584152933 and parameters: {'yearly_seasonality': False, 'weekly_seasonality': True, 'daily_seasonality': False, 'changepoint_prior_scale': 0.20497962999563912, 'seasonality_prior_scale': 16, 'seasonality_mode': 'additive'}. Best is trial 0 with value:

[I 2024-09-29 15:24:26,396] A new study created in memory with name: no-name-4cc3ffcf-4fec-4415-8360-ecd5e44c5a3b


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2024-09-29 15:24:28,415] Trial 0 finished with value: 0.11512718132470162 and parameters: {'yearly_seasonality': False, 'weekly_seasonality': True, 'daily_seasonality': True, 'changepoint_prior_scale': 0.17806298215005506, 'seasonality_prior_scale': 20, 'seasonality_mode': 'multiplicative'}. Best is trial 0 with value: 0.11512718132470162.
[I 2024-09-29 15:24:29,117] Trial 1 finished with value: 0.3385463477857294 and parameters: {'yearly_seasonality': False, 'weekly_seasonality': False, 'daily_seasonality': True, 'changepoint_prior_scale': 0.44300670603279046, 'seasonality_prior_scale': 7, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 0.11512718132470162.
[I 2024-09-29 15:24:29,576] Trial 2 finished with value: 0.3365813555113702 and parameters: {'yearly_seasonality': False, 'weekly_seasonality': False, 'daily_seasonality': False, 'changepoint_prior_scale': 0.259727482138521, 'seasonality_prior_scale': 3, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 0

[I 2024-09-29 15:24:47,178] A new study created in memory with name: no-name-42aa8ff3-daa9-492e-80ab-cea94177a7c2


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2024-09-29 15:24:49,797] Trial 0 finished with value: 0.11149145432437009 and parameters: {'yearly_seasonality': True, 'weekly_seasonality': True, 'daily_seasonality': True, 'changepoint_prior_scale': 0.4181388883460627, 'seasonality_prior_scale': 12, 'seasonality_mode': 'multiplicative'}. Best is trial 0 with value: 0.11149145432437009.
[I 2024-09-29 15:24:50,714] Trial 1 finished with value: 0.13246287351758915 and parameters: {'yearly_seasonality': True, 'weekly_seasonality': True, 'daily_seasonality': True, 'changepoint_prior_scale': 0.14484029442390622, 'seasonality_prior_scale': 6, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 0.11149145432437009.
[I 2024-09-29 15:24:52,612] Trial 2 finished with value: 0.13996643078685078 and parameters: {'yearly_seasonality': False, 'weekly_seasonality': True, 'daily_seasonality': True, 'changepoint_prior_scale': 0.42434129544046373, 'seasonality_prior_scale': 19, 'seasonality_mode': 'multiplicative'}. Best is trial 0 with val