## Notebook Configuration && Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import pandas as pd
import numpy as np
from tqdm import tqdm

from darts import TimeSeries
from darts.models import LightGBMModel

from storesales.baseline.sales_predictor import SalesPredictor
from storesales.light_gbm.feature_engineering.ts_fresh_features import (
    make_roll,
    make_roll_features,
)
from storesales.light_gbm.data_series.target_series import get_series_and_id_dicts
from storesales.light_gbm.data_series.covariates import get_covariates_dicts
from storesales.light_gbm.preprocessing import preprocess
from storesales.light_gbm.utils import save_submission
from storesales.light_gbm.loss import clipped_rmsle
from storesales.constants import (
    EXTERNAL_TRAIN_PATH,
    EXTERNAL_TEST_PATH,
    TRAIN_TEST_SPLIT_DATE,
    MODELS_PATH,
)

## Load && Preprocess Data

In [3]:
train_df = pd.read_csv(EXTERNAL_TRAIN_PATH, parse_dates=["date"])
test_df = pd.read_csv(EXTERNAL_TEST_PATH, parse_dates=["date"])

In [4]:
preprocessed_df = preprocess(train_df)

In [5]:
preprocessed_df.fillna(0, inplace=True)

## Prepare Data Before Making Series

In [50]:
train_end = train_df["date"].max()

In [7]:
END_TEST_RANGE = train_df["date"].max() - pd.Timedelta(days=16)
SERIES_TEST_RANGE = pd.date_range(TRAIN_TEST_SPLIT_DATE, END_TEST_RANGE, freq="D")

In [8]:
families = ["FROZEN FOODS", "EGGS", "LIQUOR,WINE,BEER", "SCHOOL AND OFFICE SUPPLIES"]
train_columns = ["date", "sales", "onpromotion", "store_nbr", "family"]

train_test_df = pd.concat([preprocessed_df, test_df], axis=0, ignore_index=True)
train_data = train_test_df[train_test_df["family"].isin(families)][train_columns].copy()

### Ensure that I have enough data to get lags

In [9]:
threshold_date = pd.Timestamp("2017-04-01")

In [10]:
min_dates = train_data.groupby(["family", "store_nbr"])["date"].min().reset_index()
lgb_groups = min_dates[min_dates["date"] <= threshold_date]

In [11]:
lgb_train_data = pd.merge(
    train_data,
    lgb_groups[["family", "store_nbr"]],
    on=["family", "store_nbr"],
    how="inner",
)

In [12]:
lgb_train_data.head()

Unnamed: 0,date,sales,onpromotion,store_nbr,family
0,2013-01-02,246.0,0.0,1,EGGS
1,2013-01-03,203.0,0.0,1,EGGS
2,2013-01-04,171.0,0.0,1,EGGS
3,2013-01-05,177.0,0.0,1,EGGS
4,2013-01-06,85.0,0.0,1,EGGS


## Baseline Model
Some store-family pairs do not have enough data to get lags. I will use baseline model for these pairs.

### Prepare Data

In [13]:
baseline_groups = min_dates[min_dates["date"] > threshold_date]

In [14]:
# baseline_train_data = pd.merge(
#     train_df,
#     baseline_groups[["family", "store_nbr"]],
#     on=["family", "store_nbr"],
#     how="inner",
# )

baseline_train_data = train_df.copy()
baseline_test_data = test_df.copy()

# baseline_test_data = pd.merge(
#     test_df,
#     baseline_groups[["family", "store_nbr"]],
#     on=["family", "store_nbr"],
#     how="inner",
# )

baseline_train_data.rename(columns={"date": "ds", "sales": "y"}, inplace=True)
baseline_test_data.rename(columns={"date": "ds"}, inplace=True)

In [15]:
baseline_groups

Unnamed: 0,family,store_nbr,date
51,EGGS,52,2017-04-20
105,FROZEN FOODS,52,2017-04-20
159,"LIQUOR,WINE,BEER",52,2017-04-20
213,SCHOOL AND OFFICE SUPPLIES,52,2017-04-20


### Load Model && Fit && Predict

In [16]:
model_file = os.path.join(MODELS_PATH, "daily_predictor.pkl")
baseline_predictor = SalesPredictor.load(model_file)

In [17]:
baseline_predictor.fit(baseline_train_data)

100%|██████████| 1782/1782 [00:00<00:00, 1950.28it/s]


In [18]:
baseline_prediction = baseline_predictor.predict(baseline_test_data)

100%|██████████| 1782/1782 [00:00<00:00, 3351.74it/s]


### Load Baseline Evaluation Loss

In [19]:
predictor_eval_loss_df = pd.read_csv(
    baseline_predictor.eval_loss_csv, index_col=["family", "store_nbr"]
)

baseline_loss_ids = pd.MultiIndex.from_frame(baseline_groups[["family", "store_nbr"]])
baseline_loss_df = predictor_eval_loss_df.loc[baseline_loss_ids]

In [20]:
# mean_sales_df = (
#     baseline_train_data.sort_values(by="date")
#     .groupby(["store_nbr", "family"])["sales"]
#     .apply(lambda x: x.tail(14).mean())
#     .reset_index(name="sales")
# )

In [21]:
# def evaluate_baseline(family, date):
#     family_store = not_valid_groups[not_valid_groups["family"] == family]["store_nbr"]
#     families_losses = []
#     for store_nbr in family_store.values:
#         family_store_data = baseline_train_data[
#             (baseline_train_data["family"] == family)
#             & (baseline_train_data["store_nbr"] == store_nbr)
#         ].copy()
#         family_store_data.rename(columns={"sales": "y"}, inplace=True)
#
#         baseline_train = family_store_data[family_store_data["date"] < date]
#         baseline_test_data = family_store_data[
#             (family_store_data["date"] >= date)
#             & (family_store_data["date"] <= date + pd.Timedelta(days=16))
#         ]
#
#         stat_model = DailyMeanModel(window=14)
#         stat_model.fit(baseline_train)
#         forecast = stat_model.predict(baseline_test_data)
#         families_losses.append(clipped_rmsle(forecast["y"], forecast["yhat"]))
#
#     return families_losses

In [22]:
# evaluate_baseline("AUTOMOTIVE", pd.Timestamp("2017-06-10"))

## Rolling Features
### Make Rolls

In [23]:
rolls_threshold_date = pd.Timestamp("2015-03-01")

In [24]:
lgb_rolls_data = lgb_train_data[lgb_train_data["date"] >= rolls_threshold_date].copy()

In [25]:
all_train_rolls = make_roll(lgb_rolls_data)

Rolling: 100%|██████████| 30/30 [00:41<00:00,  1.38s/it]


In [26]:
all_train_rolls.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date,sales,onpromotion,id
store_family,date_roll_id,date_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-EGGS,2015-03-31,2015-03-01,2015-03-01,99.0,0.0,"(1-EGGS, 2015-03-31 00:00:00)"
1-EGGS,2015-03-31,2015-03-02,2015-03-02,154.0,0.0,"(1-EGGS, 2015-03-31 00:00:00)"
1-EGGS,2015-03-31,2015-03-03,2015-03-03,164.0,0.0,"(1-EGGS, 2015-03-31 00:00:00)"
1-EGGS,2015-03-31,2015-03-04,2015-03-04,157.0,0.0,"(1-EGGS, 2015-03-31 00:00:00)"
1-EGGS,2015-03-31,2015-03-05,2015-03-05,134.0,0.0,"(1-EGGS, 2015-03-31 00:00:00)"


In [27]:
all_train_rolls.fillna(0, inplace=True)  # todo why nans are here?

### Make Features

In [28]:
train_featured = make_roll_features(all_train_rolls)

Feature Extraction: 100%|██████████| 30/30 [02:06<00:00,  4.20s/it]


In [29]:
train_featured.head()

Unnamed: 0,date,sales__sum_values,sales__median,sales__mean,sales__length,sales__standard_deviation,sales__variance,sales__root_mean_square,sales__maximum,sales__absolute_maximum,...,onpromotion__mean,onpromotion__length,onpromotion__standard_deviation,onpromotion__variance,onpromotion__root_mean_square,onpromotion__maximum,onpromotion__absolute_maximum,onpromotion__minimum,store_nbr,family
0,2015-03-31,4525.0,152.0,145.967742,31.0,32.855975,1079.515088,149.619841,209.0,209.0,...,3.387097,31.0,8.804974,77.527575,9.433981,27.0,27.0,0.0,1,EGGS
1,2015-04-01,4598.0,153.0,148.322581,31.0,32.010469,1024.670135,151.737464,209.0,209.0,...,3.387097,31.0,8.804974,77.527575,9.433981,27.0,27.0,0.0,1,EGGS
2,2015-04-02,4574.0,152.0,147.548387,31.0,32.153702,1033.860562,151.011215,209.0,209.0,...,3.387097,31.0,8.804974,77.527575,9.433981,27.0,27.0,0.0,1,EGGS
3,2015-04-03,4478.0,152.0,144.451613,31.0,34.923731,1219.667014,148.613376,209.0,209.0,...,4.16129,31.0,9.500753,90.264308,10.372109,27.0,27.0,0.0,1,EGGS
4,2015-04-04,4465.0,150.0,144.032258,31.0,34.848505,1214.418314,148.188089,209.0,209.0,...,4.16129,31.0,9.500753,90.264308,10.372109,27.0,27.0,0.0,1,EGGS


## Prepare Series

In [30]:
sales_cols = [col for col in train_featured.columns if "sales" in col]
onpromotion_cols = [col for col in train_featured.columns if "onpromotion" in col]

In [31]:
def train_test_split(series: dict[str, list[TimeSeries]], split_date: pd.Timestamp):
    train_series = {}

    for family, series_list in series.items():
        train_series[family] = [s.drop_after(split_date) for s in series_list]

    return train_series

In [32]:
static_columns = []

data_series_dict, data_series_id_dict = get_series_and_id_dicts(
    lgb_train_data, static_columns
)

In [33]:
df = data_series_dict["EGGS"][0].pd_dataframe()

In [34]:
df.index.max()

Timestamp('2017-08-31 00:00:00')

In [35]:
# future_cols = ["onpromotion"] + onpromotion_cols
future_cols = onpromotion_cols
past_cols = [] + sales_cols

future_covariates, past_covariates = get_covariates_dicts(
    train_featured, future_cols, past_cols
)

In [36]:
train_series_dict = train_test_split(
    data_series_dict, pd.Timestamp(TRAIN_TEST_SPLIT_DATE)
)

In [37]:
def fit_light_gb_models():
    light_gb_models = {}

    for family, series in data_series_dict.items():
        inputs = {
            "series": [s.drop_after(pd.Timestamp("2017-07-10")) for s in series],
            "future_covariates": future_covariates[family],
            # "past_covariates": past_covariates[family],
        }
        light_gb_models[family] = LightGBMModel(
            lags=24,
            lags_future_covariates=(14, 1),
            # lags_past_covariates=list(range(-16, -20, -1)),
            force_col_wise=True,
        )

        light_gb_models[family].fit(**inputs)

    return light_gb_models

In [38]:
models = fit_light_gb_models()

[LightGBM] [Info] Total Bins 26199
[LightGBM] [Info] Number of data points in the train set: 42334, number of used features: 160
[LightGBM] [Info] Start training from score 191.717252
[LightGBM] [Info] Total Bins 26619
[LightGBM] [Info] Number of data points in the train set: 42334, number of used features: 160
[LightGBM] [Info] Start training from score 182.013495
[LightGBM] [Info] Total Bins 26244
[LightGBM] [Info] Number of data points in the train set: 42334, number of used features: 160
[LightGBM] [Info] Start training from score 97.718863
[LightGBM] [Info] Total Bins 25734
[LightGBM] [Info] Number of data points in the train set: 39243, number of used features: 160
[LightGBM] [Info] Start training from score 5.534289


In [39]:
from joblib import Parallel, delayed


def evaluate():
    def evaluate_family(family):
        series = data_series_dict[family]
        lgb_family_stores = [
            element["store_nbr"] for element in data_series_id_dict[family]
        ]
        multi_index = pd.MultiIndex.from_product(
            [[family], lgb_family_stores], names=["family", "store_nbr"]
        )

        family_losses = []
        for test_date in SERIES_TEST_RANGE:
            inputs = {
                "series": [s.drop_after(test_date) for s in series],
                "future_covariates": future_covariates[family],
                # "past_covariates": past_covariates[family],
            }

            preds = models[family].predict(n=16, show_warnings=False, **inputs)
            true_values = [s.slice_intersect(p) for p, s in zip(preds, series)]

            loss = [
                clipped_rmsle(t.values(), p.values())
                for t, p in zip(true_values, preds)
            ]
            series_loss = pd.Series(
                loss, index=multi_index, name=test_date.strftime("%Y.%m.%d")
            )

            family_losses.append(series_loss)

        family_losses_df = pd.concat(family_losses, axis=1)
        return family_losses_df

    losses = Parallel(n_jobs=-1)(
        delayed(evaluate_family)(family) for family in tqdm(data_series_dict.keys())
    )

    return pd.concat(losses)

In [40]:
family_losses = evaluate()

100%|██████████| 4/4 [00:00<00:00, 82.66it/s]


In [41]:
total_lgb_loss = pd.concat([family_losses, baseline_loss_df]).sort_index(
    level=["family", "store_nbr"]
)

In [42]:
grouped_loss = total_lgb_loss.groupby("family").mean()
lgb_series_loss = grouped_loss.mean(axis=1).rename("lgb_series_loss")

predictor_grouped_loss = predictor_eval_loss_df.groupby("family").mean()
baseline_series_loss = predictor_grouped_loss.mean(axis=1).rename("baseline_loss")

loss_df = pd.concat([lgb_series_loss, baseline_series_loss], axis=1)

In [43]:
loss_df

Unnamed: 0_level_0,lgb_series_loss,baseline_loss
family,Unnamed: 1_level_1,Unnamed: 2_level_1
EGGS,0.283927,0.386635
FROZEN FOODS,0.324788,0.399494
"LIQUOR,WINE,BEER",0.523027,0.708146
SCHOOL AND OFFICE SUPPLIES,0.553213,0.644878
AUTOMOTIVE,,0.549853
BABY CARE,,0.194055
BEAUTY,,0.574898
BEVERAGES,,0.275564
BOOKS,,0.07848
BREAD/BAKERY,,0.236766


In [44]:
total_prediction_eval_loss = loss_df["lgb_series_loss"].fillna(loss_df["baseline_loss"])
total_prediction_eval_loss.mean()

0.4039702702709875

In [45]:
loss_df["baseline_loss"].mean()

0.4177338467167706

## Submission

In [46]:
test_df_copy = baseline_prediction.copy()

In [47]:
test_df_copy.set_index(["ds", "family", "store_nbr"], inplace=True)

In [51]:
def make_predictions():
    sub_date = train_end + pd.Timedelta(days=1)

    for family, series in tqdm(data_series_dict.items()):
        inputs = {
            "series": [s.drop_after(sub_date) for s in series],
            "future_covariates": future_covariates[family],
            # "past_covariates": past_covariates[family],
        }
        pred = models[family].predict(n=16, **inputs)

        for i, values in enumerate(pred):
            store_nbr = int(data_series_id_dict[family][i]["store_nbr"])

            pred_df = values.pd_dataframe()
            pred_df[["family", "store_nbr"]] = [family, store_nbr]
            pred_df.set_index(["family", "store_nbr"], append=True, inplace=True)
            pred_df.index.names = ["ds", "family", "store_nbr"]

            test_df_copy.loc[pred_df.index, "yhat"] = pred_df["sales"]

    return test_df_copy

In [52]:
prediction = make_predictions()

prediction.isna().sum()

100%|██████████| 4/4 [00:01<00:00,  3.13it/s]


id             0
onpromotion    0
yhat           0
dtype: int64

## Save Submission

In [234]:
prediction.set_index("id", inplace=True)

In [235]:
save_submission(prediction, "test_lgb_combined_with_baseline_test_range_fix.csv")

sales    0
dtype: int64


In [236]:
prediction.head()

Unnamed: 0_level_0,onpromotion,yhat
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3000888,0,4.8125
3002670,0,4.8125
3004452,0,4.8125
3006234,0,4.8125
3008016,0,4.8125
