## Notebook Configuration && Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from darts import TimeSeries
from darts.models import LightGBMModel

from storesales.light_gbm.preprocessing import preprocess
from storesales.light_gbm.utils import save_submission
from storesales.light_gbm.loss import clipped_rmsle
from storesales.constants import (
    EXTERNAL_TRAIN_PATH,
    EXTERNAL_TEST_PATH,
    TRAIN_TEST_SPLIT_DATE
)

## Load Data

In [3]:
train_df = pd.read_csv(EXTERNAL_TRAIN_PATH, parse_dates=["date"])
test_df = pd.read_csv(EXTERNAL_TEST_PATH, parse_dates=["date"])

In [4]:
train_end = train_df["date"].max()

In [5]:
preprocessed_df = preprocess(train_df)

In [6]:
preprocessed_df.fillna(0, inplace=True)

## Prepare Data Before Making Series

In [7]:
train_columns = ["date", "sales", "onpromotion", "store_nbr", "family"]

train_test_df = pd.concat([preprocessed_df, test_df], axis=0, ignore_index=True)
train_data = train_test_df[train_columns].copy()

In [8]:
# ensure that I have enough data to get lags

threshold_date = pd.Timestamp("2017-04-01")

min_dates = train_data.groupby(["family", "store_nbr"])["date"].min().reset_index()
valid_groups = min_dates[min_dates["date"] <= threshold_date]

lgb_train_data = pd.merge(
    train_data,
    valid_groups[["family", "store_nbr"]],
    on=["family", "store_nbr"],
    how="inner",
)

## Baseline Model
Some store-family pairs do not have enough data to get lags. I will use the mean sales as the prediction for these pairs.

In [9]:
not_valid_groups = min_dates[min_dates["date"] > threshold_date]

baseline_train_data = pd.merge(
    train_df,
    not_valid_groups[["family", "store_nbr"]],
    on=["family", "store_nbr"],
    how="inner",
)

In [33]:
mean_sales_df = (
    baseline_train_data.sort_values(by="date") 
      .groupby(["store_nbr", "family"])["sales"]
      .apply(lambda x: x.tail(14).mean()) 
      .reset_index(name="sales")  
)

In [35]:
mean_sales_df.head()

Unnamed: 0,store_nbr,family,sales
0,6,BABY CARE,0.0
1,22,LADIESWEAR,0.214286
2,52,AUTOMOTIVE,12.5
3,52,BEAUTY,9.785714
4,52,BEVERAGES,4425.214286


## Prepare Series

In [12]:
def get_series_and_id_dicts(df: pd.DataFrame):
    series_dict = {}
    series_id_dict = {}

    for family in df["family"].unique():
        series = TimeSeries.from_group_dataframe(
            df=df[df["family"] == family],
            time_col="date",
            value_cols="sales",
            group_cols="store_nbr",
            static_cols=None,
        )
        series_id = [
            {"store_nbr": s.static_covariates.store_nbr.iloc[0], "family": family}
            for s in series
        ]
        series_id_dict[family] = series_id

        series = [s.with_static_covariates(None) for s in series]

        series_dict[family] = series

    return series_dict, series_id_dict


def get_future_covariates_dict(df: pd.DataFrame):
    future_dict = {}

    for family in df["family"].unique():
        future_covariates = TimeSeries.from_group_dataframe(
            df=df[df["family"] == family],
            time_col="date",
            value_cols="onpromotion",
            group_cols="store_nbr",
        )
        future_dict[family] = [f.with_static_covariates(None) for f in future_covariates]

    return future_dict


def train_test_split(series: dict[str, list[TimeSeries]], split_date: pd.Timestamp):
    train_series = {}

    for family, series_list in series.items():
        train_series[family] = [s.drop_after(split_date) for s in series_list]

    return train_series

In [13]:
data_series_dict, data_series_id_dict = get_series_and_id_dicts(lgb_train_data)
future_covariates_dict = get_future_covariates_dict(lgb_train_data)

In [14]:
train_series_dict = train_test_split(
    data_series_dict, pd.Timestamp(TRAIN_TEST_SPLIT_DATE)
)

In [16]:
def fit_light_gb_models():
    light_gb_models = {}

    for family, series in data_series_dict.items():
        inputs = {
            "series": [s.drop_after(pd.Timestamp("2017-07-10")) for s in series],
            "future_covariates": future_covariates_dict[family],
        }
        light_gb_models[family] = LightGBMModel(
            lags=24, lags_future_covariates=(14, 1), force_col_wise=True
        )

        light_gb_models[family].fit(**inputs)

    return light_gb_models

In [17]:
models = fit_light_gb_models()

[LightGBM] [Info] Total Bins 2487
[LightGBM] [Info] Number of data points in the train set: 81055, number of used features: 39
[LightGBM] [Info] Start training from score 6.600093
[LightGBM] [Info] Total Bins 6150
[LightGBM] [Info] Number of data points in the train set: 47003, number of used features: 39
[LightGBM] [Info] Start training from score 0.724151
[LightGBM] [Info] Total Bins 3281
[LightGBM] [Info] Number of data points in the train set: 81047, number of used features: 39
[LightGBM] [Info] Start training from score 4.016515
[LightGBM] [Info] Total Bins 7931
[LightGBM] [Info] Number of data points in the train set: 81057, number of used features: 39
[LightGBM] [Info] Start training from score 2574.898948
[LightGBM] [Info] Total Bins 5625
[LightGBM] [Info] Number of data points in the train set: 51940, number of used features: 24
[LightGBM] [Info] Start training from score 0.141593
[LightGBM] [Info] Total Bins 7545
[LightGBM] [Info] Number of data points in the train set: 81057

In [18]:
def evaluate():
    end_test = train_end - pd.DateOffset(days=15)
    test_period = pd.date_range(start=TRAIN_TEST_SPLIT_DATE, end=end_test, freq="D")

    losses = []
    for family, series in data_series_dict.items():
        family_losses = []
        for test_date in tqdm(test_period):
            inputs = {
                "series": [s.drop_after(test_date) for s in series],
                "future_covariates": future_covariates_dict[family],
            }

            preds = models[family].predict(n=16, **inputs)

            true_values = [s.slice_intersect(p) for p, s in zip(preds, series)]

            loss = np.mean(
                [
                    clipped_rmsle(t.values(), p.values())
                    for t, p in zip(true_values, preds)
                ]
            )

            family_losses.append(loss)

        family_loss = np.mean(family_losses)
        print(f"Family: {family}, Loss: {family_loss}")
        losses.append(family_loss)

    return np.mean(losses)

In [None]:
evaluate()

## Submission

In [36]:
test_df_copy = test_df.copy()
test_df_copy.sort_values(by=["date"], inplace=True)

In [37]:
test_df_copy = test_df_copy.merge(mean_sales_df, on=["store_nbr", "family"], how="left")

In [38]:
def make_predictions():
    sub_date = pd.Timestamp(train_end)
    
    for family, series in tqdm(data_series_dict.items()):
        inputs = {
            "series": [s.drop_after(sub_date) for s in series],
            "future_covariates": future_covariates_dict[family],
        }
        pred = models[family].predict(n=16, **inputs)
    
        for i, values in enumerate(pred):
            store_nbr = data_series_id_dict[family][i]["store_nbr"]
            con = (test_df_copy["store_nbr"] == store_nbr) & (
                test_df_copy["family"] == family
            )
            test_df_copy.loc[con, "sales"] = values.values()
    
    return test_df_copy

In [39]:
prediction = make_predictions()

prediction.isna().sum()

100%|██████████| 33/33 [00:08<00:00,  3.96it/s]


id             0
date           0
store_nbr      0
family         0
onpromotion    0
sales          0
dtype: int64

## Save Submission

In [40]:
save_submission(prediction, "light_gbm_with_14_days_mean_test.csv")

In [41]:
prediction.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion,sales
0,3000888,2017-08-16,1,AUTOMOTIVE,0,4.98563
1,3002082,2017-08-16,42,CELEBRATION,0,8.27521
2,3002081,2017-08-16,42,BREAD/BAKERY,12,466.089294
3,3002080,2017-08-16,42,BOOKS,0,0.971789
4,3002079,2017-08-16,42,BEVERAGES,19,1988.141171
