## Notebook Configuration && Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [93]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from darts import TimeSeries
from darts.models import LightGBMModel

from storesales.light_gbm.feature_engineering.ts_fresh_features import make_roll, make_roll_features
from storesales.light_gbm.data_series.target_series import get_series_and_id_dicts
from storesales.light_gbm.data_series.covariates import get_covariates_dicts
from storesales.baseline.stat_models import DailyMeanModel
from storesales.light_gbm.preprocessing import preprocess
from storesales.light_gbm.utils import save_submission
from storesales.light_gbm.loss import clipped_rmsle
from storesales.constants import (
    EXTERNAL_TRAIN_PATH,
    EXTERNAL_TEST_PATH,
    TRAIN_TEST_SPLIT_DATE,
)

## Load Data

In [3]:
train_df = pd.read_csv(EXTERNAL_TRAIN_PATH, parse_dates=["date"])
test_df = pd.read_csv(EXTERNAL_TEST_PATH, parse_dates=["date"])

In [4]:
train_end = train_df["date"].max()

In [5]:
preprocessed_df = preprocess(train_df)

In [6]:
preprocessed_df.fillna(0, inplace=True)

## Prepare Data Before Making Series

In [7]:
families = ["FROZEN FOODS", "EGGS"]

train_columns = ["date", "sales", "onpromotion", "store_nbr", "family"]

train_test_df = pd.concat([preprocessed_df, test_df], axis=0, ignore_index=True)
train_data = train_test_df[train_test_df["family"].isin(families)][train_columns].copy()

In [8]:
# ensure that I have enough data to get lags

threshold_date = pd.Timestamp("2017-04-01")

min_dates = train_data.groupby(["family", "store_nbr"])["date"].min().reset_index()
valid_groups = min_dates[min_dates["date"] <= threshold_date]

lgb_train_data = pd.merge(
    train_data,
    valid_groups[["family", "store_nbr"]],
    on=["family", "store_nbr"],
    how="inner",
)

In [29]:
lgb_train_data.head()

Unnamed: 0,date,sales,onpromotion,store_nbr,family
0,2013-01-02,246.0,0.0,1,EGGS
1,2013-01-03,203.0,0.0,1,EGGS
2,2013-01-04,171.0,0.0,1,EGGS
3,2013-01-05,177.0,0.0,1,EGGS
4,2013-01-06,85.0,0.0,1,EGGS


In [9]:
# train_data[(train_data["store_nbr"] == 52) & (train_data["family"] == "BEVERAGES")][
#     "date"
# ].min()

NaT

## Baseline Model
Some store-family pairs do not have enough data to get lags. I will use the mean sales as the prediction for these pairs.

In [10]:
not_valid_groups = min_dates[min_dates["date"] > threshold_date]

baseline_train_data = pd.merge(
    train_df,
    not_valid_groups[["family", "store_nbr"]],
    on=["family", "store_nbr"],
    how="inner",
)

In [11]:
not_valid_groups

Unnamed: 0,family,store_nbr,date
51,EGGS,52,2017-04-20
105,FROZEN FOODS,52,2017-04-20


In [12]:
mean_sales_df = (
    baseline_train_data.sort_values(by="date")
    .groupby(["store_nbr", "family"])["sales"]
    .apply(lambda x: x.tail(14).mean())
    .reset_index(name="sales")
)

In [13]:
end_test_range = train_df["date"].max() - pd.Timedelta(days=16)
series_test_range = pd.date_range(TRAIN_TEST_SPLIT_DATE, end_test_range, freq="D")

In [14]:
def evaluate_baseline(family, date):
    family_store = not_valid_groups[not_valid_groups["family"] == family]["store_nbr"]
    families_losses = []
    for store_nbr in family_store.values:
        family_store_data = baseline_train_data[
            (baseline_train_data["family"] == family)
            & (baseline_train_data["store_nbr"] == store_nbr)
        ].copy()
        family_store_data.rename(columns={"sales": "y"}, inplace=True)

        baseline_train = family_store_data[family_store_data["date"] < date]
        baseline_test_data = family_store_data[
            (family_store_data["date"] >= date)
            & (family_store_data["date"] <= date + pd.Timedelta(days=16))
        ]

        stat_model = DailyMeanModel(window=14)
        stat_model.fit(baseline_train)
        forecast = stat_model.predict(baseline_test_data)
        families_losses.append(clipped_rmsle(forecast["y"], forecast["yhat"]))

    return families_losses

In [96]:
evaluate_baseline("AUTOMOTIVE", pd.Timestamp("2017-06-10"))

[0.5880354954259592]

## Rolling Features

In [15]:
train_columns = [
    "date", 
    "sales",
    "onpromotion", 
    "store_nbr", 
    "family"
]

In [34]:
lgb_train_data.head()

Unnamed: 0,date,sales,onpromotion,store_nbr,family
0,2013-01-02,246.0,0.0,1,EGGS
1,2013-01-03,203.0,0.0,1,EGGS
2,2013-01-04,171.0,0.0,1,EGGS
3,2013-01-05,177.0,0.0,1,EGGS
4,2013-01-06,85.0,0.0,1,EGGS


In [21]:
train_data_for_roll = lgb_train_data[train_columns].copy()

all_train_rolls = make_roll(train_data_for_roll)

Rolling: 100%|██████████| 30/30 [00:38<00:00,  1.28s/it]


In [22]:
all_train_rolls.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date,sales,onpromotion,id
store_family,date_roll_id,date_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-EGGS,2013-02-01,2013-01-02,2013-01-02,246.0,0.0,"(1-EGGS, 2013-02-01 00:00:00)"
1-EGGS,2013-02-01,2013-01-03,2013-01-03,203.0,0.0,"(1-EGGS, 2013-02-01 00:00:00)"
1-EGGS,2013-02-01,2013-01-04,2013-01-04,171.0,0.0,"(1-EGGS, 2013-02-01 00:00:00)"
1-EGGS,2013-02-01,2013-01-05,2013-01-05,177.0,0.0,"(1-EGGS, 2013-02-01 00:00:00)"
1-EGGS,2013-02-01,2013-01-06,2013-01-06,85.0,0.0,"(1-EGGS, 2013-02-01 00:00:00)"


In [23]:
min_data = pd.Timestamp("2015-05-01")

In [24]:
all_train_rolls = all_train_rolls[
    all_train_rolls.index.get_level_values("date_id") >= min_data
]

In [25]:
all_train_rolls.isna().sum()

date               0
sales          14416
onpromotion        0
id                 0
dtype: int64

In [26]:
all_train_rolls.fillna(0, inplace=True)

In [110]:
train_featured = make_roll_features(all_train_rolls)

Feature Extraction: 100%|██████████| 30/30 [01:03<00:00,  2.13s/it]


In [111]:
train_featured.head()

Unnamed: 0,date,sales__sum_values,sales__median,sales__mean,sales__length,sales__standard_deviation,sales__variance,sales__root_mean_square,sales__maximum,sales__absolute_maximum,...,onpromotion__mean,onpromotion__length,onpromotion__standard_deviation,onpromotion__variance,onpromotion__root_mean_square,onpromotion__maximum,onpromotion__absolute_maximum,onpromotion__minimum,store_nbr,family
0,2015-05-01,75.0,75.0,75.0,1.0,0.0,0.0,75.0,75.0,75.0,...,24.0,1.0,0.0,0.0,24.0,24.0,24.0,24.0,1,EGGS
1,2015-05-02,251.0,125.5,125.5,2.0,50.5,2550.25,135.279341,176.0,176.0,...,12.0,2.0,12.0,144.0,16.970563,24.0,24.0,0.0,1,EGGS
2,2015-05-03,337.0,86.0,112.333333,3.0,45.242556,2046.888889,121.101885,176.0,176.0,...,8.0,3.0,11.313708,128.0,13.856406,24.0,24.0,0.0,1,EGGS
3,2015-05-04,500.0,124.5,125.0,4.0,44.905456,2016.5,132.821309,176.0,176.0,...,6.0,4.0,10.392305,108.0,12.0,24.0,24.0,0.0,1,EGGS
4,2015-05-05,655.0,155.0,131.0,5.0,41.918969,1757.2,137.543448,176.0,176.0,...,4.8,5.0,9.6,92.16,10.733126,24.0,24.0,0.0,1,EGGS


In [112]:
sales_cols = [col for col in train_featured.columns if "sales" in col]
onpromotion_cols = [col for col in train_featured.columns if "onpromotion" in col]

In [113]:
sales_cols

['sales__sum_values',
 'sales__median',
 'sales__mean',
 'sales__length',
 'sales__standard_deviation',
 'sales__variance',
 'sales__root_mean_square',
 'sales__maximum',
 'sales__absolute_maximum',
 'sales__minimum']

## Prepare Series

In [114]:
def train_test_split(series: dict[str, list[TimeSeries]], split_date: pd.Timestamp):
    train_series = {}

    for family, series_list in series.items():
        train_series[family] = [s.drop_after(split_date) for s in series_list]

    return train_series

In [115]:
static_columns = []

data_series_dict, data_series_id_dict = get_series_and_id_dicts(lgb_train_data, static_columns)

In [116]:
# future_cols = ["onpromotion"] + onpromotion_cols
future_cols = onpromotion_cols
past_cols = [] + sales_cols

future_covariates, past_covariates = get_covariates_dicts(train_featured, future_cols, past_cols)

In [117]:
train_series_dict = train_test_split(
    data_series_dict, pd.Timestamp(TRAIN_TEST_SPLIT_DATE)
)

In [166]:
def fit_light_gb_models():
    light_gb_models = {}

    for family, series in data_series_dict.items():
        inputs = {
            "series": [s.drop_after(pd.Timestamp("2017-07-10")) for s in series],
            "future_covariates": future_covariates[family],
            # "past_covariates": past_covariates[family],
        }
        light_gb_models[family] = LightGBMModel(
            lags=24, 
            lags_future_covariates=(14, 1), 
            # lags_past_covariates=list(range(-16, -20, -1)), 
            force_col_wise=True
        )

        light_gb_models[family].fit(**inputs)

    return light_gb_models

In [167]:
models = fit_light_gb_models()

[LightGBM] [Info] Total Bins 26688
[LightGBM] [Info] Number of data points in the train set: 40877, number of used features: 175
[LightGBM] [Info] Start training from score 191.374768
[LightGBM] [Info] Total Bins 27068
[LightGBM] [Info] Number of data points in the train set: 40877, number of used features: 175
[LightGBM] [Info] Start training from score 183.462539


In [168]:
from joblib import Parallel, delayed


def evaluate():
    def evaluate_family(family):
        series = data_series_dict[family]
        family_losses = []

        for test_date in series_test_range:
            inputs = {
                "series": [s.drop_after(test_date) for s in series],
                "future_covariates": future_covariates[family],
                # "past_covariates": past_covariates[family],
            }

            preds = models[family].predict(n=16, show_warnings=False, **inputs)
            true_values = [s.slice_intersect(p) for p, s in zip(preds, series)]

            loss = [
                clipped_rmsle(t.values(), p.values())
                for t, p in zip(true_values, preds)
            ] + evaluate_baseline(family, test_date)

            family_losses.append(np.mean(loss))

        return {family: family_losses}

    losses = Parallel(n_jobs=-1)(
        delayed(evaluate_family)(family) for family in tqdm(data_series_dict.keys())
    )

    return losses

In [169]:
family_losses = evaluate()

100%|██████████| 2/2 [00:00<00:00, 2414.68it/s]


In [164]:
np.mean(family_losses[0]["EGGS"])

0.2820981357846141

In [170]:
np.mean(family_losses[1]["FROZEN FOODS"])

0.3261847704584309

## Submission

In [72]:
test_df_copy = test_df.copy()
test_df_copy.sort_values(by=["date"], inplace=True)

In [73]:
test_df_copy = test_df_copy.merge(mean_sales_df, on=["store_nbr", "family"], how="left")

In [74]:
test_df_copy.set_index("id", inplace=True)

In [75]:
def make_predictions():
    sub_date = pd.Timestamp(train_end)

    for family, series in tqdm(data_series_dict.items()):
        inputs = {
            "series": [s.drop_after(sub_date) for s in series],
            "future_covariates": future_covariates_dict[family],
        }
        pred = models[family].predict(n=16, **inputs)

        for i, values in enumerate(pred):
            store_nbr = data_series_id_dict[family][i]["store_nbr"]
            con = (test_df_copy["store_nbr"] == store_nbr) & (
                test_df_copy["family"] == family
            )
            test_df_copy.loc[con, "sales"] = values.values()

    return test_df_copy

In [76]:
prediction = make_predictions()

prediction.isna().sum()

100%|██████████| 33/33 [00:08<00:00,  4.06it/s]


date           0
store_nbr      0
family         0
onpromotion    0
sales          0
dtype: int64

## Save Submission

In [79]:
save_submission(prediction, "test_fix.csv")

sales    0
dtype: int64


In [78]:
prediction.head()

Unnamed: 0_level_0,date,store_nbr,family,onpromotion,sales
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3000888,2017-08-16,1,AUTOMOTIVE,0,4.98563
3002082,2017-08-16,42,CELEBRATION,0,8.27521
3002081,2017-08-16,42,BREAD/BAKERY,12,466.089294
3002080,2017-08-16,42,BOOKS,0,0.971789
3002079,2017-08-16,42,BEVERAGES,19,1988.141171
