In [1]:
%load_ext autoreload
%autoreload 2

In [73]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from darts import TimeSeries
from darts.models import LightGBMModel

from storesales.light_gbm.preprocessing import preprocess
from storesales.constants import EXTERNAL_TRAIN_PATH, EXTERNAL_TEST_PATH, EXTERNAL_SAMPLE_SUBMISSION_PATH

In [3]:
def clipped_rmsle(y_true, y_pred):
    y_pred = np.clip(y_pred, 0, None)
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))

In [40]:
train_df = pd.read_csv(EXTERNAL_TRAIN_PATH, parse_dates=["date"])
test_df = pd.read_csv(EXTERNAL_TEST_PATH, parse_dates=["date"])

In [55]:
train_end = train_df["date"].max()
train_test_split_date = "2017-05-10"

In [42]:
preprocessed_df = preprocess(train_df)

In [60]:
preprocessed_df.fillna(0, inplace=True)

In [61]:
# test_beverages_data_df = test_df[test_df["family"] == "BEVERAGES"]

In [62]:
# beverages_data_df["store_to_family"] = (
#     beverages_data_df[["store_nbr", "family"]].astype(str).agg("-".join, axis=1)
# )

train_columns = ["date", "sales", "onpromotion", "store_nbr", "family"]

train_test_df = pd.concat([preprocessed_df, test_df], axis=0, ignore_index=True)
train_data = train_test_df[train_columns].copy()

# train_data.rename(columns={"store_to_family": "id", "date": "time"}, inplace=True)

In [63]:
threshold_date = pd.Timestamp("2017-04-01")

min_dates = train_data.groupby(["family", "store_nbr"])["date"].min().reset_index()
valid_groups = min_dates[min_dates["date"] <= threshold_date]

train_data = pd.merge(train_data, valid_groups[['family', 'store_nbr']], on=['family', 'store_nbr'], how='inner')

In [74]:
def get_series_and_id_dicts(df: pd.DataFrame):
    series_dict = {}
    series_id_dict = {}
    
    for family in df["family"].unique():
        series = TimeSeries.from_group_dataframe(
            df=df[df["family"] == family],
            time_col="date",
            value_cols="sales",
            group_cols="store_nbr",
            static_cols=None,
        )
        series_id = [{"store_nbr": s.static_covariates.store_nbr.iloc[0], "family": family} for s in series]
        series_id_dict[family] = series_id
        
        series =  [s.with_static_covariates(None) for s in series]
        
        series_dict[family] = series
    
    return series_dict, series_id_dict

def get_future_covariates_dict(df: pd.DataFrame):
    future_dict = {}
    
    for family in df["family"].unique():
        future_covariates = TimeSeries.from_group_dataframe(
            df=df[df["family"] == family],
            time_col="date",
            value_cols="onpromotion",
            group_cols="store_nbr",
        )
        future_dict[family] = future_covariates
    
    return future_dict

def train_test_split(series: dict[str, list[TimeSeries]], split_date: pd.Timestamp):
    train_series = {}
    # test_series = {}
    
    for family, series_list in series.items():
        train_series[family] = [s.drop_after(split_date) for s in series_list]
        # test_series[family] = [s.slice_intersect(pd.date_range(split_date, s.end_time())) for s in series_list]
    
    return train_series
        
    
# target = list(map(lambda x: x.drop_after(pd.Timestamp(train_test_split_date)), series))

# target_id = [{"store_nbr": t.static_covariates.store_nbr.iloc[0], "family": "BEVERAGES"} for t in target]
# target =  [t.with_static_covariates(None) for t in target]
# target_dict = {"BEVERAGES": [t.astype(np.float32) for t in target]}
# id_dict = {"BEVERAGES": target_id}

In [75]:
data_series_dict, data_series_id_dict = get_series_and_id_dicts(train_data)
future_covariates_dict = get_future_covariates_dict(train_data)

In [66]:
train_series_dict = train_test_split(data_series_dict, pd.Timestamp(train_test_split_date))

In [67]:
# future_covariates = TimeSeries.from_group_dataframe(
#     df=train_data,
#     time_col="date",
#     value_cols="onpromotion",
#     group_cols="store_nbr",
# )
# 
# future_covs = [f.with_static_covariates(None) for f in future_covs]
# future_dict = {"BEVERAGES": [f.astype(np.float32) for f in future_covs]}

In [77]:
models = {}

for family, series in data_series_dict.items():
    inputs = {
        "series": [s.drop_after(pd.Timestamp("2017-07-10")) for s in series],
        "future_covariates": future_covariates_dict[family],
    }
    models[family] = LightGBMModel(lags=24, lags_future_covariates=(14, 1), force_col_wise=True)
    
    models[family].fit(**inputs)

[LightGBM] [Info] Total Bins 2487
[LightGBM] [Info] Number of data points in the train set: 81055, number of used features: 39
[LightGBM] [Info] Start training from score 6.600093
[LightGBM] [Info] Total Bins 6150
[LightGBM] [Info] Number of data points in the train set: 47003, number of used features: 39
[LightGBM] [Info] Start training from score 0.724151
[LightGBM] [Info] Total Bins 3281
[LightGBM] [Info] Number of data points in the train set: 81047, number of used features: 39
[LightGBM] [Info] Start training from score 4.016515
[LightGBM] [Info] Total Bins 7931
[LightGBM] [Info] Number of data points in the train set: 81057, number of used features: 39
[LightGBM] [Info] Start training from score 2574.898948
[LightGBM] [Info] Total Bins 5625
[LightGBM] [Info] Number of data points in the train set: 51940, number of used features: 24
[LightGBM] [Info] Start training from score 0.141593
[LightGBM] [Info] Total Bins 7545
[LightGBM] [Info] Number of data points in the train set: 81057

In [71]:
def evaluate():
    end_test = train_end - pd.DateOffset(days=15)
    test_period = pd.date_range(start=train_test_split_date, end=end_test, freq="D")
        
    losses = []
    for family, series in data_series_dict.items():
        family_losses = []
        for test_date in tqdm(test_period):
            inputs = {
                "series": [s.drop_after(test_date) for s in series],
                "future_covariates": future_covariates_dict[family]
            }
            
            preds = models[family].predict(n=16, **inputs)
            
            true_values = [s.slice_intersect(p) for p, s in zip(preds, series)]
            
            loss = np.mean([clipped_rmsle(t.values(), p.values()) for t, p in zip(true_values, preds)])
            
            family_losses.append(loss)
        
        family_loss = np.mean(family_losses)
        print(f"Family: {family}, Loss: {family_loss}")
        losses.append(family_loss)

    return np.mean(losses)


In [72]:
evaluate()

100%|██████████| 83/83 [00:18<00:00,  4.58it/s]


Family: AUTOMOTIVE, Loss: 0.5281642619634838


100%|██████████| 83/83 [00:17<00:00,  4.77it/s]


Family: BABY CARE, Loss: 0.3606325200724416


100%|██████████| 83/83 [00:17<00:00,  4.74it/s]


Family: BEAUTY, Loss: 0.4727290957916864


100%|██████████| 83/83 [00:18<00:00,  4.55it/s]


Family: BEVERAGES, Loss: 0.19436634473106135


100%|██████████| 83/83 [00:18<00:00,  4.57it/s]


Family: BOOKS, Loss: 0.3669966828328123


100%|██████████| 83/83 [00:18<00:00,  4.60it/s]


Family: BREAD/BAKERY, Loss: 0.15671234733508216


100%|██████████| 83/83 [00:17<00:00,  4.84it/s]


Family: CELEBRATION, Loss: 0.568761454951625


 37%|███▋      | 31/83 [00:06<00:11,  4.50it/s]

KeyboardInterrupt



## Submission

In [86]:
test_df_copy = test_df.copy()
test_df_copy.sort_values(by=["date"], inplace=True)

In [88]:
sub_date = pd.Timestamp(train_end)
sub_prediction = {}

for family, series in tqdm(data_series_dict.items()):
    inputs = {
        "series": [s.drop_after(sub_date) for s in series],
        "future_covariates": future_covariates_dict[family]
    }
    prediction = models[family].predict(n=16, **inputs)
    
    for i, values in enumerate(prediction):
        store_nbr = data_series_id_dict[family][i]["store_nbr"]
        con = (test_df_copy["store_nbr"] == store_nbr) & (test_df_copy["family"] == family)
        test_df_copy.loc[con, "sales"] = values.values()

100%|██████████| 33/33 [00:08<00:00,  3.92it/s]


In [90]:
test_df_copy.fillna(0, inplace=True)

In [91]:
import os
from storesales.constants import SUBMISSIONS_PATH

In [92]:
submission_df = pd.read_csv(EXTERNAL_SAMPLE_SUBMISSION_PATH)

In [93]:
submission_df["sales"] = test_df_copy["sales"]

In [96]:
file_path = os.path.join(SUBMISSIONS_PATH, "darts_light_gbm_test_submission.csv")
submission_df.to_csv(file_path, index=False)