In [None]:
import gc
import sys

In [None]:
import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd

In [None]:
src_dir = "../../src"

In [None]:
sys.path.append(src_dir)

In [None]:
from package.constants import *
from package.datasets import *
from package.metrics import *
from package.model_selection import *
from package.utils import *

In [None]:
pd.options.display.max_rows = None

In [None]:
models_dir_path.mkdir(parents=True, exist_ok=True)

In [None]:
%%time
train = load_processed(overwrite=True)

In [None]:
# See https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/135896
is_train = (train["date"] >= train_start_date) & (train["date"] <= validation_end_date)

In [None]:
train[is_train].isnull().sum()

In [None]:
%%time
# See https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/149754
dataset = create_dataset(
    train,
    is_train,
    features,
    transformed_target,
    categorical_feature=categorical_features,
    free_raw_data=False,
    weight=train.loc[is_train, "sell_price"],
)

In [None]:
n_folds = 3
folds = Folds(train.loc[is_train, "date"], n_folds=n_folds)
weight_start_d = train_days + 1

In [None]:
del train
del is_train

In [None]:
gc.collect()

In [None]:
%%time
best_iterations = []
best_scores = []

for i, (train_index, valid_index) in enumerate(folds):
    dtrain = dataset.subset(train_index)
    dvalid = dataset.subset(valid_index)
    evaluator = WRMSSEEvaluator(
        train_days - (n_folds - i - 1) * evaluation_days + 1,
        weight_start_d=weight_start_d,
        target_transform=True,
    )

    booster = lgb.train(
        lgb_params,
        dtrain,
        categorical_feature=categorical_features,
        early_stopping_rounds=500,
        feval=evaluator.feval,
        num_boost_round=10_000,
        valid_sets=[dvalid],
        verbose_eval=100,
    )

    best_iterations.append(booster.best_iteration)
    best_scores.append(booster.best_score["valid_0"]["wrmsse"])

best_iteration = int(np.mean(best_iterations))
best_score = np.mean(best_scores)

In [None]:
best_iteration

In [None]:
best_score

In [None]:
%%time
model = lgb.train(lgb_params, dataset, num_boost_round=best_iteration)

In [None]:
joblib.dump(model, lgbm_reg_path)