# LightGBM Starter Model

In [None]:
import random
import warnings

import lightgbm as lgb
import numpy as np
import polars as pl
import utilsforecast.processing as ufp
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean, RollingStd
from mlforecast.lgb_cv import LightGBMCV
from mlforecast.target_transforms import LocalRobustScaler
from utilsforecast.plotting import plot_series
from src.data import load_full_data, process_wide_df

In [None]:
# ignore some warnings produced by mlforecast
warnings.filterwarnings("ignore", message=r".*Found null values.*")
warnings.filterwarnings("ignore", message=r"The following series are too short.*")
warnings.filterwarnings("ignore", message=r"Could not find the number of physical cores*")

def comp_loss(y_true, y_pred, ids, dates):
    err = y_pred - y_true
    abs_err = np.abs(err)
    score = abs(err.sum()) + abs_err.sum()
    yt_sum = y_true.sum()

    if yt_sum == 0:
        score = 0.0
    else:
        score /= yt_sum
    return score

def get_zero_var_ids(df: pl.DataFrame, n: int = 2) -> pl.DataFrame:
    return df. group_by('unique_id').tail(n).group_by('unique_id').agg(
                            pl.col('y').std().alias('std'),
                            pl.col('y').last().alias('fill')).filter(pl.col('std').eq(0))

In [None]:
sales = process_wide_df(load_full_data())
sales_pd = sales.to_pandas()

In [None]:
plot_series(sales, seed=random.randint(0, 2000), max_ids=10)

In [None]:
sales_pd.set_index("ds").resample("W-MON")["y"].sum().plot()

# Remove Leading Zeros

In [None]:
_id = "24-226-1737"
plot_series(sales.filter(pl.col("unique_id") == _id))

In [None]:
sales = sales.sort('unique_id', 'ds')
without_leading_zeros = sales.select(pl.col('y').gt(0).cast(pl.Int64).cum_max().over('unique_id').cast(pl.Boolean))['y']
sales = sales.filter(without_leading_zeros)
plot_series(sales, seed=random.randint(0, 2000), max_ids=10)

In [None]:
plot_series(sales.filter(pl.col("unique_id") == _id))

# LightGbm Cross-Validation

In [None]:
%%time

h = 13  # number of weeks to predict
rmean4 = RollingMean(window_size=4, min_samples=1)
rstd4 = RollingStd(window_size=4, min_samples=2)

lgb_cv = LightGBMCV(
    freq='7d',
    lags=[13, 52],
    lag_transforms={
        1 : [
            ExpandingMean(),
            rmean4,
            rstd4,
            RollingMean(window_size=13, min_samples=1),
            RollingMean(window_size=26, min_samples=1),
            RollingMean(window_size=52, min_samples=1),
        ],
        **{
            k: [rmean4, rstd4, RollingMean(window_size=52-k, min_samples=1)]
            for k in [4, 8, 13]
        }
    },
    target_transforms=[LocalRobustScaler(scale='iqr')],
    date_features=['year', 'month'],
    num_threads=4,
)
cv_hist = lgb_cv.fit(
    sales_pd,
    n_windows=4,
    h=h,
    dropna=False,
    metric=comp_loss,
    num_iterations=10_000,
    params={'verbosity': -1, 'learning_rate': 0.2, 'num_leaves': 128},
    early_stopping_evals=5,
)

In [None]:
mlf = MLForecast.from_cv(lgb_cv)
mlf.fit(sales, dropna=False)
preds = mlf.predict(h).rename({'LGBMRegressor': 'preds'})

In [None]:
plot_series(sales, preds, max_ids=4, seed=random.randint(0, 234972353))

# Manual Overrides

Replace time-series with 2 consecutive weeks of zero variance with Naive Forecast.

In [None]:
df_replace = get_zero_var_ids(sales, n=2)

preds = preds.join(df_replace,
                   on=['unique_id'],
                   how='left').with_columns(
                        preds=pl.coalesce(pl.col('fill'), pl.col('preds'))
                   ).drop('std', 'fill')

In [None]:
plot_series(sales, preds, max_ids=12, seed=random.randint(0, 234972353))

In [None]:
preds_pd = preds.to_pandas()
preds_pd[['Client', 'Warehouse', 'Product']] = preds_pd['unique_id'].str.split('-', expand=True)
preds_pd['preds'] = np.clip(preds_pd['preds'], 0, None)
preds_pd

In [None]:
subm = preds_pd.drop(columns=['unique_id']).pivot(index=['Client', 'Warehouse', 'Product'], columns=['ds'])
subm = subm.reset_index()
subm.columns = subm.columns.droplevel()
subm.columns.name = None
subm.columns = ['Client', 'Warehouse', 'Product'] + [x.strftime('%Y-%m-%d') for x in subm.columns[3:]]
subm = subm.astype({'Client': 'int64', 'Warehouse': 'int64', 'Product': 'int64'}).sort_values(['Client', 'Warehouse', 'Product'])
subm

In [None]:
from datetime import date, timedelta

val_date = date(2023, 10, 1)
val_weeks = round((sales["ds"].max() - val_date ) / timedelta(weeks=1))

train = sales.filter(pl.col("ds") < val_date)
val = sales.filter(pl.col("ds") >= val_date)

mlf = MLForecast.from_cv(lgb_cv)
mlf.fit(train, dropna=False)

preds_val = mlf.predict(val_weeks).rename({'LGBMRegressor': 'val'})

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

f, ax = plt.subplots(1, 1)

dfpred = preds_val.group_by("ds").agg(pl.col("val").sum()).to_pandas()

dfval = val.group_by("ds").agg(pl.col("y").sum()).to_pandas()
dftrn = train.group_by("ds").agg(pl.col("y").sum()).to_pandas()

sns.lineplot(x=dfpred.ds, y=dfpred.val, label="Prediction", ax=ax)
sns.lineplot(x=dftrn.ds, y=dftrn.y, label="Train", ax=ax)
sns.lineplot(x=dfval.ds, y=dfval.y, label="Validation", ax=ax)