In [None]:
#|default_exp lgb_cv

In [None]:
#|hide
%load_ext autoreload
%autoreload 2

# LightGBMCV

In [None]:
#|export
import copy
import os
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.base import clone

from mlforecast import Forecast, TimeSeries
from mlforecast.utils import backtest_splits

In [None]:
#| exporti
class EarlyStopException(BaseException):
    ...

def _update(bst, n):
    for _ in range(n):
        bst.update()

def _predict(ts, bst, valid, h, time_col, target_col, dynamic_dfs, predict_fn, **predict_fn_kwargs):
    preds = ts.predict(bst, h, dynamic_dfs, predict_fn, **predict_fn_kwargs).set_index(time_col, append=True)
    preds = preds.join(valid)
    preds['sq_err'] = (preds['Booster'] - preds[target_col]) ** 2
    rmse = preds.groupby(level=0, observed=True)['sq_err'].mean().pow(0.5).mean()
    return rmse

def _update_and_predict(ts, bst, valid, n, h, time_col, target_col, dynamic_dfs, predict_fn, **predict_fn_kwargs):
    _update(bst, n)
    return _predict(ts, bst, valid, h, time_col, target_col, dynamic_dfs, predict_fn, **predict_fn_kwargs)

In [None]:
#| export
class LightGBMCV:
    def __init__(
        self,
        freq: Optional[str] = None,  # pandas offset alias, e.g. D, W, M
        lags: List[int] = [],  # list of lags to use as features
        lag_transforms: Dict[int, List[Tuple]] = {},  # list of transformations to apply to each lag
        date_features: List[str] = [],  # list of names of pandas date attributes to use as features, e.g. dayofweek
        num_threads: int = 1,  # number of threads to use when computing the predictions of each window.
    ):
        self.num_threads = num_threads
        self.ts = TimeSeries(freq, lags, lag_transforms, date_features, 1)
        
    def _should_stop(self, hist, early_stopping_evals, early_stopping_pct):
        if len(hist) < early_stopping_evals + 1:
            return False
        improvement_pct = 1 - hist[-1][1] / hist[-(early_stopping_evals + 1)][1]
        return improvement_pct < early_stopping_pct

    def fit(
        self,
        data: pd.DataFrame,  # time series
        n_windows: int,  # number of windows to evaluate
        window_size: int,  # test size in each window
        params: Dict[str, Any] = {},  # lightgbm parameters
        id_col: str = 'index',  # column that identifies each serie, can also be the index.
        time_col: str = 'ds',  # column with the timestamps
        target_col: str = 'y',  # column with the series values
        static_features: Optional[List[str]] = None,  # column names of the features that don't change in time
        dropna: bool = True,  # drop rows with missing values created by lags
        keep_last_n: Optional[int] = None,  # keep only this many observations of each serie for computing the updates
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,  # future values for dynamic features
        weights: Sequence[float] = None,  # weight for each window
        eval_every: int = 10,  # number of iterations to train before evaluating the full window
        fit_on_all: bool = True,  # return model fitted on all data
        verbose_eval: bool = True,  # print evaluation metrics
        early_stopping_evals: int = 2,  # stop if the score doesn't improve in these many evaluations
        early_stopping_pct: float = 0.01,  # score must improve at least in this percentage to keep training
        predict_fn: Optional[Callable] = None,  # custom function to compute predictions
        **predict_fn_kwargs,  # additional arguments passed to predict_fn        
    ):
        if eval_every <= 0:
            raise ValueError(
                "eval_every should be > 0. If you don't want to evaluate the complete horizon use "
                "Forecast.cross_validation instead."
            )
        if weights is None:
            weights = np.full(n_windows, 1 / n_windows)        
        elif len(weights) != n_windows:
            raise ValueError('Must specify as many weights as the number of windows')
        else:
            weights = np.asarray(weights)
            
        if id_col != 'index':
            data = data.set_index(id_col)
        
        if np.issubdtype(data['ds'].dtype.type, np.integer):
            freq = 1
        else:
            freq = self.ts.freq
        items = []
        bst_threads = os.cpu_count() // self.num_threads
        for _, train, valid in backtest_splits(data, n_windows, window_size, freq):
            ts = copy.deepcopy(self.ts)
            prep = ts.fit_transform(train, id_col, time_col, target_col, static_features, dropna, keep_last_n)
            ds = lgb.Dataset(prep.drop(columns=[time_col, target_col]), prep[target_col]).construct()
            bst = lgb.Booster({**params, 'num_threads': bst_threads}, ds)
            bst.predict = partial(bst.predict, num_threads=bst_threads)
            valid = valid.set_index(time_col, append=True)
            items.append((ts, bst, valid))

        hist = []
        n_iter = lgb.basic._choose_param_value('num_iterations', params, 100)['num_iterations']
        rmses = np.empty(n_windows)

        if self.num_threads == 1:
            try:
                for i in range(0, n_iter, eval_every):
                    for j, (ts, bst, valid) in enumerate(items):
                        rmses[j] = _update_and_predict(
                            ts,
                            bst,
                            valid,
                            eval_every,
                            window_size,
                            time_col,
                            target_col,
                            dynamic_dfs,
                            predict_fn,
                            **predict_fn_kwargs
                        )
                    rmse = rmses @ weights
                    rounds = eval_every + i
                    hist.append((rounds, rmse))
                    if verbose_eval:
                        print(f'[{rounds:,d}] RMSE: {rmse:,f}')                
                    if self._should_stop(hist, early_stopping_evals, early_stopping_pct):
                        raise EarlyStopException
            except EarlyStopException:
                print(f'Early stopping at round {rounds:,}')
        else:
            try:
                with ThreadPoolExecutor(self.num_threads) as executor:
                    for i in range(0, n_iter, eval_every):
                        futures = []
                        for ts, bst, valid in items:
                            _update(bst, eval_every)
                            future = executor.submit(
                                _predict,
                                ts,
                                bst,
                                valid,
                                window_size,
                                time_col,
                                target_col,
                                dynamic_dfs,
                                predict_fn,
                                **predict_fn_kwargs
                            )
                            futures.append(future)
                        rmses[:] = [f.result() for f in futures]
                        rmse = rmses @ weights
                        rounds = eval_every + i
                        hist.append((rounds, rmse))
                        if verbose_eval:
                            print(f'[{rounds:,d}] RMSE: {rmse:,f}')
                        if self._should_stop(hist, early_stopping_evals, early_stopping_pct):
                            raise EarlyStopException
            except EarlyStopException:
                print(f'Early stopping at round {rounds:,}.')
        
        self.cv_models_ = [item[1] for item in items]
            
        if fit_on_all:
            self.fcst = Forecast([], lags=[1])
            self.fcst.ts = self.ts
            self.fcst.models = [lgb.LGBMRegressor(**params)]
            self.fcst.fit(
                data,
                id_col,
                time_col,
                target_col,
                static_features,
                dropna,
                keep_last_n,
            )
        return hist

    def predict(
        self,
        horizon: int,  # number of periods to predict in the future
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,  # future values for dynamic features
        predict_fn: Optional[Callable] = None,  # custom function to compute predictions
        **predict_fn_kwargs,  # additional arguments passed to predict_fn
    ):
        return self.fcst.predict(horizon, dynamic_dfs, predict_fn, **predict_fn_kwargs)
    
    def cv_predict(self, horizon):
        return self.ts.predict(self.cv_models_, horizon)

In [None]:
from mlforecast.utils import generate_daily_series
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean, seasonal_rolling_mean

In [None]:
data = generate_daily_series(1_000, min_length=500, max_length=1_000)

In [None]:
cv = LightGBMCV(
    freq='D',
    lags=[i + 1 for i in range(4)],
    lag_transforms={
        7 : [expanding_mean] + [(rolling_mean, 7)],
        14: [expanding_mean] + [(rolling_mean, 14)],
    },
    num_threads=4,
)
%time cv.fit(data, n_windows=4, window_size=14, params={'verbosity': -1})

[10] RMSE: 1.380531
[20] RMSE: 0.335082
[30] RMSE: 0.166009
[40] RMSE: 0.146650
[50] RMSE: 0.144134
[60] RMSE: 0.143820
[70] RMSE: 0.143783
Early stopping at round 70.
CPU times: user 32.1 s, sys: 229 ms, total: 32.3 s
Wall time: 11.5 s


[(10, 1.3805311266473483),
 (20, 0.3350815367875217),
 (30, 0.16600853805617782),
 (40, 0.14664959228063223),
 (50, 0.144134480339),
 (60, 0.14382003487619435),
 (70, 0.1437826285951646)]

In [None]:
cv.predict(4)

Unnamed: 0_level_0,ds,LGBMRegressor
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
id_000,2001-11-03,0.247302
id_000,2001-11-04,1.249704
id_000,2001-11-05,2.251363
id_000,2001-11-06,3.249986
id_001,2001-07-01,1.250885
...,...,...
id_998,2002-05-08,3.250413
id_999,2002-07-30,3.249795
id_999,2002-07-31,4.249041
id_999,2002-08-01,5.247625


In [None]:
cv.cv_predict(4)

Unnamed: 0_level_0,ds,Booster,Booster2,Booster3,Booster4
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
id_000,2001-11-03,0.252332,0.252214,0.251296,0.252412
id_000,2001-11-04,1.251639,1.251870,1.252035,1.251550
id_000,2001-11-05,2.250045,2.251298,2.251576,2.251924
id_000,2001-11-06,3.249621,3.249849,3.249896,3.249992
id_001,2001-07-01,1.251639,1.251870,1.252035,1.251550
...,...,...,...,...,...
id_998,2002-05-08,3.250212,3.250324,3.250011,3.250453
id_999,2002-07-30,3.250212,3.250324,3.240011,3.250284
id_999,2002-07-31,4.242646,4.229230,4.239973,4.247728
id_999,2002-08-01,5.247403,5.247179,5.248409,5.249136
