In [None]:
#|default_exp lgb_cv

In [None]:
#|hide
%load_ext autoreload
%autoreload 2

# LightGBMCV

In [None]:
#|export
import copy
import os
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union

import lightgbm as lgb
import numpy as np
import pandas as pd

from mlforecast import Forecast, TimeSeries
from mlforecast.utils import backtest_splits

In [None]:
#| exporti
def _mape(y_true, y_pred):
    abs_pct_err = abs(y_true - y_pred) / y_true
    return abs_pct_err.groupby(y_true.index.get_level_values(0), observed=True).mean().mean()

def _rmse(y_true, y_pred):
    sq_err = (y_true - y_pred) ** 2
    return sq_err.groupby(y_true.index.get_level_values(0), observed=True).mean().pow(0.5).mean()

_metric2fn = {'mape': _mape, 'rmse': _rmse}

def _update(bst, n):
    for _ in range(n):
        bst.update()

def _predict(ts, bst, valid, h, time_col, dynamic_dfs, predict_fn, **predict_fn_kwargs):
    preds = ts.predict(bst, h, dynamic_dfs, predict_fn, **predict_fn_kwargs).set_index(time_col, append=True)
    return valid.join(preds)

def _update_and_predict(ts, bst, valid, n, h, time_col, dynamic_dfs, predict_fn, **predict_fn_kwargs):
    _update(bst, n)
    return _predict(ts, bst, valid, h, time_col, dynamic_dfs, predict_fn, **predict_fn_kwargs)

In [None]:
#| export
class LightGBMCV:
    def __init__(
        self,
        freq: Optional[str] = None,  # pandas offset alias, e.g. D, W, M
        lags: List[int] = [],  # list of lags to use as features
        lag_transforms: Dict[int, List[Tuple]] = {},  # list of transformations to apply to each lag
        date_features: List[str] = [],  # list of names of pandas date attributes to use as features, e.g. dayofweek
        differences: Optional[List[int]] = None,  # differences to apply to the series before fitting        
        num_threads: int = 1,  # number of threads to use when computing the predictions of each window.
    ):
        self.num_threads = num_threads
        cpu_count = os.cpu_count()
        if cpu_count is None:
            num_cpus = 1
        else:
            num_cpus = cpu_count
        self.bst_threads = num_cpus // num_threads
        self.ts = TimeSeries(freq, lags, lag_transforms, date_features, differences, self.bst_threads)
        
    def __repr__(self):
        return (
            f'{self.__class__.__name__}('
            f'ts={self.ts}, '
            f'num_threads={self.num_threads}, '
            f'bst_threads={self.bst_threads})'
        )
        
    def _should_stop(self, hist, early_stopping_evals, early_stopping_pct):
        if len(hist) < early_stopping_evals + 1:
            return False
        improvement_pct = 1 - hist[-1][1] / hist[-(early_stopping_evals + 1)][1]
        return improvement_pct < early_stopping_pct
    
    def setup(
        self,
        data: pd.DataFrame,  # time series
        n_windows: int,  # number of windows to evaluate
        window_size: int,  # test size in each window
        params: Dict[str, Any] = {},  # lightgbm parameters
        id_col: str = 'index',  # column that identifies each serie, can also be the index.
        time_col: str = 'ds',  # column with the timestamps
        target_col: str = 'y',  # column with the series values
        static_features: Optional[List[str]] = None,  # column names of the features that don't change in time
        dropna: bool = True,  # drop rows with missing values created by lags
        keep_last_n: Optional[int] = None,  # keep only this many observations of each serie for computing the updates
        weights: Sequence[float] = None,  # weight for each window
        metric: Union[str, Callable] = 'mape',  # evaluation metric        
    ):
        if weights is None:
            self.weights = np.full(n_windows, 1 / n_windows)        
        elif len(weights) != n_windows:
            raise ValueError('Must specify as many weights as the number of windows')
        else:
            self.weights = np.asarray(weights)
        if callable(metric):
            self.metric_fn = metric
            self.metric_name = 'custom_metric'
        else:
            if metric not in _metric2fn:
                raise ValueError(f'{metric} is not one of the implemented metrics: ({", ".join(_metric2fn.keys())})')
            self.metric_fn = _metric2fn[metric]
            self.metric_name = metric

        if id_col != 'index':
            data = data.set_index(id_col)
        
        if np.issubdtype(data[time_col].dtype.type, np.integer):
            freq = 1
        else:
            freq = self.ts.freq
        self.items = []
        self.window_size = window_size
        self.time_col = time_col
        self.target_col = target_col
        for _, train, valid in backtest_splits(data, n_windows, window_size, freq):
            ts = copy.deepcopy(self.ts)
            prep = ts.fit_transform(train, id_col, time_col, target_col, static_features, dropna, keep_last_n)
            ds = lgb.Dataset(prep.drop(columns=[time_col, target_col]), prep[target_col]).construct()
            bst = lgb.Booster({**params, 'num_threads': self.bst_threads}, ds)
            bst.predict = partial(bst.predict, num_threads=self.bst_threads)
            valid = valid.set_index(time_col, append=True)
            self.items.append((ts, bst, valid))
        return self

    def _single_threaded_partial_fit(
        self,
        metric_values,
        n_iter,
        dynamic_dfs,
        predict_fn,
        **predict_fn_kwargs,
    ):  
        for j, (ts, bst, valid) in enumerate(self.items):                        
            preds = _update_and_predict(
                ts,
                bst,
                valid,
                n_iter,
                self.window_size,
                self.time_col,
                dynamic_dfs,
                predict_fn,
                **predict_fn_kwargs
            )
            metric_values[j] = self.metric_fn(preds[self.target_col], preds['Booster'])

    def _multithreaded_partial_fit(
        self,
        metric_values,
        n_iter,
        dynamic_dfs,
        predict_fn,
        **predict_fn_kwargs,
    ):                           
        with ThreadPoolExecutor(self.num_threads) as executor:
            futures = []
            for ts, bst, valid in self.items:
                _update(bst, n_iter)
                future = executor.submit(
                    _predict,
                    ts,
                    bst,
                    valid,
                    self.window_size,
                    self.time_col,
                    dynamic_dfs,
                    predict_fn,
                    **predict_fn_kwargs
                )
                futures.append(future)
            cv_preds = [f.result() for f in futures]
        metric_values[:] = [self.metric_fn(preds[self.target_col], preds['Booster']) for preds in cv_preds]
        
    def partial_fit(
        self,
        n_iter: int, # number of boosting iterations to run
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,  # future values for dynamic features
        predict_fn: Optional[Callable] = None,  # custom function to compute predictions
        **predict_fn_kwargs,  # additional arguments passed to predict_fn
    ):
        metric_values = np.empty(len(self.items))
        if self.num_threads == 1:
            self._single_threaded_partial_fit(metric_values, n_iter, dynamic_dfs, predict_fn, **predict_fn_kwargs)
        else:
            self._multithreaded_partial_fit(metric_values, n_iter, dynamic_dfs, predict_fn, **predict_fn_kwargs)
        return metric_values @ self.weights
   

    def fit(
        self,
        data: pd.DataFrame,  # time series
        n_windows: int,  # number of windows to evaluate
        window_size: int,  # test size in each window
        params: Dict[str, Any] = {},  # lightgbm parameters
        id_col: str = 'index',  # column that identifies each serie, can also be the index.
        time_col: str = 'ds',  # column with the timestamps
        target_col: str = 'y',  # column with the series values
        static_features: Optional[List[str]] = None,  # column names of the features that don't change in time
        dropna: bool = True,  # drop rows with missing values created by lags
        keep_last_n: Optional[int] = None,  # keep only this many observations of each serie for computing the updates
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,  # future values for dynamic features
        weights: Sequence[float] = None,  # weight for each window
        eval_every: int = 10,  # number of iterations to train before evaluating the full window
        fit_on_all: bool = False,  # return model fitted on all data
        compute_cv_preds: bool = False,  # compute predictions on all folds using final models
        verbose_eval: bool = True,  # print evaluation metrics
        metric: Union[str, Callable] = 'mape',  # evaluation metric
        early_stopping_evals: int = 2,  # stop if the score doesn't improve in these many evaluations
        early_stopping_pct: float = 0.01,  # score must improve at least in this percentage to keep training
        predict_fn: Optional[Callable] = None,  # custom function to compute predictions
        **predict_fn_kwargs,  # additional arguments passed to predict_fn        
    ):
        self.setup(
            data,
            n_windows,
            window_size,
            params,
            id_col,
            time_col,
            target_col,
            static_features,
            dropna,
            keep_last_n,
            weights,
            metric,
        )
        hist = []
        n_iter = lgb.basic._choose_param_value('num_iterations', params, 100)['num_iterations']
        for i in range(0, n_iter, eval_every):
            metric_value = self.partial_fit(eval_every, dynamic_dfs, predict_fn, **predict_fn_kwargs)
            rounds = eval_every + i
            hist.append((rounds, metric_value))
            if verbose_eval:
                print(f'[{rounds:,d}] {self.metric_name}: {metric_value:,f}')                
            if self._should_stop(hist, early_stopping_evals, early_stopping_pct):
                print(f"Early stopping at round {rounds:,}")
                break        

        self.cv_models_ = [item[1] for item in self.items]
        if compute_cv_preds:
            with ThreadPoolExecutor(self.num_threads) as executor:
                futures = []            
                for ts, bst, valid in self.items:
                    future = executor.submit(
                        _predict,
                        ts,
                        bst,
                        valid,
                        window_size,
                        time_col,
                        dynamic_dfs,
                        predict_fn,
                        **predict_fn_kwargs
                    )
                    futures.append(future)            
                self.cv_preds_ = [f.result() for f in futures]

        if fit_on_all:
            params['n_estimators'] = rounds
            self.fcst = Forecast([])
            self.fcst.ts = self.ts
            self.fcst.models = [lgb.LGBMRegressor(**params)]
            self.fcst.fit(
                data,
                id_col,
                time_col,
                target_col,
                static_features,
                dropna,
                keep_last_n,
            )
        else:
            self.ts._fit(data, id_col, time_col, target_col, static_features, keep_last_n)
        return hist

    def predict(
        self,
        horizon: int,  # number of periods to predict in the future
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,  # future values for dynamic features
        predict_fn: Optional[Callable] = None,  # custom function to compute predictions
        **predict_fn_kwargs,  # additional arguments passed to predict_fn
    ) -> pd.DataFrame:
        """Computes the predictions of the final model trained using all of the data."""        
        if not hasattr(self, 'fcst'):
            raise ValueError('Must call fit with fit_on_all=True before. Did you mean cv_predict?')
        return self.fcst.predict(horizon, dynamic_dfs, predict_fn, **predict_fn_kwargs)
    
    def cv_predict(
        self,
        horizon: int,  # number of periods to predict in the future
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,  # future values for dynamic features
        predict_fn: Optional[Callable] = None,  # custom function to compute predictions
        **predict_fn_kwargs,  # additional arguments passed to predict_fn        
    ) -> pd.DataFrame:
        """Computes the predictions of the models fitted during the CV step."""
        return self.ts.predict(self.cv_models_, horizon)

In [None]:
from fastcore.test import test_fail
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean, seasonal_rolling_mean

from mlforecast.utils import generate_daily_series

In [None]:
data = generate_daily_series(1_000, min_length=500, max_length=1_000)

In [None]:
n_windows = 2
window_size = 14
params = {'verbosity': -1}
config = dict(
    freq='D',
    lags=[7],
    lag_transforms={
        7 : [(rolling_mean, 7)],
        14: [(rolling_mean, 7)],
    },
    num_threads=4,
)

In [None]:
cv = LightGBMCV(**config)
cv.fit(data, n_windows, window_size, params)

[10] mape: 4.927520
[20] mape: 2.953376
[30] mape: 1.026298
[40] mape: 0.798347
[50] mape: 0.723009
[60] mape: 0.697055
[70] mape: 0.688403
[80] mape: 0.685681
[90] mape: 0.684835
Early stopping at round 90


[(10, 4.927519638482391),
 (20, 2.953376099241727),
 (30, 1.026298126222729),
 (40, 0.7983465295648418),
 (50, 0.7230094339375586),
 (60, 0.6970552320257407),
 (70, 0.6884026039892563),
 (80, 0.6856806008289287),
 (90, 0.6848354338163578)]

In [None]:
cv.cv_predict(14)

Unnamed: 0_level_0,ds,Booster,Booster2
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id_000,2001-11-03,0.251411,0.252886
id_000,2001-11-04,1.249231,1.249613
id_000,2001-11-05,2.250441,2.249819
id_000,2001-11-06,3.249947,3.250130
id_000,2001-11-07,4.252896,4.252406
...,...,...,...
id_999,2002-08-08,5.249912,5.249056
id_999,2002-08-09,6.249346,6.249097
id_999,2002-08-10,0.252552,0.249360
id_999,2002-08-11,1.248313,1.250127


In [None]:
test_fail(lambda: cv.predict(1), contains='Must call fit with fit_on_all=True')

In [None]:
cv2 = LightGBMCV(**config)
cv2.fit(data, n_windows, window_size, params, metric='rmse', fit_on_all=True)

[10] rmse: 0.936530
[20] rmse: 0.442142
[30] rmse: 0.171922
[40] rmse: 0.152935
[50] rmse: 0.150601
[60] rmse: 0.150273
[70] rmse: 0.150235
Early stopping at round 70


[(10, 0.9365296382142947),
 (20, 0.4421418925471978),
 (30, 0.1719218176729842),
 (40, 0.152935170306639),
 (50, 0.15060126754993408),
 (60, 0.15027318065520012),
 (70, 0.150234674236362)]

In [None]:
cv2.predict(14)

Unnamed: 0_level_0,ds,LGBMRegressor
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
id_000,2001-11-03,0.253787
id_000,2001-11-04,1.250694
id_000,2001-11-05,2.249748
id_000,2001-11-06,3.250048
id_000,2001-11-07,4.253456
...,...,...
id_999,2002-08-08,5.248980
id_999,2002-08-09,6.247837
id_999,2002-08-10,0.251838
id_999,2002-08-11,1.250694


In [None]:
cv3 = LightGBMCV(**config)
cv3.setup(data, n_windows, window_size, params, metric='rmse')
cv3.partial_fit(10)

0.9365296382142947

In [None]:
cv3.partial_fit(10)

0.4421418925471978