In [None]:
#default_exp api

# API

> High level functions for easy interaction

This module defines the building blocks for the CLI. These functions can be leveraged to define other custom workflows more easily.

In [None]:
#export
import importlib
import inspect
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, Tuple, Union

try:
    import dask.dataframe as dd
    from dask.dataframe import DataFrame as dd_Frame
    from dask.distributed import Client, LocalCluster
except ImportError:
    class dd: pass  # type: ignore
    dd_Frame = type(None)
    class Client: pass  # type: ignore
    class LocalCluster: pass  # type: ignore
try:
    from s3path import S3Path
except ImportError:
    class S3Path: pass  # type: ignore
try:
    from mlforecast.distributed.forecast import DistributedForecast
except ImportError:
    class DistributedForecast: pass  # type: ignore

import pandas as pd
import yaml
from fastcore.script import Param, call_parse
from pandas.api.types import is_datetime64_dtype

from mlforecast.core import predictions_flow
from mlforecast.data_model import (ClusterConfig, DataConfig, DataFormat, 
                         DistributedModelConfig, DistributedModelName,
                         FeaturesConfig, FlowConfig, ModelConfig,
                         _available_tfms)
from mlforecast.forecast import Forecast

In [None]:
import shutil
import tempfile

import numpy as np
from fastcore.test import test_eq, test_fail
from window_ops.rolling import *
from window_ops.expanding import *
from window_ops.ewm import *

from mlforecast.utils import generate_daily_series

In [None]:
#exporti
Frame = Union[pd.DataFrame, dd_Frame]

_available_tfms_kwargs = {name: list(inspect.signature(tfm).parameters)[1:] 
                          for name, tfm in _available_tfms.items()}

In [None]:
#export
def validate_data_format(data: Frame) -> Frame:
    """Checks whether data is in the correct format and tries to fix it if possible."""
    if not isinstance(data, (pd.DataFrame, dd_Frame)):
        raise ValueError('data must be either pandas or dask dataframe.')
    if not data.index.name == 'unique_id':
        if 'unique_id' in data:
            data = data.set_index('unique_id')
        else:
            raise ValueError('unique_id not found in data.')
    if 'ds' not in data:
        raise ValueError('ds column not found in data.')
    if not is_datetime64_dtype(data['ds']):
        if isinstance(data, pd.DataFrame):
            data['ds'] = pd.to_datetime(data['ds'])
        else:
            data['ds'] = dd.to_datetime(data['ds'])
    if 'y' not in data:
        raise ValueError('y column not found in data.')
    return data

In [None]:
not_pandas = np.array([1])
test_fail(lambda: validate_data_format(not_pandas), contains='data must be either pandas')

no_uid = pd.DataFrame({'x': [1]})
test_fail(lambda: validate_data_format(no_uid), contains='unique_id not found')

uid_in_col = pd.DataFrame({'unique_id': [1], 'ds': pd.to_datetime(['2020-01-01']), 'y': [1.]})
assert validate_data_format(uid_in_col).equals(uid_in_col.set_index('unique_id'))

no_ds = pd.DataFrame({'unique_id': [1]})
test_fail(lambda: validate_data_format(no_ds), contains='ds column not found')

ds_not_datetime = pd.DataFrame({'unique_id': [1], 'ds': ['2020-01-01'], 'y': [1.]})
assert is_datetime64_dtype(validate_data_format(ds_not_datetime)['ds'])

if hasattr(dd, 'to_datetime'):
    ds_not_datetime_dask = dd.from_pandas(ds_not_datetime, npartitions=1)
    assert is_datetime64_dtype(validate_data_format(ds_not_datetime_dask).compute()['ds'])
    
no_y = pd.DataFrame({'unique_id': [1], 'ds': pd.to_datetime(['2020-01-01'])})
test_fail(lambda: validate_data_format(no_y), contains='y column not found')

In [None]:
#exporti
def _is_s3_path(path: str) -> bool:
    return path.startswith('s3://')

In [None]:
#hide
assert _is_s3_path('s3://bucket/file')
assert not _is_s3_path('bucket/file')

In [None]:
#exporti
def _path_as_str(path: Union[Path, S3Path]) -> str:
    if isinstance(path, S3Path):
        return path.as_uri()
    return str(path)

In [None]:
#hide
test_eq(_path_as_str(Path('data')), 'data')
test_eq(_path_as_str(S3Path('/bucket/file')), 's3://bucket/file')

In [None]:
#export
def read_data(config: DataConfig, is_distributed: bool) -> Frame:
    """Read data from `config.prefix/config.input`.
    
    If we're in distributed mode dask is used for IO, else pandas."""
    prefix = config.prefix
    path = S3Path.from_uri(prefix) if _is_s3_path(prefix) else Path(prefix)
    input_path = path/config.input
    io_module = dd if is_distributed else pd
    reader = getattr(io_module, f'read_{config.format}')
    read_path = _path_as_str(input_path)
    if io_module is dd and config.format is DataFormat.csv:
        read_path += '/*'
    data = reader(read_path)
    if (
        io_module is dd 
        and config.format is DataFormat.parquet
        and data.index.name == 'unique_id'
        and pd.api.types.is_categorical_dtype(data.index)
    ):
        data.index = data.index.cat.as_known().as_ordered()
    return validate_data_format(data)

In [None]:
series = generate_daily_series(20, 100, 200)
series_ddf = dd.from_pandas(series, npartitions=2)

for data_format in ('csv', 'parquet'):
    for df in (series, series_ddf):
        is_distributed = df is series_ddf
        with tempfile.TemporaryDirectory() as tmpdir:
            tmpdir = Path(tmpdir)
            writer = getattr(df, f'to_{data_format}')
            writer(tmpdir/'train')
            data_cfg = DataConfig(prefix=str(tmpdir), input='train', 
                                  output='output', format=data_format)
            read_df = read_data(data_cfg, is_distributed)
            if is_distributed:
                read_df, df = read_df.compute(), df.compute()
            assert read_df.drop('y', 1).equals(df.drop('y', 1))
            np.testing.assert_allclose(read_df.y, df.y)

In [None]:
#exporti
def _instantiate_transforms(config: FeaturesConfig) -> Dict:
    """Turn the function names into the actual functions and make sure their positional arguments are in order."""
    if config.lag_transforms is None:
        return {}
    lag_tfms = defaultdict(list)
    for lag, tfms in config.lag_transforms.items():
        for tfm in tfms:
            if isinstance(tfm, dict):
                [(tfm_name, tfm_kwargs)] = tfm.items()
            else:
                tfm_name, tfm_kwargs = tfm, ()
            if tfm_name not in _available_tfms:
                raise NotImplementedError(tfm_name)
            tfm_func = _available_tfms[tfm_name]
            tfm_args: Tuple[Any, ...] = ()
            for kwarg in _available_tfms_kwargs[tfm_name]:
                if kwarg in tfm_kwargs:
                    tfm_args += (tfm_kwargs[kwarg], )
            lag_tfms[lag].append((tfm_func, *tfm_args))
    return lag_tfms

In [None]:
#hide
features_cfg = FeaturesConfig(freq='D',
                              lags=[1, 2],
                              lag_transforms={
                                  1: ['expanding_mean', {'rolling_mean': {'window_size': 7}}],
                                  2: [{'rolling_mean': {'min_samples': 2, 'window_size': 3}}]
                              })

test_eq(_instantiate_transforms(features_cfg),
        {
            1: [(expanding_mean,), (rolling_mean, 7)],
            2: [(rolling_mean, 3, 2)]
        })

In [None]:
#exporti
def _fcst_from_local(model_config: ModelConfig,
                     flow_config: Dict) -> Forecast:
    module_name, model_cls = model_config.name.rsplit('.', maxsplit=1)
    module = importlib.import_module(module_name)
    model = getattr(module, model_cls)(**(model_config.params or {}))
    return Forecast(model, flow_config)


def _fcst_from_distributed(model_config: DistributedModelConfig,
                           flow_config: Dict) -> DistributedForecast:
    if model_config.name is DistributedModelName.LightGBM:
        from mlforecast.distributed.models.lgb import LGBMForecast
        model_cls = LGBMForecast
    else:
        from mlforecast.distributed.models.xgb import XGBForecast
        model_cls = XGBForecast  # type: ignore
    model = model_cls(**(model_config.params or {}))  
    return DistributedForecast(model, flow_config)  

In [None]:
#export
def fcst_from_config(config: FlowConfig) -> Union[Forecast, DistributedForecast]:
    """Instantiate Forecast class from config."""
    flow_config = config.features.dict()
    flow_config['lag_transforms'] = _instantiate_transforms(config.features)
    
    if config.local is not None:
        return _fcst_from_local(config.local.model, flow_config)
    # because of the config validation, either local or distributed will be not None
    return _fcst_from_distributed(config.distributed.model, flow_config)  # type: ignore

In [None]:
with open('../sample_configs/local.yaml', 'rt') as f:
    cfg = FlowConfig(**yaml.safe_load(f))

fcst = fcst_from_config(cfg)
test_eq(fcst.model.__class__.__name__, cfg.local.model.name.split('.')[-1])
model_params = fcst.model.get_params()
for param_name, param_value in cfg.local.model.params.items():
    test_eq(model_params[param_name], param_value)

In [None]:
with Client(n_workers=2) as client:
    with open('../sample_configs/distributed.yaml', 'rt') as f:
        cfg = FlowConfig(**yaml.safe_load(f))

    fcst = fcst_from_config(cfg)
    test_eq(fcst.model.__class__.__name__, cfg.distributed.model.name)
    model_params = fcst.model.get_params()
    for param_name, param_value in cfg.distributed.model.params.items():
        test_eq(model_params[param_name], param_value)

In [None]:
#export
def perform_backtest(fcst: Union[Forecast, DistributedForecast],
                     data: Frame,
                     config: FlowConfig,
                     output_path: Union[Path, S3Path]):
    """Performs backtesting of `fcst` using `data` and the strategy defined in `config`. 
    Writes the results to `output_path`."""
    if config.backtest is None:
        return
    data_is_dask = isinstance(data, dd_Frame)
    results = fcst.backtest(data,
                            config.backtest.n_windows,
                            config.backtest.window_size, 
                            predictions_flow)
    for i, result in enumerate(results):
        result = result.fillna(0)
        split_path = _path_as_str(output_path/f'valid_{i}')
        if not data_is_dask:
            split_path += f'.{config.data.format}'
        writer = getattr(result, f'to_{config.data.format}')
        writer(split_path)
        result['sq_err'] = (result['y'] - result['y_pred'])**2
        mse = result.groupby("unique_id")["sq_err"].mean().mean()
        if data_is_dask:
            mse = mse.compute()
        print(f'Split {i+1} MSE: {mse:.4f}')

In [None]:
with open(f'../sample_configs/local.yaml', 'rt') as f:
    cfg = FlowConfig(**yaml.safe_load(f))
fcst = fcst_from_config(cfg)
with tempfile.TemporaryDirectory() as tmpdir:
    perform_backtest(fcst, series, cfg, Path(tmpdir))

In [None]:
#distributed
with Client(n_workers=2) as client:
    with open(f'../sample_configs/distributed.yaml', 'rt') as f:
        cfg = FlowConfig(**yaml.safe_load(f))
    fcst = fcst_from_config(cfg)
    with tempfile.TemporaryDirectory() as tmpdir:
        perform_backtest(fcst, series_ddf, cfg, Path(tmpdir))

In [None]:
#export
def parse_config(config_file: str) -> FlowConfig:
    """Create a `FlowConfig` object using the contents of `config_file`"""
    with open(config_file, 'r') as f:
        config = FlowConfig(**yaml.safe_load(f))
    return config

def setup_client(config: ClusterConfig) -> Client:
    """Spins up a cluster with the specifications defined in `config` and returns a client connected to it."""
    module_name, cluster_cls = config.class_name.rsplit('.', maxsplit=1)
    module = importlib.import_module(module_name)
    cluster = getattr(module, cluster_cls)(**config.class_kwargs)
    client = Client(cluster)
    n_workers = config.class_kwargs.get('n_workers', 0)
    client.wait_for_workers(n_workers)
    return client

In [None]:
#distributed
client = setup_client(cfg.distributed.cluster)
assert isinstance(client.cluster, LocalCluster)
assert len(client.scheduler_info()['workers']) == cfg.distributed.cluster.class_kwargs['n_workers']
client.cluster.close()
client.close()

In [None]:
#export
@call_parse
def run_forecast(config_file: Param('Configuration file', str)):  # type: ignore # NOQA
    """Run the forecasting pipeline using the configuration defined in `config_file`."""
    config = parse_config(config_file)
    is_distributed = config.distributed is not None
    if config.distributed is not None:  # mypy
        client = setup_client(config.distributed.cluster)
    try:
        data = read_data(config.data, is_distributed)
        prefix = config.data.prefix
        path = S3Path.from_uri(prefix) if _is_s3_path(prefix) else Path(prefix)
        output_path = path/config.data.output
        output_path.mkdir(exist_ok=True)

        fcst = fcst_from_config(config)
        if config.backtest is not None:
            perform_backtest(fcst, data, config, output_path)
        if config.forecast is not None:
            fcst.fit(data)
            preds = fcst.predict(config.forecast.horizon)
            writer = getattr(preds, f'to_{config.data.format}')
            write_path = _path_as_str(output_path/'forecast')
            if isinstance(data, pd.DataFrame):
                write_path += f'.{config.data.format}'
            writer(write_path)
    except Exception as e:
        raise e
    finally:
        if is_distributed:
            client.cluster.close()
            client.close()

In [None]:
for data_format in ('csv', 'parquet'):
    config_name = 'local.yaml'
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)
        train_path = f'train.{data_format}'
        config_path = tmpdir/config_name
        writer = getattr(series, f'to_{data_format}')
        writer(tmpdir/train_path)

        with open(f'../sample_configs/{config_name}', 'rt') as f:
            cfg = yaml.safe_load(f)
        cfg['data']['prefix'] = str(tmpdir)
        cfg['data']['input'] = train_path
        cfg['data']['format'] = data_format
        with open(config_path, 'wt') as f:
            yaml.dump(cfg, f)
        run_forecast(config_path)

In [None]:
#distributed
for data_format in ('csv', 'parquet'):
    config_name = 'distributed.yaml'
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)
        train_path = 'train'
        config_path = tmpdir/config_name
        writer = getattr(series_ddf, f'to_{data_format}')
        writer(tmpdir/train_path)

        with open(f'../sample_configs/{config_name}', 'rt') as f:
            cfg = yaml.safe_load(f)
        cfg['data']['prefix'] = str(tmpdir)
        cfg['data']['input'] = train_path
        cfg['data']['format'] = data_format
        with open(config_path, 'wt') as f:
            yaml.dump(cfg, f)
        run_forecast(config_path)

In [None]:
data_path = Path('data')
data_path.mkdir()
series.to_parquet(data_path/'train')
!mlforecast ../sample_configs/local.yaml
assert 'forecast.parquet' in [file.name for file in (data_path/'outputs').iterdir()]
shutil.rmtree(data_path)