In [None]:
#default_exp api

# API

> High level functions for easy interaction

This module defines the building blocks for the CLI. These functions can be leveraged to define other custom workflows more easily.

In [None]:
#export
import importlib
import inspect
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
    
import pandas as pd
import yaml
from pandas.api.types import is_categorical_dtype, is_datetime64_dtype

from mlforecast.compat import Client, DistributedForecast, Frame, S3Path, dd, dd_Frame
from mlforecast.core import TimeSeries
from mlforecast.data_model import (
    ClusterConfig,
    DataConfig,
    DataFormat, 
    DistributedModelConfig,
    DistributedModelName,
    FeaturesConfig,
    FlowConfig,
    ModelConfig,
    _available_tfms,
)
from mlforecast.forecast import Forecast

In [None]:
import shutil
import tempfile
import warnings

import numpy as np
from dask.distributed import LocalCluster
from fastcore.test import test_eq, test_fail
from window_ops.rolling import *
from window_ops.expanding import *
from window_ops.ewm import *

from mlforecast.compat import Frame
from mlforecast.utils import generate_daily_series, generate_prices_for_series

warnings.filterwarnings('ignore')

In [None]:
#exporti
_available_tfms_kwargs = {
    name: list(inspect.signature(tfm).parameters)[1:] 
    for name, tfm in _available_tfms.items()
}

In [None]:
#export
def validate_data_format(data: Frame) -> Frame:
    """Checks whether data is in the correct format and tries to fix it if possible."""
    if not isinstance(data, (pd.DataFrame, dd_Frame)):
        raise ValueError('data must be either pandas or dask dataframe.')
    if not data.index.name == 'unique_id':
        if 'unique_id' in data:
            data = data.set_index('unique_id')
        else:
            raise ValueError('unique_id not found in data.')
    if 'ds' not in data:
        raise ValueError('ds column not found in data.')
    if not is_datetime64_dtype(data['ds']):
        if isinstance(data, pd.DataFrame):
            data['ds'] = pd.to_datetime(data['ds'])
        else:
            data['ds'] = dd.to_datetime(data['ds'])
    if 'y' not in data:
        raise ValueError('y column not found in data.')
    return data


In [None]:
not_pandas = np.array([1])
test_fail(lambda: validate_data_format(not_pandas), contains='data must be either pandas')

no_uid = pd.DataFrame({'x': [1]})
test_fail(lambda: validate_data_format(no_uid), contains='unique_id not found')

uid_in_col = pd.DataFrame({'unique_id': [1], 'ds': pd.to_datetime(['2020-01-01']), 'y': [1.]})
assert validate_data_format(uid_in_col).equals(uid_in_col.set_index('unique_id'))

no_ds = pd.DataFrame({'unique_id': [1]})
test_fail(lambda: validate_data_format(no_ds), contains='ds column not found')

ds_not_datetime = pd.DataFrame({'unique_id': [1], 'ds': ['2020-01-01'], 'y': [1.]})
assert is_datetime64_dtype(validate_data_format(ds_not_datetime)['ds'])

if hasattr(dd, 'to_datetime'):
    ds_not_datetime_dask = dd.from_pandas(ds_not_datetime, npartitions=1)
    assert is_datetime64_dtype(validate_data_format(ds_not_datetime_dask).compute()['ds'])
    
no_y = pd.DataFrame({'unique_id': [1], 'ds': pd.to_datetime(['2020-01-01'])})
test_fail(lambda: validate_data_format(no_y), contains='y column not found')

In [None]:
#exporti
def _is_s3_path(path: str) -> bool:
    return path.startswith('s3://')


In [None]:
#hide
assert _is_s3_path('s3://bucket/file')
assert not _is_s3_path('bucket/file')

In [None]:
#exporti
def _path_as_str(path: Union[Path, S3Path]) -> str:
    if isinstance(path, S3Path):
        return path.as_uri()
    return str(path)


def _prefix_as_path(prefix: str) -> Union[Path, S3Path]:
    return S3Path.from_uri(prefix) if _is_s3_path(prefix) else Path(prefix)


In [None]:
#hide
test_eq(_path_as_str(Path('data')), 'data')
test_eq(_path_as_str(S3Path('/bucket/file')), 's3://bucket/file')

test_eq(_prefix_as_path('s3://bucket/'), S3Path('/bucket/'))
test_eq(_prefix_as_path('/home/'), Path('/home/'))

In [None]:
#export
def read_data(config: DataConfig, is_distributed: bool) -> Frame:
    """Read data from `config.prefix/config.input`.
    
    If we're in distributed mode dask is used for IO, else pandas."""
    path = _prefix_as_path(config.prefix)
    input_path = path / config.input
    io_module = dd if is_distributed else pd
    reader = getattr(io_module, f'read_{config.format}')
    read_path = _path_as_str(input_path)
    if io_module is dd and config.format is DataFormat.csv:
        read_path += '/*'
    data = reader(read_path)
    if (
        io_module is dd 
        and config.format is DataFormat.parquet
        and data.index.name == 'unique_id'
        and is_categorical_dtype(data.index)
    ):
        data.index = data.index.cat.as_known().as_ordered()
        for col in data.select_dtypes(include='category'):
            data[col] = data[col].cat.as_known()
        
    return validate_data_format(data)


In [None]:
series = generate_daily_series(20, 100, 200)
series_ddf = dd.from_pandas(series, npartitions=2)

for data_format in ('csv', 'parquet'):
    for df in (series, series_ddf):
        is_distributed = df is series_ddf
        with tempfile.TemporaryDirectory() as tmpdir:
            tmpdir = Path(tmpdir)
            writer = getattr(df, f'to_{data_format}')
            writer(tmpdir/'train')
            data_cfg = DataConfig(prefix=str(tmpdir), input='train', 
                                  output='output', format=data_format)
            read_df = read_data(data_cfg, is_distributed)
            if is_distributed:
                read_df, df = read_df.compute(), df.compute()
            assert read_df.drop('y', 1).equals(df.drop('y', 1))
            np.testing.assert_allclose(read_df.y, df.y)

In [None]:
#exporti
def _read_dynamic(config: DataConfig) -> Optional[List[pd.DataFrame]]:
    if config.dynamic is None:
        return None
    reader = getattr(pd, f'read_{config.format}')
    input_path = _prefix_as_path(config.prefix)
    dynamic_dfs = []
    for fname in config.dynamic:
        path = _path_as_str(input_path / fname)
        kwargs = {}
        if config.format is DataFormat.csv:
            kwargs['parse_dates'] = ['ds']
        df = reader(path, **kwargs)
        dynamic_dfs.append(df)
    return dynamic_dfs


def _paste_dynamic(
    data: Frame, dynamic_dfs: Optional[List[pd.DataFrame]], is_distributed: bool
) -> pd.DataFrame:
    if dynamic_dfs is None:
        return data
    data = data.reset_index()
    for df in dynamic_dfs:
        data = data.merge(df, how='left')
    kwargs = {}
    if is_distributed:
        kwargs['sorted'] = True
    data = data.set_index('unique_id', **kwargs)
    return data


In [None]:
#hide
for data_format in ('csv', 'parquet'):
    with tempfile.TemporaryDirectory() as tmpdir:
        tmp = Path(tmpdir)
        series = generate_daily_series(20, n_static_features=2, equal_ends=True)
        series = series.rename(columns={'static_1': 'product_id'})
        prices = generate_prices_for_series(series)
        series = series.reset_index().merge(prices, how='left')
        getattr(series, f'to_{data_format}')(tmp / 'train', index=False)
        getattr(prices, f'to_{data_format}')(tmp / 'prices', index=False)
        data_cfg = DataConfig(
            prefix=tmpdir,
            input='train',
            output='',
            format=data_format,
            dynamic=['prices'],
        )
        dynamic_dfs = _read_dynamic(data_cfg)
        assert isinstance(dynamic_dfs, list)
        test_eq(len(dynamic_dfs), 1)
        pd.testing.assert_frame_equal(dynamic_dfs[0], prices)
        
data_cfg = DataConfig(prefix='', input='', output='', format='csv')
assert _read_dynamic(data_cfg) is None

In [None]:
#exporti
def _instantiate_transforms(config: FeaturesConfig) -> Dict:
    """Turn the function names into the actual functions and make sure their positional arguments are in order."""
    if config.lag_transforms is None:
        return {}
    lag_tfms = defaultdict(list)
    for lag, tfms in config.lag_transforms.items():
        for tfm in tfms:
            if isinstance(tfm, dict):
                [(tfm_name, tfm_kwargs)] = tfm.items()
            else:
                tfm_name, tfm_kwargs = tfm, ()
            tfm_func = _available_tfms[tfm_name]
            tfm_args: Tuple[Any, ...] = ()
            for kwarg in _available_tfms_kwargs[tfm_name]:
                if kwarg in tfm_kwargs:
                    tfm_args += (tfm_kwargs[kwarg],)
            lag_tfms[lag].append((tfm_func, *tfm_args))
    return lag_tfms


In [None]:
#hide
features_cfg = FeaturesConfig(freq='D',
                              lags=[1, 2],
                              lag_transforms={
                                  1: ['expanding_mean', {'rolling_mean': {'window_size': 7}}],
                                  2: [{'rolling_mean': {'min_samples': 2, 'window_size': 3}}]
                              })

test_eq(_instantiate_transforms(features_cfg),
        {
            1: [(expanding_mean,), (rolling_mean, 7)],
            2: [(rolling_mean, 3, 2)]
        })
test_eq(_instantiate_transforms(FeaturesConfig(freq='D')), {})
test_fail(
    lambda: _instantiate_transforms(
        FeaturesConfig(freq='D', lag_transforms={1: [{'exp_mean': {}}]})
    ),
    contains='unexpected value; permitted:'
)

In [None]:
#exporti
def _fcst_from_local(model_config: ModelConfig, flow_config: Dict) -> Forecast:
    module_name, model_cls = model_config.name.rsplit('.', maxsplit=1)
    module = importlib.import_module(module_name)
    model = getattr(module, model_cls)(**(model_config.params or {}))
    ts = TimeSeries(**flow_config)
    return Forecast(model, ts)


def _fcst_from_distributed(
    model_config: DistributedModelConfig, flow_config: Dict
) -> DistributedForecast:
    model_params = model_config.params or {}
    if model_config.name is DistributedModelName.LGBMForecast:
        from mlforecast.distributed.models.lgb import LGBMForecast
        
        model = LGBMForecast(**model_params)
    else:
        from mlforecast.distributed.models.xgb import XGBForecast
        
        model = XGBForecast(**model_params)
    ts = TimeSeries(**flow_config)
    return DistributedForecast(model, ts)


In [None]:
#export
def fcst_from_config(config: FlowConfig) -> Union[Forecast, DistributedForecast]:
    """Instantiate Forecast class from config."""
    flow_config = config.features.dict()
    flow_config['lag_transforms'] = _instantiate_transforms(config.features)
    remove_keys = {'static_features', 'keep_last_n'}
    flow_config = {k: v for k, v in flow_config.items() if k not in remove_keys}
    
    if config.local is not None:
        return _fcst_from_local(config.local.model, flow_config)
    # because of the config validation, either local or distributed will be not None
    # however mypy can't see this, hence the next assert
    assert config.distributed is not None
    return _fcst_from_distributed(config.distributed.model, flow_config)


In [None]:
with open('../sample_configs/local.yaml', 'rt') as f:
    cfg = FlowConfig(**yaml.safe_load(f))

fcst = fcst_from_config(cfg)
test_eq(fcst.model.__class__.__name__, cfg.local.model.name.split('.')[-1])
model_params = fcst.model.get_params()
for param_name, param_value in cfg.local.model.params.items():
    test_eq(model_params[param_name], param_value)

In [None]:
with Client(n_workers=2) as client:
    with open('../sample_configs/distributed.yaml', 'rt') as f:
        cfg = FlowConfig(**yaml.safe_load(f))
    fcst = fcst_from_config(cfg)
    test_eq(fcst.model.__class__.__name__, cfg.distributed.model.name)
    model_params = fcst.model.get_params()
    for param_name, param_value in cfg.distributed.model.params.items():
        test_eq(model_params[param_name], param_value)

In [None]:
#hide
with Client(n_workers=2) as client:
    with open('../sample_configs/distributed.yaml', 'rt') as f:
        cfg = FlowConfig(**yaml.safe_load(f))
    cfg.distributed.model.name = DistributedModelName('LGBMForecast')
    fcst = fcst_from_config(cfg)
    test_eq(fcst.model.__class__.__name__, cfg.distributed.model.name)
    model_params = fcst.model.get_params()
    for param_name, param_value in cfg.distributed.model.params.items():
        test_eq(model_params[param_name], param_value)

In [None]:
#export
def perform_backtest(
    fcst: Union[Forecast, DistributedForecast],
    data: Frame,
    config: FlowConfig,
    output_path: Union[Path, S3Path],
    dynamic_dfs: Optional[List[pd.DataFrame]] = None,    
) -> None:
    """Performs backtesting of `fcst` using `data` and the strategy defined in `config`. 
    Writes the results to `output_path`."""
    if config.backtest is None:
        return
    data_is_dask = isinstance(data, dd_Frame)
    results = fcst.backtest(
        data,
        config.backtest.n_windows,
        config.backtest.window_size,
        static_features=config.features.static_features,
        dynamic_dfs=dynamic_dfs,
    )
    for i, result in enumerate(results):
        result = result.fillna(0)
        split_path = _path_as_str(output_path / f'valid_{i}')
        if not data_is_dask:
            split_path += f'.{config.data.format}'
        writer = getattr(result, f'to_{config.data.format}')
        writer(split_path)
        result['sq_err'] = (result['y'] - result['y_pred']).pow(2)
        mse = result.groupby("unique_id")["sq_err"].mean().mean()
        if data_is_dask:
            mse = mse.compute()
        print(f'Split {i+1} MSE: {mse:.4f}')


In [None]:
with open(f'../sample_configs/local.yaml', 'rt') as f:
    cfg = FlowConfig(**yaml.safe_load(f))
fcst = fcst_from_config(cfg)
with tempfile.TemporaryDirectory() as tmpdir:
    series = generate_daily_series(20, 100, 200)
    perform_backtest(fcst, series, cfg, Path(tmpdir))

In [None]:
#hide
with open(f'../sample_configs/local.yaml', 'rt') as f:
    cfg = FlowConfig(**yaml.safe_load(f))
cfg.backtest = None
fcst = fcst_from_config(cfg)
with tempfile.TemporaryDirectory() as tmpdir:
    perform_backtest(fcst, series, cfg, Path(tmpdir))

In [None]:
#distributed
with Client(n_workers=2) as client:
    with open(f'../sample_configs/distributed.yaml', 'rt') as f:
        cfg = FlowConfig(**yaml.safe_load(f))
    fcst = fcst_from_config(cfg)
    with tempfile.TemporaryDirectory() as tmpdir:
        perform_backtest(fcst, series_ddf, cfg, Path(tmpdir))

In [None]:
#export
def parse_config(config_file: str) -> FlowConfig:
    """Create a `FlowConfig` object using the contents of `config_file`"""
    with open(config_file, 'r') as f:
        config = FlowConfig(**yaml.safe_load(f))
    return config


def setup_client(config: ClusterConfig) -> Client:
    """Spins up a cluster with the specifications defined in `config` and returns a client connected to it."""
    module_name, cluster_cls = config.class_name.rsplit('.', maxsplit=1)
    module = importlib.import_module(module_name)
    cluster = getattr(module, cluster_cls)(**config.class_kwargs)
    client = Client(cluster)
    n_workers = config.class_kwargs.get('n_workers', 0)
    client.wait_for_workers(n_workers)
    return client


In [None]:
#distributed
client = setup_client(cfg.distributed.cluster)
assert isinstance(client.cluster, LocalCluster)
assert len(client.scheduler_info()['workers']) == cfg.distributed.cluster.class_kwargs['n_workers']
client.cluster.close()
client.close()