In [None]:
#default_exp cli

# CLI

> Run the forecasting pipeline with configuration files

In [None]:
#export
from pathlib import Path

import pandas as pd
import typer

from mlforecast.api import (
    S3Path,
    _is_s3_path,
    _path_as_str,
    fcst_from_config,
    parse_config,
    perform_backtest,
    read_data,
    setup_client,
)

In [None]:
#exporti
app = typer.Typer()

In [None]:
#export
@app.command()
def run_forecast(config_file: str):
    """Run the forecasting pipeline using the configuration defined in `config_file`."""
    config = parse_config(config_file)
    is_distributed = config.distributed is not None
    if config.distributed is not None:  # mypy
        client = setup_client(config.distributed.cluster)
    try:
        data = read_data(config.data, is_distributed)
        prefix = config.data.prefix
        path = S3Path.from_uri(prefix) if _is_s3_path(prefix) else Path(prefix)
        output_path = path / config.data.output
        output_path.mkdir(exist_ok=True)

        fcst = fcst_from_config(config)
        if config.backtest is not None:
            perform_backtest(fcst, data, config, output_path)
        if config.forecast is not None:
            fcst.fit(data)
            preds = fcst.predict(config.forecast.horizon)
            writer = getattr(preds, f'to_{config.data.format}')
            write_path = _path_as_str(output_path / 'forecast')
            if isinstance(data, pd.DataFrame):
                write_path += f'.{config.data.format}'
            writer(write_path)
    except Exception as e:
        raise e
    finally:
        if is_distributed:
            client.cluster.close()
            client.close()


In [None]:
#hide
import shutil
import tempfile
from pprint import pprint

import dask.dataframe as dd
import yaml
from mlforecast.utils import generate_daily_series

In [None]:
#hide
series = generate_daily_series(20, 100, 200)

for data_format in ('csv', 'parquet'):
    config_name = 'local.yaml'
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)
        train_path = f'train.{data_format}'
        config_path = tmpdir/config_name
        writer = getattr(series, f'to_{data_format}')
        writer(tmpdir/train_path)

        with open(f'../sample_configs/{config_name}', 'rt') as f:
            cfg = yaml.safe_load(f)
        cfg['data']['prefix'] = str(tmpdir)
        cfg['data']['input'] = train_path
        cfg['data']['format'] = data_format
        with open(config_path, 'wt') as f:
            yaml.dump(cfg, f)
        run_forecast(config_path)

In [None]:
#distributed
#hide
series_ddf = dd.from_pandas(series, npartitions=2)

for data_format in ('csv', 'parquet'):
    config_name = 'distributed.yaml'
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)
        train_path = 'train'
        config_path = tmpdir/config_name
        writer = getattr(series_ddf, f'to_{data_format}')
        writer(tmpdir/train_path)

        with open(f'../sample_configs/{config_name}', 'rt') as f:
            cfg = yaml.safe_load(f)
        cfg['data']['prefix'] = str(tmpdir)
        cfg['data']['input'] = train_path
        cfg['data']['format'] = data_format
        with open(config_path, 'wt') as f:
            yaml.dump(cfg, f)
        run_forecast(config_path)

## Example

If you want to run the forecasting pipeline from the CLI you first need to save your data and define a configuration file. Sample configurations are provided in `sample_configs/local.yaml` and `sample_configs/distributed.yaml`. 

### Local

We can run the forecasting pipeline defined in `sample_configs/local.yaml` by saving our data and using the following configuration:

In [None]:
with open('../sample_configs/local.yaml', 'rt') as f:
    cfg = yaml.safe_load(f)
pprint(cfg)

This will look for a single file in the directory `data` called `train` (`data.prefix/data.input`).

In [None]:
#hide
data_path = Path('data')
data_path.mkdir()
series.to_parquet(data_path/'train')

In [None]:
Path('data/train').is_file()

Then we just call `mlforecast` with this configuration.

In [None]:
!mlforecast ../sample_configs/local.yaml

We can see our results have been saved.

In [None]:
list(Path('data/outputs').iterdir())

In [None]:
#hide
assert 'forecast.parquet' in [file.name for file in (data_path/'outputs').iterdir()]
shutil.rmtree(data_path)

### Distributed

We can also use the CLI to run the distributed forecasting pipeline. To do this we need to save our data in partitions and fill the distributed key of the configuration file (instead of local). A sample configuration is provided in `sample_configs/distributed.yaml`:

In [None]:
with open('../sample_configs/distributed.yaml', 'rt') as f:
    cfg = yaml.safe_load(f)
pprint(cfg)

Notice that we use `cluster.class_name` to define the cluster that we want to instantiate, as well as its keyword arguments. This example will use a `dask.distributed.LocalCluster`, however you can fill any other cluster you want.

In [None]:
#distributed
#hide
data_path = Path('data')
data_path.mkdir()
series_ddf = dd.from_pandas(series, npartitions=2)
series_ddf.to_parquet('data/train')

In [None]:
#distributed
list(Path('data/train').iterdir())

Note that we have split our data in two partitions and have specified that we want two workers from our cluster (`distributed.cluster.class_kwargs.n_workers`). **If you're using a remote cluster use a remote storage like S3**.

To run the pipeline we just call `mlforecast` with this configuration.

In [None]:
#distributed
!mlforecast ../sample_configs/distributed.yaml

We can see our results have been saved.

In [None]:
#distributed
list(Path('data/outputs').iterdir())

In [None]:
#distributed
#hide
assert (data_path/'outputs'/'forecast').is_dir()
shutil.rmtree(data_path)