In [None]:
#| default_exp distributed.timegpt

In [None]:
#| hide 
%load_ext autoreload
%autoreload 2

In [None]:
#| export
from typing import Any, Callable, Dict, List, Optional, Union

import numpy as np
import pandas as pd
import fugue
import fugue.api as fa
from fugue import transform, DataFrame, FugueWorkflow, ExecutionEngine
from fugue.collections.yielded import Yielded
from fugue.constants import FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT
from fugue.execution.factory import make_execution_engine
from triad import Schema

In [None]:
#| export
def _cotransform(
    df1: Any,
    df2: Any,
    using: Any,
    schema: Any = None,
    params: Any = None,
    partition: Any = None,
    engine: Any = None,
    engine_conf: Any = None,
    force_output_fugue_dataframe: bool = False,
    as_local: bool = False,
) -> Any:
    dag = FugueWorkflow(compile_conf={FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT: 0})

    src = dag.create_data(df1).zip(dag.create_data(df2), partition=partition)
    tdf = src.transform(
        using=using,
        schema=schema,
        params=params,
        pre_partition=partition,
    )
    tdf.yield_dataframe_as("result", as_local=as_local)
    dag.run(engine, conf=engine_conf)
    result = dag.yields["result"].result  # type:ignore
    if force_output_fugue_dataframe or isinstance(df1, (DataFrame, Yielded)):
        return result
    return result.as_pandas() if result.is_local else result.native  # type:ignore

In [None]:
#| export
class _DistributedTimeGPT:

    def __init__(
            self, 
            token: Optional[str] = None, 
            environment: Optional[str] = None,
            max_retries: int = 6,
            retry_interval: int = 10,
            max_wait_time: int = 60,
        ):
        self.token = token
        self.environment = environment
        self.max_retries = max_retries
        self.retry_interval = retry_interval
        self.max_wait_time = max_wait_time

    def _distribute_method(
            self, 
            method: Callable,
            df: fugue.AnyDataFrame, 
            kwargs: dict, 
            schema: str, 
            num_partitions: int, 
            id_col: str,
            X_df: Optional[fugue.AnyDataFrame] = None, 
        ):
        if id_col not in fa.get_column_names(df):
            raise Exception(
                'Distributed environment is meant to forecasts '
                'multiple time series at once. You did not provide '
                'an identifier for each time series.'
            )
        engine = make_execution_engine(infer_by=[df])
        if num_partitions is None:
            num_partitions = engine.get_current_parallelism()
        partition = dict(by=id_col, num=num_partitions, algo='coarse')
        params = dict(kwargs={**kwargs, 'num_partitions': 1},) # local num_partitions
        if X_df is not None:
            # check same engine
            engine_x = make_execution_engine(infer_by=[X_df])
            if repr(engine) != repr(engine_x):
                raise Exception(
                    'Target variable and exogenous variables '
                    'have different engines. Please provide the same '
                    'distributed engine for both inputs.'
                )
            result_df = _cotransform(
                df,
                X_df,
                method,
                params=params,
                schema=schema,
                partition=partition,
                engine=engine,
            )
        else:
            result_df = fa.transform(
                df,
                method,
                params=params,
                schema=schema,
                engine=engine,
                partition=partition,
                as_fugue=True,
            )
        return fa.get_native_as_df(result_df)

    def forecast(
            self,
            df: fugue.AnyDataFrame,
            h: int,
            freq: Optional[str] = None,    
            id_col: str = 'unique_id',
            time_col: str = 'ds',
            target_col: str = 'y',
            X_df: Optional[fugue.AnyDataFrame] = None,
            level: Optional[List[Union[int, float]]] = None,
            quantiles: Optional[List[float]] = None,
            fewshot_steps: int = 0,
            fewshot_loss: str = 'default',
            clean_ex_first: bool = True,
            validate_token: bool = False,
            add_history: bool = False,
            date_features: Union[bool, List[str]] = False,
            date_features_to_one_hot: Union[bool, List[str]] = True,
            model: str = 'timegpt-1',
            num_partitions: Optional[int] = None,
        ) -> fugue.AnyDataFrame:
        kwargs = dict(
            h=h,
            freq=freq,
            id_col=id_col,
            time_col=time_col,
            target_col=target_col,
            level=level,
            quantiles=quantiles,
            fewshot_steps=fewshot_steps,
            fewshot_loss=fewshot_loss,
            clean_ex_first=clean_ex_first,
            validate_token=validate_token,
            add_history=add_history,
            date_features=date_features,
            date_features_to_one_hot=date_features_to_one_hot,
            model=model,
        )
        schema = self._get_forecast_schema(
            id_col=id_col, 
            time_col=time_col, 
            level=level,
            quantiles=quantiles,
        )
        fcst_df = self._distribute_method(
            method=self._forecast if X_df is None else self._forecast_x,
            df=df,
            kwargs=kwargs,
            schema=schema,
            num_partitions=num_partitions,
            id_col=id_col,
            X_df=X_df,   
        )
        return fcst_df

    def detect_anomalies(
            self,
            df: pd.DataFrame,
            freq: Optional[str] = None,    
            id_col: str = 'unique_id',
            time_col: str = 'ds',
            target_col: str = 'y',
            level: Union[int, float] = 99,
            clean_ex_first: bool = True,
            validate_token: bool = False,
            date_features: Union[bool, List[str]] = False,
            date_features_to_one_hot: Union[bool, List[str]] = True,
            model: str = 'timegpt-1',
            num_partitions: Optional[int] = None,
        ) -> fugue.AnyDataFrame:
        kwargs = dict(
            freq=freq,
            id_col=id_col,
            time_col=time_col,
            target_col=target_col,
            level=level,
            clean_ex_first=clean_ex_first,
            validate_token=validate_token,
            date_features=date_features,
            date_features_to_one_hot=date_features_to_one_hot,
            model=model,
        )
        schema = self._get_anomalies_schema(id_col=id_col, time_col=time_col)
        anomalies_df = self._distribute_method(
            method=self._detect_anomalies,
            df=df,
            kwargs=kwargs,
            schema=schema,
            num_partitions=num_partitions,
            id_col=id_col,
            X_df=None,
        )
        return anomalies_df

    def cross_validation(
            self,
            df: fugue.AnyDataFrame,
            h: int,
            freq: Optional[str] = None,    
            id_col: str = 'unique_id',
            time_col: str = 'ds',
            target_col: str = 'y',
            level: Optional[List[Union[int, float]]] = None,
            quantiles: Optional[List[float]] = None,
            fewshot_steps: int = 0,
            fewshot_loss: str = 'default',
            clean_ex_first: bool = True,
            validate_token: bool = False,
            date_features: Union[bool, List[str]] = False,
            date_features_to_one_hot: Union[bool, List[str]] = True,
            model: str = 'timegpt-1',
            n_windows: int = 1,
            step_size: Optional[int] = None,
            num_partitions: Optional[int] = None,
        ) -> fugue.AnyDataFrame:
        kwargs = dict(
            h=h,
            freq=freq,
            id_col=id_col,
            time_col=time_col,
            target_col=target_col,
            level=level,
            quantiles=quantiles,
            fewshot_steps=fewshot_steps,
            fewshot_loss=fewshot_loss,
            clean_ex_first=clean_ex_first,
            validate_token=validate_token,
            date_features=date_features,
            date_features_to_one_hot=date_features_to_one_hot,
            model=model,
            n_windows=n_windows,
            step_size=step_size,
        )
        schema = self._get_forecast_schema(
            id_col=id_col, 
            time_col=time_col, 
            level=level,
            quantiles=quantiles,
            cv=True,
        )
        fcst_df = self._distribute_method(
            method=self._cross_validation,
            df=df,
            kwargs=kwargs,
            schema=schema,
            num_partitions=num_partitions,
            id_col=id_col,
        )
        return fcst_df
    
    def _instantiate_timegpt(self):
        from nixtlats.timegpt import _TimeGPT
        timegpt = _TimeGPT(
            token=self.token, 
            environment=self.environment,
            max_retries=self.max_retries,
            retry_interval=self.retry_interval,
            max_wait_time=self.max_wait_time,
        )
        return timegpt

    def _forecast(
            self, 
            df: pd.DataFrame, 
            kwargs,
        ) -> pd.DataFrame:
        timegpt = self._instantiate_timegpt()
        return timegpt._forecast(df=df, **kwargs)

    def _forecast_x(
            self, 
            df: pd.DataFrame, 
            X_df: pd.DataFrame,
            kwargs,
        ) -> pd.DataFrame:
        timegpt = self._instantiate_timegpt()
        return timegpt._forecast(df=df, X_df=X_df, **kwargs)

    def _detect_anomalies(
            self, 
            df: pd.DataFrame, 
            kwargs,
        ) -> pd.DataFrame:
        timegpt = self._instantiate_timegpt()
        return timegpt._detect_anomalies(df=df, **kwargs)

    def _cross_validation(
            self, 
            df: pd.DataFrame, 
            kwargs,
        ) -> pd.DataFrame:
        timegpt = self._instantiate_timegpt()
        return timegpt._cross_validation(df=df, **kwargs)
    
    @staticmethod
    def _get_forecast_schema(id_col, time_col, level, quantiles, cv=False):
        schema = f'{id_col}:string,{time_col}:datetime'
        if cv:
            schema = f'{schema},cutoff:datetime'
        schema = f'{schema},TimeGPT:double'
        if (level is not None) and (quantiles is not None):
            raise Exception(
                "you should include `level` or `quantiles` but not both."
            )
        if level is not None:
            level = sorted(level)
            schema = f'{schema},{",".join([f"TimeGPT-lo-{lv}:double" for lv in reversed(level)])}'
            schema = f'{schema},{",".join([f"TimeGPT-hi-{lv}:double" for lv in level])}'
        if quantiles is not None:
            quantiles = sorted(quantiles)
            q_cols = [f'TimeGPT-q-{int(q * 100)}:double' for q in quantiles]
            schema = f'{schema},{",".join(q_cols)}'
        return Schema(schema)
    
    @staticmethod
    def _get_anomalies_schema(id_col, time_col):
        schema = f'{id_col}:string,{time_col}:datetime,anomaly:int'
        return Schema(schema)

In [None]:
#| hide
import os

from fastcore.test import test_eq
from dotenv import load_dotenv
from utilsforecast.data import generate_series

load_dotenv()

In [None]:
#| hide
def test_forecast(
        df: fugue.AnyDataFrame, 
        horizon: int = 12,
        id_col: str = 'unique_id',
        time_col: str = 'ds',
        **fcst_kwargs,
    ):
    fcst_df = distributed_timegpt.forecast(
        df=df, 
        h=horizon,
        id_col=id_col,
        time_col=time_col,
        **fcst_kwargs,
    )
    fcst_df = fa.as_pandas(fcst_df)
    test_eq(n_series * 12, len(fcst_df))
    cols = fcst_df.columns.to_list()
    exp_cols = [id_col, time_col, 'TimeGPT']
    if 'level' in fcst_kwargs:
        level = sorted(fcst_kwargs['level'])
        exp_cols.extend([f'TimeGPT-lo-{lv}' for lv in reversed(level)])
        exp_cols.extend([f'TimeGPT-hi-{lv}' for lv in level])
    test_eq(cols, exp_cols)

In [None]:
#| hide
from fastcore.test import test_fail

In [None]:
#| hide
def test_forecast_diff_results_diff_models(
        df: fugue.AnyDataFrame, 
        horizon: int = 12, 
        id_col: str = 'unique_id',
        time_col: str = 'ds',
        **fcst_kwargs,
    ):
    fcst_df = distributed_timegpt.forecast(
        df=df, 
        h=horizon, 
        num_partitions=1,
        id_col=id_col,
        time_col=time_col,
        model='timegpt-1',
        **fcst_kwargs
    )
    fcst_df = fa.as_pandas(fcst_df)
    fcst_df_2 = distributed_timegpt.forecast(
        df=df, 
        h=horizon, 
        num_partitions=1,
        id_col=id_col,
        time_col=time_col,
        model='timegpt-1-long-horizon',
        **fcst_kwargs
    )
    fcst_df_2 = fa.as_pandas(fcst_df_2)
    test_fail(
        lambda: pd.testing.assert_frame_equal(
            fcst_df.sort_values([id_col, time_col]).reset_index(drop=True),
            fcst_df_2.sort_values([id_col, time_col]).reset_index(drop=True),
        ),
        contains='(column name="TimeGPT") are different',
    )

In [None]:
#| hide
def test_forecast_same_results_num_partitions(
        df: fugue.AnyDataFrame, 
        horizon: int = 12, 
        id_col: str = 'unique_id',
        time_col: str = 'ds',
        **fcst_kwargs,
    ):
    fcst_df = distributed_timegpt.forecast(
        df=df, 
        h=horizon, 
        num_partitions=1,
        id_col=id_col,
        time_col=time_col,
        **fcst_kwargs
    )
    fcst_df = fa.as_pandas(fcst_df)
    fcst_df_2 = distributed_timegpt.forecast(
        df=df, 
        h=horizon, 
        num_partitions=2,
        id_col=id_col,
        time_col=time_col,
        **fcst_kwargs
    )
    fcst_df_2 = fa.as_pandas(fcst_df_2)
    pd.testing.assert_frame_equal(
        fcst_df.sort_values([id_col, time_col]).reset_index(drop=True),
        fcst_df_2.sort_values([id_col, time_col]).reset_index(drop=True),
    )

In [None]:
#| hide
def test_cv_same_results_num_partitions(
        df: fugue.AnyDataFrame, 
        horizon: int = 12, 
        id_col: str = 'unique_id',
        time_col: str = 'ds',
        **fcst_kwargs,
    ):
    fcst_df = distributed_timegpt.cross_validation(
        df=df, 
        h=horizon, 
        num_partitions=1,
        id_col=id_col,
        time_col=time_col,
        **fcst_kwargs
    )
    fcst_df = fa.as_pandas(fcst_df)
    fcst_df_2 = distributed_timegpt.cross_validation(
        df=df, 
        h=horizon, 
        num_partitions=2,
        id_col=id_col,
        time_col=time_col,
        **fcst_kwargs
    )
    fcst_df_2 = fa.as_pandas(fcst_df_2)
    pd.testing.assert_frame_equal(
        fcst_df.sort_values([id_col, time_col]).reset_index(drop=True),
        fcst_df_2.sort_values([id_col, time_col]).reset_index(drop=True),
    )

In [None]:
#| hide
def test_forecast_dataframe(df: fugue.AnyDataFrame):
    test_cv_same_results_num_partitions(df, n_windows=2, step_size=1)
    test_cv_same_results_num_partitions(df, n_windows=3, step_size=None, horizon=1)
    test_cv_same_results_num_partitions(df, model='timegpt-1-long-horizon', horizon=1)
    test_forecast_diff_results_diff_models(df)
    test_forecast(df, num_partitions=1)
    test_forecast(df, level=[90, 80], num_partitions=1)
    test_forecast_same_results_num_partitions(df)

In [None]:
#| hide
def test_forecast_dataframe_diff_cols(df: fugue.AnyDataFrame, id_col: str = 'id_col', time_col: str = 'time_col'):
    test_forecast(df, id_col=id_col, time_col=time_col, num_partitions=1)
    test_forecast(df, id_col=id_col, time_col=time_col, level=[90, 80], num_partitions=1)
    test_forecast_same_results_num_partitions(df, id_col=id_col, time_col=time_col)

In [None]:
#| hide
def test_forecast_x(
        df: fugue.AnyDataFrame, 
        X_df: fugue.AnyDataFrame,
        horizon: int = 24,
        id_col: str = 'unique_id',
        time_col: str = 'ds',
        **fcst_kwargs,
    ):
    fcst_df = distributed_timegpt.forecast(
        df=df, 
        X_df=X_df,
        h=horizon,
        id_col=id_col,
        time_col=time_col,
        **fcst_kwargs,
    )
    fcst_df = fa.as_pandas(fcst_df)
    n_series = fa.as_pandas(X_df)[id_col].nunique()
    test_eq(n_series * horizon, len(fcst_df))
    cols = fcst_df.columns.to_list()
    exp_cols = [id_col, time_col, 'TimeGPT']
    if 'level' in fcst_kwargs:
        level = sorted(fcst_kwargs['level'])
        exp_cols.extend([f'TimeGPT-lo-{lv}' for lv in reversed(level)])
        exp_cols.extend([f'TimeGPT-hi-{lv}' for lv in level])
    test_eq(cols, exp_cols)
    fcst_df_2 = distributed_timegpt.forecast(
        df=df, 
        h=horizon,
        id_col=id_col,
        time_col=time_col,
        **fcst_kwargs,
    )
    fcst_df_2 = fa.as_pandas(fcst_df_2)
    equal_arrays = np.array_equal(
        fcst_df.sort_values([id_col, time_col])['TimeGPT'].values,
        fcst_df_2.sort_values([id_col, time_col])['TimeGPT'].values
    )
    assert not equal_arrays, 'Forecasts with and without ex vars are equal'

In [None]:
#| hide
def test_forecast_x_same_results_num_partitions(
        df: fugue.AnyDataFrame, 
        X_df: fugue.AnyDataFrame,
        horizon: int = 24, 
        id_col: str = 'unique_id',
        time_col: str = 'ds',
        **fcst_kwargs,
    ):
    fcst_df = distributed_timegpt.forecast(
        df=df, 
        X_df=X_df,
        h=horizon, 
        num_partitions=1,
        id_col=id_col,
        time_col=time_col,
        **fcst_kwargs
    )
    fcst_df = fa.as_pandas(fcst_df)
    fcst_df_2 = distributed_timegpt.forecast(
        df=df, 
        h=horizon, 
        num_partitions=2,
        id_col=id_col,
        time_col=time_col,
        **fcst_kwargs
    )
    fcst_df_2 = fa.as_pandas(fcst_df_2)
    equal_arrays = np.array_equal(
        fcst_df.sort_values([id_col, time_col])['TimeGPT'].values,
        fcst_df_2.sort_values([id_col, time_col])['TimeGPT'].values
    )
    assert not equal_arrays, 'Forecasts with and without ex vars are equal'

In [None]:
#| hide
def test_forecast_x_dataframe(df: fugue.AnyDataFrame, X_df: fugue.AnyDataFrame):
    test_forecast_x(df, X_df, num_partitions=1)
    test_forecast_x(df, X_df, level=[90, 80], num_partitions=1)
    test_forecast_x_same_results_num_partitions(df, X_df)

In [None]:
#| hide
def test_forecast_x_dataframe_diff_cols(df: fugue.AnyDataFrame, X_df: fugue.AnyDataFrame, id_col: str = 'id_col', time_col: str = 'time_col'):
    test_forecast_x(df, X_df, id_col=id_col, time_col=time_col, num_partitions=1)
    test_forecast_x(df, X_df, id_col=id_col, time_col=time_col, level=[90, 80], num_partitions=1)
    test_forecast_x_same_results_num_partitions(df, X_df, id_col=id_col, time_col=time_col)

In [None]:
#| hide
def test_anomalies(
        df: fugue.AnyDataFrame, 
        id_col: str = 'unique_id',
        time_col: str = 'ds',
        **anomalies_kwargs,
    ):
    anomalies_df = distributed_timegpt.detect_anomalies(
        df=df, 
        id_col=id_col,
        time_col=time_col,
        **anomalies_kwargs,
    )
    anomalies_df = fa.as_pandas(anomalies_df)
    test_eq(fa.as_pandas(df)[id_col].unique(), anomalies_df[id_col].unique())
    cols = anomalies_df.columns.to_list()
    exp_cols = [id_col, time_col, 'anomaly']
    test_eq(cols, exp_cols)

In [None]:
#| hide
def test_anomalies_same_results_num_partitions(
        df: fugue.AnyDataFrame, 
        id_col: str = 'unique_id',
        time_col: str = 'ds',
        **anomalies_kwargs,
    ):
    anomalies_df = distributed_timegpt.detect_anomalies(
        df=df, 
        num_partitions=1,
        id_col=id_col,
        time_col=time_col,
        **anomalies_kwargs
    )
    anomalies_df = fa.as_pandas(anomalies_df)
    anomalies_df_2 = distributed_timegpt.detect_anomalies(
        df=df, 
        num_partitions=2,
        id_col=id_col,
        time_col=time_col,
        **anomalies_kwargs
    )
    anomalies_df_2 = fa.as_pandas(anomalies_df_2)
    pd.testing.assert_frame_equal(
        anomalies_df.sort_values([id_col, time_col]).reset_index(drop=True),
        anomalies_df_2.sort_values([id_col, time_col]).reset_index(drop=True),
    )

In [None]:
#| hide
def test_anomalies_diff_results_diff_models(
        df: fugue.AnyDataFrame, 
        id_col: str = 'unique_id',
        time_col: str = 'ds',
        **anomalies_kwargs,
    ):
    anomalies_df = distributed_timegpt.detect_anomalies(
        df=df, 
        num_partitions=1,
        id_col=id_col,
        time_col=time_col,
        model='timegpt-1',
        **anomalies_kwargs
    )
    anomalies_df = fa.as_pandas(anomalies_df)
    anomalies_df_2 = distributed_timegpt.detect_anomalies(
        df=df, 
        num_partitions=1,
        id_col=id_col,
        time_col=time_col,
        model='timegpt-1-long-horizon',
        **anomalies_kwargs
    )
    anomalies_df_2 = fa.as_pandas(anomalies_df_2)
    test_fail(
        lambda: pd.testing.assert_frame_equal(
            anomalies_df.sort_values([id_col, time_col]).reset_index(drop=True),
            anomalies_df_2.sort_values([id_col, time_col]).reset_index(drop=True),
        ),
        contains='(column name="TimeGPT") are different',
    )


In [None]:
#| hide
def test_anomalies_dataframe(df: fugue.AnyDataFrame):
    test_anomalies(df, num_partitions=1)
    test_anomalies(df, level=90, num_partitions=1)
    test_anomalies_same_results_num_partitions(df)

In [None]:
#| hide
def test_anomalies_dataframe_diff_cols(df: fugue.AnyDataFrame, id_col: str = 'id_col', time_col: str = 'time_col'):
    test_anomalies(df, id_col=id_col, time_col=time_col, num_partitions=1)
    test_anomalies(df, id_col=id_col, time_col=time_col, level=90, num_partitions=1)
    test_anomalies_same_results_num_partitions(df, id_col=id_col, time_col=time_col)
    # @A: document behavior with exogenous variables in distributed environments.  
    #test_anomalies_same_results_num_partitions(df, id_col=id_col, time_col=time_col, date_features=True, clean_ex_first=False)

In [None]:
#| hide
def test_quantiles(df: fugue.AnyDataFrame, id_col: str = 'id_col', time_col: str = 'time_col'):
    test_qls = list(np.arange(0.1, 1, 0.1))
    exp_q_cols = [f"TimeGPT-q-{int(q * 100)}" for q in test_qls]
    def test_method_qls(method, **kwargs):
        df_qls = method(
            df=df, 
            h=12, 
            id_col=id_col,
            time_col=time_col, 
            quantiles=test_qls,
            **kwargs
        )
        df_qls = fa.as_pandas(df_qls)
        assert all(col in df_qls.columns for col in exp_q_cols)
        # test monotonicity of quantiles
        df_qls.apply(lambda x: x.is_monotonic_increasing, axis=1).sum() == len(exp_q_cols)
    test_method_qls(distributed_timegpt.forecast)
    test_method_qls(distributed_timegpt.forecast, add_history=True)
    test_method_qls(distributed_timegpt.cross_validation)

In [None]:
#| hide
distributed_timegpt = _DistributedTimeGPT()

In [None]:
#| hide
n_series = 4
horizon = 7

series = generate_series(n_series, min_length=100)
series['unique_id'] = series['unique_id'].astype(str)

series_diff_cols = series.copy()
renamer = {'unique_id': 'id_col', 'ds': 'time_col'}
series_diff_cols = series_diff_cols.rename(columns=renamer)

In [None]:
#| hide
# data for exogenous tests
df_x = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/electricity-short-with-ex-vars.csv')
future_ex_vars_df = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/electricity-short-future-ex-vars.csv')

## Spark

In [None]:
#| hide
from pyspark.sql import SparkSession

In [None]:
#| hide
spark = SparkSession.builder.getOrCreate()
spark_df = spark.createDataFrame(series)
spark_diff_cols_df = spark.createDataFrame(series_diff_cols)

In [None]:
#| hide
test_quantiles(spark_df, id_col="unique_id", time_col="ds")

In [None]:
#| hide
test_forecast_dataframe(spark_df)
test_forecast_dataframe_diff_cols(spark_diff_cols_df)
test_anomalies_dataframe(spark_df)
test_anomalies_dataframe_diff_cols(spark_diff_cols_df)

In [None]:
#| hide
# test exogenous variables
spark_df_x = spark.createDataFrame(df_x)
spark_future_ex_vars_df = spark.createDataFrame(future_ex_vars_df)
test_forecast_x_dataframe(spark_df_x, spark_future_ex_vars_df)

In [None]:
#| hide
# test x different cols
spark_df_x_diff_cols = spark.createDataFrame(df_x.rename(columns=renamer))
spark_future_ex_vars_df_diff_cols = spark.createDataFrame(future_ex_vars_df.rename(columns=renamer))
test_forecast_x_dataframe_diff_cols(spark_df_x_diff_cols, spark_future_ex_vars_df_diff_cols)

In [None]:
#| hide
spark.stop()

## Dask

In [None]:
#| hide
import dask.dataframe as dd

In [None]:
#| hide
dask_df = dd.from_pandas(series, npartitions=2)
dask_diff_cols_df = dd.from_pandas(series_diff_cols, npartitions=2)

In [None]:
#| hide
test_quantiles(dask_df, id_col="unique_id", time_col="ds")

In [None]:
#| hide
test_forecast_dataframe(dask_df)
test_forecast_dataframe_diff_cols(dask_diff_cols_df)
test_anomalies_dataframe(dask_df)
test_anomalies_dataframe_diff_cols(dask_diff_cols_df)

In [None]:
#| hide
# test exogenous variables
dask_df_x = dd.from_pandas(df_x, npartitions=2)
dask_future_ex_vars_df = dd.from_pandas(future_ex_vars_df, npartitions=2)
test_forecast_x_dataframe(dask_df_x, dask_future_ex_vars_df)

In [None]:
#| hide
# test x different cols
dask_df_x_diff_cols = dd.from_pandas(df_x.rename(columns=renamer), npartitions=2)
dask_future_ex_vars_df_diff_cols = dd.from_pandas(future_ex_vars_df.rename(columns=renamer), npartitions=2)
test_forecast_x_dataframe_diff_cols(dask_df_x_diff_cols, dask_future_ex_vars_df_diff_cols)

## Ray

In [None]:
#| hide
import ray
from ray.cluster_utils import Cluster

In [None]:
#| hide
ray_cluster = Cluster(
    initialize_head=True,
    head_node_args={"num_cpus": 2}
)
ray.init(address=ray_cluster.address, ignore_reinit_error=True)
# add mock node to simulate a cluster
mock_node = ray_cluster.add_node(num_cpus=2)
ray_df = ray.data.from_pandas(series)
ray_diff_cols_df = ray.data.from_pandas(series_diff_cols)

In [None]:
test_quantiles(ray_df, id_col="unique_id", time_col="ds")

In [None]:
#| hide
test_forecast_dataframe(ray_df)
test_forecast_dataframe_diff_cols(ray_diff_cols_df)
test_anomalies_dataframe(ray_df)
test_anomalies_dataframe_diff_cols(ray_diff_cols_df)

In [None]:
#| hide
# test exogenous variables
ray_df_x = ray.data.from_pandas(df_x)
ray_future_ex_vars_df = ray.data.from_pandas(future_ex_vars_df)
test_forecast_x_dataframe(ray_df_x, ray_future_ex_vars_df)

In [None]:
#| hide
# test x different cols
ray_df_x_diff_cols = ray.data.from_pandas(df_x.rename(columns=renamer))
ray_future_ex_vars_df_diff_cols = ray.data.from_pandas(future_ex_vars_df.rename(columns=renamer))
test_forecast_x_dataframe_diff_cols(ray_df_x_diff_cols, ray_future_ex_vars_df_diff_cols)

In [None]:
ray.shutdown()