In [None]:
#|default_exp distributed.fugue

# Fugue

> Fugue backend

In [None]:
#|export
import copy
from typing import Any, Iterable, List, Optional

import cloudpickle
import pandas as pd
from fugue import transform

from mlforecast.core import (
    DateFeature,
    Differences,
    Freq,
    LagTransforms,
    Lags,
    TimeSeries,
)

In [None]:
#|exporti
def _retrieve_df(items: List[List[Any]]) -> Iterable[pd.DataFrame]:
    for _, serialized_df in items:
        yield cloudpickle.loads(serialized_df)

In [None]:
#|export
class FugueMLForecast:
    def __init__(
        self,
        models,
        freq: Optional[Freq] = None,
        lags: Optional[Lags] = None,
        lag_transforms: Optional[LagTransforms] = None,
        date_features: Optional[Iterable[DateFeature]] = None,
        differences: Optional[Differences] = None,
        num_threads: int = 1,
        engine = None,
    ):
        self.models = models
        self.base_ts = TimeSeries(
            freq, lags, lag_transforms, date_features, differences, num_threads
        )
        self.engine = engine
        
    def _preprocess_partition(
        self,
        part: pd.DataFrame,
        id_col: str,
        time_col: str,
        target_col: str,
    ) -> List[List[Any]]:
        ts = copy.deepcopy(self.base_ts)
        transformed = ts.fit_transform(part, id_col, time_col, target_col)
        return [[cloudpickle.dumps(ts), cloudpickle.dumps(transformed)]]
    
    def preprocess(
        self,
        data,
        id_col: str,
        time_col: str,
        target_col: str,        
    ):
        self.partition_results = transform(
            data,
            self._preprocess_partition,
            params={'id_col': id_col, 'time_col': time_col, 'target_col': target_col},
            schema='ts:binary,df:binary',
            partition={'by': id_col, 'num': 2},
            engine=self.engine,
        )
        return transform(
            self.partition_results,
            _retrieve_df,
            schema='unique_id:string,ds:datetime,y:double,lag1:double',
            engine=self.engine,
        )
    
    def _predict(self, items: List[List[Any]], model, horizon) -> Iterable[pd.DataFrame]:
        for serialized_ts, _ in items:
            ts = cloudpickle.loads(serialized_ts)
            yield ts.predict(model, horizon).reset_index()
            
    def predict(self, model, horizon: int):
        return transform(
            self.partition_results,
            self._predict,
            schema='unique_id:string,ds:datetime,DummyModel:double',
            params={'model': model, 'horizon': horizon},
        )

In [None]:
from mlforecast.utils import generate_daily_series

In [None]:
series = generate_daily_series(100).reset_index()
series['unique_id'] = series['unique_id'].astype(str)
series.head(2)

Unnamed: 0,unique_id,ds,y
0,id_00,2000-01-01,0.49765
1,id_00,2000-01-02,1.290925


In [None]:
fcst = FugueMLForecast([], freq='H', lags=[1])
fcst.preprocess(series, 'unique_id', 'ds', 'y')

NativeExecutionEngine doesn't respect num_partitions 2


Unnamed: 0,unique_id,ds,y,lag1
0,id_00,2000-01-02,1.290925,0.497650
1,id_00,2000-01-03,2.207184,1.290925
2,id_00,2000-01-04,3.237349,2.207184
3,id_00,2000-01-05,4.311755,3.237349
4,id_00,2000-01-06,5.169004,4.311755
...,...,...,...,...
26898,id_99,2000-06-25,6.477150,5.246337
26899,id_99,2000-06-26,0.431850,6.477150
26900,id_99,2000-06-27,1.447339,0.431850
26901,id_99,2000-06-28,2.081776,1.447339


In [None]:
class DummyModel:
    def predict(self, X):
        return X['lag1']
model = DummyModel()

In [None]:
fcst.predict(model, 2)

Unnamed: 0,unique_id,ds,DummyModel
0,id_00,2000-08-09 01:00:00,4.357781
1,id_00,2000-08-09 02:00:00,4.357781
2,id_01,2000-04-06 01:00:00,3.365699
3,id_01,2000-04-06 02:00:00,3.365699
4,id_02,2000-06-15 01:00:00,2.113542
...,...,...,...
195,id_97,2000-04-06 02:00:00,2.097209
196,id_98,2000-06-25 01:00:00,4.489870
197,id_98,2000-06-25 02:00:00,4.489870
198,id_99,2000-06-29 01:00:00,3.259480
