In [0]:
import pandas as pd

import fugue_bigquery.api as fbqa
from fugue_notebook import setup
from statsforecast import StatsForecast
from statsforecast.distributed.fugue import FugueBackend
from statsforecast.models import *

setup(is_lab=True)

In [0]:
%%fsql spark
CONNECT bigquery SELECT
    store_number,
    item_description,
    date,
    SUM(bottles_sold) AS total_bottles_sold
FROM
  `bigquery-public-data.iowa_liquor_sales.sales`
WHERE date BETWEEN DATE("2015-01-01") AND DATE("2021-12-24")
GROUP BY store_number, item_description, date

SELECT 
    CONCAT(store_number, '_', item_description) AS unique_id,
    date AS ds,
    total_bottles_sold AS y

PERSIST
YIELD DATAFRAME AS data

  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]


In [0]:
backend = FugueBackend(spark, {'fugue.spark.use_pandas_udf': True}, persist=True)

In [0]:
sf = StatsForecast(
   models = [ 
       Naive(), 
       MSTL(season_length=7, trend_forecaster=AutoETS(model='ZZN')),
       AutoETS(season_length=7),
       AutoCES(season_length=7),
       SeasonalNaive(season_length=7)
   ], # models to employ, # models to employ
   freq = 'D', # frequency per https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
   backend = backend, # specify backend, if you want to run your pipeline locally, just remove it
   fallback_model = Naive()
)
y_pred = sf.forecast(
    df = data.native.repartition(512, 'unique_id'), 
    h = 7
)