In [None]:
pip install prophet neuralforecast

In [None]:
import logging
from time import time

import pandas as pd
from neuralforecast.data.datasets.m5 import M5, M5Evaluation
from prophet import Prophet
from pyspark.sql.types import *

# disable informational messages from prophet
logging.getLogger('py4j').setLevel(logging.ERROR)

In [None]:
# structure of the training data set
train_schema = StructType([
  StructField('item_id', StringType()),  
  StructField('timestamp', DateType()),
  StructField('demand', DoubleType())
  ])
 
# read the training file into a dataframe
train = spark.read.parquet(
  's3://m5-benchmarks/data/train/target.parquet', 
  header=True, 
  schema=train_schema
 )
 
# make the dataframe queriable as a temporary view
train.createOrReplaceTempView('train')


In [None]:
sql_statement = '''
  SELECT
    item_id AS unique_id,
    CAST(timestamp as date) as ds,
    demand as y
  FROM train
  '''
 
m5_history = (
  spark
    .sql( sql_statement )
    .repartition(sc.defaultParallelism, ['unique_id'])
  ).cache()

In [None]:
def forecast( history_pd: pd.DataFrame ) -> pd.DataFrame:
  
  # TRAIN MODEL AS BEFORE
  # --------------------------------------
  # remove missing values (more likely at day-store-item level)
    history_pd = history_pd.dropna()

    # configure the model
    model = Prophet(
        growth='linear',
        daily_seasonality=False,
        weekly_seasonality=True,
        yearly_seasonality=True,
        seasonality_mode='multiplicative'
    )

    # train the model
    model.fit( history_pd )
    # --------------------------------------

    # BUILD FORECAST AS BEFORE
    # --------------------------------------
    # make predictions
    future_pd = model.make_future_dataframe(
        periods=28, 
        freq='d', 
        include_history=False
    )
    forecast_pd = model.predict( future_pd )  
    # --------------------------------------

    # ASSEMBLE EXPECTED RESULT SET
    # --------------------------------------
    # get relevant fields from forecast
    forecast_pd['unique_id'] = history_pd['unique_id'].unique()[0]
    f_pd = forecast_pd[['unique_id', 'ds','yhat']]
    # --------------------------------------

    # return expected dataset
    return f_pd  

In [None]:
result_schema = StructType([
  StructField('unique_id', StringType()), 
  StructField('ds',DateType()),
  StructField('yhat',FloatType()),
])

In [None]:
init = time()
results = (
  m5_history
    .groupBy('unique_id')
      .applyInPandas(forecast, schema=result_schema)
    ).toPandas()
end = time()
print(f'Minutes taken by Prophet on a Spark cluster: {(end - init) / 60}')


In [None]:
Y_hat = results.set_index(['unique_id', 'ds']).unstack()
Y_hat = Y_hat.droplevel(0, 1).reset_index()

In [None]:
*_, S_df = M5.load('./data')
Y_hat = S_df.merge(Y_hat, how='left', on=['unique_id'])#.drop(columns=['unique_id'])

In [None]:
wrmsse = M5Evaluation.evaluate(y_hat=Y_hat, directory='./data')

In [None]:
wrmsse

Unnamed: 0,wrmsse
Total,0.7718
Level1,0.507905
Level2,0.586328
Level3,0.666686
Level4,0.549358
Level5,0.655003
Level6,0.647176
Level7,0.747047
Level8,0.743422
Level9,0.824667
