In [0]:
#Install Dependencies
!pip install prophet
!pip install databricks-sdk --upgrade
!pip install mlflow
!pip install grpcio
!pip install grpcio-status
!pip install pandas
dbutils.library.restartPython()

In [0]:
# TODO: update the catalog, schema, and table name for your data and give your model a name
catalog = "johannes_oehler"
schema = "vectorlab"
table = "forecast_data"
forecast_horizon = 10


# Define the catalog, schema, and model name for organizing the model within the MLflow model registry
model_catalog = "johannes_oehler" #Update it to your catalog name
model_schema = "vectorlab" #Update it to your schema name
model_name = "prophet_forecast" #Update it to your model name

serving_endpoint_name = "forecast_joe"

In [0]:
#Select Data
query = f"SELECT date, store, SUM(sales) as sales FROM {catalog}.{schema}.{table} GROUP BY date, store ORDER BY date desc"

df = spark.sql(query)

# Choose a single store to make the calculations simpler
df = df.filter(df.store == 1)

# train-test-split
train_df = df.orderBy(df.date.asc()).limit(df.count() - forecast_horizon).orderBy(df.date.desc())
test_df = df.orderBy(df.date.desc()).limit(forecast_horizon).toPandas()

train_df.show(5)
test_df.head(5)

In [0]:
from pyspark.sql.functions import col, lit

# Dropping rows with missing values in the 'sales' column
cleaned_df = train_df.na.drop(subset=["sales"]) 
cleaned_df.show(5)

# Calculating IQR and defining bounds for outliers
quartiles = cleaned_df.approxQuantile("sales", [0.25, 0.75], 0.05) 
IQR = quartiles[1] - quartiles[0]
lower_bound = 0
upper_bound = quartiles[1] + 1.5 * IQR

# Filtering out outliers
no_outliers_df = cleaned_df.filter(
    (col("sales") > lit(lower_bound)) 
    & (col("sales") <= lit(upper_bound)) 
)

# Showing the updated DataFrame
no_outliers_df.show(5)

In [0]:
from prophet import Prophet
from pyspark.sql.functions import col, to_date

# Prophet requires at the minimum 2 columns - ds & y
train_df = no_outliers_df.select(to_date(col("date")).alias("ds"), col("store"), col("sales").alias("y").cast("double")).orderBy(col("ds").desc())

# set model parameters
prophet_model = Prophet(
  interval_width=0.95,
  growth='linear',
  daily_seasonality=True,
  weekly_seasonality=True,
  yearly_seasonality=True,
  seasonality_mode='additive'
  )
 
# fit the model to historical data
history_pd = train_df.toPandas()
prophet_model.fit(history_pd)

In [0]:
from pyspark.sql.functions import col, lit, to_date
from prophet import Prophet
import mlflow
from mlflow.pyfunc import PythonModel, log_model
from mlflow.models.signature import infer_signature
import pandas as pd
import time

In [0]:

class ProphetWrapper(PythonModel):
    def __init__(self, model):
        self.model = model

    def predict(self, context, model_input):
        future_pd = self.model.make_future_dataframe(
            periods=forecast_horizon,
            freq="d",
            include_history=True
        )
        forecast_pd = self.model.predict(future_pd)
        return forecast_pd[["ds", "yhat", "yhat_lower", "yhat_upper"]]

wrapped_model = ProphetWrapper(prophet_model)

with mlflow.start_run(run_name="prophet_training") as run:
    # Input/output examples for signature
    input_example = history_pd.head()[["ds", "y"]]
    output_example = prophet_model.predict(input_example).iloc[:10]
    signature = infer_signature(input_example, output_example)

    # Log the model artifact
    model_path = "forecasting_model"
    log_model(
        artifact_path=model_path,
        python_model=wrapped_model,
        signature=signature,
        input_example=input_example
    )

    # Log hyperparameters / metadata
    mlflow.log_param("interval_width", prophet_model.interval_width)
    mlflow.log_param("growth", prophet_model.growth)
    mlflow.log_param("seasonality_mode", prophet_model.seasonality_mode)
    mlflow.log_param("daily_seasonality", prophet_model.daily_seasonality)
    mlflow.log_param("weekly_seasonality", prophet_model.weekly_seasonality)
    mlflow.log_param("yearly_seasonality", prophet_model.yearly_seasonality)

    # Save run_id for evaluation
    run_id = run.info.run_id
    print(f"Training run logged with run_id: {run_id}")



In [0]:
import mlflow
from mlflow.pyfunc import PythonModel, log_model
from mlflow.models.signature import infer_signature
import pandas as pd
import pickle

# Save the fitted Prophet model as a pickle file
with open("/tmp/prophet_model.pkl", "wb") as f:
    pickle.dump(prophet_model, f)

# Wrapper loads the pickled model
class ProphetWrapper(PythonModel):
    def __init__(self, model_path, forecast_horizon):
        self.model_path = model_path
        self.forecast_horizon = forecast_horizon

    def load_context(self, context):
        import pickle
        # Load the fitted Prophet model
        with open(self.model_path, "rb") as f:
            self.model = pickle.load(f)

    def predict(self, context, model_input: pd.DataFrame) -> pd.DataFrame:
        # Generate future dataframe
        future_pd = self.model.make_future_dataframe(
            periods=self.forecast_horizon,
            freq="d",
            include_history=True
        )
        forecast_pd = self.model.predict(future_pd)
        return forecast_pd[["ds", "yhat", "yhat_lower", "yhat_upper"]]

# Wrap the saved model
forecast_horizon = 10
wrapped_model = ProphetWrapper("/tmp/prophet_model.pkl", forecast_horizon)

# Start MLflow run and log the model
with mlflow.start_run(run_name="prophet_training") as run:
    input_example = history_pd.head()[["ds", "y"]]
    output_example = prophet_model.predict(input_example).iloc[:10]
    signature = infer_signature(input_example, output_example)

    log_model(
        artifact_path="forecasting_model",
        python_model=wrapped_model,
        signature=signature,
        input_example=input_example
    )

    run_id = run.info.run_id
    print(f"Training run logged with run_id: {run_id}")

# Verify artifacts
from mlflow.tracking import MlflowClient
client = MlflowClient()
artifacts = client.list_artifacts(run_id)
for a in artifacts:
    print(a.path)
