In [0]:
import mlflow
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import time

In [0]:
catalog = "johannes_oehler"
schema = "vectorlab"
table = "forecast_data"
forecast_horizon = 10

In [0]:
forecast_horizon = 10

# MLflow training run info (from training notebook)
run_id = "ce305aba0b684d2f9f568e036bdf4098"
model_path = "forecasting_model"

# Unity Catalog info for registration
model_catalog = "johannes_oehler"
model_schema = "vectorlab"
model_name = "prophet_forecasting"
model_name_uc = f"{model_catalog}.{model_schema}.{model_name}"

# Serving endpoint (optional)
serving_endpoint_name = "forecast_joe"

In [0]:
loaded_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/{model_path}")

In [0]:
#Select Data
query = f"SELECT date, store, SUM(sales) as sales FROM {catalog}.{schema}.{table} GROUP BY date, store ORDER BY date desc"

df = spark.sql(query)

# Choose a single store to make the calculations simpler
df = df.filter(df.store == 1)

# train-test-split
test_df = df.orderBy(df.date.desc()).limit(forecast_horizon)

In [0]:
from pyspark.sql.functions import col, lit

# Dropping rows with missing values in the 'sales' column
cleaned_df = test_df.na.drop(subset=["sales"]) 

# Calculating IQR and defining bounds for outliers
quartiles = cleaned_df.approxQuantile("sales", [0.25, 0.75], 0.05) 
IQR = quartiles[1] - quartiles[0]
lower_bound = 0
upper_bound = quartiles[1] + 1.5 * IQR

# Filtering out outliers
no_outliers_df = cleaned_df.filter(
    (col("sales") > lit(lower_bound)) 
    & (col("sales") <= lit(upper_bound)) 
)

# Showing the updated DataFrame
no_outliers_df.show(5)

In [0]:
from prophet import Prophet
from pyspark.sql.functions import col, to_date

test_df = no_outliers_df.select(to_date(col("date")).alias("ds"), col("store"), col("sales").alias("y").cast("double")).orderBy(col("ds").desc())

In [0]:
forecast = loaded_model.predict(test_df)

In [0]:
# Convert test_df to Pandas first
test_df_pd = test_df.select("ds", "y").toPandas()

# Make sure 'ds' is datetime in both dataframes
test_df_pd["ds"] = pd.to_datetime(test_df_pd["ds"])
forecast["ds"] = pd.to_datetime(forecast["ds"])

# Now merge
eval_df = test_df_pd.merge(forecast[["ds", "yhat"]], on="ds", how="left")

In [0]:
y_true = eval_df["y"]
y_pred = eval_df["yhat"]

mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

In [0]:
import mlflow

model_name = "johannes_oehler.vectorlab.prophet_forecasting"
model_version = "1"  # or whatever version you just registered
model_uri = f"models:/{model_name}/{model_version}"

# test_df must be Pandas
test_df_pd = test_df.select("ds", "y").toPandas()
test_df_pd["ds"] = pd.to_datetime(test_df_pd["ds"])
test_df_pd["y"] = test_df_pd["y"].astype(float)
mlflow.models.evaluate(
    model=model_uri,
    data=test_df_pd,
    targets="y",
    model_type="regressor"
)

In [0]:
import mlflow
import time

# run_id from the training notebook
run_id = "ce305aba0b684d2f9f568e036bdf4098"
model_path = "forecasting_model"  # artifact path used in training

# Register the model
model_version = mlflow.register_model(
    f"runs:/{run_id}/{model_path}",  # MLflow run path to the model
    model_name_uc
)

# Optional: wait a few seconds for registration to complete
time.sleep(15)

print(f"Model registered in Unity Catalog as {model_name_uc}, version {model_version.version}")
