# Config

In [1]:
import numpy as np
import pandas as pd
import json
from pyspark.sql import SparkSession
from prophet import Prophet, serialize
from prophet.diagnostics import cross_validation, performance_metrics
import mlflow
from mlflow.models import infer_signature

DATASOURCE = (
    "/data/ts-spark_ch1_ds2.csv"
)
ARTIFACT_DIR = "model"
np.random.seed(20244)

Importing plotly failed. Interactive plots will not work.


# DataOps

## Ingest data from source

In [2]:
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("ts-spark_ch4_data-ml-ops_time_series_prophet_notebook") \
    .getOrCreate()

sdf = spark.read.csv(DATASOURCE, header=True, inferSchema=True)

## Transform data

In [3]:
pdf = sdf.select("date", "daily_min_temperature").toPandas()
pdf.columns = ["ds", "y"]
pdf["y"] = pd.to_numeric(pdf["y"], errors="coerce")
pdf.drop(index=pdf.index[-2:], inplace=True)
pdf.dropna()

Unnamed: 0,ds,y
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8
3,1981-01-04,14.6
4,1981-01-05,15.8
...,...,...
3643,1990-12-25,12.9
3644,1990-12-26,14.6
3645,1990-12-27,14.0
3646,1990-12-28,13.6


# ModelOps

## Train and log model

In [4]:
mlflow.set_tracking_uri("http://mlflow-server:5000")
mlflow.set_experiment('ts-spark_ch4_data-ml-ops_time_series_prophet_notebook')

with mlflow.start_run():    
    model = Prophet().fit(pdf)

    param = {attr: getattr(model, attr) for attr in serialize.SIMPLE_ATTRIBUTES}

    cv_metrics_name = ["mse", "rmse", "mae", "mdape", "smape", "coverage"]
    cv_params = cross_validation(
        model=model,
        horizon="90 days",
        period="30 days",
        initial="700 days",
        parallel="threads",
        disable_tqdm=True,
    )
    _cv_metrics = performance_metrics(cv_params)
    cv_metrics = {n: _cv_metrics[n].mean() for n in cv_metrics_name}

    train = model.history
    predictions = model.predict(model.make_future_dataframe(30))
    signature = infer_signature(train, predictions)

    mlflow.prophet.log_model(model, artifact_path=ARTIFACT_DIR, signature=signature)
    mlflow.log_params(param)
    mlflow.log_metrics(cv_metrics)
    model_uri = mlflow.get_artifact_uri(ARTIFACT_DIR)

    print(f"CV params: \n{json.dumps(param, indent=2)}")
    print(f"CV metrics: \n{json.dumps(cv_metrics, indent=2)}")
    print(f"Model URI: {model_uri}")

2024/04/30 18:49:50 INFO mlflow.tracking.fluent: Experiment with name 'ts-spark_ch4_data-ml-ops_time_series_prophet_notebook' does not exist. Creating a new experiment.
18:49:51 - cmdstanpy - INFO - Chain [1] start processing
18:49:51 - cmdstanpy - INFO - Chain [1] done processing
18:49:53 - cmdstanpy - INFO - Chain [1] start processing
18:49:53 - cmdstanpy - INFO - Chain [1] start processing
18:49:53 - cmdstanpy - INFO - Chain [1] start processing
18:49:53 - cmdstanpy - INFO - Chain [1] start processing
18:49:54 - cmdstanpy - INFO - Chain [1] start processing
18:49:54 - cmdstanpy - INFO - Chain [1] done processing
18:49:54 - cmdstanpy - INFO - Chain [1] start processing
18:49:54 - cmdstanpy - INFO - Chain [1] start processing
18:49:54 - cmdstanpy - INFO - Chain [1] start processing
18:49:54 - cmdstanpy - INFO - Chain [1] start processing
18:49:54 - cmdstanpy - INFO - Chain [1] start processing
18:49:54 - cmdstanpy - INFO - Chain [1] done processing
18:49:54 - cmdstanpy - INFO - Chain 

CV params: 
{
  "growth": "linear",
  "n_changepoints": 25,
  "specified_changepoints": false,
  "changepoint_range": 0.8,
  "yearly_seasonality": "auto",
  "weekly_seasonality": "auto",
  "daily_seasonality": "auto",
  "seasonality_mode": "additive",
  "seasonality_prior_scale": 10.0,
  "changepoint_prior_scale": 0.05,
  "holidays_prior_scale": 10.0,
  "mcmc_samples": 0,
  "interval_width": 0.8,
  "uncertainty_samples": 1000,
  "y_scale": 26.3,
  "y_min": 0.0,
  "scaling": "absmax",
  "logistic_floor": false,
  "country_holidays": null,
  "component_modes": {
    "additive": [
      "yearly",
      "weekly",
      "additive_terms",
      "extra_regressors_additive",
      "holidays"
    ],
    "multiplicative": [
      "multiplicative_terms",
      "extra_regressors_multiplicative"
    ]
  },
  "holidays_mode": "additive"
}
CV metrics: 
{
  "mse": 7.933370571149654,
  "rmse": 2.816072480474402,
  "mae": 2.2336426897094794,
  "mdape": 0.1743597502078967,
  "smape": 0.22119437920557894,

## Forecast with model

In [5]:
_model = mlflow.prophet.load_model(model_uri)

forecast = _model.predict(_model.make_future_dataframe(30))
print(f"forecast:\n${forecast.head(10)}")

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

forecast:
$          ds      trend  yhat_lower  yhat_upper  trend_lower  trend_upper  \
0 1981-01-01  11.839497   11.874776   18.524958    11.839497    11.839497   
1 1981-01-02  11.837740   11.601939   18.410193    11.837740    11.837740   
2 1981-01-03  11.835982   12.056064   18.841682    11.835982    11.835982   
3 1981-01-04  11.834224   11.864106   18.528551    11.834224    11.834224   
4 1981-01-05  11.832467   12.165678   18.753201    11.832467    11.832467   
5 1981-01-06  11.830709   12.371939   19.258229    11.830709    11.830709   
6 1981-01-07  11.828951   12.106755   19.083532    11.828951    11.828951   
7 1981-01-08  11.827194   12.319995   19.202247    11.827194    11.827194   
8 1981-01-09  11.825436   12.524928   19.314604    11.825436    11.825436   
9 1981-01-10  11.823678   12.231800   19.161781    11.823678    11.823678   

   additive_terms  additive_terms_lower  additive_terms_upper    weekly  \
0        3.317405              3.317405              3.317405  0.0