In [1]:
# Welcome to your new notebook
# Type here in the cell editor to add code!
from pyspark.sql import functions as F


StatementMeta(, d6872138-7d89-4d61-a782-6e1be21637d2, 3, Finished, Available, Finished)

In [17]:
df_silver_orders = spark.table("stg_silver_orders")
# display(df_silver_orders)

StatementMeta(, b0c27f56-91fb-4768-bd14-2e2710fcf3c0, 22, Finished, Available, Finished)

In [18]:
df_silver_orders_select = spark.table("stg_silver_orders").select("order_id","timestamp")
# display(df_silver_orders_select)

StatementMeta(, b0c27f56-91fb-4768-bd14-2e2710fcf3c0, 23, Finished, Available, Finished)

In [19]:
# df_silver_items_select = spark.table("stg_silver_items")

df_silver_items_select = spark.table("stg_silver_items").select("order_id", "fried_chicken_qty", "fries_qty", "drink_qty", "ice_cream_qty")
# display(df_silver_items_select)

StatementMeta(, b0c27f56-91fb-4768-bd14-2e2710fcf3c0, 24, Finished, Available, Finished)

In [20]:
# df_silver_weather_select = spark.table("stg_silver_weather")

df_silver_weather_select = spark.table("stg_silver_weather").select("order_id", "temperature", "humidity", "condition")
# display(df_silver_weather_select)
# df_silver_weather_select.select("condition").distinct().show(truncate=False)

StatementMeta(, b0c27f56-91fb-4768-bd14-2e2710fcf3c0, 25, Finished, Available, Finished)

In [21]:

df_joined = (
    df_silver_orders_select
    .join(df_silver_items_select, on="order_id", how="left")
    .join(df_silver_weather_select, on="order_id", how="left")
)

df_clean = df_joined.na.drop()

df_clean = (df_clean
    .withColumn(
        "condition_code",
        F.when(F.col("condition") == "rainy", 1)
         .when(F.col("condition") == "snowy", 2)
         .when(F.col("condition") == "sunny", 3)
         .when(F.col("condition") == "cloudy", 4)
         .otherwise(0)  # fallback
    )
)

# df_clean.columns
# display(df_clean)


StatementMeta(, b0c27f56-91fb-4768-bd14-2e2710fcf3c0, 26, Finished, Available, Finished)

In [22]:
from pyspark.sql import functions as F

df_bucketed = (
    df_clean
    .withColumn("ts", F.col("timestamp").cast("timestamp"))
    .groupBy(F.window("ts", "10 minutes").alias("w"))
    .agg(
        F.sum("fried_chicken_qty").alias("fried_chicken_sum"),
        F.sum("fries_qty").alias("fries_sum"),
        F.sum("drink_qty").alias("drink_sum"),
        F.sum("ice_cream_qty").alias("ice_cream_sum"),
        F.avg("temperature").alias("temperature_avg"),
        F.avg("humidity").alias("humidity_avg"),
        F.first("condition_code").alias("condition_code")
    )
    .withColumn("t10", F.col("w.start"))
    .drop("w")
    # bucket_time HH:MM
    .withColumn("bucket_time", F.date_format(F.col("t10"), "HH:mm"))
    # week day (1=sunday ... 7=saturday)
    .withColumn("day_of_week", F.dayofweek("t10"))
    .orderBy("t10")
)

# df_bucketed.show(20, truncate=False)


StatementMeta(, b0c27f56-91fb-4768-bd14-2e2710fcf3c0, 27, Finished, Available, Finished)

In [23]:
from pyspark.sql import functions as F, Window as W

df_bucketed = (
    df_clean
    .withColumn("ts", F.col("timestamp").cast("timestamp"))
    .groupBy(F.window("ts", "10 minutes").alias("w"))
    .agg(
        F.sum("fried_chicken_qty").alias("fried_chicken_sum"),
        F.sum("fries_qty").alias("fries_sum"),
        F.sum("drink_qty").alias("drink_sum"),
        F.sum("ice_cream_qty").alias("ice_cream_sum"),
        F.avg("temperature").alias("temperature_avg"),
        F.avg("humidity").alias("humidity_avg"),
        F.first("condition_code").alias("condition_code")
    )
    .withColumn("t10", F.col("w.start"))
    .drop("w")
    .withColumn("bucket_time", F.date_format(F.col("t10"), "HH:mm"))
    .withColumn("day_of_week", F.dayofweek("t10"))
    # sum sales in bucket
    .withColumn("qty_10m",
        F.col("fried_chicken_sum") +
        F.col("fries_sum") +
        F.col("drink_sum") +
        F.col("ice_cream_sum")
    )
)

# label: next bucket sales (lead)
w = W.orderBy("t10")
df_labeled = df_bucketed.withColumn("qty_next_10m", F.lead("qty_10m", 1).over(w)) \
                        .orderBy("t10")

# df_labeled.show(20, truncate=False)


StatementMeta(, b0c27f56-91fb-4768-bd14-2e2710fcf3c0, 28, Finished, Available, Finished)

In [13]:
from pyspark.sql import functions as F, Window as W
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import mlflow

# === 1) Feature engineering 
w = W.orderBy("t10")
df2 = (df_labeled
       .withColumn("hour", F.hour("t10"))
       .withColumn("day_of_week", F.dayofweek("t10"))
       .withColumn("lag1", F.lag("qty_10m", 1).over(w))
       .na.fill({
           "temperature_avg": 0.0,
           "humidity_avg": 0.0,
           "condition_code": 0,
           "qty_10m": 0,
           "lag1": 0
       })
       .where(F.col("qty_next_10m").isNotNull())   # remove missing labels
)

# === 2) time split
max_t = df2.select(F.max("t10")).first()[0]
test_from = F.date_sub(F.lit(max_t), 7)
train = df2.where(F.col("t10") <  test_from)
test  = df2.where(F.col("t10") >= test_from)

# === 3) Wektor cech
features = [
    "hour","day_of_week",
    "temperature_avg","humidity_avg","condition_code",
    "qty_10m","lag1"
]
assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid="keep")

train_v = (assembler.transform(train)
           .select("features", F.col("qty_next_10m").cast("double").alias("label"))
           .na.drop(subset=["label"]))

test_v  = (assembler.transform(test)
           .select("features", F.col("qty_next_10m").cast("double").alias("label"))
           .na.drop(subset=["label"]))

# if filtr is emtpy
if test_v.rdd.isEmpty():
    raise ValueError("The test dataset is empty after filtering NULL values. Please increase the data range or adjust the train/test split.")

# === 4) trening + evaluation
mlflow.set_experiment("eh_10m_forecast_simple")
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

with mlflow.start_run(run_name="LR_safe"):
    model = LinearRegression(featuresCol="features", labelCol="label", maxIter=100, regParam=0.1).fit(train_v)
    pred  = model.transform(test_v)

    pred = pred.filter(F.col("prediction").isNotNull())
    pred = pred.filter(~F.isnan("prediction"))

    rmse = evaluator.evaluate(pred)
    mae  = pred.select(F.avg(F.abs(F.col("label") - F.col("prediction")))).first()[0]

    mlflow.log_metric("rmse", float(rmse))
    mlflow.log_metric("mae",  float(mae))
    mlflow.spark.log_model(model, "model")

    print(f"LR OK. RMSE={rmse:.3f}, MAE={mae:.3f}")


StatementMeta(, d6872138-7d89-4d61-a782-6e1be21637d2, 15, Finished, Available, Finished)

LR OK. RMSE=57.640, MAE=43.173


In [14]:
from pyspark.ml.regression import GBTRegressor
with mlflow.start_run(run_name="GBT_safe"):
    model = GBTRegressor(featuresCol="features", labelCol="label", maxDepth=5, maxIter=200, stepSize=0.05).fit(train_v)
    pred  = model.transform(test_v).filter(F.col("prediction").isNotNull() & ~F.isnan("prediction"))
    rmse  = evaluator.evaluate(pred)
    mae   = pred.select(F.avg(F.abs(F.col("label") - F.col("prediction")))).first()[0]
    mlflow.log_metric("rmse", float(rmse)); mlflow.log_metric("mae", float(mae))
    mlflow.spark.log_model(model, "model")


StatementMeta(, d6872138-7d89-4d61-a782-6e1be21637d2, 16, Finished, Available, Finished)



# A fully numeric feature engineering pipeline with cyclic time encoding (sin/cos), lags, and rolling averages — a mapping better suited for machine learning models.

In [24]:
from pyspark.sql import functions as F, Window as W
import math

# time window
w = W.orderBy("t10")

# sin/cos
df_num = (
    df_labeled
    # --- time features  ---
    .withColumn("hour", F.hour("t10").cast("int"))
    .withColumn("day_of_week", F.dayofweek("t10").cast("int"))  # 1..7
    .withColumn("hour_sin", F.sin(2*math.pi*F.col("hour")/24.0))
    .withColumn("hour_cos", F.cos(2*math.pi*F.col("hour")/24.0))
    .withColumn("dow_sin",  F.sin(2*math.pi*F.col("day_of_week")/7.0))
    .withColumn("dow_cos",  F.cos(2*math.pi*F.col("day_of_week")/7.0))

    # --- numeric weather ---
    .withColumn("temperature_avg", F.col("temperature_avg").cast("double"))
    .withColumn("humidity_avg",    F.col("humidity_avg").cast("double"))
    .withColumn("condition_code",  F.col("condition_code").cast("int"))
    .withColumn("is_rain", (F.col("condition_code") == F.lit(1)).cast("int"))  # jeśli 1=rany

    # --- salec ---
    .withColumn("qty_10m", F.col("qty_10m").cast("double"))
    .withColumn("lag1", F.lag("qty_10m", 1).over(w))
    .withColumn("lag2", F.lag("qty_10m", 2).over(w))
    .withColumn("roll_mean_3", F.avg("qty_10m").over(w.rowsBetween(-3, -1)))
    .withColumn("change", F.col("qty_10m") - F.col("lag1"))

    # --- label ---
    .withColumn("qty_next_10m", F.col("qty_next_10m").cast("double"))

    # --- filling ---
    .na.fill({
        "temperature_avg": 0.0,
        "humidity_avg": 0.0,
        "condition_code": 0,
        "is_rain": 0,
        "qty_10m": 0.0,
        "lag1": 0.0,
        "lag2": 0.0,
        "roll_mean_3": 0.0,
        "change": 0.0
    })
    # remove labels missing labels
    .where(F.col("qty_next_10m").isNotNull())
)

# output columns (numeric + label)
feature_cols = [
    "hour_sin","hour_cos","dow_sin","dow_cos",
    "temperature_avg","humidity_avg","condition_code","is_rain",
    "qty_10m","lag1","lag2","roll_mean_3","change"
]

df_train_num = df_num.select(*feature_cols, "qty_next_10m")

df_train_num.show(10, truncate=False)

# df_train_num.write.mode("overwrite").saveAsTable("tbl_ml_train_10m_numeric")


StatementMeta(, b0c27f56-91fb-4768-bd14-2e2710fcf3c0, 29, Finished, Available, Finished)

+-------------------+---------------------+-------------------+-------------------+------------------+------------------+--------------+-------+-------+------+------+------------------+------+------------+
|hour_sin           |hour_cos             |dow_sin            |dow_cos            |temperature_avg   |humidity_avg      |condition_code|is_rain|qty_10m|lag1  |lag2  |roll_mean_3       |change|qty_next_10m|
+-------------------+---------------------+-------------------+-------------------+------------------+------------------+--------------+-------+-------+------+------+------------------+------+------------+
|-0.2588190451025208|-0.9659258262890683  |0.43388373911755823|-0.900968867902419 |18.753846153846144|47.72307692307693 |4             |0      |1048.0 |0.0   |0.0   |0.0               |0.0   |1240.0      |
|-0.2588190451025208|-0.9659258262890683  |0.43388373911755823|-0.900968867902419 |18.746666666666673|46.459999999999965|3             |0      |1240.0 |1048.0|0.0   |1048.0    

# The transformation above represents the final modification of the input data for the model. Below, the model is trained on the entire dataset based on this prepared feature set.

In [11]:
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
import mlflow

#  feature as df_train_num
feature_cols = [
    "hour_sin","hour_cos","dow_sin","dow_cos",
    "temperature_avg","humidity_avg","condition_code","is_rain",
    "qty_10m","lag1","lag2","roll_mean_3","change"
]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="keep")

all_v = (assembler.transform(df_train_num)
         .select("features", F.col("qty_next_10m").cast("double").alias("label"))
         .na.drop(subset=["label"]))  # remove any last bucket without a label

# LR training + log do MLflow
mlflow.set_experiment("eh_10m_forecast_simple")

with mlflow.start_run(run_name="LR_refit_numeric_all"):
    lr = LinearRegression(featuresCol="features", labelCol="label",
                          maxIter=100, regParam=0.1, standardization=True)
    lr_model_all = lr.fit(all_v)

    # infos log
    mlflow.log_param("refit_on_all_history", True)
    mlflow.log_param("features", ",".join(feature_cols))
    mlflow.spark.log_model(lr_model_all, "model")

print("✅ LR_refit_numeric_all: Artifacts → model → Register.")


StatementMeta(, b0c27f56-91fb-4768-bd14-2e2710fcf3c0, 13, Finished, Available, Finished)



✅ LR_refit_numeric_all: gotowy do rejestracji w MLflow (Artifacts → model → Register).


# Prediction attempt for the next 10m - i.e. running the ML model:

In [25]:
from pyspark.sql import functions as F, Window as W
from pyspark.ml.feature import VectorAssembler
import mlflow, math
from pyspark.sql import functions as F, Window as W
import math

w = W.orderBy("t10")

df_all = (
    df_labeled
    .withColumn("hour", F.hour("t10").cast("int"))
    .withColumn("day_of_week", F.dayofweek("t10").cast("int"))
    .withColumn("hour_sin", F.sin(2*math.pi*F.col("hour")/24.0))
    .withColumn("hour_cos", F.cos(2*math.pi*F.col("hour")/24.0))
    .withColumn("dow_sin",  F.sin(2*math.pi*F.col("day_of_week")/7.0))
    .withColumn("dow_cos",  F.cos(2*math.pi*F.col("day_of_week")/7.0))
    .withColumn("is_rain", (F.col("condition_code") == F.lit(1)).cast("int"))
    .withColumn("qty_10m", F.col("qty_10m").cast("double"))
    .withColumn("lag1", F.lag("qty_10m", 1).over(w))
    .withColumn("lag2", F.lag("qty_10m", 2).over(w))
    .withColumn("roll_mean_3", F.avg("qty_10m").over(w.rowsBetween(-3, -1)))
    .withColumn("change", F.col("qty_10m") - F.col("lag1"))
    .na.fill({
        "temperature_avg": 0.0, "humidity_avg": 0.0, "condition_code": 0, "is_rain": 0,
        "qty_10m": 0.0, "lag1": 0.0, "lag2": 0.0, "roll_mean_3": 0.0, "change": 0.0
    })
)

# Assembler is_rain:
from pyspark.ml.feature import VectorAssembler
feature_cols = [
    "hour_sin","hour_cos","dow_sin","dow_cos",
    "temperature_avg","humidity_avg","condition_code","is_rain",
    "qty_10m","lag1","lag2","roll_mean_3","change"
]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="keep")

feat_vec = assembler.transform(df_all.where(F.col("t10")==df_all.select(F.max("t10")).first()[0])) \
                   .select("t10","features")




model_uri = "models:/ml_model_10m_forecast/1"  
# model_uri = "models:/ml_model_10m_forecast/1"         

model = mlflow.spark.load_model(model_uri)
pred = model.transform(feat_vec)

result = (pred
    .select(
        F.col("t10").alias("observed_window_start"),
        F.col("prediction").alias("forecast_qty_next_10m")
    )
    .withColumn("forecast_window_start", F.col("observed_window_start") + F.expr("INTERVAL 10 MINUTES"))
    .withColumn("generated_at", F.current_timestamp())
)

result.show(truncate=False)

# result.write.mode("overwrite").saveAsTable("tbl_gold_10m_forecast_live")


StatementMeta(, b0c27f56-91fb-4768-bd14-2e2710fcf3c0, 30, Finished, Available, Finished)

2025/09/09 18:04:15 INFO mlflow.spark: 'models:/ml_model_10m_forecast/1' resolved as 'abfss://bed316b0-e26c-4a89-a790-a007cce0e3fd@onelakenortheurope.pbidedicated.windows.net/ad323cd2-6a65-44f4-80de-81db41647e52/Data/4e66a0fb-7db4-4c41-880f-1437a4a43cba/artifacts'


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]

2025/09/09 18:04:15 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false
2025/09/09 18:04:16 INFO mlflow.spark: File 'models:/ml_model_10m_forecast/1/sparkml' not found on DFS. Will attempt to upload the file.
2025/09/09 18:04:16 INFO mlflow.spark: Copied SparkML model to Files/tmp/mlflow/6e894341-9e11-4a89-8cc9-6fe30b16504c


+---------------------+---------------------+---------------------+--------------------------+
|observed_window_start|forecast_qty_next_10m|forecast_window_start|generated_at              |
+---------------------+---------------------+---------------------+--------------------------+
|2025-09-09 17:40:00  |67.35692445813964    |2025-09-09 17:50:00  |2025-09-09 18:04:17.682474|
+---------------------+---------------------+---------------------+--------------------------+



StatementMeta(, b0c27f56-91fb-4768-bd14-2e2710fcf3c0, 31, Finished, Available, Finished)

StatementMeta(, , -1, Finished, , Finished)

RejectSilentExecuteRequest: Livy session has failed. Error code: RejectSilentExecuteRequest. Rejected silent execute_request as there is no active session.

StatementMeta(, , -1, Finished, , Finished)

RejectSilentExecuteRequest: Livy session has failed. Error code: RejectSilentExecuteRequest. Rejected silent execute_request as there is no active session.

StatementMeta(, , -1, Finished, , Finished)

RejectSilentExecuteRequest: Livy session has failed. Error code: RejectSilentExecuteRequest. Rejected silent execute_request as there is no active session.

BACKTESTs

In [27]:
# === BACKTEST + GLOBAL ACCURACY (sMAPE-accuracy / MAPE-accuracy / Accuracy@10%) ===
from pyspark.sql import functions as F, Window as W
from pyspark.ml.feature import VectorAssembler
import mlflow, math
from datetime import datetime

# ──────────────────────────────────────────────────────────────────────────────
# FEATURING (sin/cos time + rolling)
# ──────────────────────────────────────────────────────────────────────────────
w = W.orderBy("t10")

df_all = (
    df_labeled
    # time
    .withColumn("hour", F.hour("t10").cast("int"))
    .withColumn("day_of_week", F.dayofweek("t10").cast("int"))   # 1..7
    .withColumn("hour_sin", F.sin(2*math.pi*F.col("hour")/24.0))
    .withColumn("hour_cos", F.cos(2*math.pi*F.col("hour")/24.0))
    .withColumn("dow_sin",  F.sin(2*math.pi*F.col("day_of_week")/7.0))
    .withColumn("dow_cos",  F.cos(2*math.pi*F.col("day_of_week")/7.0))
    # weather
    .withColumn("temperature_avg", F.col("temperature_avg").cast("double"))
    .withColumn("humidity_avg",    F.col("humidity_avg").cast("double"))
    .withColumn("condition_code",  F.col("condition_code").cast("int"))
    .withColumn("is_rain", (F.col("condition_code") == F.lit(1)).cast("int"))  # 1=rainy → 1
    # sales
    .withColumn("qty_10m", F.col("qty_10m").cast("double"))
    .withColumn("lag1", F.lag("qty_10m", 1).over(w))
    .withColumn("lag2", F.lag("qty_10m", 2).over(w))
    .withColumn("roll_mean_3", F.avg("qty_10m").over(w.rowsBetween(-3, -1)))
    .withColumn("change", F.col("qty_10m") - F.col("lag1"))
    # label
    .withColumn("qty_next_10m", F.col("qty_next_10m").cast("double"))
    # filling
    .na.fill({
        "temperature_avg": 0.0, "humidity_avg": 0.0, "condition_code": 0, "is_rain": 0,
        "qty_10m": 0.0, "lag1": 0.0, "lag2": 0.0, "roll_mean_3": 0.0, "change": 0.0
    })
    .where(F.col("qty_next_10m").isNotNull())
)

feature_cols = [
    "hour_sin","hour_cos","dow_sin","dow_cos",
    "temperature_avg","humidity_avg","condition_code","is_rain",
    "qty_10m","lag1","lag2","roll_mean_3","change"
]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="keep")
df_vec = assembler.transform(df_all).select("t10","qty_next_10m","features")

# stop if emtpy after filters
if df_vec.rdd.isEmpty():
    raise ValueError("No data for backtest after filters (qty_next_10m IS NOT NULL).")

# ──────────────────────────────────────────────────────────────────────────────
# load MLflow Registry
# ──────────────────────────────────────────────────────────────────────────────
model_uri = "models:/ml_model_10m_forecast/1"
model = mlflow.spark.load_model(model_uri)

# ──────────────────────────────────────────────────────────────────────────────
# backtest
# ──────────────────────────────────────────────────────────────────────────────
pred = model.transform(df_vec).filter(F.col("prediction").isNotNull())

eval_df = (pred
    .select(
        F.col("t10").alias("observed_window_start"),
        F.col("qty_next_10m").alias("actual_qty_next_10m"),
        F.col("prediction").alias("forecast_qty_next_10m")
    )
    .withColumn("forecast_window_start", F.col("observed_window_start") + F.expr("INTERVAL 10 MINUTES"))
)

#eval_df.write.mode("overwrite").saveAsTable("tbl_gold_10m_forecast_eval")

# ──────────────────────────────────────────────────────────────────────────────
# sMAPE-accuracy + MAPE + Acc@10%
# ───────────────────────────── ─────────────────────────────────────────────────
eps = F.lit(1e-9)
err = F.col("forecast_qty_next_10m") - F.col("actual_qty_next_10m")

# sMAPE
smape = 2 * F.abs(err) / (F.abs(F.col("forecast_qty_next_10m")) + F.abs(F.col("actual_qty_next_10m")) + eps)
smape_val = eval_df.select(F.avg(smape).alias("smape")).first()["smape"]
accuracy_smape = (1.0 - float(smape_val)) * 100.0

# MAPE
mape = F.abs(err) / F.when(F.abs(F.col("actual_qty_next_10m")) > 0, F.abs(F.col("actual_qty_next_10m")))
mape_row = (eval_df.where(F.abs(F.col("actual_qty_next_10m")) > 0)
            .select(F.avg(mape).alias("mape"), F.count(F.lit(1)).alias("rows_used"))
            .first())
accuracy_mape = (1.0 - float(mape_row["mape"])) * 100.0 if mape_row["mape"] is not None else None
rows_used = int(mape_row["rows_used"])

# Accuracy@10% 
rel_err = F.when(F.abs(F.col("actual_qty_next_10m")) > 0,
                 F.abs(err) / F.abs(F.col("actual_qty_next_10m")))
acc10 = (eval_df
         .withColumn("hit", F.when(rel_err <= 0.10, 1).otherwise(0))
         .select(F.avg("hit").alias("acc10"))
         .first()["acc10"])
accuracy_at_10pct = float(acc10) * 100.0 if acc10 is not None else None

print(f"sMAPE-accuracy (global): {accuracy_smape:.2f}%")
print(f"MAPE-accuracy  (global, excl. zeros): {accuracy_mape:.2f}% | rows_used={rows_used}")
print(f"Accuracy@10%   (global): {accuracy_at_10pct:.2f}%")

# ──────────────────────────────────────────────────────────────────────────────
# append
# ──────────────────────────────────────────────────────────────────────────────
metrics_out = [(float(accuracy_smape),
                float(accuracy_mape) if accuracy_mape is not None else None,
                float(accuracy_at_10pct) if accuracy_at_10pct is not None else None,
                datetime.utcnow())]

metrics_df = spark.createDataFrame(
    metrics_out,
    schema="accuracy_smape DOUBLE, accuracy_mape DOUBLE, accuracy_at_10pct DOUBLE, computed_utc TIMESTAMP"
)

# metrics_df.write.mode("append").saveAsTable("tbl_gold_10m_backtest_metrics")
display(metrics_df)


StatementMeta(, b0c27f56-91fb-4768-bd14-2e2710fcf3c0, 33, Finished, Available, Finished)

2025/09/09 18:16:43 INFO mlflow.spark: 'models:/ml_model_10m_forecast/1' resolved as 'abfss://bed316b0-e26c-4a89-a790-a007cce0e3fd@onelakenortheurope.pbidedicated.windows.net/ad323cd2-6a65-44f4-80de-81db41647e52/Data/4e66a0fb-7db4-4c41-880f-1437a4a43cba/artifacts'


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]

2025/09/09 18:16:43 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false
2025/09/09 18:16:44 INFO mlflow.spark: File 'models:/ml_model_10m_forecast/1/sparkml' not found on DFS. Will attempt to upload the file.
2025/09/09 18:16:44 INFO mlflow.spark: Copied SparkML model to Files/tmp/mlflow/59001758-6eab-4281-b7e6-dc63ee7904a1


sMAPE-accuracy (global): 20.26%
MAPE-accuracy  (global, excl. zeros): -77.99% | rows_used=1242
Accuracy@10%   (global): 6.28%


SynapseWidget(Synapse.DataFrame, 0fac431d-fbcd-4dd5-ad32-08c14787f313)

StatementMeta(, b0c27f56-91fb-4768-bd14-2e2710fcf3c0, 34, Finished, Available, Finished)

StatementMeta(, , -1, Finished, , Finished)

RejectSilentExecuteRequest: Livy session has failed. Error code: RejectSilentExecuteRequest. Rejected silent execute_request as there is no active session.

StatementMeta(, , -1, Finished, , Finished)

RejectSilentExecuteRequest: Livy session has failed. Error code: RejectSilentExecuteRequest. Rejected silent execute_request as there is no active session.

StatementMeta(, , -1, Finished, , Finished)

RejectSilentExecuteRequest: Livy session has failed. Error code: RejectSilentExecuteRequest. Rejected silent execute_request as there is no active session.