In [0]:
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

# =========================
# CONFIG
# =========================
catalog_name = "electricity-project"
gold_schema = "gold"

input_table = "gold.price_model_training_data"
output_table = "gold.price_model_parameters"

# =========================
# CATALOG + SCHEMA
# =========================
spark.sql(f"USE CATALOG `{catalog_name}`")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {gold_schema}")
spark.sql(f"USE SCHEMA {gold_schema}")

# =========================
# MODEL VERSION (MONTHLY)
# =========================
current_version = (
    spark.sql("SELECT date_format(current_date(), 'yyyyMM') AS v")
    .collect()[0]["v"]
)

# =========================
# CHECK IF MODEL EXISTS
# =========================
if spark.catalog.tableExists(output_table):
    exists = (
        spark.table(output_table)
        .filter(F.col("model_version") == current_version)
        .limit(1)
        .count()
    )
    if exists > 0:
        dbutils.notebook.exit(
            f"Model version {current_version} already exists. Skipping training."
        )

# =========================
# READ TRAINING DATA
# =========================
df = spark.table(input_table)

# =========================
# DEFINE FEATURE COLUMNS
# =========================
feature_cols = (
    ["price_lag_24", "temperature", "trend"]
    + [c for c in df.columns if c.startswith("hour_")]
    + [c for c in df.columns if c.startswith("day_of_week_")]
)

# =========================
# DROP ROWS WITH MISSING FEATURES
# =========================
train_df = df.dropna(
    subset=["price_nok", "price_lag_24", "temperature"]
)

# =========================
# VECTOR ASSEMBLER
# =========================
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

train_vec = assembler.transform(train_df)

# =========================
# TRAIN LINEAR REGRESSION
# =========================
lr = LinearRegression(
    featuresCol="features",
    labelCol="price_nok",
    fitIntercept=True
)

model = lr.fit(train_vec)

# =========================
# EXTRACT COEFFICIENTS
# =========================
coef_rows = [
    (name, float(coef))
    for name, coef in zip(feature_cols, model.coefficients)
]

coef_df = spark.createDataFrame(
    coef_rows,
    ["feature_name", "coefficient"]
)

# =========================
# MODEL METADATA
# =========================
stats = train_df.agg(
    F.min("datetime").alias("train_start"),
    F.max("datetime").alias("train_end"),
    F.count("*").alias("n_observations")
).collect()[0]

final_df = (
    coef_df
    .withColumn("intercept", F.lit(float(model.intercept)))
    .withColumn("model_version", F.lit(current_version))
    .withColumn("trained_at", F.current_timestamp())
    .withColumn("train_start", F.lit(stats["train_start"]))
    .withColumn("train_end", F.lit(stats["train_end"]))
    .withColumn("n_observations", F.lit(stats["n_observations"]))
)

# =========================
# WRITE MODEL PARAMETERS
# =========================
(
    final_df
    .write
    .format("delta")
    .mode("append")
    .saveAsTable(output_table)
)


In [0]:
# %sql
# SELECT *
# FROM `electricity-project`.gold.day_ahead_price_forecast
# ORDER BY datetime;


In [0]:
# %sql
# SELECT
#   avg(abs(actual_price - predicted_price)) AS mae
# FROM `electricity-project`.gold.day_ahead_price_forecast;
