In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

# =========================
# CONFIG
# =========================
catalog_name = "electricity-project"

silver_schema = "silver"
gold_schema = "gold"

input_table = "silver.price_features"
output_table = "gold.day_ahead_price_forecast"

# =========================
# CATALOG + SCHEMA
# =========================
spark.sql(f"USE CATALOG `{catalog_name}`")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {gold_schema}")
spark.sql(f"USE SCHEMA {gold_schema}")

# =========================
# READ FEATURES
# =========================
df = spark.table(input_table)

# =========================
# DEFINE FEATURE COLUMNS
# =========================
feature_cols = (
    ["price_lag_24", "temperature", "trend"]
    + [c for c in df.columns if c.startswith("hour_")]
    + [c for c in df.columns if c.startswith("day_of_week_")]
)

# =========================
# DROP ROWS THAT CANNOT BE USED
# =========================
model_df = df.dropna(subset=["price_lag_24", "temperature"])

# =========================
# TRAIN / FORECAST SPLIT
# =========================
# Forecast next 24 hours based on latest available data

max_dt = model_df.agg(F.max("datetime")).collect()[0][0]

forecast_start = max_dt - F.expr("INTERVAL 24 HOURS")

train_df = model_df.filter(F.col("datetime") < forecast_start)
forecast_df = model_df.filter(F.col("datetime") >= forecast_start)

# =========================
# VECTOR ASSEMBLER
# =========================
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

train_vec = assembler.transform(train_df)
forecast_vec = assembler.transform(forecast_df)

# =========================
# LINEAR REGRESSION MODEL
# =========================
lr = LinearRegression(
    featuresCol="features",
    labelCol="price_nok",
    predictionCol="predicted_price"
)

model = lr.fit(train_vec)

# =========================
# GENERATE FORECAST
# =========================
predictions = model.transform(forecast_vec)

# =========================
# FINAL GOLD OUTPUT
# =========================
gold_df = (
    predictions
    .select(
        F.col("datetime"),
        F.col("price_nok").alias("actual_price"),
        F.col("predicted_price")
    )
    .orderBy("datetime")
)

# =========================
# WRITE GOLD
# =========================
(
    gold_df
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable(output_table)
)


In [0]:
%sql
SELECT *
FROM `electricity-project`.gold.day_ahead_price_forecast
ORDER BY datetime;


In [0]:
%sql
SELECT
  avg(abs(actual_price - predicted_price)) AS mae
FROM `electricity-project`.gold.day_ahead_price_forecast;
