In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable

# =========================
# CONFIG
# =========================
catalog_name = "electricity-project"

silver_schema = "silver"
gold_schema = "gold"

features_table = "silver.price_features"
params_table = "gold.price_model_parameters"
output_table = "gold.day_ahead_price_forecast"

# =========================
# CATALOG + SCHEMA
# =========================
spark.sql(f"USE CATALOG `{catalog_name}`")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {gold_schema}")
spark.sql(f"USE SCHEMA {gold_schema}")

# =========================
# LOAD LATEST MODEL PARAMETERS
# =========================
params_df = spark.table(params_table)

latest_version = (
    params_df
    .select(F.max("model_version").alias("v"))
    .collect()[0]["v"]
)

model_params = params_df.filter(F.col("model_version") == latest_version)

# =========================
# EXTRACT INTERCEPT
# =========================
intercept = (
    model_params
    .select("intercept")
    .limit(1)
    .collect()[0][0]
)

# =========================
# EXTRACT COEFFICIENTS
# =========================
coef_dict = {
    row["feature_name"]: row["coefficient"]
    for row in model_params
        .select("feature_name", "coefficient")
        .collect()
}

# =========================
# LOAD FEATURE DATA
# =========================
df = spark.table(features_table)

# =========================
# FORECAST WINDOW (NEXT 24 HOURS)
# =========================
max_dt = df.select(F.max("datetime")).collect()[0][0]

forecast_df = df.filter(
    F.col("datetime") > max_dt - F.expr("INTERVAL 24 HOURS")
)

forecast_date = F.to_date(F.lit(max_dt))

# =========================
# APPLY LINEAR MODEL
# =========================
prediction_expr = F.lit(intercept)

for feature, coef in coef_dict.items():
    prediction_expr = prediction_expr + F.col(feature) * F.lit(coef)

result_df = (
    forecast_df
    .withColumn("predicted_price", prediction_expr)
    .withColumn("model_version", F.lit(latest_version))
    .withColumn("forecast_for_date", forecast_date)
    .withColumn("generated_at", F.current_timestamp())
    .select(
        "datetime",
        "predicted_price",
        "model_version",
        "forecast_for_date",
        "generated_at"
    )
)

# =========================
# MERGE / APPEND FORECASTS
# =========================
if spark.catalog.tableExists(output_table):

    delta_out = DeltaTable.forName(spark, output_table)

    (
        delta_out.alias("t")
        .merge(
            result_df.alias("s"),
            """
            t.datetime = s.datetime
            AND t.model_version = s.model_version
            AND t.forecast_for_date = s.forecast_for_date
            """
        )
        .whenNotMatchedInsertAll()
        .execute()
    )

else:
    (
        result_df
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(output_table)
    )


In [0]:
# %sql
# DROP TABLE `electricity-project`.gold.day_ahead_price_forecast




In [0]:
 #%sql
 #DROP TABLE IF EXISTS `electricity-project`.gold.day_ahead_price_forecast;


In [0]:
%sql
SELECT
  MIN(datetime) AS min_dt,
  MAX(datetime) AS max_dt,
  COUNT(*)       AS n_rows
FROM `electricity-project`.bronze.weather_observed;


In [0]:
%sql
SELECT
  MIN(datetime) AS min_dt,
  MAX(datetime) AS max_dt,
  COUNT(*)       AS n_rows
FROM `electricity-project`.bronze.weather_observed;


In [0]:
%sql
SELECT datetime, COUNT(*)
FROM `electricity-project`.silver.price_features
GROUP BY datetime
HAVING COUNT(*) > 1;


In [0]:
%sql
SELECT
  COUNT(*)                                   AS total_rows,
  COUNT(price_lag_24)                        AS lag_available,
  COUNT(temperature)                         AS temp_available
FROM `electricity-project`.silver.price_features;


In [0]:
%sql
SELECT
  datetime,
  price_nok,
  price_lag_24,
  temperature,
  hour_12,
  day_of_week_3
FROM `electricity-project`.silver.price_features
ORDER BY datetime
LIMIT 10;


In [0]:
%sql
SELECT
  MIN(datetime) AS train_start,
  MAX(datetime) AS train_end,
  COUNT(*)       AS n_rows
FROM `electricity-project`.gold.price_model_training_data;


In [0]:
%sql
SELECT
  model_version,
  COUNT(*) AS n_features,
  MIN(trained_at) AS trained_at
FROM `electricity-project`.gold.price_model_parameters
GROUP BY model_version
ORDER BY model_version;


In [0]:
%sql
SELECT
  a.datetime,
  a.price_nok AS actual,
  f.predicted_price,
  ABS(a.price_nok - f.predicted_price) AS abs_error
FROM `electricity-project`.gold.actual_prices a
JOIN `electricity-project`.gold.day_ahead_price_forecast f
  USING (datetime)
ORDER BY a.datetime ASC
LIMIT 10;


In [0]:
%sql
SELECT MAX(datetime)
FROM `electricity-project`.gold.actual_prices;


In [0]:
%sql
SELECT MIN(datetime), MAX(datetime), COUNT(*)
FROM `electricity-project`.gold.day_ahead_price_forecast;


In [0]:
%sql
SELECT *
FROM `electricity-project`.gold.day_ahead_price_forecast;