In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# =========================
# CONFIG
# =========================
catalog_name = "electricity-project"
gold_schema = "gold"

input_table = "silver.price_features"
output_table = "gold.price_model_training_data"

# =========================
# CATALOG + SCHEMA
# =========================
spark.sql(f"USE CATALOG `{catalog_name}`")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {gold_schema}")
spark.sql(f"USE SCHEMA {gold_schema}")

# =========================
# READ SILVER FEATURES
# =========================
df = spark.table(input_table)

# =========================
# EXPLICIT TIME-SERIES PARTITION
# =========================
# This is REQUIRED to avoid Window warnings in production
df = df.withColumn("ts_partition", F.lit(1))

w = (
    Window
    .partitionBy("ts_partition")
    .orderBy("datetime")
)

# =========================
# CREATE WEATHER LAG (t-24)
# =========================
df = df.withColumn(
    "temperature_lag_24",
    F.lag("temperature", 24).over(w)
)

# =========================
# KEEP ONLY MODEL-READY ROWS
# =========================
training_df = df.dropna(
    subset=[
        "price_nok",
        "price_lag_24",
        "temperature_lag_24"
    ]
)

# =========================
# DROP TECH COLUMN
# =========================
training_df = training_df.drop("ts_partition")

# =========================
# WRITE GOLD (SNAPSHOT)
# =========================
(
    training_df
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable(output_table)
)

# =========================
# SANITY CHECKS
# =========================
print("Training rows:", training_df.count())

training_df.select(
    F.count("*").alias("rows"),
    F.countDistinct("datetime").alias("distinct_hours"),
    F.sum(F.when(F.col("temperature_lag_24").isNull(), 1).otherwise(0)).alias("null_temp_lag_24")
).show()


In [0]:
# %sql
# SELECT
#   min(datetime),
#   max(datetime),
#   count(*)
# FROM `price_model_training_data`.gold.actual_prices;

In [0]:
# %sql
# SELECT *
# FROM `electricity-project`.gold.actual_prices
# ORDER BY datetime
# LIMIT 10;

In [0]:
# %sql
# SELECT *
# FROM `electricity-project`.gold.actual_prices
# ORDER BY datetime DESC
# LIMIT 10;
