In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from delta.tables import DeltaTable

# =========================
# CONFIG
# =========================
catalog_name = "electricity-project"

prices_table = "silver.electricity_prices_cleaned"
weather_table = "silver.weather_cleaned"

silver_schema = "silver"
output_table = "silver.price_weather_joined"

# =========================
# CATALOG + SCHEMA
# =========================
spark.sql(f"USE CATALOG `{catalog_name}`")
spark.sql(f"USE SCHEMA {silver_schema}")

# =========================
# DETERMINE WATERMARK
# =========================
if spark.catalog.tableExists(output_table):
    max_dt = (
        spark.table(output_table)
        .agg(F.max("datetime").alias("max_dt"))
        .collect()[0]["max_dt"]
    )
else:
    max_dt = None

# =========================
# READ INPUTS (LOOKBACK)
# =========================
prices_df = spark.table(prices_table)
weather_df = spark.table(weather_table)

if max_dt is not None:
    prices_df = prices_df.filter(
        F.col("datetime") >= F.lit(max_dt) - F.expr("INTERVAL 48 HOURS")
    )
    weather_df = weather_df.filter(
        F.col("datetime") >= F.lit(max_dt) - F.expr("INTERVAL 48 HOURS")
    )

# =========================
# JOIN (PRICES AS BACKBONE)
# =========================
joined_df = (
    prices_df.alias("p")
    .join(
        weather_df.alias("w"),
        on="datetime",
        how="left"
    )
    .select(
        F.col("datetime"),
        F.col("price_nok"),
        F.col("temperature")
    )
)

# =========================
# BOUNDED FORWARD FILL
# =========================
window_ffill = (
    Window
    .partitionBy(F.to_date("datetime"))
    .orderBy("datetime")
    .rowsBetween(-48, 0)
)



final_updates_df = (
    joined_df
    .withColumn(
        "temperature",
        F.last("temperature", ignorenulls=True).over(window_ffill)
    )
)

# =========================
# MERGE INTO OUTPUT
# =========================
if spark.catalog.tableExists(output_table):

    delta_out = DeltaTable.forName(spark, output_table)

    (
        delta_out.alias("t")
        .merge(
            final_updates_df.alias("s"),
            "t.datetime = s.datetime"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )

else:
    (
        final_updates_df
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(output_table)
    )


In [0]:
%sql
SELECT
  min(datetime),
  max(datetime),
  count(*)
FROM `electricity-project`.silver.price_weather_joined;


In [0]:
%sql
SELECT *
FROM `electricity-project`.silver.price_weather_joined
ORDER BY datetime
LIMIT 10;


In [0]:
%sql
SELECT *
FROM `electricity-project`.silver.price_weather_joined
ORDER BY datetime DESC
LIMIT 10;
