In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable

# =========================
# CONFIG
# =========================
catalog_name = "electricity-project"
bronze_table = "bronze.electricity_prices_no5"
silver_schema = "silver"
silver_table = "silver.electricity_prices_cleaned"

# =========================
# CATALOG + SCHEMA
# =========================
spark.sql(f"USE CATALOG `{catalog_name}`")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {silver_schema}")
spark.sql(f"USE SCHEMA {silver_schema}")

# =========================
# DETERMINE WATERMARK
# =========================
if spark.catalog.tableExists(silver_table):
    silver_max_dt = (
        spark.table(silver_table)
        .agg(F.max("datetime").alias("max_dt"))
        .collect()[0]["max_dt"]
    )
else:
    silver_max_dt = None

# =========================
# READ BRONZE (INCREMENTAL)
# =========================
bronze_df = spark.table(bronze_table)

if silver_max_dt is not None:
    bronze_df = bronze_df.filter(F.col("datetime") > F.lit(silver_max_dt))

# =========================
# CLEANING LOGIC
# =========================
silver_updates_df = (
    bronze_df
    .filter(F.col("price_nok").isNotNull())
    .select(
        F.col("datetime"),
        F.col("price_nok")
    )
    .dropDuplicates(["datetime"])
    .filter(F.col("datetime") >= F.lit("2023-01-01 00:00:00"))
)

# =========================
# WRITE SILVER (MERGE)
# =========================
if spark.catalog.tableExists(silver_table):

    silver_delta = DeltaTable.forName(spark, silver_table)

    (
        silver_delta.alias("t")
        .merge(
            silver_updates_df.alias("s"),
            "t.datetime = s.datetime"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )

else:
    (
        silver_updates_df
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(silver_table)
    )


In [0]:
%sql
SELECT min(datetime), max(datetime), count(*)
FROM `electricity-project`.silver.electricity_prices_cleaned;

In [0]:
# spark.table("silver.electricity_prices_cleaned") \
#      .orderBy("datetime") \
#      .limit(5) \
#      .show(truncate=False)

In [0]:
%sql
SELECT count(*)
FROM `electricity-project`.silver.electricity_prices_cleaned;


In [0]:
%sql
SELECT
  COUNT(*)                             AS bronze_rows,
  COUNT(DISTINCT datetime)             AS bronze_unique_hours,
  SUM(CASE WHEN price_nok IS NULL THEN 1 ELSE 0 END) AS null_prices
FROM `electricity-project`.bronze.electricity_prices_no5;
