In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable
from pyspark.sql.window import Window

# =========================
# CONFIG
# =========================
catalog_name = "electricity-project"
bronze_table = "bronze.weather_observed"
silver_schema = "silver"
silver_table = "silver.weather_cleaned"

# =========================
# CATALOG + SCHEMA
# =========================
spark.sql(f"USE CATALOG `{catalog_name}`")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {silver_schema}")
spark.sql(f"USE SCHEMA {silver_schema}")

# =========================
# DETERMINE WATERMARK
# =========================
if spark.catalog.tableExists(silver_table):
    silver_max_dt = (
        spark.table(silver_table)
        .agg(F.max("datetime").alias("max_dt"))
        .collect()[0]["max_dt"]
    )
else:
    silver_max_dt = None

# =========================
# READ BRONZE (LOOKBACK)
# =========================
bronze_df = spark.table(bronze_table)

if silver_max_dt is not None:
    bronze_df = bronze_df.filter(
        F.col("datetime") >= F.lit(silver_max_dt) - F.expr("INTERVAL 48 HOURS")
    )

# =========================
# DETERMINISTIC DEDUPLICATION
# =========================
window_spec = (
    Window
    .partitionBy("datetime")
    .orderBy(F.col("ingestion_ts").desc())
)

silver_updates_df = (
    bronze_df
    .withColumn("rn", F.row_number().over(window_spec))
    .filter(F.col("rn") == 1)
    .select("datetime", "temperature")
)

# =========================
# MERGE INTO SILVER
# =========================
if spark.catalog.tableExists(silver_table):

    silver_delta = DeltaTable.forName(spark, silver_table)

    (
        silver_delta.alias("t")
        .merge(
            silver_updates_df.alias("s"),
            "t.datetime = s.datetime"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )

else:
    (
        silver_updates_df
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(silver_table)
    )


In [0]:
%sql
-- Weather
SELECT min(datetime), max(datetime), count(*)
FROM `electricity-project`.silver.weather_cleaned;

In [0]:
spark.table("silver.weather_cleaned") \
     .orderBy("datetime") \
     .limit(5) \
     .show(truncate=False)

In [0]:
%sql
SELECT datetime, temperature
FROM `electricity-project`.silver.weather_cleaned
ORDER BY datetime DESC
LIMIT 1;

In [0]:
# %sql INSERT INTO `electricity-project`.bronze.weather_observed
# VALUES (
#   TIMESTAMP '2026-01-11 19:00:00',
#   2.9,
#   CURRENT_DATE(),
#   current_timestamp()
# );



In [0]:
%sql
SELECT *
FROM `electricity-project`.silver.weather_cleaned
WHERE datetime = TIMESTAMP '2026-01-11 19:00:00';

In [0]:
%sql
SELECT
  datetime,
  COUNT(*) AS row_count
FROM `electricity-project`.bronze.weather_observed
WHERE datetime = TIMESTAMP '2026-01-11 19:00:00'
GROUP BY datetime;


In [0]:
%sql
SELECT
  *
  
FROM `electricity-project`.bronze.weather_observed
WHERE datetime = TIMESTAMP '2026-01-11 19:00:00'
;