In [0]:
# Create the silver table if it doesn't exist
spark.sql("""
    CREATE TABLE IF NOT EXISTS data.silver.weather_data_silver (
        YYYYMMDD_HH TIMESTAMP,
        STN BIGINT,
        DD BIGINT,
        FH BIGINT,
        FF BIGINT,
        FX BIGINT,
        T BIGINT,
        T10N DOUBLE,
        TD BIGINT,
        SQ BIGINT,
        Q BIGINT,
        DR BIGINT,
        RH BIGINT,
        P BIGINT,
        VV BIGINT,
        N DOUBLE,
        U BIGINT,
        WW DOUBLE,
        IX BIGINT,
        M BIGINT,
        R BIGINT,
        S BIGINT,
        O BIGINT,
        Y DOUBLE,
        api_query_timestamp TIMESTAMP
    )
    USING DELTA
""")

print("✅ Silver table created/verified: data.silver.weather_data_silver")

In [0]:
from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window
from delta.tables import DeltaTable

def process_silver_batch(batch_df, batch_id):
    """
    Process each batch: deduplicate and upsert into silver table
    """
    print(f"Processing batch {batch_id} with {batch_df.count()} rows")
    
    if batch_df.count() == 0:
        print("Empty batch, skipping")
        return
    
    # Deduplicate within batch: keep only the most recent api_query_timestamp for each YYYYMMDD_HH
    window_spec = Window.partitionBy("YYYYMMDD_HH").orderBy(col("api_query_timestamp").desc())
    
    deduped_df = batch_df.withColumn("row_num", row_number().over(window_spec)) \
                         .filter(col("row_num") == 1) \
                         .drop("row_num")
    
    print(f"After deduplication: {deduped_df.count()} rows")
    
    # Upsert into silver table
    silver_table = DeltaTable.forName(spark, "data.silver.weather_data_silver")
    
    silver_table.alias("target").merge(
        deduped_df.alias("source"),
        "target.YYYYMMDD_HH = source.YYYYMMDD_HH"
    ).whenMatchedUpdate(
        condition="source.api_query_timestamp > target.api_query_timestamp",
        set={
            "STN": "source.STN",
            "DD": "source.DD",
            "FH": "source.FH",
            "FF": "source.FF",
            "FX": "source.FX",
            "T": "source.T",
            "T10N": "source.T10N",
            "TD": "source.TD",
            "SQ": "source.SQ",
            "Q": "source.Q",
            "DR": "source.DR",
            "RH": "source.RH",
            "P": "source.P",
            "VV": "source.VV",
            "N": "source.N",
            "U": "source.U",
            "WW": "source.WW",
            "IX": "source.IX",
            "M": "source.M",
            "R": "source.R",
            "S": "source.S",
            "O": "source.O",
            "Y": "source.Y",
            "api_query_timestamp": "source.api_query_timestamp"
        }
    ).whenNotMatchedInsertAll().execute()
    
    print(f"✅ Batch {batch_id} processed successfully")

print("✅ Batch processing function defined")

In [0]:
# Checkpoint location for the streaming query
checkpoint_location = "/Volumes/source/source_schema/source_volume/checkpoints/weather_bronze_to_silver"

# Read stream from bronze table
df_bronze_stream = spark.readStream.table("data.bronze.weather_data_bronze")

print("Starting streaming query from bronze to silver...")

# Write stream with foreachBatch processing
query = (
    df_bronze_stream.writeStream
    .foreachBatch(process_silver_batch)
    .option("checkpointLocation", checkpoint_location)
    .trigger(availableNow=True)
    .start()
)

query.awaitTermination()

print("✅ Streaming query completed")