In [1]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from delta.tables import DeltaTable

# Load bronze table
bronze_df = spark.table("bronze_stock_minutes")

# Cast fields and normalize schema
silver_df = (
    bronze_df
    .withColumn("timestamp", F.to_timestamp("Datetime"))
    .withColumn("open", F.col("Open").cast("double"))
    .withColumn("high", F.col("High").cast("double"))
    .withColumn("low", F.col("Low").cast("double"))
    .withColumn("close", F.col("Close").cast("double"))
    .withColumn("volume", F.col("Volume").cast("long"))
    .withColumn("ticker", F.col("Ticker"))
)

# Deduplicate by ticker + timestamp
window_spec = Window.partitionBy("ticker", "timestamp").orderBy(F.lit(1))

silver_df = (
    silver_df
    .withColumn("row_num", F.row_number().over(window_spec))
    .filter(F.col("row_num") == 1)
    .drop("row_num")
)

# Add ingestion timestamp and define final column order
silver_df = (
    silver_df
    .withColumn("ingestion_time", F.current_timestamp())
    .select("ticker", "timestamp", "open", "high", "low", "close", "volume", "ingestion_time")
)

# Define target table name
silver_table = "silver_stock_minutes"

# Merge into existing Delta table or create it if it does not exist
if spark.catalog.tableExists(silver_table):
    delta_table = DeltaTable.forName(spark, silver_table)
    (
        delta_table.alias("target")
        .merge(
            silver_df.alias("source"),
            "target.ticker = source.ticker AND target.timestamp = source.timestamp"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )
else:
    silver_df.write.format("delta").saveAsTable(silver_table)


StatementMeta(, a35ddb27-8658-40d3-83ba-3ae924b369cb, 3, Finished, Available, Finished)