In [1]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from delta.tables import DeltaTable

# Load bronze table
bronze_df = spark.table("bronze_stock_daily")

# create schema and rename columns
silver_df = (
    bronze_df
    .withColumn("date", F.to_date("Datetime"))
    .withColumn("open", F.col("Open").cast("double"))
    .withColumn("high", F.col("High").cast("double"))
    .withColumn("low", F.col("Low").cast("double"))
    .withColumn("close", F.col("Close").cast("double"))
    .withColumn("volume", F.col("Volume").cast("long"))
    .withColumn("ticker", F.col("ticker"))
)

# Remove original bronze columns AFTER creating new ones
silver_df = silver_df.drop("Datetime")

# Deduplicate by ticker + date
window_spec = Window.partitionBy("ticker", "date").orderBy(F.lit(1))

silver_df = (
    silver_df
    .withColumn("row_num", F.row_number().over(window_spec))
    .filter(F.col("row_num") == 1)
    .drop("row_num")
)

# Add ingestion timestamp and define final column order 
silver_df = ( 
    silver_df 
    .withColumn("ingestion_time", F.current_timestamp())
    .select("ticker", "date", "open", "high", "low", "close", "volume", "ingestion_time") 
)

# Define target table
silver_table = "silver_stock_daily"

# Check if the table already exists in the Lakehouse
if spark.catalog.tableExists(silver_table):
    delta_table = DeltaTable.forName(spark, silver_table)
    (
        delta_table.alias("target")
        .merge(
            silver_df.alias("source"),
            "target.ticker = source.ticker AND target.date = source.date"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )
else:
    silver_df.write.format("delta").saveAsTable(silver_table)


StatementMeta(, c2a78d58-ed77-49c5-a8d0-59a79a424eb6, 3, Finished, Available, Finished)