In [3]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Load gold table
gold = spark.read.table("gold_stock_minutes_indicators")

# Try to load existing dim table
try:
    dim = spark.read.table("dim_trading_minutes")
    dim_exists = True
except:
    dim_exists = False

if dim_exists:
    # Get max timestamp from existing dim
    max_ts = dim.agg(F.max("timestamp")).collect()[0][0]

    # Only new timestamps
    new_minutes = (
        gold.filter(F.col("timestamp") > max_ts)
            .select("timestamp")
            .dropDuplicates()
    )

    # Combine old + new
    updated_dim = (
        dim.select("timestamp")
           .unionByName(new_minutes)
           .dropDuplicates(["timestamp"])
           .orderBy("timestamp")
    )

else:
    # First creation
    updated_dim = (
        gold.select("timestamp")
            .dropna()
            .dropDuplicates()
            .orderBy("timestamp")
    )

# Add helper columns
updated_dim = (
    updated_dim
    .withColumn("date", F.to_date("timestamp"))
    .withColumn("hour", F.hour("timestamp"))
    .withColumn("minute", F.minute("timestamp"))
)

# Add stable trading_minute_id
window_spec = Window.orderBy("timestamp")
updated_dim = updated_dim.withColumn("trading_minute_id", F.row_number().over(window_spec))

# Save table
updated_dim.write.mode("overwrite").saveAsTable("dim_trading_minutes")


StatementMeta(, 339454e4-b0ec-4934-8d9b-3ebe2ad44eed, 5, Finished, Available, Finished)