In [0]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
%sql
select * from bronze_table;

In [0]:
schema = "customer_id string, email string, first_name string, last_name string, gender string, street string, city string, country_code string, row_status string, row_time timestamp"

customers_df = (spark.read.table("bronze_table")
                    .filter("topic = 'customers'")
                    .select(from_json(col("value").cast("string"), schema).alias("v"))
                    .select("v.*")
                    .filter(col("row_status").isin(["insert", "update"]))
                    )
display(customers_df)

In [0]:
from pyspark.sql.window import Window

window = Window.partitionBy("customer_id").orderBy(col("row_time").desc())

ranked_df = (customers_df.withColumn("rank", rank().over(window))
        .filter("rank = 1")
        .drop("rank"))

In [0]:
 display(ranked_df)

In [0]:
from pyspark.sql.window import Window

def batch_upsert(microBatchDF, batchId):
    window = Window.partitionBy("customer_id").orderBy(col("row_time").desc())

    (microBatchDF.filter(col("row_status").isin(["insert", "update"]))
                .withColumn("rank", rank().over(window))
                .filter("rank = 1")
                .drop("rank")
                .createOrReplaceTempView("ranked_updates"))
    
    query = """
        Merge into customers_silver c
        using ranked_updates r
        on c.customer_id = r.customer_id
        when matched and c.row_time < r.row_time
            then update set *
        when not matched
            then insert *
    """

    microBatchDF.sparkSession.sql(query)

In [0]:
%sql
create table if not exists customers_silver 
(customer_id string, email string, first_name string, last_name string, gender string, street string, city string, country string, row_time timestamp)

In [0]:
df_country_lookup = spark.read.json("dbfs:/mnt/demo-datasets/DE-Pro/bookstore/country_lookup")
display(df_country_lookup)

In [0]:
query = (spark.readStream.table("bronze_table")
            .filter("topic = 'customers'")
            .select(from_json(col("value").cast("string"), schema).alias("v"))
            .select("v.*")
            .join(broadcast(df_country_lookup), col("country_code") == col("code"), "inner")
        .writeStream
            .foreachBatch(batch_upsert)
            .option("checkpointLocation","dbfs:/mnt/demo_pro/checkpoints/customers_silver")
            .trigger(availableNow=True)
            .start()
         )

query.awaitTermination()


In [0]:
%sql
select * from customers_silver

In [0]:
count = spark.table("customers_silver").count()
expected_count = spark.table("customers_silver").select("customer_id").distinct().count()
print(count, expected_count)