In [0]:
from delta.tables import DeltaTable
from pyspark.sql import functions as f

# Step 1: Attach Delta table
events_delta = DeltaTable.forPath(spark,
    "/Volumes/workspace/ecommerce/ecommerce_data/events_delta"
)

In [0]:
# Step 2: Load deduplicated existing events
deduped_events = events_delta.toDF()

In [0]:
# Step 3: Simulate incremental data
incremental_df = deduped_events.limit(100)\
.withColumn("price", f.col("price") + 10)

display(incremental_df.limit(5))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-01T00:01:20.000Z,view,21400135,2053013561579406073,electronics.clocks,casio,175.26,565921426,f7ad8fc9-15ee-4c4d-bfda-1f80311d24bf
2019-11-01T00:03:09.000Z,view,10201206,2053013553224352013,kids.dolls,llorens,48.59,538620608,792e0551-a420-413c-96ad-4484f7ec48e8
2019-11-01T00:18:51.000Z,view,16000004,2053013558223962683,,rondell,53.73,524843394,d58bd43b-758d-491d-91d6-054b72729a2d
2019-11-01T00:33:29.000Z,view,1500075,2053013552955916539,computers.peripherals.printer,canon,156.46,515958621,fed55ee9-df03-445c-902c-586f12caace9
2019-11-01T00:41:47.000Z,view,1005074,2053013555631882655,electronics.smartphone,samsung,1152.62,515836513,a7b0597c-004e-4a50-9947-3ecde295a1b2


In [0]:
# Step 4: Handle NULLs before MERGE
incremental_df_clean = incremental_df \
.dropna(subset=["user_session", "event_time"])\
.fillna({
    "price": 0.0,
    "brand": "unknown",
    "category_code": "unknown",
    "category_id": -1
    })

display(incremental_df_clean.limit(5))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-01T00:01:20.000Z,view,21400135,2053013561579406073,electronics.clocks,casio,175.26,565921426,f7ad8fc9-15ee-4c4d-bfda-1f80311d24bf
2019-11-01T00:03:09.000Z,view,10201206,2053013553224352013,kids.dolls,llorens,48.59,538620608,792e0551-a420-413c-96ad-4484f7ec48e8
2019-11-01T00:18:51.000Z,view,16000004,2053013558223962683,unknown,rondell,53.73,524843394,d58bd43b-758d-491d-91d6-054b72729a2d
2019-11-01T00:33:29.000Z,view,1500075,2053013552955916539,computers.peripherals.printer,canon,156.46,515958621,fed55ee9-df03-445c-902c-586f12caace9
2019-11-01T00:41:47.000Z,view,1005074,2053013555631882655,electronics.smartphone,samsung,1152.62,515836513,a7b0597c-004e-4a50-9947-3ecde295a1b2


In [0]:
# Step 5: MERGE incremental data
merge_summary = events_delta.alias("t").merge(
    incremental_df_clean.alias("s"),
    "t.user_session = s.user_session AND t.event_time = s.event_time"
).whenMatchedUpdateAll()\
 .whenNotMatchedInsertAll()\
 .execute()
 
 # Shows updated/inserted rows
display(merge_summary)  

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
100,100,0,0


In [0]:
# Step 6: Verify
events_df = events_delta.toDF()
print("Total rows after merge:", events_df.count())
display(events_df.orderBy("event_time", ascending=False).limit(5))

Total rows after merge: 67351679


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-30T23:59:59.000Z,view,1004233,2053013555631882655,electronics.smartphone,apple,1312.52,579969851,90aca71c-ed8a-4670-866a-761ebacb732d
2019-11-30T23:59:59.000Z,view,2701706,2053013563911439225,appliances.kitchen.refrigerators,samsung,566.27,531607492,368ddc8b-5db9-40fb-b7ff-b6582a1192c0
2019-11-30T23:59:59.000Z,view,1004833,2053013555631882655,electronics.smartphone,samsung,167.03,557794415,6fecf566-ebb0-4e70-a243-cdc13ce044cb
2019-11-30T23:59:58.000Z,view,15700137,2053013559733912211,,,277.74,532714000,02b4131c-0112-4231-aafa-ceaa08e77c1b
2019-11-30T23:59:58.000Z,view,28719425,2053013565639492569,apparel.shoes,baden,62.81,545223467,734c5eef-0742-4f8b-9d22-48f75b0bc359
