In [0]:
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.source")  # Create source schema if it doesn't exist
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.target")  # Create target schema if it doesn't exist

DataFrame[]

In [0]:
sales2 = spark.sql("select * from samples.bakehouse.sales_suppliers")
sales2.write.mode("overwrite").saveAsTable("workspace.source.sales2")

In [0]:
source = spark.read.table('workspace.source.sales2')
source.display()

supplierID,name,ingredient,continent,city,district,size,longitude,latitude,approved
4000000,Cacao Wonders,cacao,South America,Guayaquil,Las Peñas,M,-79.8974,-2.1791,Y
4000001,Coconut Grove,coconut,Asia,Manila,Intramuros,S,121.0221,14.6042,Y
4000002,Almond Delights,almonds,Europe,Valencia,Ruzafa,L,-0.3762,39.4699,Y
4000003,Sugar Cane Harvest,cane sugar,South America,Sao Paulo,Vila Madalena,XL,-46.6333,-23.5489,Y
4000004,Vanilla Valley,vanilla,North America,Mexico City,Roma Norte,M,-99.1332,19.4326,Y
4000005,Pecan Pleasures,pecans,North America,Atlanta,Virginia-Highland,S,-84.3888,33.749,Y
4000006,Hazelnut Haven,hazelnuts,Europe,Istanbul,Kadıköy,XXL,28.9784,41.0082,Y
4000007,Cinnamon Spice,cinnamon,Asia,Colombo,Galle Face Green,L,79.8612,6.9271,Y
4000008,Cashew Corner,cashews,Asia,Goa,Anjuna Beach,XL,73.8067,15.3173,Y
4000009,Maple Monarch,maple syrup,North America,Montreal,Plateau Mont-Royal,M,-73.5673,45.5017,Y


In [0]:
# Create a hash key by concatenating all columns into a single string column 'RowHash'
from pyspark.sql import functions as F

# Concatenate all columns in 'source' DataFrame into 'RowHash'
source = source.withColumn('RowHash', F.sha2(F.concat_ws('', *source.columns), 256))

In [0]:
display(source)

supplierID,name,ingredient,continent,city,district,size,longitude,latitude,approved,RowHash
4000000,Cacao Wonders,cacao,South America,Guayaquil,Las Peñas,M,-79.8974,-2.1791,Y,19e37f9076246ae6ffb69e9be7ed2073d94656fb55aadc78ee09807ffd86ff36
4000001,Coconut Grove,coconut,Asia,Manila,Intramuros,S,121.0221,14.6042,Y,dd8379700727ada5448616395cee167291078ba63a10ae62f20ea3708b19004a
4000002,Almond Delights,almonds,Europe,Valencia,Ruzafa,L,-0.3762,39.4699,Y,2618ef1944d020b72ae94ce0343b054c4818855400650829c903c0a0bec5004c
4000003,Sugar Cane Harvest,cane sugar,South America,Sao Paulo,Vila Madalena,XL,-46.6333,-23.5489,Y,98327b4f32a42f832fa286d6fd6c96ba0734010fc2fdb4cb3ac62aa321f2b06c
4000004,Vanilla Valley,vanilla,North America,Mexico City,Roma Norte,M,-99.1332,19.4326,Y,e6d6a60d578c4b2907df80d8b136e67370b976e9a0754f68a4dd914058b61839
4000005,Pecan Pleasures,pecans,North America,Atlanta,Virginia-Highland,S,-84.3888,33.749,Y,eb24f9d60f76123cd4744b9bff18c66b5c1babd2a3950faef75a353837c96dd0
4000006,Hazelnut Haven,hazelnuts,Europe,Istanbul,Kadıköy,XXL,28.9784,41.0082,Y,432fd8759f3212a0b0fb1b90a232b03f65ff6883d1cf4247f71366737d55d732
4000007,Cinnamon Spice,cinnamon,Asia,Colombo,Galle Face Green,L,79.8612,6.9271,Y,5bb8fa1ebe1c59e25cf6022593adba52ad61c9fb79d47eccd81d402afa43597c
4000008,Cashew Corner,cashews,Asia,Goa,Anjuna Beach,XL,73.8067,15.3173,Y,edfa576dd2e0c945b119604b91e8d4680a64e069945367ff633b7ae2d72be8fe
4000009,Maple Monarch,maple syrup,North America,Montreal,Plateau Mont-Royal,M,-73.5673,45.5017,Y,2de584aa231a74956a5ac5dfd69a06aab3050410b11f8c89c095b95b4feaee24


In [0]:
# Add three new columns to source :
# 1. 'IndCurrent': Set to 1 for all rows, indicating the current/active record.
# 2. 'CreatedDate': Set to the current timestamp, representing when the record was created.
# 3. 'ModifiedDate': Set to the current timestamp, representing when the record was last modified.
source = source.withColumn("IndCurrent", F.lit(1)) \
    .withColumn("CreatedDate", F.current_timestamp()) \
    .withColumn("ModifiedDate", F.current_timestamp())

In [0]:
from pyspark.sql.window import Window
# Define the window specification
window_spec = Window.orderBy(F.monotonically_increasing_id())

# Add a row number column based on the window specification
source = source.withColumn("storage_id", F.row_number().over(window_spec))

first_cols = ["storage_id"]
other_cols = [col for col in source.columns if col not in first_cols]
source = source.select(first_cols + other_cols)



In [0]:
#table_name= 'workspace.target.sales2'
#source.write.format('delta').mode('append').saveAsTable(table_name)

In [0]:
from pyspark.sql.functions import col, when

Source = source.withColumn(
    "ingredient",
    when(col("supplierID") == "4000000", "Mumbai").otherwise(col("ingredient"))
)



In [0]:
cols= Source.columns
cols = cols[1:-4]

In [0]:
from pyspark.sql import functions as F

# Concatenate all columns in 'source' DataFrame into 'RowHash'
SourceDf = Source.withColumn('RowHash', F.sha2(F.concat_ws('', *cols), 256))



In [0]:
SourceDf=SourceDf.drop("storage_Id")



In [0]:
TargetDf=spark.read.table(table_name).select(['supplierID','RowHash','storage_id']).withColumnRenamed('RowHash','TargetHash')
SourceDf=SourceDf.join(TargetDf, on =['supplierID'], how='left').withColumn('Flag', F.when(col('TargetHash').isNull() | (col('TargetHash') != col('RowHash')), 'New').when(col('TargetHash') == col('RowHash'), 'NoChange').otherwise('Update'))
# Drop the TargetHash column
SourceDf=SourceDf.drop('TargetHash')
SourceDf=SourceDf.filter(col("Flag") == "New")
SourceDf.display()



supplierID,name,ingredient,continent,city,district,size,longitude,latitude,approved,RowHash,IndCurrent,CreatedDate,ModifiedDate,storage_id,Flag
4000025,Nutmeg Nirvana,nutmeg,Asia,Banda Aceh,Peunayong,M,95.3198,5.5577,Y,9aeb18dc8f6e19a8354a0fdccf7d8d72716e85a5cf85024c9835fe38482cb8c6,1,2025-07-09T17:07:59.017Z,2025-07-09T17:07:59.017Z,28,New
4000012,Coffee Collective,coffee,South America,Medellin,El Poblado,XXL,-75.5638,6.2518,Y,4b60aa6f629df63cc6d694ea0213d6de5a03cd5c849243b953474ef81a2199c3,1,2025-07-09T17:07:59.017Z,2025-07-09T17:07:59.017Z,28,New
4000011,Oat Oasis,oats,Europe,Edinburgh,Stockbridge,L,-3.1883,55.9533,Y,6ef1ac8bb1bd9456c4817028fc2d038187bc3577e9a62a0f1386d949b127ba24,1,2025-07-09T17:07:59.017Z,2025-07-09T17:07:59.017Z,28,New
4000018,Raisin Ranch,raisins,Asia,Kabul,Chicken Street,XXL,69.1763,34.521,Y,bfaa5e74c298d71cf0d035a20644361602075b8b7a33f46fa9dbd3563206a152,1,2025-07-09T17:07:59.017Z,2025-07-09T17:07:59.017Z,28,New
4000023,Fennel Fields,fennel seeds,Europe,Florence,Santo Spirito,L,11.2558,43.7695,Y,632cc73f404913cb5d284f4bfb0c40bf4637be189e91085de194b404e6e92bb8,1,2025-07-09T17:07:59.017Z,2025-07-09T17:07:59.017Z,28,New
4000022,Poppy Peaks,poppy seeds,Europe,Krakow,Kazimierz,S,19.9368,50.0647,Y,29615452d067ea16096d2061788bbd51a75b01c975a86aebf5dfe2d21a2f274f,1,2025-07-09T17:07:59.017Z,2025-07-09T17:07:59.017Z,28,New
4000002,Almond Delights,almonds,Europe,Valencia,Ruzafa,L,-0.3762,39.4699,Y,2618ef1944d020b72ae94ce0343b054c4818855400650829c903c0a0bec5004c,1,2025-07-09T17:07:59.017Z,2025-07-09T17:07:59.017Z,28,New
4000014,Molasses Mills,molasses,Central America,Havana,Vedado,XL,-82.3665,23.1136,Y,fed099934e5785df6b1b7ffa7eb61a9cd02fefa6c13aac2da93a82348d68f361,1,2025-07-09T17:07:59.017Z,2025-07-09T17:07:59.017Z,28,New
4000005,Pecan Pleasures,pecans,North America,Atlanta,Virginia-Highland,S,-84.3888,33.749,Y,eb24f9d60f76123cd4744b9bff18c66b5c1babd2a3950faef75a353837c96dd0,1,2025-07-09T17:07:59.017Z,2025-07-09T17:07:59.017Z,28,New
4000001,Coconut Grove,coconut,Asia,Manila,Intramuros,S,121.0221,14.6042,Y,dd8379700727ada5448616395cee167291078ba63a10ae62f20ea3708b19004a,1,2025-07-09T17:07:59.017Z,2025-07-09T17:07:59.017Z,28,New


In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import current_timestamp, lit, col, udf
from pyspark.sql.types import StringType
import uuid

# Configuration
table_name = "workspace.target.sales2"
key_column = "supplierID"
hash_column = "RowHash"
is_current_column = "IndCurrent"
surrogate_key_column = "storage_id"
created_column = "CreatedDate"

# Reference Delta table
target_table = DeltaTable.forName(spark, table_name)

# Add new columns to source DataFrame
uuid_udf = udf(lambda: str(uuid.uuid4()), StringType())
SourceDf = SourceDf \
    .withColumn(surrogate_key_column, uuid_udf()) \
    .withColumn(created_column, current_timestamp()) \
    .withColumn(is_current_column, lit(1))

# Use aliases properly
src = SourceDf.alias("src")
tgt = target_table.alias("tgt")

# Use column expressions (not strings) in merge condition
tgt.merge(
    source=src,
    condition=(
        (col(f"tgt.{key_column}") == col(f"src.{key_column}")) &
        (col(f"tgt.{is_current_column}") == lit(1))
    )
).whenMatchedUpdate(
    condition=col(f"tgt.{hash_column}") != col(f"src.{hash_column}"),
    set={
        is_current_column: lit(0)
    }
).whenNotMatchedInsertAll().execute()



[0;31m---------------------------------------------------------------------------[0m
[0;31mUnsupportedOperationException[0m             Traceback (most recent call last)
File [0;32m<command-6559142975587852>, line 40[0m
[1;32m     26[0m tgt [38;5;241m=[39m target_table[38;5;241m.[39malias([38;5;124m"[39m[38;5;124mtgt[39m[38;5;124m"[39m)
[1;32m     28[0m [38;5;66;03m# Use column expressions (not strings) in merge condition[39;00m
[1;32m     29[0m tgt[38;5;241m.[39mmerge(
[1;32m     30[0m     source[38;5;241m=[39msrc,
[1;32m     31[0m     condition[38;5;241m=[39m(
[1;32m     32[0m         (col([38;5;124mf[39m[38;5;124m"[39m[38;5;124mtgt.[39m[38;5;132;01m{[39;00mkey_column[38;5;132;01m}[39;00m[38;5;124m"[39m) [38;5;241m==[39m col([38;5;124mf[39m[38;5;124m"[39m[38;5;124msrc.[39m[38;5;132;01m{[39;00mkey_column[38;5;132;01m}[39;00m[38;5;124m"[39m)) [38;5;241m&[39m
[1;32m     33[0m         (col([38;5;124mf[39m[38;5;124m"[39

In [0]:
SourceDf = SourceDf.drop('storage_id','Flag')
max_storage_id = spark.sql(f"select max(storage_id) as max_id from {table_name}").first()['max_id']
next_storage_id = 1 if not max_storage_id or max_storage_id == 0 else max_storage_id + 1

SourceDf = SourceDf.withColumn('storage_id', lit(next_storage_id))
SourceDf = SourceDf.withColumn('IndCurrent', lit(1))
SourceDf.write.format('delta').mode('append').saveAsTable(table_name)

