In [0]:
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.scd_implementation_source")
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.scd_implementation_target")

DataFrame[]

In [0]:
customer_data = spark.sql("select * from samples.tpch.customer")
customer_data.write.mode("overwrite").saveAsTable(
    "workspace.scd_implementation_source.customer_data"
)

In [0]:
source = spark.read.table("workspace.scd_implementation_source.customer_data")

In [0]:
from pyspark.sql.functions import col

# Filter the DataFrame to show only rows where 'franchiseID' is '3000001'
# Display the filtered DataFrame for inspection
source.filter(col("c_custkey") == "412450").display()

c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment
412450,Customer#000412450,fUD6IoGdtF,20,30-293-696-5047,4406.28,BUILDING,refully final dolphins after the carefully bold packages sleep quickly express deposits. fluffily


In [0]:
from pyspark.sql.functions import col, when

# Update the 'city' column in SourceDf:
# For rows where 'franchiseID' equals '3000001', set the 'city' value to 'Tokyo Modified'.
# For all other rows, retain the original 'city' value.
source = source.withColumn(
    "customer_priority",
    when(col("c_mktsegment") == "BUILDING", "Priority Customer").otherwise(
        "Not Priority"
    ),
)

# Display rows where 'franchiseID' is '3000001' to verify the 'city' column update.
source.filter(col("customer_priority") == "Priority Customer").display()

In [0]:
source = source.drop("customer_priority")

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

window_spec = Window.orderBy(F.monotonically_increasing_id())
source = source.withColumn("storage_id", F.row_number().over(window_spec))

first_cols = ["storage_id"]
other_cols = [col for col in source.columns if col not in first_cols]
source = source.select(first_cols + other_cols)

display(source)

In [0]:
from pyspark.sql import functions as F

# Load Data From Source and concatenate all columns into 'ConCatValue'
source = source.withColumn('RowHash', F.sha2(F.concat_ws('', *source.columns), 256))




In [0]:
source = source.withColumn("IndCurrent", F.lit(1)) \
    .withColumn("CreatedDate", F.current_timestamp()) \
    .withColumn("ModifiedDate", F.current_timestamp())

display(source)



storage_id,c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment,RowHash,IndCurrent,CreatedDate,ModifiedDate
1,412445,Customer#000412445,"0QAB3OjYnbP6mA0B,kgf",21,31-421-403-4333,5358.33,BUILDING,arefully blithely regular epi,5ce2bee572c3f0f172439bfe3b0e93b08dd0bc6e09584cfc57f77171e5f59914,1,2025-07-09T15:08:19.697Z,2025-07-09T15:08:19.697Z
2,412446,Customer#000412446,"5u8MSbyiC7J,7PuY4Ivaq1JRbTCMKeNVqg",20,30-487-949-7942,9441.59,MACHINERY,"sleep according to the fluffily even forges. fluffily careful packages after the ironic, silent deposi",ea653dab01e597f6c8b2845d0cf146074f7315d8c4f78b7d1bd881ddda666425,1,2025-07-09T15:08:19.697Z,2025-07-09T15:08:19.697Z
3,412447,Customer#000412447,HC4ZT62gKPgrjr ceoaZgFOunlUogr7GO,7,17-797-466-6308,7868.75,AUTOMOBILE,aggle blithely among the carefully express excus,e4130b027c7a967dc399f7405d71c3de47a52b4534d327d73a60ebd85fc4311c,1,2025-07-09T15:08:19.697Z,2025-07-09T15:08:19.697Z
4,412448,Customer#000412448,hJok1MMrDgH,6,16-541-510-4964,6060.98,MACHINERY,ly silent requests boost slyly. express courts sleep according to the fluf,49b2e6bdf2fa497c2a2b4c6bd2705c8d5c8a34c34f7c3d5bf1632e56ee517c98,1,2025-07-09T15:08:19.697Z,2025-07-09T15:08:19.697Z
5,412449,Customer#000412449,"zAt1nZNG01gOhIqgyDtDa S,Y0VSofZJs1dd",14,24-710-983-5536,4973.84,HOUSEHOLD,"refully final theodolites. final, slow excuses sleep quickly! quickly ironic idea",b29de27bb1062bc2f91ea4ab48d31486258fd07e63a1c2756c45507a6e4c8071,1,2025-07-09T15:08:19.697Z,2025-07-09T15:08:19.697Z
6,412450,Customer#000412450,fUD6IoGdtF,20,30-293-696-5047,4406.28,BUILDING,refully final dolphins after the carefully bold packages sleep quickly express deposits. fluffily,ee19bf79c40ef105f2f0806ca32b936f6291daa08c59132a48fb1c50bf82bbd3,1,2025-07-09T15:08:19.697Z,2025-07-09T15:08:19.697Z
7,412451,Customer#000412451,W2Ge0Qd8adH,20,30-590-724-6711,2290.38,BUILDING,slow asymptotes will are carefully final packages. slyly regular fox,02be2b3720bfb904a3e2e81840dc3df4a3486b7e5bf8a2eb5a6c009835d55d5f,1,2025-07-09T15:08:19.697Z,2025-07-09T15:08:19.697Z
8,412452,Customer#000412452,Ij4xiPIeNEP1uR5p7H,10,20-492-590-3363,3426.64,AUTOMOBILE,sleep slyly after the sometimes even ideas. slyly express theodolites dazzle furiously ironic dependenci,eb2cbf824cdee971715eb13fe43f3c0dba3d54c70ee9b977df0a71e538969302,1,2025-07-09T15:08:19.697Z,2025-07-09T15:08:19.697Z
9,412453,Customer#000412453,4DmSxDPMmfidKQB3W50FIzkjZESEW3LPgLBuQbic,21,31-480-724-9665,4592.14,MACHINERY,"against the slyly regular requests-- pending, pending accounts boost quic",ea4a30201ad964d9671406453a6e82464b17a0ddecb62e84a16c25a930d1fb68,1,2025-07-09T15:08:19.697Z,2025-07-09T15:08:19.697Z
10,412454,Customer#000412454,ZQfKDMUyEfn,9,19-898-261-2669,2035.91,FURNITURE,quickly. blithely special theodolites about the excus,f30f5cf881ba903349d423c0fbd7ee4e74d45bace070f56e0dc7e827ed774957,1,2025-07-09T15:08:19.697Z,2025-07-09T15:08:19.697Z


In [0]:
%sql
DESCRIBE TABLE workspace.scd_implementation_source.customer_data

col_name,data_type,comment
c_custkey,bigint,
c_name,string,
c_address,string,
c_nationkey,bigint,
c_phone,string,
c_acctbal,"decimal(18,2)",
c_mktsegment,string,
c_comment,string,


In [0]:
source.write.mode("append").saveAsTable(
    "workspace.scd_implementation_target.customer_data"
)



In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import current_timestamp, col

# Static configuration
table_name = "workspace.scd_implementation_target.customer_data"
key_column = "c_custkey"
timestamp_column = "ModifiedDate"
hash_column = "RowHash"
created_column = "CreatedDate"

# Reference Delta table
target_table = DeltaTable.forName(spark, table_name)

# Aliases
src = source.alias("src")
tgt = target_table.alias("tgt")

# Columns to update (exclude key, timestamp, and created date)
columns_to_update = [
    col_name
    for col_name in source.columns
    if col_name not in [key_column, timestamp_column, created_column]
]

# Construct SET dictionary for update
set_dict = {col_name: col(f"src.{col_name}") for col_name in columns_to_update}
set_dict[timestamp_column] = current_timestamp()  # Add ModifiedDate explicitly

# Perform SCD Type 1 MERGE
tgt.merge(src, f"tgt.{key_column} = src.{key_column}").whenMatchedUpdate(
    condition=col(f"src.{hash_column}") != col(f"tgt.{hash_column}"), set=set_dict
).whenNotMatchedInsertAll().execute()



DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]