In [0]:
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.scd_implementation_source")
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.scd_implementation_target")

DataFrame[]

In [0]:
customer_data = spark.sql("select * from samples.tpch.customer")
customer_data.write.mode("overwrite").saveAsTable(
    "workspace.scd_implementation_source.customer_data"
)

In [0]:
source = spark.read.table("workspace.scd_implementation_source.customer_data")

In [0]:
from pyspark.sql import functions as F

# Load Data From Source and concatenate all columns into 'ConCatValue'
source = source.withColumn("ConCatValue", F.concat_ws("", *source.columns))

display(source)

In [0]:
# Add IndCurrent, CreatedDate, and ModifiedDate columns
source = source.withColumn("IndCurrent", F.lit(1)).withColumn(
    "CreatedDate", F.current_timestamp()
)

In [0]:
%sql
DESCRIBE TABLE workspace.scd_implementation_source.customer_data

In [0]:
from pyspark.sql.window import Window

window_spec = Window.orderBy(F.monotonically_increasing_id())
source = source.withColumn("storage_id", F.row_number().over(window_spec))

first_cols = ["storage_id"]
other_cols = [col for col in source.columns if col not in first_cols]
source = source.select(first_cols + other_cols)

display(source)

In [0]:
# Generate SHA-256 hash of concatenated column values and drop 'ConCatValue'
source = source.withColumn("RowHash", F.sha2(F.col("ConCatValue"), 256)).drop(
    "ConCatValue"
)
display(source)

In [0]:
source.write.mode("append").saveAsTable(
    "workspace.scd_implementation_target.customer_data"
)



In [0]:
SourceDf = spark.read.table("workspace.scd_implementation_source.customer_data")
TargetDf = spark.read.table("workspace.scd_implementation_target.customer_data")

In [0]:
from pyspark.sql.functions import col

# Filter the DataFrame to show only rows where 'franchiseID' is '3000001'
# Display the filtered DataFrame for inspection
SourceDf.filter(col("c_custkey") == "412450").display()

In [0]:
from pyspark.sql.functions import col, when

# Update the 'city' column in SourceDf:
# For rows where 'franchiseID' equals '3000001', set the 'city' value to 'Tokyo Modified'.
# For all other rows, retain the original 'city' value.
SourceDf = SourceDf.withColumn(
    "customer_priority",
    when(col("c_mktsegment") == "BUILDING", "Priority Customer").otherwise(
        "Not Priority"
    ),
)

# Display rows where 'franchiseID' is '3000001' to verify the 'city' column update.
SourceDf.filter(col("customer_priority") == "Priority Customer").display()

In [0]:
# Concatenate all columns in 'source' DataFrame into 'RowHash'
SourceDf = SourceDf.withColumn("RowHash", F.concat_ws("", *SourceDf.columns))
SourceDf.display()

In [0]:
SourceDf = (
    SourceDf.withColumn("IndCurrent", F.lit(1))
    .withColumn("CreatedDate", F.current_timestamp())
    .withColumn("ModifiedDate", F.current_timestamp())
)
SourceDf.display()

In [0]:
SourceDf = SourceDf.drop("customer_priority")

In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import current_timestamp, col

# Static configuration
table_name = "workspace.scd_implementation_target.customer_data"
key_column = "c_custkey"
timestamp_column = "ModifiedDate"
hash_column = "RowHash"
created_column = "CreatedDate"

# Reference Delta table
target_table = DeltaTable.forName(spark, table_name)

# Aliases
src = SourceDf.alias("src")
tgt = target_table.alias("tgt")

# Columns to update (exclude key, timestamp, and created date)
columns_to_update = [
    col_name
    for col_name in SourceDf.columns
    if col_name not in [key_column, timestamp_column, created_column]
]

# Construct SET dictionary for update
set_dict = {col_name: col(f"src.{col_name}") for col_name in columns_to_update}
set_dict[timestamp_column] = current_timestamp()  # Add ModifiedDate explicitly

# Perform SCD Type 1 MERGE
tgt.merge(src, f"tgt.{key_column} = src.{key_column}").whenMatchedUpdate(
    condition=col(f"src.{hash_column}") != col(f"tgt.{hash_column}"), set=set_dict
)

<delta.connect.tables.DeltaMergeBuilder at 0xffae64fdd790>

In [0]:
SourceTable='workspace.scd_implementation_source.customer_data'
TargetTable='workspace.scd_implementation_target.customer_data'
SourceDf = spark.read.table("workspace.scd_implementation_source.customer_data")
TargetDf = spark.read.table("workspace.scd_implementation_target.customer_data")

In [0]:
#join with Target Table and create Flag
TargetDf=spark.read.table(TargetTable).select(['c_custkey','RowHash','storage_id']).withColumnRenamed('RowHash','TargetHash')
SourceDf=SourceDf.join(TargetDf, on =['c_custkey'], how='left').withColumn('Flag', F.when(col('TargetHash').isNull() | (col('TargetHash') != col('RowHash')), 'New').when(col('TargetHash') == col('RowHash'), 'NoChange').otherwise('Update'))
# Drop the TargetHash column
SourceDf=SourceDf.drop('TargetHash')
SourceDf=SourceDf.filter(col("Flag") == "New")


In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import current_timestamp, lit, col, udf
from pyspark.sql.types import StringType
from pyspark.sql.window import Window
import uuid

# Configuration
table_name = "workspace.scd_implementation_target.customer_data"
key_column = "c_custkey"
hash_column = "RowHash"
is_current_column = "IndCurrent"
surrogate_key_column = "storage_id"
created_column = "CreatedDate"

# Reference Delta table
target_table = DeltaTable.forName(spark, table_name)

# Add new columns to source DataFrame
uuid_udf = udf(lambda: str(uuid.uuid4()), StringType())
SourceDf = SourceDf \
    .withColumn(surrogate_key_column, uuid_udf()) \
    .withColumn(created_column, current_timestamp()) \
    .withColumn(is_current_column, lit(1))

# Use aliases properly
src = SourceDf.alias("src")
tgt = target_table.alias("tgt")

# Use column expressions (not strings) in merge condition
tgt.merge(
    source=src,
    condition=(
        (col(f"tgt.{key_column}") == col(f"src.{key_column}")) &
        (col(f"tgt.{is_current_column}") == lit(1))
    )
).whenMatchedUpdate(
    condition=col(f"tgt.{hash_column}") != col(f"src.{hash_column}"),
    set={
        is_current_column: lit(0)
    }
)

<delta.connect.tables.DeltaMergeBuilder at 0xffae642f9610>