In [0]:
SourceTable='workspace.de_practice_source.sales'
TargetTable='workspace.de_practice_target.sales'

In [0]:
SourceDf=spark.read.table(SourceTable)  # Read source table into DataFrame
TargetDf=spark.read.table(TargetTable)  # Read target table into DataFrame

In [0]:
from pyspark.sql.functions import col

# Filter the DataFrame to show only rows where 'franchiseID' is '3000001'
# Display the filtered DataFrame for inspection
SourceDf.filter(col("franchiseID") == "3000001").display()

# The 'city' value for rows with 'franchiseID' 3000001 is 'Tokyo'

In [0]:
from pyspark.sql.functions import col, when

# Update the 'city' column in SourceDf:
# For rows where 'franchiseID' equals '3000001', set the 'city' value to 'Tokyo Modified'.
# For all other rows, retain the original 'city' value.
SourceDf = SourceDf.withColumn(
    "city",
    when(col("franchiseID") == "3000001", "Delhi").otherwise(col("city"))
)

# Display rows where 'franchiseID' is '3000001' to verify the 'city' column update.
SourceDf.filter(col("franchiseID") == "3000001").display()

# After this update, the 'city' value for all rows with 'franchiseID' 3000001 will be 'Delhi'.

In [0]:
# Create a hash key by concatenating all columns into a single string column 'RowHash'
from pyspark.sql import functions as F

# Concatenate all columns in 'source' DataFrame into 'RowHash'
SourceDf = SourceDf.withColumn('RowHash', F.concat_ws('', *SourceDf.columns))


In [0]:
# Add three new columns to SourceDf:
# 1. 'IndCurrent': Set to 1 for all rows, indicating the current/active record.
# 2. 'CreatedDate': Set to the current timestamp, representing when the record was created.
# 3. 'ModifiedDate': Set to the current timestamp, representing when the record was last modified.
SourceDf = SourceDf.withColumn("IndCurrent", F.lit(1)) \
    .withColumn("CreatedDate", F.current_timestamp()) \
    .withColumn("ModifiedDate", F.current_timestamp())

In [0]:
SourceDf.filter(col("franchiseID") == "3000001").display()

## 📘 What is SCD Type 1?

**SCD (Slowly Changing Dimension) Type 1** is a technique used in data warehousing to handle changes in dimension data **without preserving historical records**.

---

### 🔹 Scenario Example:

- **Primary Key:** `frinsied_id`
- **Changed Attribute:** `city`

If a record with `frinsied_id = 3000001` originally had `city = 'Tokyo'`, and later the city is updated to `city = 'Delhi'`, **SCD Type 1** will simply **overwrite** the old value:

| frinsied_id | city   |
|-------------|--------|
| 3000001         | Delhi  |

> The original value `'Tokyo'` is lost — **no history is maintained**.

---

### 🔹 Key Characteristics:
- Updates data **in place**.
- Update `Modified Date` .
- **No audit trail** or history of prior values.
- Ensures the table always shows the **most recent information**.

---

### 📄 Supporting Document:
- [Microsoft Docs – Slowly Changing Dimensions (SCD)](https://learn.microsoft.com/en-us/azure/data-factory/tutorial-incremental-copy-overview#slowly-changing-data)


In [0]:
# Before applying the SCD Type 1 merge, let's inspect the data in the target table for a specific franchiseID
display(spark.sql("select * from workspace.de_practice_target.sales where franchiseID='3000001'"))

In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import current_timestamp, col

# Static configuration
table_name = "workspace.de_practice_target.sales"
key_column = "franchiseID"
timestamp_column = "ModifiedDate"
hash_column = "RowHash"
created_column = "CreatedDate"

# Reference Delta table
target_table = DeltaTable.forName(spark, table_name)

# Aliases
src = SourceDf.alias("src")
tgt = target_table.alias("tgt")

# Columns to update (exclude key, timestamp, and created date)
columns_to_update = [
    col_name for col_name in SourceDf.columns 
    if col_name not in [key_column, timestamp_column, created_column]
]

# Construct SET dictionary for update
set_dict = {col_name: col(f"src.{col_name}") for col_name in columns_to_update}
set_dict[timestamp_column] = current_timestamp()  # Add ModifiedDate explicitly

# Perform SCD Type 1 MERGE
tgt.merge(
    src,
    f"tgt.{key_column} = src.{key_column}"
).whenMatchedUpdate(
    condition=col(f"src.{hash_column}") != col(f"tgt.{hash_column}"),
    set=set_dict
).whenNotMatchedInsertAll().execute()


In [0]:
# Inspect the target table data for a specific franchiseID after applying the SCD Type 1 merge
display(spark.sql("select * from workspace.de_practice_target.sales where franchiseID='3000001'"))