In [0]:
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import *

# **SCD - 1**

In [0]:
old_data = [("user1",100),("user2",200)]

# create a dataframe
old_df = spark.createDataFrame(old_data, ["user","sales"])

# create a delta table -- first Time
old_df.write.format("delta").mode("overwrite").save("/FileStore/delta/Users")

# new data
new_data = [("user3",500),("user2",400)]

new_df = spark.createDataFrame(new_data, ["user","sales"])

from delta.tables import DeltaTable

# create a object of delta table to do advanced operation on the table
delta_tbl = DeltaTable.forPath(spark, "/FileStore/delta/Users")

# Perform upsert/merge on top of delta_table to avoid data corruption
delta_tbl.alias("target").merge(new_df.alias("source"), "target.user = source.user") \
                        .whenMatchedUpdate(set = {"sales": "source.sales"}) \
                        .whenNotMatchedInsert(values = {"user":"source.user",
                                                        "sales":"source.sales"}) \
                        .execute()

# Read delta table data
result_df = spark.read.format("delta").load("/FileStore/delta/Users")

result_df.display()

# **SCD - 2**

SCD - 2 
Steps: 
-         Step 1: Created a dataframe as source and loaded in delta table 
-         Step 2: In real time we'll have 2 more column start date and end date in table where start date will be filled by current date,
-                 as we are doing on some dummy data, so instead of taking them in old data adding them during dataframe creation
-         Step 3: We'll create another dataframe for new set data 
-         Step 4: Create a delta table object to perform any advance action on it:
>                                If our delta table is saved on file based then use DeltaTable.forPath()
>                                If our delta table is saved as sql managed table then DeltaTable.forName()
-         Step 5: Now apply operations on it 

**DATA CHECK**

In [0]:
%sql

select * from customer_table_scd2

**CREATE DELTA TABLE FROM OLD DATASET**

In [0]:
old_data = [("cust1","Oin","Kolkata"),("cust2","Kou","Contai")]

# create a dataframe
old_df = spark.createDataFrame(old_data, ["cust_id","name","address"]) \
              .withColumn("start_date", current_date()) \
              .withColumn("end_date", lit(None).cast("date")) \
              .withColumn("flag",lit("Y"))

# create a delta table -- first Time
old_df.write.format("delta").mode("overwrite").saveAsTable("customer_table_scd2")

**NEW INCOMING DATA**

In [0]:
new_data = [("cust1","Oin","Pune","2025-05-25"),("cust3","Ree","Bagnan","2025-05-25")]

# create a dataframe
new_df = spark.createDataFrame(new_data, ["cust_id","name","address","update_dt"]) \
            .withColumn("update_dt",col("update_dt").cast("date"))

new_df.display()

FOR TESTING SCD FUNCTIONALITY CREATE DIFF DATASET

In [0]:
new_data = [("cust1","Oin","Kolkata","2025-05-27")]

# create a dataframe
new_df1 = spark.createDataFrame(new_data, ["cust_id","name","address","update_dt"]) \
            .withColumn("update_dt",col("update_dt").cast("date"))

new_df1.display()

**PERFORM SCD - 2**

USING PYSPARK CODE FOR UPDATE EXISTING RECORD AND BRAND NEW RECORD

In [0]:
from delta.tables import DeltaTable

delta_table_scd2 = DeltaTable.forName(spark, "customer_table_scd2")

delta_table_scd2.alias("Target") \
        .merge(new_df.alias("Source"), "Source.cust_id == Target.cust_id and Target.flag = 'Y' and Target.end_date is null") \
        .whenMatchedUpdate(
            condition = "Target.address != Source.address",
            set = {
                "end_date" : col("Source.update_dt").cast("date"),
                "flag" : lit("N")
            }
        ) \
        .whenNotMatchedInsert(
            values = {
                "cust_id" : col("Source.cust_id"),
                "name" : col("Source.name"),
                "address" : col("Source.address"),
                "start_date" : col("Source.update_dt"),
                "end_date" : lit(None).cast("date"),
                "flag" : lit("Y")
            }
        ) \
        .execute()

USING SQL CODE FOR UPDATE EXISTING RECORD AND BRAND NEW RECORD

In [0]:
new_df.createOrReplaceTempView("new_source")
new_df1.createOrReplaceTempView("new_source")

In [0]:
%sql
merge into customer_table_scd2 as target
using new_source as source
on target.cust_id = source.cust_id and target.flag = 'Y' and isnull(target.end_date)

when matched then
update
set 
target.end_date = source.update_dt,
target.flag = 'N'

when not matched then
insert (cust_id,name,address,start_date,end_date,flag)
values (source.cust_id,source.name,source.address,source.update_dt,null,'Y')

In [0]:
%sql
insert into customer_table_scd2(cust_id,name,address,start_date,end_date,flag) 
select
  s.cust_id,
  s.name,
  s.address,
  s.update_dt,
  null as end_date,
  'Y' as flag
from new_source s
join customer_table_scd2 t
on s.cust_id = t.cust_id
where  s.address <> t.address
and t.flag = 'N'

In [0]:
%sql
DESCRIBE HISTORY customer_table_scd2

In [0]:
%sql
RESTORE customer_table_scd2 TO VERSION AS OF 26