
### Scenario

A retail company maintains a product catalog in its data warehouse. Product details such as name, category, and price may change over time due to rebranding, category updates, or pricing adjustments. To preserve the histoical data for accurate reporting and trend analysis, the company needs to implement Slowly Changing Dimensions Type II mechanism in PySpark, ensuring old records are retained with efective date ranges while new versions are inserted as separate records.

In [0]:
%sql
CREATE TABLE IF NOT EXISTS pyspark_cata.source.items
(
  item_id INT,
  item_category STRING,
  item_brand STRING,
  item_name STRING,
  price INT,
  updated_at TIMESTAMP
)

In [0]:
%sql
INSERT INTO pyspark_cata.source.items
VALUES
(1, 'phones', 'apple', 'iphone', 1000, current_timestamp()),
(2, 'phones', 'samsung', 'galaxy', 1200, current_timestamp()),
(3, 'phones', 'google', 'pixel', 800, current_timestamp()),
(4, 'laptops', 'apple', 'macbook', 2000, current_timestamp()),
(5, 'laptops', 'dell', 'inspiron', 1500, current_timestamp()),
(6, 'laptops', 'hp', 'pavilion', 1800, current_timestamp()),
(7, 'tablets', 'amazon', 'fire', 300, current_timestamp()),
(8, 'tablets', 'google', 'pixel', 400, current_timestamp()),
(9, 'tablets', 'apple', 'ipad', 600, current_timestamp()),
(10, 'tablets', 'microsoft', 'surface', 500, current_timestamp())

num_affected_rows,num_inserted_rows
10,10


In [0]:
%sql
SELECT * FROM pyspark_cata.source.items

item_id,item_category,item_brand,item_name,price,updated_at
1,phones,apple,iphone,1000,2025-12-19T16:03:54.170Z
2,phones,samsung,galaxy,1200,2025-12-19T16:03:54.170Z
3,phones,google,pixel,800,2025-12-19T16:03:54.170Z
4,laptops,apple,macbook,2000,2025-12-19T16:03:54.170Z
5,laptops,dell,inspiron,1500,2025-12-19T16:03:54.170Z
6,laptops,hp,pavilion,1800,2025-12-19T16:03:54.170Z
7,tablets,amazon,fire,300,2025-12-19T16:03:54.170Z
8,tablets,google,pixel,400,2025-12-19T16:03:54.170Z
9,tablets,apple,ipad,600,2025-12-19T16:03:54.170Z
10,tablets,microsoft,surface,500,2025-12-19T16:03:54.170Z


In [0]:
if spark.catalog.tableExists('pyspark_cata.source.DimItems'):
    pass
else:
    spark.sql("""
              CREATE TABLE pyspark_cata.source.DimItems
              SELECT *,
                    current_timestamp() AS start_time,
                    CAST('9999-12-31 23:59:59' AS TIMESTAMP) AS end_time,
                    'Y' as is_active
              FROM pyspark_cata.source.items
              """)

In [0]:
%sql
SELECT * FROM pyspark_cata.source.DimItems

item_id,item_category,item_brand,item_name,price,updated_at,start_time,end_time,is_active
1,phones,apple,iphone,1000,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
2,phones,samsung,galaxy,1200,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
3,phones,google,pixel,800,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
4,laptops,apple,macbook,2000,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
5,laptops,dell,inspiron,1500,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
6,laptops,hp,pavilion,1800,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
7,tablets,amazon,fire,300,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
8,tablets,google,pixel,400,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
9,tablets,apple,ipad,600,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
10,tablets,microsoft,surface,500,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from delta.tables import DeltaTable

In [0]:
df = spark.sql("""
               SELECT * FROM pyspark_cata.source.items
               """)
df = df.withColumn("dedup", row_number().over(Window.partitionBy("item_id").orderBy(col("updated_at").desc()))).drop('dedup')
df.createOrReplaceTempView("src_temp")
df = spark.sql("""
               SELECT *,
                    current_timestamp() AS start_time,
                    CAST('9999-12-31 23:59:59' AS TIMESTAMP) AS end_time,
                    'Y' as is_active
               FROM src_temp
              """)
df.createOrReplaceTempView("src")
display(df)

item_id,item_category,item_brand,item_name,price,updated_at,start_time,end_time,is_active
1,phones,apple,iphone,1000,2025-12-19T16:03:54.170Z,2025-12-19T19:03:50.442Z,9999-12-31T23:59:59.000Z,Y
2,phones,samsung,galaxy,1200,2025-12-19T16:03:54.170Z,2025-12-19T19:03:50.442Z,9999-12-31T23:59:59.000Z,Y
3,phones,google,pixel,800,2025-12-19T16:03:54.170Z,2025-12-19T19:03:50.442Z,9999-12-31T23:59:59.000Z,Y
4,laptops,apple,macbook,2000,2025-12-19T16:03:54.170Z,2025-12-19T19:03:50.442Z,9999-12-31T23:59:59.000Z,Y
5,laptops,dell,inspiron,1500,2025-12-19T16:03:54.170Z,2025-12-19T19:03:50.442Z,9999-12-31T23:59:59.000Z,Y
6,laptops,hp,pavilion,1800,2025-12-19T16:03:54.170Z,2025-12-19T19:03:50.442Z,9999-12-31T23:59:59.000Z,Y
7,tablets,amazon,fire,300,2025-12-19T16:03:54.170Z,2025-12-19T19:03:50.442Z,9999-12-31T23:59:59.000Z,Y
8,tablets,google,pixel,400,2025-12-19T16:03:54.170Z,2025-12-19T19:03:50.442Z,9999-12-31T23:59:59.000Z,Y
9,tablets,apple,ipad,600,2025-12-19T16:03:54.170Z,2025-12-19T19:03:50.442Z,9999-12-31T23:59:59.000Z,Y
10,tablets,microsoft,surface,500,2025-12-19T16:03:54.170Z,2025-12-19T19:03:50.442Z,9999-12-31T23:59:59.000Z,Y


Merge 1


In [0]:
%sql
MERGE INTO pyspark_cata.source.DimItems AS trg
USING src AS src
ON trg.item_id = src.item_id
AND trg.is_active = 'Y'

WHEN MATCHED AND src.item_category <> trg.item_category
OR src.item_brand <> trg.item_brand
OR src.item_name <> trg.item_name
OR src.price <> trg.price
OR src.updated_at <> trg.updated_at
THEN UPDATE SET 
trg.end_time = current_timestamp(),
trg.is_active = 'N'

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
0,0,0,0


Merge 2

In [0]:
%sql

MERGE INTO pyspark_cata.source.DimItems AS trg
USING src AS src
ON trg.item_id = src.item_id
AND trg.is_active = 'Y'

WHEN NOT MATCHED
THEN INSERT *

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
0,0,0,0


In [0]:
%sql
SELECT * FROM pyspark_cata.source.DimItems

item_id,item_category,item_brand,item_name,price,updated_at,start_time,end_time,is_active
1,phones,apple,iphone,1000,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
2,phones,samsung,galaxy,1200,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
3,phones,google,pixel,800,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
4,laptops,apple,macbook,2000,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
5,laptops,dell,inspiron,1500,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
6,laptops,hp,pavilion,1800,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
7,tablets,amazon,fire,300,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
8,tablets,google,pixel,400,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
9,tablets,apple,ipad,600,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
10,tablets,microsoft,surface,500,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y


Inserting New Records

In [0]:
%sql
INSERT INTO pyspark_cata.source.items
VALUES
(1, 'phones', 'apple', 'iphone', 1100, current_timestamp()),
(2, 'phones', 'samsung', 'galaxy', 1250, current_timestamp()),
(3, 'phones', 'google', 'pixel', 700, current_timestamp()),
(11, 'watch', 'apple', 'apple watch', 300, current_timestamp()),
(12, 'watch', 'samsung', 'galaxy watch', 250, current_timestamp()),
(13, 'watch', 'google', 'pixel watch', 100, current_timestamp())

num_affected_rows,num_inserted_rows
6,6


In [0]:
%sql
SELECT * FROM pyspark_cata.source.items

item_id,item_category,item_brand,item_name,price,updated_at
1,phones,apple,iphone,1000,2025-12-19T16:03:54.170Z
2,phones,samsung,galaxy,1200,2025-12-19T16:03:54.170Z
3,phones,google,pixel,800,2025-12-19T16:03:54.170Z
4,laptops,apple,macbook,2000,2025-12-19T16:03:54.170Z
5,laptops,dell,inspiron,1500,2025-12-19T16:03:54.170Z
6,laptops,hp,pavilion,1800,2025-12-19T16:03:54.170Z
7,tablets,amazon,fire,300,2025-12-19T16:03:54.170Z
8,tablets,google,pixel,400,2025-12-19T16:03:54.170Z
9,tablets,apple,ipad,600,2025-12-19T16:03:54.170Z
10,tablets,microsoft,surface,500,2025-12-19T16:03:54.170Z


In [0]:
if spark.catalog.tableExists('pyspark_cata.source.DimItems'):
    df = spark.sql("""
                SELECT * FROM pyspark_cata.source.items
                """)
    df = df.withColumn("dedup", row_number().over(Window.partitionBy("item_id").orderBy(col("updated_at").desc()))).drop('dedup')
    df = df.filter(col("dedup") == 1)
    df.createOrReplaceTempView("src_temp")
    df = spark.sql("""
                SELECT *,
                        current_timestamp() AS start_time,
                        CAST('9999-12-31 23:59:59' AS TIMESTAMP) AS end_time,
                        'Y' as is_active
                FROM src_temp
                """)
    df.createOrReplaceTempView("src")
    spark.sql("""
                MERGE INTO pyspark_cata.source.DimItems AS trg
                USING src AS src
                ON trg.item_id = src.item_id
                AND trg.is_active = 'Y'

                WHEN MATCHED AND src.item_category <> trg.item_category
                OR src.item_brand <> trg.item_brand
                OR src.item_name <> trg.item_name
                OR src.price <> trg.price
                OR src.updated_at <> trg.updated_at
                THEN UPDATE SET 
                trg.end_time = current_timestamp(),
                trg.is_active = 'N'
              """)
    spark.sql("""
                MERGE INTO pyspark_cata.source.DimItems AS trg
                USING src AS src
                ON trg.item_id = src.item_id
                AND trg.is_active = 'Y'

                WHEN NOT MATCHED
                THEN INSERT *
              """)
else:
    spark.sql("""
              CREATE TABLE pyspark_cata.source.DimItems
              SELECT *,
                    current_timestamp() AS start_time,
                    CAST('9999-12-31 23:59:59' AS TIMESTAMP) AS end_time,
                    'Y' as is_active
              FROM pyspark_cata.source.items
              """)

In [0]:
%sql
SELECT * FROM pyspark_cata.source.DimItems
ORDER BY item_id

item_id,item_category,item_brand,item_name,price,updated_at,start_time,end_time,is_active
1,phones,apple,iphone,1000,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,2025-12-19T19:15:28.074Z,N
1,phones,apple,iphone,1100,2025-12-19T19:08:00.025Z,2025-12-19T19:15:34.190Z,9999-12-31T23:59:59.000Z,Y
2,phones,samsung,galaxy,1200,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,2025-12-19T19:15:28.074Z,N
2,phones,samsung,galaxy,1250,2025-12-19T19:08:00.025Z,2025-12-19T19:15:34.190Z,9999-12-31T23:59:59.000Z,Y
3,phones,google,pixel,700,2025-12-19T19:08:00.025Z,2025-12-19T19:15:34.190Z,9999-12-31T23:59:59.000Z,Y
3,phones,google,pixel,800,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,2025-12-19T19:15:28.074Z,N
4,laptops,apple,macbook,2000,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
5,laptops,dell,inspiron,1500,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
6,laptops,hp,pavilion,1800,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
7,tablets,amazon,fire,300,2025-12-19T16:03:54.170Z,2025-12-19T16:08:12.443Z,9999-12-31T23:59:59.000Z,Y
