In [187]:
from pyspark.sql.functions import * 
from pyspark.sql.window import Window
from pyspark.sql import SparkSession

In [188]:
spark = SparkSession.builder \
        .appName("SCD Implementation application") \
        .config("spark.sql.shuffle.partitions",3) \
        .master("local[2]") \
        .getOrCreate()

In [189]:
DATE_FORMAT = "yyyy-MM-dd"
future_date = "9999-12-31"
source_url = "/user/itv005857/scd_demo/source"
destination_url = "/user/itv005857/scd_demo/target"
primary_key = ["customerid"]
slowly_changing_cols = [ "email","phone","address", "city", "state", "zipcode"]
implementation_cols = ["effective_date","end_date","active_flag"]

In [190]:
customers_source_schema = "customerid long,firstname string, lastname string, email string, phone string, address string, city string, state string, zipcode long"

In [191]:
customers_target_schema = "customerid long,firstname string, lastname string, email string, phone string, address string, city string, state string, zipcode long, customer_skey long, effective_date date, end_date date, active_flag boolean"

In [192]:
customers_source_df = spark.read \
.format("csv") \
.option("header",True) \
.schema(customers_source_schema) \
.load(source_url)

In [193]:
customers_source_df.show()

+----------+-----------+--------+--------------------+--------+-------------+----------+-----+-------+
|customerid|  firstname|lastname|               email|   phone|      address|      city|state|zipcode|
+----------+-----------+--------+--------------------+--------+-------------+----------+-----+-------+
|         1|       John|     Doe|   johndoe@gmail.com|555-1234|  123 Main St|   Anytown|   CA|  12345|
|         2|       Jane|   Smith| janesmith@email.com|555-5679|  456 Oak Ave|  Sometown|   NY|  67890|
|         3|     Robert| Johnson|robertjohnson@ema...|555-8765|   123 Elm Ln|Harborcity|   FL|  87654|
|         4|      Alice|Williams|alicewilliams@ema...|555-4321| 234 Cedar Dr|  Yourtown|   FL|  89012|
|         5|    Michael|   Brown|michaelbrown@emai...|555-9876| 567 Elm Blvd| Theirtown|   IL|  45678|
|         6|      Emily|  Miller|emilymiller@email...|555-6543| 890 Birch Rd|   Newcity|   WA|  23456|
|         7|      David|   Jones|davidjones@email.com|555-2345|678 Maple 

In [195]:
window_def = Window.orderBy("customerid")

In [196]:
enhanced_customers_source_df = spark.read \
.format("csv") \
.option("header",True) \
.schema(customers_source_schema) \
.load(source_url) \
.withColumn("customer_skey",row_number().over(window_def)) \
.withColumn("effective_date",date_format(current_date(), DATE_FORMAT)) \
.withColumn("end_date",date_format(lit(future_date), DATE_FORMAT)) \
.withColumn("active_flag", lit(True))

In [168]:
enhanced_customers_source_df.show()

+----------+-----------+--------+--------------------+--------+-------------+----------+-----+-------+-------------+--------------+----------+-----------+
|customerid|  firstname|lastname|               email|   phone|      address|      city|state|zipcode|customer_skey|effective_date|  end_date|active_flag|
+----------+-----------+--------+--------------------+--------+-------------+----------+-----+-------+-------------+--------------+----------+-----------+
|         1|       John|     Doe|   johndoe@gmail.com|555-1234|  123 Main St|   Anytown|   CA|  12345|            1|    2024-02-13|9999-12-31|       true|
|         2|       Jane|   Smith| janesmith@email.com|555-5679|  456 Oak Ave|  Sometown|   NY|  67890|            2|    2024-02-13|9999-12-31|       true|
|         3|     Robert| Johnson|robertjohnson@ema...|555-8765|   123 Elm Ln|Harborcity|   FL|  87654|            3|    2024-02-13|9999-12-31|       true|
|         4|      Alice|Williams|alicewilliams@ema...|555-4321| 234 Ce

In [128]:
enhanced_customers_source_df.write.mode('overwrite') \
.option("header",True) \
.option("delimiter",",") \
.csv(destination_url)

In [197]:
customers_target_df = spark.read \
.format("csv") \
.option("header",True) \
.schema(customers_target_schema) \
.load(destination_url)

In [198]:
customers_target_df.show()

+----------+-----------+--------+--------------------+--------+-------------+---------+-----+-------+-------------+--------------+----------+-----------+
|customerid|  firstname|lastname|               email|   phone|      address|     city|state|zipcode|customer_skey|effective_date|  end_date|active_flag|
+----------+-----------+--------+--------------------+--------+-------------+---------+-----+-------+-------------+--------------+----------+-----------+
|         1|       John|     Doe|   johndoe@email.com|555-1234|  123 Main St|  Anytown|   CA|  12345|            1|    2024-02-13|9999-12-31|       true|
|         2|       Jane|   Smith| janesmith@email.com|555-5678|  456 Oak Ave| Sometown|   NY|  67890|            2|    2024-02-13|9999-12-31|       true|
|         3|     Robert| Johnson|robertjohnson@ema...|555-8765|  789 Pine Ln|Othercity|   TX|  34567|            3|    2024-02-13|9999-12-31|       true|
|         4|      Alice|Williams|alicewilliams@ema...|555-4321| 234 Cedar Dr

In [199]:
customers_source_df.show()

+----------+-----------+--------+--------------------+--------+-------------+----------+-----+-------+
|customerid|  firstname|lastname|               email|   phone|      address|      city|state|zipcode|
+----------+-----------+--------+--------------------+--------+-------------+----------+-----+-------+
|         1|       John|     Doe|   johndoe@gmail.com|555-1234|  123 Main St|   Anytown|   CA|  12345|
|         2|       Jane|   Smith| janesmith@email.com|555-5679|  456 Oak Ave|  Sometown|   NY|  67890|
|         3|     Robert| Johnson|robertjohnson@ema...|555-8765|   123 Elm Ln|Harborcity|   FL|  87654|
|         4|      Alice|Williams|alicewilliams@ema...|555-4321| 234 Cedar Dr|  Yourtown|   FL|  89012|
|         5|    Michael|   Brown|michaelbrown@emai...|555-9876| 567 Elm Blvd| Theirtown|   IL|  45678|
|         6|      Emily|  Miller|emilymiller@email...|555-6543| 890 Birch Rd|   Newcity|   WA|  23456|
|         7|      David|   Jones|davidjones@email.com|555-2345|678 Maple 

In [200]:
max_sk = customers_target_df.agg({"customer_skey": "max"}).collect()[0][0]

In [201]:
print(max_sk)

10


In [202]:
active_customers_target_df = customers_target_df.where(col("active_flag")==True)

In [203]:
inactive_customers_target_df = customers_target_df.where(col("active_flag")==False)

In [204]:
active_customers_target_df.show()

+----------+-----------+--------+--------------------+--------+-------------+---------+-----+-------+-------------+--------------+----------+-----------+
|customerid|  firstname|lastname|               email|   phone|      address|     city|state|zipcode|customer_skey|effective_date|  end_date|active_flag|
+----------+-----------+--------+--------------------+--------+-------------+---------+-----+-------+-------------+--------------+----------+-----------+
|         1|       John|     Doe|   johndoe@email.com|555-1234|  123 Main St|  Anytown|   CA|  12345|            1|    2024-02-13|9999-12-31|       true|
|         2|       Jane|   Smith| janesmith@email.com|555-5678|  456 Oak Ave| Sometown|   NY|  67890|            2|    2024-02-13|9999-12-31|       true|
|         3|     Robert| Johnson|robertjohnson@ema...|555-8765|  789 Pine Ln|Othercity|   TX|  34567|            3|    2024-02-13|9999-12-31|       true|
|         4|      Alice|Williams|alicewilliams@ema...|555-4321| 234 Cedar Dr

In [205]:
active_customers_target_df.join(customers_source_df, "customerid" , "full_outer").show()

+----------+-----------+--------+--------------------+--------+-------------+---------+-----+-------+-------------+--------------+----------+-----------+-----------+--------+--------------------+--------+-------------+----------+-----+-------+
|customerid|  firstname|lastname|               email|   phone|      address|     city|state|zipcode|customer_skey|effective_date|  end_date|active_flag|  firstname|lastname|               email|   phone|      address|      city|state|zipcode|
+----------+-----------+--------+--------------------+--------+-------------+---------+-----+-------+-------------+--------------+----------+-----------+-----------+--------+--------------------+--------+-------------+----------+-----+-------+
|         7|      David|   Jones|davidjones@email.com|555-2345|678 Maple Ave| Yourcity|   GA|  78901|            7|    2024-02-13|9999-12-31|       true|      David|   Jones|davidjones@email.com|555-2345|678 Maple Ave|  Yourcity|   GA|  78901|
|         6|      Emily|

In [206]:
def column_renamer(df, suffix, append):
   
    if append:
        new_column_names = list(map(lambda x: x+suffix, df.columns))
        
    else:
        new_column_names = list(map(lambda x: x.replace(suffix,""), df.columns))
        
    return df.toDF(*new_column_names)

def get_hash(df, keys_list):
 
    columns = [col(column) for column in keys_list]
    
    if columns:
        return df.withColumn("hash_md5", md5(concat_ws("", *columns)))
    else:
        return df.withColumn("hash_md5", md5(lit(1)))


In [207]:
active_customers_target_df_hash = column_renamer(get_hash(active_customers_target_df, slowly_changing_cols), suffix="_target", append=True)
customers_source_df_hash = column_renamer(get_hash(customers_source_df, slowly_changing_cols), suffix="_source", append=True)


In [208]:
active_customers_target_df_hash.show()

+-----------------+----------------+---------------+--------------------+------------+--------------+-----------+------------+--------------+--------------------+---------------------+---------------+------------------+--------------------+
|customerid_target|firstname_target|lastname_target|        email_target|phone_target|address_target|city_target|state_target|zipcode_target|customer_skey_target|effective_date_target|end_date_target|active_flag_target|     hash_md5_target|
+-----------------+----------------+---------------+--------------------+------------+--------------+-----------+------------+--------------+--------------------+---------------------+---------------+------------------+--------------------+
|                1|            John|            Doe|   johndoe@email.com|    555-1234|   123 Main St|    Anytown|          CA|         12345|                   1|           2024-02-13|     9999-12-31|              true|31ddb0d8a5baa88cc...|
|                2|            Jane|

In [209]:
customers_source_df_hash.show()

+-----------------+----------------+---------------+--------------------+------------+--------------+-----------+------------+--------------+--------------------+
|customerid_source|firstname_source|lastname_source|        email_source|phone_source|address_source|city_source|state_source|zipcode_source|     hash_md5_source|
+-----------------+----------------+---------------+--------------------+------------+--------------+-----------+------------+--------------+--------------------+
|                1|            John|            Doe|   johndoe@gmail.com|    555-1234|   123 Main St|    Anytown|          CA|         12345|8db85f2fdc7c5c1e4...|
|                2|            Jane|          Smith| janesmith@email.com|    555-5679|   456 Oak Ave|   Sometown|          NY|         67890|4b77ad952717438be...|
|                3|          Robert|        Johnson|robertjohnson@ema...|    555-8765|    123 Elm Ln| Harborcity|          FL|         87654|e194f30cc658c18bf...|
|                4|   

In [210]:
merged_df = active_customers_target_df_hash.join(customers_source_df_hash, col("customerid_source") ==  col("customerid_target") , "full_outer") \
.withColumn("Action", when(col("hash_md5_source") == col("hash_md5_target")  , 'NOCHANGE')\
.when(col("customerid_source").isNull(), 'DELETE')\
.when(col("customerid_target").isNull(), 'INSERT')\
.otherwise('UPDATE'))

In [211]:
merged_df.show()

+-----------------+----------------+---------------+--------------------+------------+--------------+-----------+------------+--------------+--------------------+---------------------+---------------+------------------+--------------------+-----------------+----------------+---------------+--------------------+------------+--------------+-----------+------------+--------------+--------------------+--------+
|customerid_target|firstname_target|lastname_target|        email_target|phone_target|address_target|city_target|state_target|zipcode_target|customer_skey_target|effective_date_target|end_date_target|active_flag_target|     hash_md5_target|customerid_source|firstname_source|lastname_source|        email_source|phone_source|address_source|city_source|state_source|zipcode_source|     hash_md5_source|  Action|
+-----------------+----------------+---------------+--------------------+------------+--------------+-----------+------------+--------------+--------------------+----------------

In [212]:
unchanged_records = column_renamer(merged_df.filter(col("action") == 'NOCHANGE'), suffix="_target", append=False).select(active_customers_target_df.columns)

In [213]:
unchanged_records.show()

+----------+-----------+--------+--------------------+--------+-------------+---------+-----+-------+-------------+--------------+----------+-----------+
|customerid|  firstname|lastname|               email|   phone|      address|     city|state|zipcode|customer_skey|effective_date|  end_date|active_flag|
+----------+-----------+--------+--------------------+--------+-------------+---------+-----+-------+-------------+--------------+----------+-----------+
|         7|      David|   Jones|davidjones@email.com|555-2345|678 Maple Ave| Yourcity|   GA|  78901|            7|    2024-02-13|9999-12-31|       true|
|         6|      Emily|  Miller|emilymiller@email...|555-6543| 890 Birch Rd|  Newcity|   WA|  23456|            6|    2024-02-13|9999-12-31|       true|
|         9|Christopher|  Taylor|christophertaylor...|555-8765|   234 Oak Ln| Thistown|   PA|  12345|            9|    2024-02-13|9999-12-31|       true|
|         5|    Michael|   Brown|michaelbrown@emai...|555-9876| 567 Elm Blvd

In [216]:
insert_records = column_renamer(merged_df.filter(col("action") == 'INSERT'), suffix="_source", append=False) \
                .select(customers_source_df.columns)\
                .withColumn("row_number",row_number().over(window_def))\
                .withColumn("customer_skey",col("row_number")+ max_sk)\
                .withColumn("effective_date",date_format(current_date(),DATE_FORMAT))\
                .withColumn("end_date",date_format(lit(future_date),DATE_FORMAT))\
                .withColumn("active_flag", lit(True))\
                .drop("row_number")

insert_records.show()

+----------+---------+--------+--------------------+--------+------------+----------+-----+-------+-------------+--------------+----------+-----------+
|customerid|firstname|lastname|               email|   phone|     address|      city|state|zipcode|customer_skey|effective_date|  end_date|active_flag|
+----------+---------+--------+--------------------+--------+------------+----------+-----+-------+-------------+--------------+----------+-----------+
|        11|    Grace|  Turner|graceturner@email...|555-1122|  567 Oak St|  Cityview|   CA|  98765|           11|    2024-02-13|9999-12-31|       true|
|        12|   Connor|   Evans|connorevans@email...|555-2233|890 Pine Ave|Townsville|   TX|  54321|           12|    2024-02-13|9999-12-31|       true|
+----------+---------+--------+--------------------+--------+------------+----------+-----+-------+-------------+--------------+----------+-----------+



In [217]:
max_sk = insert_records.agg({"customer_skey": "max"}).collect()[0][0]

In [218]:
print(max_sk)

12


In [219]:
update_records = column_renamer(merged_df.filter(col("action") == 'UPDATE'), suffix="_target", append=False)\
                .select(active_customers_target_df.columns)\
                .withColumn("end_date", date_format(current_date(),DATE_FORMAT))\
                .withColumn("active_flag", lit(False))\
            .unionByName(
            column_renamer(merged_df.filter(col("action") == 'UPDATE'), suffix="_source", append=False)\
                .select(customers_source_df.columns)\
                .withColumn("effective_date",date_format(current_date(),DATE_FORMAT))\
                .withColumn("end_date",date_format(lit(future_date),DATE_FORMAT))\
                .withColumn("row_number",row_number().over(window_def))\
                .withColumn("customer_skey",col("row_number")+ max_sk)\
                .withColumn("active_flag", lit(True))\
                .drop("row_number")
                )


In [220]:
update_records.show()

+----------+---------+--------+--------------------+--------+-----------+----------+-----+-------+-------------+--------------+----------+-----------+
|customerid|firstname|lastname|               email|   phone|    address|      city|state|zipcode|customer_skey|effective_date|  end_date|active_flag|
+----------+---------+--------+--------------------+--------+-----------+----------+-----+-------+-------------+--------------+----------+-----------+
|         1|     John|     Doe|   johndoe@email.com|555-1234|123 Main St|   Anytown|   CA|  12345|            1|    2024-02-13|2024-02-13|      false|
|         3|   Robert| Johnson|robertjohnson@ema...|555-8765|789 Pine Ln| Othercity|   TX|  34567|            3|    2024-02-13|2024-02-13|      false|
|         2|     Jane|   Smith| janesmith@email.com|555-5678|456 Oak Ave|  Sometown|   NY|  67890|            2|    2024-02-13|2024-02-13|      false|
|         1|     John|     Doe|   johndoe@gmail.com|555-1234|123 Main St|   Anytown|   CA|  12

In [221]:
max_sk = update_records.agg({"customer_skey": "max"}).collect()[0][0]

In [222]:
print(max_sk)

15


In [223]:
delete_records = column_renamer(merged_df.filter(col("action") == 'DELETE'), suffix="_target", append=False)\
                .select(active_customers_target_df.columns)\
                .withColumn("end_date", date_format(current_date(),DATE_FORMAT))\
                .withColumn("active_flag", lit(False))

delete_records.show()

+----------+---------+--------+--------------------+--------+-------------+--------+-----+-------+-------------+--------------+----------+-----------+
|customerid|firstname|lastname|               email|   phone|      address|    city|state|zipcode|customer_skey|effective_date|  end_date|active_flag|
+----------+---------+--------+--------------------+--------+-------------+--------+-----+-------+-------------+--------------+----------+-----------+
|        10|   Olivia|   Clark|oliviaclark@email...|555-3456|567 Cedar Ave|Thatcity|   TN|  67890|           10|    2024-02-13|2024-02-13|      false|
+----------+---------+--------+--------------------+--------+-------------+--------+-----+-------+-------------+--------------+----------+-----------+



In [224]:
resultant_df = inactive_customers_target_df \
            .unionByName(unchanged_records)\
            .unionByName(insert_records)\
            .unionByName(update_records)\
            .unionByName(delete_records)

resultant_df.show()

+----------+-----------+--------+--------------------+--------+-------------+----------+-----+-------+-------------+--------------+----------+-----------+
|customerid|  firstname|lastname|               email|   phone|      address|      city|state|zipcode|customer_skey|effective_date|  end_date|active_flag|
+----------+-----------+--------+--------------------+--------+-------------+----------+-----+-------+-------------+--------------+----------+-----------+
|         7|      David|   Jones|davidjones@email.com|555-2345|678 Maple Ave|  Yourcity|   GA|  78901|            7|    2024-02-13|9999-12-31|       true|
|         6|      Emily|  Miller|emilymiller@email...|555-6543| 890 Birch Rd|   Newcity|   WA|  23456|            6|    2024-02-13|9999-12-31|       true|
|         9|Christopher|  Taylor|christophertaylor...|555-8765|   234 Oak Ln|  Thistown|   PA|  12345|            9|    2024-02-13|9999-12-31|       true|
|         5|    Michael|   Brown|michaelbrown@emai...|555-9876| 567 El

In [186]:
spark.stop()