In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *


In [2]:
# Spark Session
spark = SparkSession.builder \
    .appName("Transform Customer-Policy Relation Bronze to Silver") \
    .config("spark.jars", r"C:\Users\Mind-Graph\Desktop\etlproject\jars\postgresql-42.7.2.jar") \
    .getOrCreate()


In [8]:
# Load Bronze data
df_bronze = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/bronze_layer_db") \
    .option("dbtable", "bronze.\"customer_policy_relation_staging\"") \
    .option("user", "postgres") \
    .option("password", "rathi") \
    .option("driver", "org.postgresql.Driver") \
    .load()

In [9]:
df_bronze.show(5)

+-----------+---------+----------+----------+-------+--------------------+--------------------+
|customer_id|policy_id|start_date|  end_date| status|src_transaction_date|         received_at|
+-----------+---------+----------+----------+-------+--------------------+--------------------+
|      C0001|    P8544|2023-07-05|      NULL| Active|          2025-01-21|2025-08-03 18:56:...|
|      C0002|    P0830|2021-04-10|2024-06-27|Expired|          2025-03-01|2025-08-03 18:56:...|
|      C0003|    P8159|2021-04-13|      NULL| Active|          2024-10-02|2025-08-03 18:56:...|
|      C0004|    P0330|2021-10-13|      NULL| Active|          2024-06-08|2025-08-03 18:56:...|
|      C0005|    P0379|2021-11-27|      NULL| Active|          2024-10-26|2025-08-03 18:56:...|
+-----------+---------+----------+----------+-------+--------------------+--------------------+
only showing top 5 rows



In [10]:

# Watermark cutoff
watermark_cutoff = "2024-01-01"

In [11]:
from pyspark.sql.functions import col, trim, upper, to_date, current_timestamp, when, concat_ws, sha2, expr

df_silver = df_bronze \
    .withColumn("Customer_ID", trim(col("customer_id"))) \
    .withColumn("Policy_ID", trim(col("policy_id"))) \
    .withColumn("Start_Date", to_date(trim(col("start_date")), "yyyy-MM-dd")) \
    .withColumn("End_Date", to_date(trim(col("end_date")), "yyyy-MM-dd")) \
    .withColumn("Status", upper(trim(col("status")))) \
    .withColumn("SRC_TRANSACTION_DATE", to_date(trim(col("src_transaction_date")), "yyyy-MM-dd")) \
    .withColumn("Received_At", when(col("received_at").isNotNull(), col("received_at")).otherwise(current_timestamp())) \
    .filter(col("SRC_TRANSACTION_DATE") >= lit(watermark_cutoff)) \
    .withColumn("Record_UUID", expr("uuid()")) \
    .withColumn("Record_Hash_ID", sha2(concat_ws("||",
        col("Customer_ID"), col("Policy_ID"), col("Start_Date")
    ), 256))


In [12]:
df_silver.show(5)

+-----------+---------+----------+----------+-------+--------------------+--------------------+--------------------+--------------------+
|Customer_ID|Policy_ID|Start_Date|  End_Date| Status|SRC_TRANSACTION_DATE|         Received_At|         Record_UUID|      Record_Hash_ID|
+-----------+---------+----------+----------+-------+--------------------+--------------------+--------------------+--------------------+
|      C0001|    P8544|2023-07-05|      NULL| ACTIVE|          2025-01-21|2025-08-03 18:56:...|eb170f54-7087-42d...|fcc97f6e8bfcdd66a...|
|      C0002|    P0830|2021-04-10|2024-06-27|EXPIRED|          2025-03-01|2025-08-03 18:56:...|b85925e5-e004-42e...|16a39c3c35889958a...|
|      C0003|    P8159|2021-04-13|      NULL| ACTIVE|          2024-10-02|2025-08-03 18:56:...|0a41f54f-0933-42d...|13145eaf9a1d4457e...|
|      C0004|    P0330|2021-10-13|      NULL| ACTIVE|          2024-06-08|2025-08-03 18:56:...|0aa95e3b-3a6f-493...|7fc617ef3bdd73eca...|
|      C0005|    P0379|2021-11-27|

In [13]:

# Null and type validation
df_clean = df_silver.filter(
    col("Customer_ID").isNotNull() &
    col("Policy_ID").isNotNull() &
    col("Start_Date").isNotNull()
)


In [14]:
# Deduplicate using composite key
df_deduped = df_clean.dropDuplicates(["Customer_ID", "Policy_ID", "Start_Date"])

In [15]:
# Write to Silver PostgreSQL table
df_deduped.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/silver_layer_db") \
    .option("dbtable", "public.silver_customer_policy_relation") \
    .option("user", "postgres") \
    .option("password", "rathi") \
    .option("driver", "org.postgresql.Driver") \
    .mode("overwrite") \
    .save()


In [17]:
df_deduped.show(5)

+-----------+---------+----------+----------+-------+--------------------+--------------------+--------------------+--------------------+
|Customer_ID|Policy_ID|Start_Date|  End_Date| Status|SRC_TRANSACTION_DATE|         Received_At|         Record_UUID|      Record_Hash_ID|
+-----------+---------+----------+----------+-------+--------------------+--------------------+--------------------+--------------------+
|      C0001|    P8544|2023-07-05|      NULL| ACTIVE|          2025-01-21|2025-08-03 18:56:...|eb170f54-7087-42d...|fcc97f6e8bfcdd66a...|
|      C0002|    P0830|2021-04-10|2024-06-27|EXPIRED|          2025-03-01|2025-08-03 18:56:...|b85925e5-e004-42e...|16a39c3c35889958a...|
|      C0003|    P8159|2021-04-13|      NULL| ACTIVE|          2024-10-02|2025-08-03 18:56:...|0a41f54f-0933-42d...|13145eaf9a1d4457e...|
|      C0004|    P0330|2021-10-13|      NULL| ACTIVE|          2024-06-08|2025-08-03 18:56:...|0aa95e3b-3a6f-493...|7fc617ef3bdd73eca...|
|      C0005|    P0379|2021-11-27|