In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime

In [2]:
# Define watermark cutoff date
watermark_cutoff = "2024-01-01"

In [3]:

# Create Spark session
spark = SparkSession.builder \
    .appName("Transform Claims Bronze to Silver") \
    .config("spark.jars", r"C:\Users\Mind-Graph\Desktop\etlproject\jars\postgresql-42.7.2.jar") \
    .getOrCreate()

In [4]:
# Read claims staging table from PostgreSQL (Bronze layer)
df_bronze = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/bronze_layer_db") \
    .option("dbtable", "bronze.claims_staging") \
    .option("user", "postgres") \
    .option("password", "rathi") \
    .option("driver", "org.postgresql.Driver") \
    .load()

In [5]:
df_bronze.show(5)

+--------+---------+------------+----------+--------+----------------------+--------------------+--------------------+
|claim_id|policy_id|claim_amount|      date|  status|investigation_required|src_transaction_date|         received_at|
+--------+---------+------------+----------+--------+----------------------+--------------------+--------------------+
|  CL0001|    P0932|      $10082|2025-02-01| Pending|                   Yes|          2024-05-12|2025-08-03 18:55:...|
|  CL0002|    P0588|      $37339|2025-03-11|Rejected|                    No|          2025-04-23|2025-08-03 18:55:...|
|  CL0003|    P5054|       $2982|2024-06-17|Rejected|                   Yes|          2025-03-22|2025-08-03 18:55:...|
|  CL0004|    P0827|      $28714|2024-05-09|Rejected|                   Yes|          2025-01-21|2025-08-03 18:55:...|
|  CL0005|    P0145|      $49841|2025-04-25|Approved|                   Yes|          2024-07-05|2025-08-03 18:55:...|
+--------+---------+------------+----------+----

In [6]:

# Clean and transform Bronze data to Silver
df_silver = df_bronze \
    .withColumn("SRC_TRANSACTION_DATE", to_date(col("src_transaction_date"), "yyyy-MM-dd")) \
    .withColumn("received_at", current_timestamp()) \
    .withColumn("Claim_ID", when(col("claim_id").isNull(), expr("uuid()")).otherwise(col("claim_id"))) \
    .withColumn("Claim_UUID", expr("uuid()")) \
    .withColumn("Policy_ID", col("policy_id")) \
    .withColumn("Claim_Amount",
        regexp_replace(trim(col("claim_amount")), "[$,]", "").cast("double")
    ) \
    .withColumn("Date", to_date(col("date"), "yyyy-MM-dd")) \
    .withColumn("Status", initcap(trim(col("status")))) \
    .withColumn("Investigation_Required", when(col("investigation_required").isin("Yes", "Y", "True", "1"), lit(True))
                                            .when(col("investigation_required").isin("No", "N", "False", "0"), lit(False))
                                            .otherwise(lit(None))) \
    .withColumn("Record_UUID", expr("uuid()")) \
    .withColumn("Record_Hash_ID", sha2(concat_ws("||", col("Claim_ID"), col("Policy_ID"), col("Date")), 256))


In [7]:
df_silver.show(5)

+--------+---------+------------+----------+--------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Claim_ID|Policy_ID|Claim_Amount|      Date|  Status|Investigation_Required|SRC_TRANSACTION_DATE|         received_at|          Claim_UUID|         Record_UUID|      Record_Hash_ID|
+--------+---------+------------+----------+--------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  CL0001|    P0932|     10082.0|2025-02-01| Pending|                  true|          2024-05-12|2025-08-04 11:56:...|75b017f2-63dd-4b5...|46b4d269-0524-40a...|65c17440701e90a57...|
|  CL0002|    P0588|     37339.0|2025-03-11|Rejected|                 false|          2025-04-23|2025-08-04 11:56:...|49b68f66-3de8-4ef...|967bfefc-9126-481...|018c65251ae581c55...|
|  CL0003|    P5054|      2982.0|2024-06-17|Rejected|                  true|          2025

In [8]:
df_clean = df_silver.filter(
    col("Claim_ID").isNotNull() &
    col("Policy_ID").isNotNull()
)


In [9]:
df_clean.show(5)

+--------+---------+------------+----------+--------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Claim_ID|Policy_ID|Claim_Amount|      Date|  Status|Investigation_Required|SRC_TRANSACTION_DATE|         received_at|          Claim_UUID|         Record_UUID|      Record_Hash_ID|
+--------+---------+------------+----------+--------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  CL0001|    P0932|     10082.0|2025-02-01| Pending|                  true|          2024-05-12|2025-08-04 11:56:...|75b017f2-63dd-4b5...|46b4d269-0524-40a...|65c17440701e90a57...|
|  CL0002|    P0588|     37339.0|2025-03-11|Rejected|                 false|          2025-04-23|2025-08-04 11:56:...|49b68f66-3de8-4ef...|967bfefc-9126-481...|018c65251ae581c55...|
|  CL0003|    P5054|      2982.0|2024-06-17|Rejected|                  true|          2025

In [10]:
# Write to Silver layer PostgreSQL
df_clean.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/silver_layer_db") \
    .option("dbtable", '"public"."silver_claims"') \
    .option("user", "postgres") \
    .option("password", "rathi") \
    .option("driver", "org.postgresql.Driver") \
    .mode("overwrite") \
    .save()

In [42]:
df_clean.show(5)

+--------+---------+------------+----------+--------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Claim_ID|Policy_ID|Claim_Amount|      Date|  Status|Investigation_Required|SRC_TRANSACTION_DATE|         received_at|          Claim_UUID|         Record_UUID|      Record_Hash_ID|
+--------+---------+------------+----------+--------+----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  CL0001|    P0932|        NULL|2025-02-01| Pending|                  true|          2024-05-12|2025-08-04 09:33:...|41e341d5-14cb-4ff...|8e5e5dc4-90cb-43e...|65c17440701e90a57...|
|  CL0002|    P0588|        NULL|2025-03-11|Rejected|                 false|          2025-04-23|2025-08-04 09:33:...|a7b16386-a22e-4ec...|056f1482-a63f-475...|018c65251ae581c55...|
|  CL0003|    P5054|        NULL|2024-06-17|Rejected|                  true|          2025