In [1]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_date, trim, initcap, sha2, concat_ws, when, lit, expr, udf
)
from pyspark.sql.types import StringType, DoubleType
import uuid

In [2]:
# Start Spark session
spark = SparkSession.builder \
    .appName("policy Silver Transformation") \
    .config("spark.jars", r"C:\Users\Mind-Graph\Desktop\etlproject\jars\postgresql-42.7.2.jar") \
    .getOrCreate()

In [3]:
# Read from PostgreSQL bronze layer
df_bronze = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/bronze_layer_db") \
    .option("dbtable", "bronze.policies_staging") \
    .option("user", "postgres") \
    .option("password", "rathi") \
    .option("driver", "org.postgresql.Driver") \
    .load()


In [4]:
# Watermark cutoff
watermark_cutoff = "2000-01-01"


In [5]:

# UUID Generator UDF
uuid_udf = udf(lambda: str(uuid.uuid4()), StringType())


In [6]:
df_bronze.show(5)

+---------+-----------+-----------+-------+---------------+-------+--------------------+--------------------+
|policy_id|customer_id|policy_type|premium|coverage_amount| status|src_transaction_date|         received_at|
+---------+-----------+-----------+-------+---------------+-------+--------------------+--------------------+
|    P0477|      C0657|     Health|$293/mo|        $292411|Expired|          2025-05-05|2025-08-03 18:57:...|
|    P5828|      C0061|       Home|$336/mo|        $196659| Active|          2025-05-05|2025-08-03 18:57:...|
|    P3060|      C0604|     Travel|$389/mo|        $594567| Active|          2025-05-05|2025-08-03 18:57:...|
|    P0299|      C0416|     Travel|$106/mo|         $66707|Expired|          2025-05-04|2025-08-03 18:57:...|
|    P0418|      C0152|       Home|$122/mo|        $478177| Active|          2025-05-04|2025-08-03 18:57:...|
+---------+-----------+-----------+-------+---------------+-------+--------------------+--------------------+
only showi

In [7]:
from pyspark.sql.functions import regexp_replace, trim, initcap, to_date, col, sha2, concat_ws
from pyspark.sql.types import DoubleType

# Apply the transformation
df_silver = df_bronze \
    .withColumnRenamed("policy_id", "Policy_ID") \
    .withColumnRenamed("customer_id", "Customer_ID") \
    .withColumnRenamed("policy_type", "Policy_Type") \
    .withColumnRenamed("premium", "Premium") \
    .withColumnRenamed("coverage_amount", "Coverage_Amount") \
    .withColumnRenamed("status", "Status") \
    .withColumnRenamed("src_transaction_date", "SRC_TRANSACTION_DATE") \
    .withColumnRenamed("received_at", "Received_At") \
    .withColumn("SRC_TRANSACTION_DATE", to_date(col("SRC_TRANSACTION_DATE"), "yyyy-MM-dd")) \
    .withColumn("Policy_UUID", uuid_udf()) \
    .withColumn("Record_Hash_ID", sha2(concat_ws("||",
        col("Policy_ID"), col("Customer_ID"), col("Policy_Type")
    ), 256)) \
    .withColumn("Policy_Type", initcap(trim(col("Policy_Type")))) \
    .withColumn("Status", initcap(trim(col("Status")))) \
    .withColumn("Premium", regexp_replace(col("Premium"), "[$,/a-zA-Z]", "").cast(DoubleType())) \
    .withColumn("Coverage_Amount", regexp_replace(col("Coverage_Amount"), "[$,/a-zA-Z]", "").cast(DoubleType()))


In [8]:
df_silver.show(5)

+---------+-----------+-----------+-------+---------------+-------+--------------------+--------------------+--------------------+--------------------+
|Policy_ID|Customer_ID|Policy_Type|Premium|Coverage_Amount| Status|SRC_TRANSACTION_DATE|         Received_At|         Policy_UUID|      Record_Hash_ID|
+---------+-----------+-----------+-------+---------------+-------+--------------------+--------------------+--------------------+--------------------+
|    P0477|      C0657|     Health|  293.0|       292411.0|Expired|          2025-05-05|2025-08-03 18:57:...|ffb976f6-a12a-47b...|3cec7d06d589e532b...|
|    P5828|      C0061|       Home|  336.0|       196659.0| Active|          2025-05-05|2025-08-03 18:57:...|d960fd2e-8556-454...|f86e0d4f12d891678...|
|    P3060|      C0604|     Travel|  389.0|       594567.0| Active|          2025-05-05|2025-08-03 18:57:...|3f45d969-769b-4ae...|01df20e10b51645c1...|
|    P0299|      C0416|     Travel|  106.0|        66707.0|Expired|          2025-05-04|

In [9]:
df_clean = df_silver.filter(
    col("Policy_ID").isNotNull() &
    col("Customer_ID").isNotNull() &
    col("Policy_Type").isNotNull()
)
df_clean.show(5)


+---------+-----------+-----------+-------+---------------+-------+--------------------+--------------------+--------------------+--------------------+
|Policy_ID|Customer_ID|Policy_Type|Premium|Coverage_Amount| Status|SRC_TRANSACTION_DATE|         Received_At|         Policy_UUID|      Record_Hash_ID|
+---------+-----------+-----------+-------+---------------+-------+--------------------+--------------------+--------------------+--------------------+
|    P0477|      C0657|     Health|  293.0|       292411.0|Expired|          2025-05-05|2025-08-03 18:57:...|d0d2e28f-d817-46a...|3cec7d06d589e532b...|
|    P5828|      C0061|       Home|  336.0|       196659.0| Active|          2025-05-05|2025-08-03 18:57:...|e665a841-60a2-4e2...|f86e0d4f12d891678...|
|    P3060|      C0604|     Travel|  389.0|       594567.0| Active|          2025-05-05|2025-08-03 18:57:...|853616f1-f8b2-4c5...|01df20e10b51645c1...|
|    P0299|      C0416|     Travel|  106.0|        66707.0|Expired|          2025-05-04|

In [10]:
# Write to Silver table
df_clean.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/silver_layer_db") \
    .option("dbtable", "public.silver_policies") \
    .option("user", "postgres") \
    .option("password", "rathi") \
    .option("driver", "org.postgresql.Driver") \
    .mode("overwrite") \
    .save()


In [11]:
df_clean.show(5)

+---------+-----------+-----------+-------+---------------+-------+--------------------+--------------------+--------------------+--------------------+
|Policy_ID|Customer_ID|Policy_Type|Premium|Coverage_Amount| Status|SRC_TRANSACTION_DATE|         Received_At|         Policy_UUID|      Record_Hash_ID|
+---------+-----------+-----------+-------+---------------+-------+--------------------+--------------------+--------------------+--------------------+
|    P0477|      C0657|     Health|  293.0|       292411.0|Expired|          2025-05-05|2025-08-03 18:57:...|ccd35c87-7ead-452...|3cec7d06d589e532b...|
|    P5828|      C0061|       Home|  336.0|       196659.0| Active|          2025-05-05|2025-08-03 18:57:...|f54e6c45-39b2-41d...|f86e0d4f12d891678...|
|    P3060|      C0604|     Travel|  389.0|       594567.0| Active|          2025-05-05|2025-08-03 18:57:...|74d6a3ed-d24e-452...|01df20e10b51645c1...|
|    P0299|      C0416|     Travel|  106.0|        66707.0|Expired|          2025-05-04|