In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import uuid

# UDF to generate UUID
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType


In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("underwriting silver Transformation") \
    .config("spark.jars", r"C:\Users\Mind-Graph\Desktop\etlproject\jars\postgresql-42.7.2.jar") \
    .getOrCreate()

In [3]:
# Read Bronze table
df_bronze = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/bronze_layer_db") \
    .option("dbtable", "bronze.underwriting_staging") \
    .option("user", "postgres") \
    .option("password", "rathi") \
    .option("driver", "org.postgresql.Driver") \
    .load()

In [4]:
df_bronze.show(5)

+-----------+---------+----------+---------------+--------------------+--------------------+
|customer_id|policy_id|risk_score|approval_status|src_transaction_date|         received_at|
+-----------+---------+----------+---------------+--------------------+--------------------+
|      C0344|    P0001|        35|       Approved|          2025-02-14|2025-08-03 18:57:...|
|      C0474|    P0002|         1|       Rejected|          2024-07-03|2025-08-03 18:57:...|
|      C0644|    P0003|        14|       Approved|          2024-06-15|2025-08-03 18:57:...|
|      C0956|    P0004|         3|       Approved|          2024-11-25|2025-08-03 18:57:...|
|      C0933|    P0005|        74|   Under Review|          2025-04-06|2025-08-03 18:57:...|
+-----------+---------+----------+---------------+--------------------+--------------------+
only showing top 5 rows



In [5]:
uuid_udf = udf(lambda: str(uuid.uuid4()), StringType())

In [6]:
from pyspark.sql.functions import col, when, to_date, sha2, concat_ws, expr, initcap, trim
from pyspark.sql.types import DoubleType, StringType

df_silver = df_bronze \
    .withColumnRenamed("customer_id", "Customer_ID") \
    .withColumnRenamed("policy_id", "Policy_ID") \
    .withColumnRenamed("risk_score", "Risk_Score") \
    .withColumnRenamed("approval_status", "Approval_Status") \
    .withColumnRenamed("src_transaction_date", "SRC_TRANSACTION_DATE") \
    .withColumnRenamed("received_at", "Received_At") \
    .withColumn("SRC_TRANSACTION_DATE", to_date(col("SRC_TRANSACTION_DATE"), "yyyy-MM-dd")) \
    .withColumn("Underwriting_UUID", expr("uuid()")) \
    .withColumn("Record_Hash_ID", sha2(concat_ws("||",
        col("Customer_ID").cast(StringType()),
        col("Policy_ID").cast(StringType()),
        col("Risk_Score").cast(StringType())
    ), 256)) \
    .withColumn("Approval_Status", initcap(trim(col("Approval_Status")))) \
    .withColumn("Risk_Score", when(col("Risk_Score").rlike("^[0-9.]+$"), col("Risk_Score")).otherwise(None).cast(DoubleType()))


In [7]:
df_clean = df_silver.filter(
    col("Customer_ID").isNotNull() &
    col("Policy_ID").isNotNull() &
    col("Risk_Score").isNotNull() & (col("Risk_Score") > 0) &
    col("Approval_Status").isNotNull() &
    col("SRC_TRANSACTION_DATE").isNotNull()
)


In [8]:
print("Before filter:", df_silver.count())
print("After filter:", df_clean.count())


Before filter: 1371
After filter: 1371


In [9]:
df_clean.show(5)

+-----------+---------+----------+---------------+--------------------+--------------------+--------------------+--------------------+
|Customer_ID|Policy_ID|Risk_Score|Approval_Status|SRC_TRANSACTION_DATE|         Received_At|   Underwriting_UUID|      Record_Hash_ID|
+-----------+---------+----------+---------------+--------------------+--------------------+--------------------+--------------------+
|      C0344|    P0001|      35.0|       Approved|          2025-02-14|2025-08-03 18:57:...|75976a22-a945-4f8...|35a7720489028d209...|
|      C0474|    P0002|       1.0|       Rejected|          2024-07-03|2025-08-03 18:57:...|2446241c-a46c-4c5...|046be171a399dac4b...|
|      C0644|    P0003|      14.0|       Approved|          2024-06-15|2025-08-03 18:57:...|c94fcead-c2d3-465...|b2be355eaf84ba29c...|
|      C0956|    P0004|       3.0|       Approved|          2024-11-25|2025-08-03 18:57:...|a4fa3f09-f545-492...|1b33bb4c1b5f7d974...|
|      C0933|    P0005|      74.0|   Under Review|     

In [10]:
# Write to Silver table
df_clean.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/silver_layer_db") \
    .option("dbtable", "public.silver_underwriting") \
    .option("user", "postgres") \
    .option("password", "rathi") \
    .option("driver", "org.postgresql.Driver") \
    .mode("overwrite") \
    .save()
