In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime, timedelta

In [2]:
spark = SparkSession.builder \
    .appName("BronzeToSilver_Insurance_Batch") \
    .config("spark.jars", r"C:\Users\Mind-Graph\Desktop\etlproject\jars\postgresql-42.7.2.jar") \
    .getOrCreate()


In [3]:
spark.sparkContext.setLogLevel("ERROR")

In [4]:
df_bronze = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/bronze_layer_db") \
    .option("dbtable", '"bronze"."insurance_dataset_staging"') \
    .option("user", "postgres") \
    .option("driver", "org.postgresql.Driver") \
    .option("password", "rathi") \
    .load()


In [5]:
# Define watermark-like logic: only process records from last 3 days
watermark_cutoff = (datetime.now() - timedelta(days=3)).strftime('%Y-%m-%d')

In [6]:
df_bronze.show(5)

+-----------+----------------+----------+---------+--------------+---+-------+--------------------+------------+--------+-------+------------+------------+--------------+--------------------+--------------------+
|Customer_ID|            Name|       DOB|DL_Number|Aadhaar_Number|Age|Door_No|              Street|        City|   State|Country|Credit_Score|Risk_Profile|Claims_History|SRC_TRANSACTION_DATE|         received_at|
+-----------+----------------+----------+---------+--------------+---+-------+--------------------+------------+--------+-------+------------+------------+--------------+--------------------+--------------------+
|      C0001|      Jamie Bell|1972-01-25|   227272|  662282154096| 74|    779|267 Roy Landing S...|  Floresside|KY 72134|  India|         803|         Low|             2|          2024-12-03|2025-08-03 19:09:...|
|      C0002|Carrie Nicholson|1990-02-19|   746082|  814740195173| 66|    809|754 Javier Creek ...|   East John|VI 14302|  India|         602|      

In [8]:
from pyspark.sql.functions import *

df_transformed = df_bronze \
    .withColumn("SRC_TRANSACTION_DATE", to_date(trim(col("SRC_TRANSACTION_DATE")), "yyyy-MM-dd")) \
    .withColumn("Customer_ID", trim(col("Customer_ID"))) \
    .withColumn("Customer_UUID", expr("uuid()")) \
    .withColumn("Name", trim(col("Name"))) \
    .withColumn("DOB", to_date(trim(col("DOB")), "yyyy-MM-dd")) \
    .withColumn("DL_Masked", concat(
        sha2(expr("substring(trim(DL_Number), 1, length(trim(DL_Number)) - 2)"), 256),
        expr("substring(trim(DL_Number), -2, 2)")
    )) \
    .withColumn("Aadhaar_Masked", concat(
        sha2(expr("substring(trim(Aadhaar_Number), 1, length(trim(Aadhaar_Number)) - 2)"), 256),
        expr("substring(trim(Aadhaar_Number), -2, 2)")
    )) \
    .withColumn("Age", col("Age").cast("int")) \
    .withColumn("Door_No", trim(col("Door_No"))) \
    .withColumn("Street", trim(col("Street"))) \
    .withColumn("City", trim(col("City"))) \
    .withColumn("State", trim(col("State"))) \
    .withColumn("Country", trim(col("Country"))) \
    .withColumn("Credit_Score", col("Credit_Score").cast("int")) \
    .withColumn("Risk_Profile", trim(col("Risk_Profile"))) \
    .withColumn("Claims_History", col("Claims_History").cast("int")) \
    .withColumn("received_at", current_timestamp()) \
    .drop("DL_Number", "Aadhaar_Number")



In [9]:
df_transformed.show(5)

+-----------+----------------+----------+---+-------+--------------------+------------+--------+-------+------------+------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Customer_ID|            Name|       DOB|Age|Door_No|              Street|        City|   State|Country|Credit_Score|Risk_Profile|Claims_History|SRC_TRANSACTION_DATE|         received_at|       Customer_UUID|           DL_Masked|      Aadhaar_Masked|
+-----------+----------------+----------+---+-------+--------------------+------------+--------+-------+------------+------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      C0001|      Jamie Bell|1972-01-25| 74|    779|267 Roy Landing S...|  Floresside|KY 72134|  India|         803|         Low|             2|          2024-12-03|2025-08-04 12:23:...|bfffaa92-58d2-496...|1134c0a7d44fdae1a...|9a94e5a8818d70fe2.

In [10]:
# Filter invalid records
df_clean = df_transformed.filter(
    (col("Age") > 0) &
    (col("Credit_Score").between(300, 900)) &
    (col("Claims_History") >= 0)
)


In [11]:


df_clean.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/silver_layer_db") \
    .option("dbtable", '"public"."silver_insurance_dataset"') \
    .option("user", "postgres") \
    .option("password", "rathi") \
    .option("driver", "org.postgresql.Driver") \
    .mode("overwrite") \
    .save()


In [12]:
df_clean.show(5)

+-----------+----------------+----------+---+-------+--------------------+------------+--------+-------+------------+------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Customer_ID|            Name|       DOB|Age|Door_No|              Street|        City|   State|Country|Credit_Score|Risk_Profile|Claims_History|SRC_TRANSACTION_DATE|         received_at|       Customer_UUID|           DL_Masked|      Aadhaar_Masked|
+-----------+----------------+----------+---+-------+--------------------+------------+--------+-------+------------+------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      C0001|      Jamie Bell|1972-01-25| 74|    779|267 Roy Landing S...|  Floresside|KY 72134|  India|         803|         Low|             2|          2024-12-03|2025-08-04 12:23:...|bfffaa92-58d2-496...|1134c0a7d44fdae1a...|9a94e5a8818d70fe2.