In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_date, sha2, concat_ws, lit, expr, trim, initcap, udf
)
from pyspark.sql.types import StringType
import uuid

In [2]:
# Start Spark session
spark = SparkSession.builder \
    .appName("Customer Silver Transformation") \
    .config("spark.jars", r"C:\Users\Mind-Graph\Desktop\etlproject\jars\postgresql-42.7.2.jar") \
    .getOrCreate()

In [3]:
# Read Bronze table
df_bronze = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/bronze_layer_db") \
    .option("dbtable", "bronze.customer_staging") \
    .option("user", "postgres") \
    .option("password", "rathi") \
    .option("driver", "org.postgresql.Driver") \
    .load()

In [4]:
df_bronze.show(5)

+-----------+----------------+----------+---------+--------------+---+-------+--------------------+------------+--------+-------+------------+------------+--------------+--------------------+--------------------+
|customer_id|            name|       dob|dl_number|aadhaar_number|age|door_no|              street|        city|   state|country|credit_score|risk_profile|claims_history|src_transaction_date|         received_at|
+-----------+----------------+----------+---------+--------------+---+-------+--------------------+------------+--------+-------+------------+------------+--------------+--------------------+--------------------+
|      C0001|      Jamie Bell|25-01-1972|   227272|   6.62282E+11| 74|    779|267 Roy Landing S...|  Floresside|KY 72134|    USA|         803|         Low|             2|          2024-12-03|2025-08-03 18:44:...|
|      C0002|Carrie Nicholson|19-02-1990|   746082|    8.1474E+11| 66|    809|754 Javier Creek ...|   East John|VI 14302|    USA|         602|      

In [5]:
# Watermark (for future streaming, simulate here with a filter on SRC_TRANSACTION_DATE)
watermark_cutoff = "2020-01-01"

In [13]:
# Watermark Cutoff
watermark_cutoff = "2020-01-01"
'''
# UDFs for masking
def mask_aadhaar(aadhaar):
    return aadhaar[:2] + "XXXXXX" + aadhaar[-2:] if aadhaar and len(aadhaar) == 12 else aadhaar

def mask_dl(dl):
    return dl[:2] + "XXXXXXX" + dl[-2:] if dl and len(dl) > 4 else dl

mask_aadhaar_udf = udf(mask_aadhaar, StringType())
mask_dl_udf = udf(mask_dl, StringType())
'''
# UUID Generator
uuid_udf = udf(lambda: str(uuid.uuid4()), StringType())


In [14]:
# Transformation
df_silver = df_bronze \
    .withColumn("SRC_TRANSACTION_DATE", to_date(col("src_transaction_date"), "yyyy-MM-dd")) \
    .filter(col("SRC_TRANSACTION_DATE") >= lit(watermark_cutoff)) \
    .withColumnRenamed("customer_id", "Customer_ID") \
    .withColumnRenamed("name", "Name") \
    .withColumnRenamed("dob", "DOB") \
    .withColumnRenamed("dl_number", "DL_Number") \
    .withColumnRenamed("aadhaar_number", "Aadhaar_Number") \
    .withColumnRenamed("age", "Age") \
    .withColumnRenamed("door_no", "Door_No") \
    .withColumnRenamed("street", "Street") \
    .withColumnRenamed("city", "City") \
    .withColumnRenamed("state", "State") \
    .withColumnRenamed("country", "Country") \
    .withColumnRenamed("credit_score", "Credit_Score") \
    .withColumnRenamed("risk_profile", "Risk_Profile") \
    .withColumnRenamed("claims_history", "Claims_History") \
    .withColumnRenamed("received_at", "Received_At") \
    .withColumn("Customer_UUID", uuid_udf()) \
    .withColumn("Record_Hash_ID", sha2(concat_ws("||",
        col("Customer_ID"), col("Name"), col("DOB"),
        col("DL_Number"), col("Aadhaar_Number")
    ), 256)) \
    .withColumn("Name", initcap(trim(col("Name")))) \
    .withColumn("City", initcap(trim(col("City")))) \
    .withColumn("State", initcap(trim(col("State")))) \
    .withColumn("Country", initcap(trim(col("Country"))))


In [15]:
df_silver.show(5)

+-----------+----------------+----------+---------+--------------+---+-------+--------------------+------------+--------+-------+------------+------------+--------------+--------------------+--------------------+--------------------+--------------------+
|Customer_ID|            Name|       DOB|DL_Number|Aadhaar_Number|Age|Door_No|              Street|        City|   State|Country|Credit_Score|Risk_Profile|Claims_History|SRC_TRANSACTION_DATE|         Received_At|       Customer_UUID|      Record_Hash_ID|
+-----------+----------------+----------+---------+--------------+---+-------+--------------------+------------+--------+-------+------------+------------+--------------+--------------------+--------------------+--------------------+--------------------+
|      C0001|      Jamie Bell|25-01-1972|   227272|   6.62282E+11| 74|    779|267 Roy Landing S...|  Floresside|Ky 72134|    Usa|         803|         Low|             2|          2024-12-03|2025-08-03 18:44:...|da8014d8-fc34-451...|00

In [17]:
from pyspark.sql.functions import col, sha2, concat_ws

# Fix scientific notation issue and convert to string first
df_masked = df_silver \
    .withColumn("aadhaar_number_str", col("Aadhaar_Number").cast("string")) \
    .withColumn("dl_number_str", col("DL_Number").cast("string")) \
    .withColumn("Masked_Aadhaar", sha2(col("aadhaar_number_str"), 256)) \
    .withColumn("Masked_DL", sha2(col("dl_number_str"), 256)) \
    .drop("aadhaar_number_str", "dl_number_str")  # Optional cleanup

df_masked.select("customer_id", "Masked_Aadhaar", "Masked_DL").show(truncate=False)


+-----------+----------------------------------------------------------------+----------------------------------------------------------------+
|customer_id|Masked_Aadhaar                                                  |Masked_DL                                                       |
+-----------+----------------------------------------------------------------+----------------------------------------------------------------+
|C0001      |af1e80f7c7c723533caf86e9e7c9289450ac9b94ee51e2a8881a6f6941239e85|4c3ad176dfac7b9dba006a3d3f770ef8c9f41cde9e44105d42d8dffcc1439726|
|C0002      |7cc1ce39bbd9deffedd647d2b864be0f74a5f29993f50e8a1f109bbd51a2a264|406df86a5296fc7babfdbdf5c9a6d67a19919dc6c3a3619e33d0be1d8c30fb94|
|C0003      |e9b450f3b789b7a44220cc1a640b2fb0bdf95dbb9b23cd6740badb89dfefb4d0|bdb6f48157c444eb2e46c67e30e0196e594539ed3b3ef6cfd3f1671d152b94fe|
|C0004      |005c369f4bac43f4b1f39770b6ce2972607f2dea22e746aa14629b15be10e63e|4b3d0cad1a0d4117254acdcc34a42467a0396d949f49e7816315f3a67e

In [18]:
# Filter nulls and quality rules
df_clean = df_masked.filter(
    (col("Customer_ID").isNotNull()) &
    (col("Name").isNotNull()) &
    (col("DOB").isNotNull()) &
    (col("Age") > 0) &
    (col("Credit_Score").isNotNull()) &
    (col("Risk_Profile").isNotNull())
)


In [19]:
df_clean.show(5)

+-----------+----------------+----------+---------+--------------+---+-------+--------------------+------------+--------+-------+------------+------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Customer_ID|            Name|       DOB|DL_Number|Aadhaar_Number|Age|Door_No|              Street|        City|   State|Country|Credit_Score|Risk_Profile|Claims_History|SRC_TRANSACTION_DATE|         Received_At|       Customer_UUID|      Record_Hash_ID|      Masked_Aadhaar|           Masked_DL|
+-----------+----------------+----------+---------+--------------+---+-------+--------------------+------------+--------+-------+------------+------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      C0001|      Jamie Bell|25-01-1972|   227272|   6.62282E+11| 74|    779|267 Roy Landing S...|  Floressi

In [20]:
# Write to Silver table
df_clean.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/silver_layer_db") \
    .option("dbtable", "public.silver_customer") \
    .option("user", "postgres") \
    .option("password", "rathi") \
    .option("driver", "org.postgresql.Driver") \
    .mode("overwrite") \
    .save()
