In [5]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *


In [6]:
# Create Spark session
spark = SparkSession.builder \
    .appName("Transform Customer Identity Bronze to Silver") \
    .config("spark.jars", r"C:\Users\Mind-Graph\Desktop\etlproject\jars\postgresql-42.7.2.jar") \
    .getOrCreate()

In [7]:
# Read from PostgreSQL (Bronze)
df_bronze = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/bronze_layer_db") \
    .option("dbtable", "bronze.customer_identity_staging") \
    .option("user", "postgres") \
    .option("password", "rathi") \
    .option("driver", "org.postgresql.Driver") \
    .load()

In [8]:
df_bronze.show(5)

+-----------+--------------+---------+--------------------+
|customer_id|aadhaar_number|dl_number|         received_at|
+-----------+--------------+---------+--------------------+
|      C0001|  662282154096|   227272|2025-08-03 18:56:...|
|      C0002|  814740195173|   746082|2025-08-03 18:56:...|
|      C0003|  312860089533|   340792|2025-08-03 18:56:...|
|      C0004|  746863388159|   920503|2025-08-03 18:56:...|
|      C0005|  982130250187|   566788|2025-08-03 18:56:...|
+-----------+--------------+---------+--------------------+
only showing top 5 rows



In [13]:


from pyspark.sql.functions import sha2, concat_ws, col, expr

df_silver = df_bronze \
    .withColumn("Customer_ID", col("customer_id")) \
    .withColumn("Masked_Aadhaar_Number", sha2(col("aadhaar_number").cast("string"), 256)) \
    .withColumn("Masked_DL_Number", sha2(col("dl_number").cast("string"), 256)) \
    .withColumn("Received_At", col("received_at")) \
    .withColumn("Record_UUID", expr("uuid()")) \
    .withColumn("Record_Hash_ID", sha2(concat_ws("||", col("Customer_ID"), col("Masked_Aadhaar_Number"), col("Masked_DL_Number")), 256))


In [14]:
df_silver.show(5)

+-----------+--------------+---------+--------------------+---------------------+--------------------+--------------------+--------------------+
|Customer_ID|aadhaar_number|dl_number|         Received_At|Masked_Aadhaar_Number|    Masked_DL_Number|         Record_UUID|      Record_Hash_ID|
+-----------+--------------+---------+--------------------+---------------------+--------------------+--------------------+--------------------+
|      C0001|  662282154096|   227272|2025-08-03 18:56:...| 53db24e255e42f4fc...|4c3ad176dfac7b9db...|c39bee25-03f3-441...|919b2320e1a03f843...|
|      C0002|  814740195173|   746082|2025-08-03 18:56:...| c4a479edab65f714a...|406df86a5296fc7ba...|84294531-23ba-4c6...|ae86a89cfa3812fe0...|
|      C0003|  312860089533|   340792|2025-08-03 18:56:...| 3437c1ab81807789a...|bdb6f48157c444eb2...|f8918752-b29b-494...|ce97395d57a7c88f4...|
|      C0004|  746863388159|   920503|2025-08-03 18:56:...| 83f6c92c818b0e31b...|4b3d0cad1a0d41172...|d8a32798-83ef-469...|a2d643a

In [15]:

# Optional: Null checks (only keep valid records)
df_clean = df_silver.filter(
    col("Customer_ID").isNotNull() &
    col("Masked_Aadhaar_Number").isNotNull() &
    col("Masked_DL_Number").isNotNull()
)

In [16]:
df_clean.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/silver_layer_db") \
    .option("dbtable", "public.silver_customer_identity") \
    .option("user", "postgres") \
    .option("password", "rathi") \
    .option("driver", "org.postgresql.Driver") \
    .mode("overwrite") \
    .save()


In [17]:
df_clean.show(5)

+-----------+--------------+---------+--------------------+---------------------+--------------------+--------------------+--------------------+
|Customer_ID|aadhaar_number|dl_number|         Received_At|Masked_Aadhaar_Number|    Masked_DL_Number|         Record_UUID|      Record_Hash_ID|
+-----------+--------------+---------+--------------------+---------------------+--------------------+--------------------+--------------------+
|      C0001|  662282154096|   227272|2025-08-03 18:56:...| 53db24e255e42f4fc...|4c3ad176dfac7b9db...|c39bee25-03f3-441...|919b2320e1a03f843...|
|      C0002|  814740195173|   746082|2025-08-03 18:56:...| c4a479edab65f714a...|406df86a5296fc7ba...|84294531-23ba-4c6...|ae86a89cfa3812fe0...|
|      C0003|  312860089533|   340792|2025-08-03 18:56:...| 3437c1ab81807789a...|bdb6f48157c444eb2...|f8918752-b29b-494...|ce97395d57a7c88f4...|
|      C0004|  746863388159|   920503|2025-08-03 18:56:...| 83f6c92c818b0e31b...|4b3d0cad1a0d41172...|d8a32798-83ef-469...|a2d643a