In [0]:
# ============================================================
# ONE-CELL DATA QUALITY CORRECTION PIPELINE (SILVER-READY)
# ============================================================

from pyspark.sql import functions as F
from pyspark.sql.window import Window
from collections import Counter

# ------------------------------------------------------------
# 1. LOAD RAW DATA
# ------------------------------------------------------------
customers_raw= spark.read.option("header",True).csv("/Volumes/hdfc_data_mentor/etl/shrutivolume/customer.csv")
branches_raw= spark.read.option("header",True).csv("/Volumes/hdfc_data_mentor/etl/shrutivolume/branches.csv")
loans_raw=spark.read.json("/Volumes/hdfc_data_mentor/etl/shrutivolume/loans.json")
disb_raw=spark.read.parquet("/Volumes/hdfc_data_mentor/etl/shrutivolume/disbursements.parquet")

# ------------------------------------------------------------
# 2. NULL HANDLING (MANDATORY FIELDS)
# ------------------------------------------------------------
customers_valid = customers_raw.filter("customer_id IS NOT NULL")
loans_valid = loans_raw.filter("loan_id IS NOT NULL AND customer_id IS NOT NULL")
disb_valid = disb_raw.filter("loan_id IS NOT NULL")
branches_valid = branches_raw.filter("branch_id IS NOT NULL")

# ------------------------------------------------------------
# 3. DATA TYPE & ENCODING FIXES
# ------------------------------------------------------------

# Fix comma-separated numeric values
loans_valid = loans_valid.withColumn(
    "loan_amount",
    F.regexp_replace("loan_amount", ",", "").cast("double")
)

# ------------------------------------------------------------
# 4. TIMESTAMP STANDARDIZATION (FINAL SAFE FIX)
# ------------------------------------------------------------
# Handles:
# yyyy-MM-dd HH:mm:ss
# dd-MM-yyyy HH:mm

loans_valid = loans_valid.withColumn(
    "update_ts",
    F.coalesce(
        F.expr("try_to_timestamp(update_ts, 'yyyy-MM-dd HH:mm:ss')"),
        F.expr("try_to_timestamp(update_ts, 'dd-MM-yyyy HH:mm')")
    )
)

customers_valid = customers_valid.withColumn(
    "update_ts",
    F.coalesce(
        F.expr("try_to_timestamp(update_ts, 'yyyy-MM-dd HH:mm:ss')"),
        F.expr("try_to_timestamp(update_ts, 'dd-MM-yyyy HH:mm')")
    )
)

disb_valid = disb_valid.withColumn(
    "update_ts",
    F.coalesce(
        F.expr("try_to_timestamp(update_ts, 'yyyy-MM-dd HH:mm:ss')"),
        F.expr("try_to_timestamp(update_ts, 'dd-MM-yyyy HH:mm')")
    )
)

# ------------------------------------------------------------
# 5. DEDUPLICATION (DETERMINISTIC)
# ------------------------------------------------------------

# Customers: latest record per customer
w_cust = Window.partitionBy("customer_id").orderBy(F.col("update_ts").desc_nulls_last())
customers_clean = (
    customers_valid
    .withColumn("rn", F.row_number().over(w_cust))
    .filter("rn = 1")
    .drop("rn")
)

# Loans: latest record per loan
w_loan = Window.partitionBy("loan_id").orderBy(F.col("update_ts").desc_nulls_last())
loans_clean = (
    loans_valid
    .withColumn("rn", F.row_number().over(w_loan))
    .filter("rn = 1")
    .drop("rn")
)

# Disbursements: latest per loan
w_disb = Window.partitionBy("loan_id").orderBy(F.col("update_ts").desc_nulls_last())
disb_clean = (
    disb_valid
    .withColumn("rn", F.row_number().over(w_disb))
    .filter("rn = 1")
    .drop("rn")
)

# ------------------------------------------------------------
# 6. CATEGORICAL STANDARDIZATION
# ------------------------------------------------------------
loans_clean = loans_clean.withColumn(
    "status",
    F.upper(F.trim("status"))
)

valid_status = ["APPROVED", "PENDING", "DISBURSED", "CLOSED"]
loans_clean = loans_clean.filter(F.col("status").isin(valid_status))

# ------------------------------------------------------------
# 7. REFERENTIAL INTEGRITY
# ------------------------------------------------------------
loans_clean = loans_clean.join(
    branches_valid.select("branch_id"),
    "branch_id",
    "inner"
)

# ------------------------------------------------------------
# 8. ADD STANDARDIZED METADATA (OPTIONAL)
# ------------------------------------------------------------
customers_clean = customers_clean.withColumn("record_source", F.lit("CUSTOMER_SYS"))
loans_clean = loans_clean.withColumn("record_source", F.lit("LOAN_SYS"))
disb_clean = disb_clean.withColumn("record_source", F.lit("DISBURSEMENT_SYS"))

# ------------------------------------------------------------
# 9. FINAL VALIDATION CHECKS
# ------------------------------------------------------------

def find_duplicate_columns(df):
    return [c for c, n in Counter(df.columns).items() if n > 1]

print("Customer duplicate columns:", find_duplicate_columns(customers_clean))
print("Loan duplicate columns:", find_duplicate_columns(loans_clean))
print("Disbursement duplicate columns:", find_duplicate_columns(disb_clean))

print("Final record counts")
print("Customers:", customers_clean.count())
print("Loans:", loans_clean.count())
print("Disbursements:", disb_clean.count())
print("Branches:", branches_valid.count())

# ============================================================
# OUTPUT DATAFRAMES (SILVER-READY)
# customers_clean
# loans_clean
# disb_clean
# branches_valid
# ============================================================


Customer duplicate columns: []
Loan duplicate columns: []
Disbursement duplicate columns: []
Final record counts
Customers: 30
Loans: 41
Disbursements: 29
Branches: 8


In [0]:
assert customers_clean.filter("customer_id IS NULL").count() == 0, "NULL customer_id in Silver"
assert loans_clean.filter("loan_id IS NULL OR customer_id IS NULL").count() == 0, "NULL keys in Silver loans"
assert disb_clean.filter("loan_id IS NULL").count() == 0, "NULL loan_id in Silver disbursements"
assert branches_valid.filter("branch_id IS NULL").count() == 0, "NULL branch_id in Silver branches"


In [0]:
assert customers_clean.groupBy("customer_id").count().filter("count > 1").count() == 0, "Duplicate customers in Silver"
assert loans_clean.groupBy("loan_id").count().filter("count > 1").count() == 0, "Duplicate loans in Silver"


In [0]:
invalid_branch_refs = loans_clean.join(
    branches_valid.select("branch_id"),
    "branch_id",
    "left_anti"
).count()

assert invalid_branch_refs == 0, "Broken branch reference in Silver loans"


In [0]:
assert loans_clean.filter("loan_amount < 0").count() == 0, "Negative loan amount found"


In [0]:
customers_clean.write \
    .mode("overwrite") \
    .format("delta") \
    .saveAsTable("silver_customers")

loans_clean.write \
    .mode("overwrite") \
    .format("delta") \
    .saveAsTable("silver_loans")

disb_clean.write \
    .mode("overwrite") \
    .format("delta") \
    .saveAsTable("silver_disbursements")

branches_valid.write \
    .mode("overwrite") \
    .format("delta") \
    .saveAsTable("silver_branches")


In [0]:
spark.table("silver_customers").count()
spark.table("silver_loans").count()
spark.table("silver_disbursements").count()
spark.table("silver_branches").count()


8