In [0]:
loans_raw=spark.read.json("/Volumes/hdfc_data_mentor/etl/shrutivolume/loans.json")



In [0]:
loans_raw.printSchema()


root
 |-- branch_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- interest_rate: double (nullable = true)
 |-- loan_amount: string (nullable = true)
 |-- loan_id: string (nullable = true)
 |-- loan_type: string (nullable = true)
 |-- origination_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tenure_months: long (nullable = true)
 |-- update_ts: string (nullable = true)



In [0]:
loans_raw.count()


42

In [0]:
from pyspark.sql.functions import col, sum

null_summary = loans_raw.select([
    sum(col(c).isNull().cast("int")).alias(c) for c in loans_raw.columns
])

null_summary.show(truncate=False)


+---------+-----------+-------------+-----------+-------+---------+----------------+------+-------------+---------+
|branch_id|customer_id|interest_rate|loan_amount|loan_id|loan_type|origination_date|status|tenure_months|update_ts|
+---------+-----------+-------------+-----------+-------+---------+----------------+------+-------------+---------+
|0        |1          |0            |0          |0      |0        |0               |0     |0            |0        |
+---------+-----------+-------------+-----------+-------+---------+----------------+------+-------------+---------+



In [0]:
loans_raw.filter("customer_id IS NULL OR loan_id IS NULL").show(truncate=False)


+---------+-----------+-------------+-----------+-------+---------+----------------+--------+-------------+-------------------+
|branch_id|customer_id|interest_rate|loan_amount|loan_id|loan_type|origination_date|status  |tenure_months|update_ts          |
+---------+-----------+-------------+-----------+-------+---------+----------------+--------+-------------+-------------------+
|BR103    |NULL       |11.0         |500,000    |L50042 |Personal |2025-11-04      |APPROVED|24           |2025-11-05 00:00:00|
+---------+-----------+-------------+-----------+-------+---------+----------------+--------+-------------+-------------------+



In [0]:
loans_raw.groupBy("loan_id").count().filter("count > 1").show()


+-------+-----+
|loan_id|count|
+-------+-----+
+-------+-----+



In [0]:
loans_raw.groupBy("customer_id").count().orderBy(col("count").desc()).show()


+-----------+-----+
|customer_id|count|
+-----------+-----+
|      C1026|    2|
|      C1021|    2|
|      C1018|    2|
|      C1024|    2|
|      C1014|    2|
|      C1012|    2|
|      C1006|    2|
|      C1023|    2|
|      C1011|    2|
|      C1013|    2|
|      C1017|    2|
|      C1010|    1|
|      C1015|    1|
|      C1029|    1|
|      C1027|    1|
|      C1005|    1|
|      C1002|    1|
|      C1020|    1|
|      C1016|    1|
|      C1022|    1|
+-----------+-----+
only showing top 20 rows


In [0]:
loans_raw.select("loan_amount").distinct().show(20, truncate=False)


+-----------+
|loan_amount|
+-----------+
|1068906    |
|250,291    |
|609285     |
|261919     |
|2421753    |
|708111     |
|349469     |
|2,571,542  |
|874,717    |
|269606     |
|1,507,901  |
|2056019    |
|1435270    |
|2797175    |
|2629669    |
|2794122    |
|727,030    |
|602,913    |
|1418271    |
|2203139    |
+-----------+
only showing top 20 rows


In [0]:
loans_raw.filter(
    col("loan_amount").rlike(",")
).select("loan_id", "loan_amount").show(truncate=False)


+-------+-----------+
|loan_id|loan_amount|
+-------+-----------+
|L50003 |727,030    |
|L50005 |1,914,865  |
|L50006 |1,507,901  |
|L50010 |1,751,301  |
|L50012 |2,764,833  |
|L50015 |2,975,871  |
|L50020 |874,717    |
|L50023 |602,913    |
|L50024 |250,291    |
|L50025 |2,345,326  |
|L50032 |2,571,542  |
|L50033 |2,431,087  |
|L50036 |1,686,846  |
|L50037 |2,885,601  |
|L50039 |2,778,717  |
|L50040 |523,372    |
|L50042 |500,000    |
+-------+-----------+



In [0]:
loans_raw.select("update_ts").distinct().show(10, truncate=False)


+-------------------+
|update_ts          |
+-------------------+
|2025-12-04 00:00:00|
|2025-11-15 00:00:00|
|2025-11-14 00:00:00|
|2025-11-21 00:00:00|
|2025-11-19 00:00:00|
|2025-11-16 00:00:00|
|2025-12-12 00:00:00|
|2025-12-06 00:00:00|
|2025-12-07 00:00:00|
|2025-11-12 00:00:00|
+-------------------+
only showing top 10 rows


In [0]:
from pyspark.sql.functions import to_timestamp

loans_raw.withColumn(
    "parsed_ts",
    to_timestamp("update_ts", "yyyy-MM-dd HH:mm:ss")
).filter(col("parsed_ts").isNull()).select("update_ts").show(truncate=False)


+---------+
|update_ts|
+---------+
+---------+



In [0]:
loans_raw.select("loan_type").distinct().show()
loans_raw.select("status").distinct().show()


+---------+
|loan_type|
+---------+
| Personal|
|     Auto|
|Education|
|     Home|
+---------+

+---------+
|   status|
+---------+
|   CLOSED|
|DISBURSED|
| APPROVED|
|  PENDING|
+---------+



In [0]:
valid_status = ["APPROVED", "PENDING", "DISBURSED", "CLOSED"]

loans_raw.filter(~col("status").isin(valid_status)).show()


+---------+-----------+-------------+-----------+-------+---------+----------------+------+-------------+---------+
|branch_id|customer_id|interest_rate|loan_amount|loan_id|loan_type|origination_date|status|tenure_months|update_ts|
+---------+-----------+-------------+-----------+-------+---------+----------------+------+-------------+---------+
+---------+-----------+-------------+-----------+-------+---------+----------------+------+-------------+---------+



In [0]:
loans_raw.select("interest_rate").summary().show()


+-------+------------------+
|summary|     interest_rate|
+-------+------------------+
|  count|                42|
|   mean|10.140476190476189|
| stddev|1.4347702329538685|
|    min|               8.5|
|    25%|               8.5|
|    50%|              10.5|
|    75%|              11.0|
|    max|              12.5|
+-------+------------------+



In [0]:
loans_raw.filter("tenure_months <= 0 OR tenure_months > 360").show()


+---------+-----------+-------------+-----------+-------+---------+----------------+------+-------------+---------+
|branch_id|customer_id|interest_rate|loan_amount|loan_id|loan_type|origination_date|status|tenure_months|update_ts|
+---------+-----------+-------------+-----------+-------+---------+----------------+------+-------------+---------+
+---------+-----------+-------------+-----------+-------+---------+----------------+------+-------------+---------+



In [0]:
from pyspark.sql.functions import regexp_replace

loans_num = loans_raw.withColumn(
    "loan_amount_clean",
    regexp_replace("loan_amount", ",", "").cast("double")
)


In [0]:
loans_num.select("loan_amount_clean").summary().show()


+-------+------------------+
|summary| loan_amount_clean|
+-------+------------------+
|  count|                42|
|   mean|1601841.0238095238|
| stddev|  909547.596054844|
|    min|          250291.0|
|    25%|          708111.0|
|    50%|         1686846.0|
|    75%|         2421753.0|
|    max|         2975871.0|
+-------+------------------+



In [0]:
q1, q3 = loans_num.approxQuantile("loan_amount_clean", [0.25, 0.75], 0.01)
iqr = q3 - q1

outliers = loans_num.filter(
    (col("loan_amount_clean") < q1 - 1.5 * iqr) |
    (col("loan_amount_clean") > q3 + 1.5 * iqr)
)

outliers.select("loan_id", "loan_amount_clean").show(truncate=False)


+-------+-----------------+
|loan_id|loan_amount_clean|
+-------+-----------------+
+-------+-----------------+

