In [0]:
customers_data_path ="dbfs:/FileStore/tables/lendingclubproject/raw/customers_data_csv"
loans_path="dbfs:/FileStore/tables/lendingclubproject/raw/loans_data_csv"
loans_repayment_path ="/FileStore/tables/lendingclubproject/raw/loans_repayments_csv"
loans_default_path ="dbfs:/FileStore/tables/lendingclubproject/raw/loans_defaulters_csv"

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("LoansDataProcessing") \
    .getOrCreate()


In [0]:
%fs ls "dbfs:/FileStore/tables/lendingclubproject/raw/loans_data_csv"

path,name,size,modificationTime
dbfs:/FileStore/tables/lendingclubproject/raw/loans_data_csv/_SUCCESS,_SUCCESS,0,1757401769000
dbfs:/FileStore/tables/lendingclubproject/raw/loans_data_csv/_committed_5081223458602308933,_committed_5081223458602308933,113,1757401769000
dbfs:/FileStore/tables/lendingclubproject/raw/loans_data_csv/_started_5081223458602308933,_started_5081223458602308933,0,1757401745000
dbfs:/FileStore/tables/lendingclubproject/raw/loans_data_csv/part-00000-tid-5081223458602308933-2f1773a8-5d1a-4a1f-897f-f27d7ed6ae93-376-1-c000.csv,part-00000-tid-5081223458602308933-2f1773a8-5d1a-4a1f-897f-f27d7ed6ae93-376-1-c000.csv,374481078,1757401766000


In [0]:
loans_schema = 'loan_id string, member_id string, loan_amount float, funded_amount float, loan_term_months string, interest_rate float, monthly_installment float, issue_date string, loan_status string, loan_purpose string, loan_title string'

In [0]:
loans_raw_df = spark.read.format("csv")\
    .option("header", True)\
    .schema(loans_schema)\
    .load("dbfs:/FileStore/tables/lendingclubproject/raw/loans_data_csv")

In [0]:
loans_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_months: string (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)



In [0]:
from pyspark.sql.functions import current_timestamp

loans_ingested_df = loans_raw_df.withColumn("ingested_date", current_timestamp())
loans_ingested_df.show(10)


+--------+--------------------+-----------+-------------+----------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+
| loan_id|           member_id|loan_amount|funded_amount|loan_term_months|interest_rate|monthly_installment|issue_date|loan_status|      loan_purpose|          loan_title|       ingested_date|
+--------+--------------------+-----------+-------------+----------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+
|68407277|6d5091b3fcaaeb4ea...|     3600.0|       3600.0|       36 months|        13.99|             123.03|  Dec-2015| Fully Paid|debt_consolidation|  Debt consolidation|2025-09-11 06:17:...|
|68355089|b5e7938b0a2da4cea...|    24700.0|      24700.0|       36 months|        11.99|             820.28|  Dec-2015| Fully Paid|    small_business|            Business|2025-09-11 06:17:...|
|68341763|91060b858433e8a61...|    

In [0]:
loans_ingested_df.createOrReplaceTempView("loans")

In [0]:
spark.sql("select count(*)from loans").show()

+--------+
|count(1)|
+--------+
| 2260701|
+--------+



In [0]:
spark.sql("select count(*)from loans where loan_amount is null ").show()

+--------+
|count(1)|
+--------+
|      33|
+--------+



In [0]:
columns_to_check = ["loan_amount", "funded_amount", "loan_term_months",
 "interest_rate",
 "monthly_installment","issue_date","loan_status","loan_purpose"]

loans_null_check_df = loans_ingested_df.dropna(subset= columns_to_check)

In [0]:
loans_ingested_df.count()

Out[37]: 2260701

In [0]:
loans_null_check_df.count()

Out[35]: 2260667

In [0]:

loans_null_check_df.createOrReplaceTempView("loans")

In [0]:
from pyspark.sql.functions import regexp_replace, col
loans_cleaned_df = loans_null_check_df.withColumn(
    "loan_term_months", regexp_replace(col("loan_term_months")," months","").cast("integer")

)

In [0]:
from pyspark.sql.functions import regexp_replace, col, round
loans_cleaned_df = loans_cleaned_df.withColumn(
    "loan_term_years", round(col("loan_term_months")/12,2).cast("integer")
)
loans_cleaned_df.show()

+--------+--------------------+-----------+-------------+----------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+---------------+
| loan_id|           member_id|loan_amount|funded_amount|loan_term_months|interest_rate|monthly_installment|issue_date|loan_status|      loan_purpose|          loan_title|       ingested_date|loan_term_years|
+--------+--------------------+-----------+-------------+----------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+---------------+
|68407277|6d5091b3fcaaeb4ea...|     3600.0|       3600.0|              36|        13.99|             123.03|  Dec-2015| Fully Paid|debt_consolidation|  Debt consolidation|2025-09-11 06:40:...|              3|
|68355089|b5e7938b0a2da4cea...|    24700.0|      24700.0|              36|        11.99|             820.28|  Dec-2015| Fully Paid|    small_business|            Bu

In [0]:
loans_cleaned_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_months: integer (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)
 |-- ingested_date: timestamp (nullable = false)
 |-- loan_term_years: integer (nullable = true)



In [0]:
loan_purpose_lookup = ["debt_consolidation", "credit_card",
 "home_improvement", "other", "major_purchase", "medical", "small_business",
 "car", "vacation", "moving", "house", "wedding", "renewable_energy",
 "educational"]

In [0]:
from pyspark.sql.functions import when, col

# Assuming loan_purpose_lookup is a list of valid purposes
# Example: loan_purpose_lookup = ["debt_consolidation", "credit_card", "home_improvement"]

loans_pur_mod = loans_cleaned_df.withColumn(
    "loan_purpose",
    when(col("loan_purpose").isin(loan_purpose_lookup), col("loan_purpose"))
    .otherwise("other")
)

loans_pur_mod.show(10)

+--------+--------------------+-----------+-------------+----------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+---------------+
| loan_id|           member_id|loan_amount|funded_amount|loan_term_months|interest_rate|monthly_installment|issue_date|loan_status|      loan_purpose|          loan_title|       ingested_date|loan_term_years|
+--------+--------------------+-----------+-------------+----------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+---------------+
|68407277|6d5091b3fcaaeb4ea...|     3600.0|       3600.0|              36|        13.99|             123.03|  Dec-2015| Fully Paid|debt_consolidation|  Debt consolidation|2025-09-11 07:59:...|              3|
|68355089|b5e7938b0a2da4cea...|    24700.0|      24700.0|              36|        11.99|             820.28|  Dec-2015| Fully Paid|    small_business|            Bu

In [0]:
loans_pur_mod.write.format("parquet")\
    .mode("overwrite")\
    .option("path", "dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_data_parquet")\
    .save()

In [0]:
%fs ls dbfs:/FileStore/tables/lendingclubproject/cleaned/

path,name,size,modificationTime
dbfs:/FileStore/tables/lendingclubproject/cleaned/customers_data_parquet/,customers_data_parquet/,0,0
dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_data_parquet/,loans_data_parquet/,0,0
