In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("LendingClubProject_DC_loans") \
    .master("local[*]") \
    .getOrCreate()

In [4]:

loan_raw_df = spark.read.format("csv").option("header","true").option("inferSchema","true").load("Lending_club_project/raw/loans_data_csv")
loan_raw_df.show(2)

+--------+--------------------+---------+-----------+---------+--------+-----------+--------+-----------+------------------+------------------+
| loan_id|           member_id|loan_amnt|funded_amnt|     term|int_rate|installment| issue_d|loan_status|           purpose|             title|
+--------+--------------------+---------+-----------+---------+--------+-----------+--------+-----------+------------------+------------------+
|68407277|6d5091b3fcaaeb4ea...|   3600.0|     3600.0|36 months|   13.99|     123.03|Dec-2015| Fully Paid|debt_consolidation|Debt consolidation|
|68355089|b5e7938b0a2da4cea...|  24700.0|    24700.0|36 months|   11.99|     820.28|Dec-2015| Fully Paid|    small_business|          Business|
+--------+--------------------+---------+-----------+---------+--------+-----------+--------+-----------+------------------+------------------+
only showing top 2 rows



In [7]:
loan_raw_df.printSchema

<bound method DataFrame.printSchema of DataFrame[loan_id: string, member_id: string, loan_amnt: double, funded_amnt: double, term: string, int_rate: double, installment: double, issue_d: string, loan_status: string, purpose: string, title: string]>

In [8]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType

loan_schema = StructType([
    StructField("loan_id", StringType(), True),
    StructField("member_id", StringType(), True),
    StructField("loan_amnt", FloatType(), True),
    StructField("funded_amnt", FloatType(), True),
    StructField("loan_term_months", StringType(), True),
    StructField("interest_rate", FloatType(), True),
    StructField("monthly_installment", FloatType(), True),
    StructField("issue_date", StringType(), True),
    StructField("loan_status", StringType(), True),
    StructField("loan_purpose", StringType(), True),
    StructField("loan_title", StringType(), True)
])

In [11]:
#Though we have renamed the columns giving header true will ignore and the schema as a header will be considered
loan_df = spark.read.format("csv").option("header","true").schema(loan_schema).load("Lending_club_project/raw/loans_data_csv")
loan_df.show(2)
loan_df.printSchema()


+--------+--------------------+---------+-----------+----------------+-------------+-------------------+----------+-----------+------------------+------------------+
| loan_id|           member_id|loan_amnt|funded_amnt|loan_term_months|interest_rate|monthly_installment|issue_date|loan_status|      loan_purpose|        loan_title|
+--------+--------------------+---------+-----------+----------------+-------------+-------------------+----------+-----------+------------------+------------------+
|68407277|6d5091b3fcaaeb4ea...|   3600.0|     3600.0|       36 months|        13.99|             123.03|  Dec-2015| Fully Paid|debt_consolidation|Debt consolidation|
|68355089|b5e7938b0a2da4cea...|  24700.0|    24700.0|       36 months|        11.99|             820.28|  Dec-2015| Fully Paid|    small_business|          Business|
+--------+--------------------+---------+-----------+----------------+-------------+-------------------+----------+-----------+------------------+------------------+
only

In [12]:
from pyspark.sql.functions import current_timestamp

timestamp_df = loan_df.withColumn("ingested_time",current_timestamp())

In [13]:
timestamp_df.createOrReplaceTempView("loantable")

In [14]:
spark.sql("SELECT count(*) FROM loantable WHERE loan_amnt IS NULL").show()

+--------+
|count(1)|
+--------+
|       6|
+--------+



In [15]:
spark.sql("SELECT * FROM loantable WHERE loan_amnt IS NULL").show()

+--------------------+--------------------+---------+-----------+----------------+-------------+-------------------+----------+-----------+------------+----------+--------------------+
|             loan_id|           member_id|loan_amnt|funded_amnt|loan_term_months|interest_rate|monthly_installment|issue_date|loan_status|loan_purpose|loan_title|       ingested_time|
+--------------------+--------------------+---------+-----------+----------------+-------------+-------------------+----------+-----------+------------+----------+--------------------+
|Total amount fund...|e3b0c44298fc1c149...|     null|       null|            null|         null|               null|      null|       null|        null|      null|2025-08-25 23:34:...|
|Total amount fund...|e3b0c44298fc1c149...|     null|       null|            null|         null|               null|      null|       null|        null|      null|2025-08-25 23:34:...|
|Total amount fund...|e3b0c44298fc1c149...|     null|       null|          

In [16]:
timestamp_df.printSchema

<bound method DataFrame.printSchema of DataFrame[loan_id: string, member_id: string, loan_amnt: float, funded_amnt: float, loan_term_months: string, interest_rate: float, monthly_installment: float, issue_date: string, loan_status: string, loan_purpose: string, loan_title: string, ingested_time: timestamp]>

In [26]:
from pyspark.sql.functions import col
# droping all the rows if any of the columns has NULL value
checklist_columns = ["loan_amnt", "funded_amnt", "loan_term_months", "interest_rate", "monthly_installment", "issue_date", "loan_status", "loan_purpose", "loan_title"]
loan_cleaned_df = timestamp_df.na.drop(subset=checklist_columns) #if any value is null this will be dropped



In [29]:
from pyspark.sql.functions import col
from functools import reduce

condition = reduce(lambda x, y: x | y, [col(c).isNull() for c in checklist_columns])
loan_cleaned_df.select(checklist_columns).filter(condition).count()
   # If checklist_columns = ["loan_amnt", "funded_amnt", "loan_term_months"]
   # Step 1: [col("loan_amnt").isNull(), col("funded_amnt").isNull(), col("loan_term_months").isNull()]
   # Step 2: reduce combines them as:
   # (col("loan_amnt").isNull() | col("funded_amnt").isNull()) | col("loan_term_months").isNull()

0

In [34]:
#converting the loan_term_months in to integer and year 
from pyspark.sql.functions import regexp_replace,col
loan_modified = loan_cleaned_df.withColumn("loan_term_months",(regexp_replace(col("loan_term_months")," months","").cast("int")/12).cast("int"))\
    .withColumnRenamed("loan_term_months","loan_term_years")
loan_modified.select("loan_term_years").show(1)
loan_modified.schema["loan_term_years"].dataType

+---------------+
|loan_term_years|
+---------------+
|              3|
+---------------+
only showing top 1 row



IntegerType()

In [35]:
loan_modified.createOrReplaceTempView("loantable")

In [None]:
#checking Loan_purpose column to clean to only have the neccessary column 
spark.sql("SELECT loan_purpose,count(*) AS Purpose FROM loantable group by loan_purpose order by Purpose DESC ").show()


+--------------------+-------+
|        loan_purpose|Purpose|
+--------------------+-------+
|  debt_consolidation| 408422|
|         credit_card| 162947|
|    home_improvement|  47874|
|               other|  42247|
|      major_purchase|  15522|
|             medical|   8476|
|                 car|   7066|
|      small_business|   6680|
|              moving|   4812|
|            vacation|   4728|
|               house|   3993|
|    renewable_energy|    400|
|             wedding|      9|
|         educational|      1|
|and also pay off ...|      1|
+--------------------+-------+



In [None]:
# working on loan
from pyspark.sql.functions import col, when
loan_purpose_lookup = [
    "debt_consolidation",
    "credit_card", 
    "home_improvement",
    "other",
    "major_purchase",
    "medical",
    "car",
    "small_business",
    "moving",
    "vacation",
    "house",
    "renewable_energy",
    "wedding",
    "educational"
]
# condition for valid loan purposes
valid_condition = col("loan_purpose").isin(loan_purpose_lookup)

# Apply the logic
df_cleaned = loan_modified.withColumn("loan_purpose", 
                          when(valid_condition, col("loan_purpose"))
                          .otherwise("other"))
df_cleaned.createOrReplaceTempView("loantable")
spark.sql("SELECT loan_purpose,count(*) AS Purpose FROM loantable group by loan_purpose order by Purpose DESC ").show()


+------------------+-------+
|      loan_purpose|Purpose|
+------------------+-------+
|debt_consolidation| 408422|
|       credit_card| 162947|
|  home_improvement|  47874|
|             other|  42248|
|    major_purchase|  15522|
|           medical|   8476|
|               car|   7066|
|    small_business|   6680|
|            moving|   4812|
|          vacation|   4728|
|             house|   3993|
|  renewable_energy|    400|
|           wedding|      9|
|       educational|      1|
+------------------+-------+



In [42]:
df_cleaned.write.format("parquet").mode("overwrite")\
.option("path","Lending_club_project/cleaned/loan_data_parquet").save()

In [43]:
df_cleaned.repartition(1).write.format("csv").mode("overwrite").option("header","True")\
.option("path","Lending_club_project/cleaned/csv/loan_data_csv").save()