In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("LendingClubProject_DC_loan_repayments") \
    .master("local[*]") \
    .getOrCreate()

In [2]:
loan_repayment_raw_df = spark.read.format("csv").option("header","true").option("inferSchema","true")\
    .load("Lending_club_project/raw/loans_repayments_data_csv")

In [None]:
loan_repayment_raw_df.printSchema() #correctly inffred but i would like to make them float

root
 |-- loan_id: string (nullable = true)
 |-- total_rec_prncp: double (nullable = true)
 |-- total_rec_int: double (nullable = true)
 |-- total_rec_late_fee: double (nullable = true)
 |-- total_pymnt: string (nullable = true)
 |-- last_pymnt_amnt: string (nullable = true)
 |-- last_pymnt_d: string (nullable = true)
 |-- next_pymnt_d: string (nullable = true)



In [4]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType

loans_repayment_schema = StructType([
    StructField("loan_id", StringType(), True),
    StructField("total_principal_received", FloatType(), True),
    StructField("total_interest_received", FloatType(), True),
    StructField("total_late_fee_received", FloatType(), True),
    StructField("total_payment_received", FloatType(), True),
    StructField("last_payment_amount", FloatType(), True),
    StructField("last_payment_date", StringType(), True),
    StructField("next_payment_date", StringType(), True)
])

In [7]:
loan_repayment_raw_df = spark.read.format("csv").option("header","true").option("inferSchema","true")\
    .schema(loans_repayment_schema).load("Lending_club_project/raw/loans_repayments_data_csv")
loan_repayment_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- total_principal_received: float (nullable = true)
 |-- total_interest_received: float (nullable = true)
 |-- total_late_fee_received: float (nullable = true)
 |-- total_payment_received: float (nullable = true)
 |-- last_payment_amount: float (nullable = true)
 |-- last_payment_date: string (nullable = true)
 |-- next_payment_date: string (nullable = true)



In [8]:
from pyspark.sql.functions import current_timestamp
timestamp_df = loan_repayment_raw_df.withColumn("ingest_date",current_timestamp())


In [9]:
timestamp_df.createTempView("loanrepayments")

In [None]:
spark.sql("SELECT COUNT(*) FROM loanrepayments WHERE total_payment_received IS NULL").show() # this column shouldn't be null quality check

+--------+
|count(1)|
+--------+
|       7|
+--------+



In [13]:
spark.sql("SELECT * FROM loanrepayments WHERE total_payment_received IS NULL").show()

+--------------------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|             loan_id|total_principal_received|total_interest_received|total_late_fee_received|total_payment_received|last_payment_amount|last_payment_date|next_payment_date|         ingest_date|
+--------------------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|Total amount fund...|                    null|                   null|                   null|                  null|               null|             null|             null|2025-08-26 01:30:...|
|Total amount fund...|                    null|                   null|                   null|                  null|               null|             null|             null|2025-08-26 01:30:...|
|Total amount fund..

In [26]:
columns_check = ["total_principal_received","total_interest_received","total_late_fee_received","total_payment_received","last_payment_amount"]

In [28]:
filtered_df = timestamp_df.na.drop(subset=columns_check)
filtered_df.createOrReplaceTempView("loanrepayments")
spark.sql("SELECT * FROM loanrepayments WHERE total_payment_received IS NULL").show()

+-------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+-----------+
|loan_id|total_principal_received|total_interest_received|total_late_fee_received|total_payment_received|last_payment_amount|last_payment_date|next_payment_date|ingest_date|
+-------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+-----------+
+-------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+-----------+



In [46]:
spark.sql("SELECT count(*) FROM loanrepayments WHERE total_payment_received = 0.0 AND  total_principal_received !=0.0").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



In [None]:
#Not needed but can use when there are issues with these columns calculation
from pyspark.sql.functions import when, col

total_payment_fixed_df = filtered_df.withColumn("total_payment_received", 
    when((col("total_principal_received") != 0.0) & 
         (col("total_payment_received") == 0.0),
         col("total_principal_received") + col("total_interest_received") + col("total_late_fee_received")
    ).otherwise(col("total_payment_received"))
)

In [52]:
total_payment_fixed_df.filter("total_payment_received=0.0").count()

224

In [53]:
total_payment_fixed_df1 = total_payment_fixed_df.filter("total_payment_received!=0.0")

In [57]:
#last_payment_date should not be 0 or null it should be some date
total_payment_fixed_df1.filter("last_payment_date IS NULL").show()

+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
| loan_id|total_principal_received|total_interest_received|total_late_fee_received|total_payment_received|last_payment_amount|last_payment_date|next_payment_date|         ingest_date|
+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|67255203|                     0.0|                    0.0|                    0.0|               1229.56|                0.0|             null|             null|2025-08-26 11:37:...|
|68132560|                     0.0|                    0.0|                    0.0|               4459.86|                0.0|             null|             null|2025-08-26 11:37:...|
|67869980|                     0.0|                    0.0|                    0

In [58]:
#last_payment_date 
from pyspark.sql.functions import when, col

payment_date_fixed_df = filtered_df.withColumn("last_payment_date", 
    when((col("last_payment_date") != 0.0), None
    ).otherwise(col("last_payment_date"))
)

In [59]:
from pyspark.sql.functions import when, col

next_payment_date_fixed = payment_date_fixed_df.withColumn("next_payment_date", 
    when((col("next_payment_date") != 0.0), None
    ).otherwise(col("next_payment_date"))
)

In [61]:
next_payment_date_fixed.filter("next_payment_date = 0.0").show()

+-------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+-----------+
|loan_id|total_principal_received|total_interest_received|total_late_fee_received|total_payment_received|last_payment_amount|last_payment_date|next_payment_date|ingest_date|
+-------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+-----------+
+-------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+-----------+



In [62]:
next_payment_date_fixed.write.format("parquet").mode("overwrite")\
.option("path","Lending_club_project/cleaned/loan_repayments_parquet").save()

In [63]:
next_payment_date_fixed.repartition(1).write.format("csv").mode("overwrite").option("header","True")\
.option("path","Lending_club_project/cleaned/csv/loan_repayments_csv").save()