In [0]:
customers_data_path ="dbfs:/FileStore/tables/lendingclubproject/raw/customers_data_csv"
loans_path="dbfs:/FileStore/tables/lendingclubproject/raw/loans_data_csv"
loans_repayment_path ="/FileStore/tables/lendingclubproject/raw/loans_repayments_csv"
loans_default_path ="dbfs:/FileStore/tables/lendingclubproject/raw/loans_defaulters_csv"

In [0]:
loan_rep_schema = "loan_id string,total_principal_received float,total_interest_received float,total_late_fee_received float,total_payment_received float,last_payment_amount float,last_payment_date string,next_payment_date string"


In [0]:
loans_repayment_path = "dbfs:/FileStore/tables/lendingclubproject/raw/loans_repayments_csv/"

loans_repayment_raw_df = spark.read.format("csv") \
    .option("header", True) \
    .schema(loan_rep_schema) \
    .load(loans_repayment_path)


loans_repayment_raw_df.printSchema()


root
 |-- loan_id: string (nullable = true)
 |-- total_principal_received: float (nullable = true)
 |-- total_interest_received: float (nullable = true)
 |-- total_late_fee_received: float (nullable = true)
 |-- total_payment_received: float (nullable = true)
 |-- last_payment_amount: float (nullable = true)
 |-- last_payment_date: string (nullable = true)
 |-- next_payment_date: string (nullable = true)



In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
loans_repayment_ing = loans_repayment_raw_df.withColumn("ingested_date", current_timestamp())

In [0]:
columns_to_check = [
    "total_principal_received",
    "total_interest_received",
    "total_late_fee_received",
    "total_payment_received",
    "last_payment_amount"
]


In [0]:
loans_repay_fil =loans_repayment_ing.dropna(subset =columns_to_check)
loans_repay_fil.count()

Out[14]: 2260498

In [0]:
from pyspark.sql.functions import when, col

loans_payments_fixed_df = loans_repay_fil.withColumn(
    "total_payment_received",
    when(
        (col("total_principal_received") != 0.0) & 
        (col("total_payment_received") == 0.0),
        col("total_principal_received") + col("total_interest_received") + col("total_late_fee_received")
    ).otherwise(col("total_payment_received"))
)

loans_payments_fixed_df.show(10)


+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
| loan_id|total_principal_received|total_interest_received|total_late_fee_received|total_payment_received|last_payment_amount|last_payment_date|next_payment_date|       ingested_date|
+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|68407277|                  3600.0|                 821.72|                    0.0|              4421.724|             122.67|         Jan-2019|             null|2025-09-11 08:21:...|
|68355089|                 24700.0|                 979.66|                    0.0|              25679.66|             926.35|         Jun-2016|             null|2025-09-11 08:21:...|
|68341763|                 20000.0|                2705.92|                    0

In [0]:
from pyspark.sql.functions import col

loans_payments_fixed2_df = loans_payments_fixed_df.filter(col("total_payment_received") != 0.0)
loans_payments_fixed2_df.show(10)


+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
| loan_id|total_principal_received|total_interest_received|total_late_fee_received|total_payment_received|last_payment_amount|last_payment_date|next_payment_date|       ingested_date|
+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|68407277|                  3600.0|                 821.72|                    0.0|              4421.724|             122.67|         Jan-2019|             null|2025-09-11 08:23:...|
|68355089|                 24700.0|                 979.66|                    0.0|              25679.66|             926.35|         Jun-2016|             null|2025-09-11 08:23:...|
|68341763|                 20000.0|                2705.92|                    0

In [0]:
from pyspark.sql.functions import when, col

# Fix last_payment_date first
loans_payments_ldate_fixed_df = loans_payments_fixed2_df.withColumn(
    "last_payment_date",
    when(col("last_payment_date") == "0.0", None).otherwise(col("last_payment_date"))
)

# Fix next_payment_date
loans_payments_ndate_fixed_df = loans_payments_ldate_fixed_df.withColumn(
    "next_payment_date",
    when(col("next_payment_date") == "0.0", None).otherwise(col("next_payment_date"))
)

loans_payments_ndate_fixed_df.show(10)


+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
| loan_id|total_principal_received|total_interest_received|total_late_fee_received|total_payment_received|last_payment_amount|last_payment_date|next_payment_date|       ingested_date|
+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|68407277|                  3600.0|                 821.72|                    0.0|              4421.724|             122.67|         Jan-2019|             null|2025-09-11 08:25:...|
|68355089|                 24700.0|                 979.66|                    0.0|              25679.66|             926.35|         Jun-2016|             null|2025-09-11 08:25:...|
|68341763|                 20000.0|                2705.92|                    0

In [0]:
loans_payments_ndate_fixed_df.write.format("parquet") \
    .mode("overwrite") \
    .option("path", "dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_repayments_parquet") \
    .save()


In [0]:
display(dbutils.fs.ls("dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_repayments_parquet"))


path,name,size,modificationTime
dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_repayments_parquet/_SUCCESS,_SUCCESS,0,1757579395000
dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_repayments_parquet/_committed_4055122626246095289,_committed_4055122626246095289,824,1757579395000
dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_repayments_parquet/_started_4055122626246095289,_started_4055122626246095289,0,1757579382000
dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_repayments_parquet/part-00000-tid-4055122626246095289-53742a91-c3b6-47cc-b273-35bf0454372a-392-1-c000.snappy.parquet,part-00000-tid-4055122626246095289-53742a91-c3b6-47cc-b273-35bf0454372a-392-1-c000.snappy.parquet,6328652,1757579392000
dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_repayments_parquet/part-00001-tid-4055122626246095289-53742a91-c3b6-47cc-b273-35bf0454372a-393-1-c000.snappy.parquet,part-00001-tid-4055122626246095289-53742a91-c3b6-47cc-b273-35bf0454372a-393-1-c000.snappy.parquet,6370806,1757579394000
dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_repayments_parquet/part-00002-tid-4055122626246095289-53742a91-c3b6-47cc-b273-35bf0454372a-394-1-c000.snappy.parquet,part-00002-tid-4055122626246095289-53742a91-c3b6-47cc-b273-35bf0454372a-394-1-c000.snappy.parquet,6300151,1757579394000
dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_repayments_parquet/part-00003-tid-4055122626246095289-53742a91-c3b6-47cc-b273-35bf0454372a-395-1-c000.snappy.parquet,part-00003-tid-4055122626246095289-53742a91-c3b6-47cc-b273-35bf0454372a-395-1-c000.snappy.parquet,6537877,1757579394000
dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_repayments_parquet/part-00004-tid-4055122626246095289-53742a91-c3b6-47cc-b273-35bf0454372a-396-1-c000.snappy.parquet,part-00004-tid-4055122626246095289-53742a91-c3b6-47cc-b273-35bf0454372a-396-1-c000.snappy.parquet,6226586,1757579394000
dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_repayments_parquet/part-00005-tid-4055122626246095289-53742a91-c3b6-47cc-b273-35bf0454372a-397-1-c000.snappy.parquet,part-00005-tid-4055122626246095289-53742a91-c3b6-47cc-b273-35bf0454372a-397-1-c000.snappy.parquet,6438489,1757579394000
dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_repayments_parquet/part-00006-tid-4055122626246095289-53742a91-c3b6-47cc-b273-35bf0454372a-398-1-c000.snappy.parquet,part-00006-tid-4055122626246095289-53742a91-c3b6-47cc-b273-35bf0454372a-398-1-c000.snappy.parquet,6330621,1757579394000
