In [0]:
from pyspark.sql.types import StructType

loan_defaulters_schema = """
member_id STRING,
delinq_2yrs INT,
delinq_amnt FLOAT,
pub_rec INT,
pub_rec_bankruptcies INT,
inq_last_6mths FLOAT,
total_rec_late_fee FLOAT,
mths_since_last_delinq FLOAT,
mths_since_last_record FLOAT
"""



In [0]:
loans_def_raw_df = spark.read \
    .format("csv") \
    .option("header", True) \
    .schema(loan_defaulters_schema) \
    .load("dbfs:/FileStore/tables/lendingclubproject/raw/loans_defaulters_csv")

In [0]:

from pyspark.sql.functions import col

# ---------------------------
# 3️⃣ Process Loans Defaulters
# ---------------------------
loans_def_processed_df = loans_def_raw_df \
    .withColumn("delinq_2yrs", col("delinq_2yrs").cast("integer")) \
    .fillna(0, subset=["delinq_2yrs"])

# Create Temp View for SQL Queries
loans_def_processed_df.createOrReplaceTempView("loan_defaulters")

print("✅ Loans Defaulters processed and temp view created.")


✅ Loans Defaulters processed and temp view created.


In [0]:
spark.sql("select count(*) from loan_defaulters where delinq_2yrs is null ").show()

+--------+
|count(1)|
+--------+
|       0|
+--------+



In [0]:
loans_def_delinq_df = spark.sql("select member_id,delinq_2yrs, delinq_amnt, int(mths_since_last_delinq) from loan_defaulters where delinq_2yrs > 0 or mths_since_last_delinq > 0")

In [0]:
loans_def_records_enq_df = spark.sql("select member_id from loan_defaulters where pub_rec > 0.0 or pub_rec_bankruptcies > 0.0 or inq_last_6mths > 0.0")

In [0]:
loans_def_delinq_df.write \
    .option("header", True) \
    .format("csv") \
    .mode("overwrite") \
    .option("path","dbfs:/FileStore/tables/lendingclubproject/raw/cleaned/loans_defaulters_deling_csv") \
    .save()


In [0]:
loans_def_delinq_df.write \
    .option("header", True) \
    .format("parquet") \
    .mode("overwrite") \
    .option("path","dbfs:/FileStore/tables/lendingclubproject/raw/cleaned/loans_defaulters_deling_parquet") \
    .save()

In [0]:
loans_def_records_enq_df.write \
    .option("header", True) \
    .format("csv") \
    .mode("overwrite") \
    .option("path","dbfs:/FileStore/tables/lendingclubproject/raw/cleaned/loans_defaulters_records_enq_csv") \
    .save()

In [0]:
loans_def_records_enq_df.write \
    .option("header", True) \
    .format("parquet") \
    .mode("overwrite") \
    .option("path","dbfs:/FileStore/tables/lendingclubproject/raw/cleaned/loans_defaulters_records_enq_parquet") \
    .save()