In [0]:
customers_data_path ="dbfs:/FileStore/tables/lendingclubproject/raw/customers_data_csv"
loans_path="dbfs:/FileStore/tables/lendingclubproject/raw/loans_data_csv"
loans_repayment_path ="/FileStore/tables/lendingclubproject/raw/loans_repayments_csv"
loans_default_path ="dbfs:/FileStore/tables/lendingclubproject/raw/loans_defaulters_csv"

In [0]:
from pyspark.sql.types import StructType
from pyspark.sql.functions import col, lit

# ---------------------------
# 1️⃣ Define Schema (Strict)
# ---------------------------
loan_defaulters_schema = """
member_id STRING,
delinq_2yrs FLOAT,
delinq_amnt FLOAT,
pub_rec FLOAT,
pub_rec_bankruptcies FLOAT,
inq_last_6mths FLOAT,
total_rec_late_fee FLOAT,
mths_since_last_delinq FLOAT,
mths_since_last_record FLOAT
"""

# ---------------------------
# 2️⃣ Read Raw CSV with Schema
# ---------------------------
loans_def_raw_df = spark.read \
    .format("csv") \
    .option("header", True) \
    .schema(loan_defaulters_schema) \
    .load("dbfs:/FileStore/tables/lendingclubproject/raw/loans_defaulters_csv")

print("✅ Raw Data Schema:")
loans_def_raw_df.printSchema()
loans_def_raw_df.show(5)

# ---------------------------
# 3️⃣ Handle Missing Columns (Optional)
# ---------------------------
# If parquet file might be missing these, ensure they exist with default values
loans_def_processed_df = loans_def_raw_df \
    .withColumn("pub_rec", col("pub_rec") if "pub_rec" in loans_def_raw_df.columns else lit(0.0)) \
    .withColumn("pub_rec_bankruptcies", col("pub_rec_bankruptcies") if "pub_rec_bankruptcies" in loans_def_raw_df.columns else lit(0.0)) \
    .withColumn("inq_last_6mths", col("inq_last_6mths") if "inq_last_6mths" in loans_def_raw_df.columns else lit(0.0))

# ---------------------------
# 4️⃣ Create Temp View
# ---------------------------
loans_def_processed_df.createOrReplaceTempView("loan_defaulters")

# ---------------------------
# 5️⃣ SQL Transformation 1: Delinquency Filter
# ---------------------------
loans_def_delinq_df = spark.sql("""
SELECT 
  member_id,
  delinq_2yrs,
  delinq_amnt,
  CAST(mths_since_last_delinq AS INT) AS mths_since_last_delinq
FROM loan_defaulters
WHERE delinq_2yrs > 0 OR mths_since_last_delinq > 0
""")

print("✅ Delinquency Records:")
loans_def_delinq_df.show(5)
loans_def_delinq_df.printSchema()

# ---------------------------
# 6️⃣ SQL Transformation 2: Public Records / Inquiries Filter
# ---------------------------
loans_def_records_enq_df = spark.sql("""
SELECT member_id
FROM loan_defaulters
WHERE pub_rec > 0.0 OR pub_rec_bankruptcies > 0.0 OR inq_last_6mths > 0.0
""")

print("✅ Public Records / Enquiries Records:")
loans_def_records_enq_df.show(5)

# ---------------------------
# 7️⃣ Write Outputs (CSV)
# ---------------------------
loans_def_delinq_df.write \
    .option("header", True) \
    .mode("overwrite") \
    .csv("dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_defaulters_deling_csv")

loans_def_records_enq_df.write \
    .option("header", True) \
    .mode("overwrite") \
    .csv("dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_defaulters_records_enq_csv")

print("✅ Data written successfully to cleaned CSV folders in DBFS.")


✅ Raw Data Schema:
root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: float (nullable = true)
 |-- delinq_amnt: float (nullable = true)
 |-- pub_rec: float (nullable = true)
 |-- pub_rec_bankruptcies: float (nullable = true)
 |-- inq_last_6mths: float (nullable = true)
 |-- total_rec_late_fee: float (nullable = true)
 |-- mths_since_last_delinq: float (nullable = true)
 |-- mths_since_last_record: float (nullable = true)

+--------------------+-----------+-----------+-------+--------------------+--------------+------------------+----------------------+----------------------+
|           member_id|delinq_2yrs|delinq_amnt|pub_rec|pub_rec_bankruptcies|inq_last_6mths|total_rec_late_fee|mths_since_last_delinq|mths_since_last_record|
+--------------------+-----------+-----------+-------+--------------------+--------------+------------------+----------------------+----------------------+
|6d5091b3fcaaeb4ea...|        0.0|        0.0|    0.0|                 0.0|           1.0|   

In [0]:
# ---------------------------
# 7️⃣ Write Outputs (CSV)
# ---------------------------
loans_def_delinq_df.write \
    .option("header", True) \
    .mode("overwrite") \
    .csv("dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_defaulters_deling_csv")

loans_def_records_enq_df.write \
    .option("header", True) \
    .mode("overwrite") \
    .csv("dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_defaulters_records_enq_csv")

print("✅ Data written successfully to cleaned CSV folders in DBFS.")

# ---------------------------
# 8️⃣ Write Outputs (Parquet)
# ---------------------------
loans_def_delinq_df.write \
    .format("parquet") \
    .mode("overwrite") \
    .save("dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_defaulters_deling_parquet")

loans_def_records_enq_df.write \
    .format("parquet") \
    .mode("overwrite") \
    .save("dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_defaulters_records_enq_parquet")

print("✅ Data written successfully to cleaned Parquet folders in DBFS.")


✅ Data written successfully to cleaned CSV folders in DBFS.
✅ Data written successfully to cleaned Parquet folders in DBFS.
