In [0]:
customers_data_path ="dbfs:/FileStore/tables/lendingclubproject/cleaned/customers_data_parquet"
loans_path="dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_data_parquet"
loans_repayment_path ="/FileStore/tables/lendingclubproject/cleaned/loans_repayments_parquet"
loans_default_path ="dbfs:/FileStore/tables/lendingclubproject/raw/loans_defaulters_csv"

In [0]:
customers_df = spark.read.parquet("dbfs:/FileStore/tables/lendingclubproject/cleaned/customers_data_parquet")


In [0]:
# Read original Parquet
customers_df = spark.read.parquet("dbfs:/FileStore/tables/lendingclubproject/cleaned/customers_data_parquet")

# Convert timestamp column to string
customers_df = customers_df.withColumn("ingest_date", customers_df["ingest_date"].cast("string"))

# Write back as new Parquet
customers_df.write.mode("overwrite").parquet("dbfs:/FileStore/tables/lendingclubproject/cleaned/customers_data_parquet_string")



In [0]:
# 1️⃣ Create schema first
spark.sql("""
CREATE SCHEMA IF NOT EXISTS lending_club
""")

# 2️⃣ Create external table inside lending_club schema
spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS lending_club.customers(
    member_id STRING,
    emp_title STRING,
    emp_length INT,
    home_ownership STRING,
    annual_income FLOAT,
    address_state STRING,
    address_zipcode STRING,
    address_country STRING,
    grade STRING,
    sub_grade STRING,
    verification_status STRING,
    total_high_credit_limit FLOAT,
    application_type STRING,
    join_annual_income FLOAT,
    verification_status_joint STRING,
    ingest_date STRING
)
STORED AS PARQUET
LOCATION 'dbfs:/FileStore/tables/lendingclubproject/cleaned/customers_data_parquet_string'
""")


Out[30]: DataFrame[]

In [0]:
# Read original Parquet
customers_df = spark.read.parquet("dbfs:/FileStore/tables/lendingclubproject/cleaned/customers_data_parquet")

# Write as Delta
customers_df.write.format("delta").mode("overwrite").save("dbfs:/FileStore/tables/lendingclubproject/cleaned/customers_data_delta")

# Create Delta table
spark.sql("""
CREATE TABLE  if not exists lending_club.customers_delta
USING DELTA
LOCATION 'dbfs:/FileStore/tables/lendingclubproject/cleaned/customers_data_delta'
""")


Out[31]: DataFrame[]

In [0]:
%sql
select * from lending_club.customers_delta limit 5

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zip_code,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
077e913b4cc781e0dd14a5cdfcacd74ea1ec3c33361eb139910b98a9565f1dc8,General Manager,5,RENT,60000.0,VA,234xx,USA,B,B5,Not Verified,63700.0,Individual,,,2025-09-11T05:42:04.623+0000
a44e21e0c08364bdb5be50add96cef6d02974e70f63510912bb8bc4c4baa3a2e,Teacher,2,MORTGAGE,60000.0,IL,608xx,USA,A,A5,Not Verified,144106.0,Individual,,,2025-09-11T05:42:04.623+0000
f7775515526463b1c7a7641683e10c74beee2516b0831d8cb7fd989220137725,Nurse Educator,3,RENT,150000.0,CA,900xx,USA,B,B1,Not Verified,118480.0,Individual,,,2025-09-11T05:42:04.623+0000
4bb6598efcef04c07b40b1831bf58e763c5e75ca5885ddbf024c55ab3af1d7ee,Senior business analyst,4,RENT,80000.0,GA,303xx,USA,B,B2,Not Verified,124337.0,Individual,,,2025-09-11T05:42:04.623+0000
b779f27d707ec82e5bdd809a423dc361fde8ad35eabf9853730b23a216f18030,Nurse,7,MORTGAGE,76000.0,NC,281xx,USA,B,B3,Source Verified,410381.0,Individual,,,2025-09-11T05:42:04.623+0000


# creating table for loans data 

In [0]:
spark.sql("""
CREATE EXTERNAL TABLE if not exists  lending_club.loans_repayments(
    loan_id string,
    total_principal_received float,
    total_interest_received float,
    total_late_fee_received float,
    total_payment_received float,
    last_payment_amount float,
    last_payment_date string,
    next_payment_date string,
    ingest_date string
)
STORED AS PARQUET
LOCATION 'dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_repayments_parquet_string'
""")


Out[52]: DataFrame[]

In [0]:
spark.sql("""
CREATE EXTERNAL TABLE  if not exists lending_club.loans(
    loan_id string,
    member_id string,
    loan_amount float,
    funded_amount float,
    loan_term_years int,
    interest_rate float,
    monthly_installment float,
    issue_date string,
    loan_status string,
    loan_purpose string,
    loan_title string,
    ingest_date string
)
STORED AS PARQUET
LOCATION 'dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_parquet_string'
""")


Out[53]: DataFrame[]

In [0]:
spark.sql("""
CREATE EXTERNAL TABLE  if not exists lending_club.loans_defaulters_delinq(
    member_id string,
    delinq_2yrs int,
    delinq_amnt float,
    mths_since_last_delinq int
)
STORED AS PARQUET
LOCATION 'dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_defaulters_delinq_parquet'
""")


Out[54]: DataFrame[]

In [0]:
spark.sql("""
CREATE EXTERNAL TABLE  if not exists lending_club.loans_defaulters_detail_rec_enq(
    member_id string,
    pub_rec int,
    pub_rec_bankruptcies int,
    inq_last_6mths int
)
STORED AS PARQUET
LOCATION 'dbfs:/FileStore/tables/lendingclubproject/cleaned/loans_defaulters_detail_records_enq_parquet'
""")


Out[55]: DataFrame[]

In [0]:
# List all tables in the database
spark.sql("SHOW TABLES IN lending_club").show(truncate=False)


+------------+-------------------------------+-----------+
|database    |tableName                      |isTemporary|
+------------+-------------------------------+-----------+
|lending_club|customers                      |false      |
|lending_club|customers_delta                |false      |
|lending_club|customers_loan_t               |false      |
|lending_club|customers_loan_v               |false      |
|lending_club|loans                          |false      |
|lending_club|loans_defaulters_delinq        |false      |
|lending_club|loans_defaulters_detail_rec_enq|false      |
|lending_club|loans_repayments               |false      |
+------------+-------------------------------+-----------+



In [0]:
spark.sql("""
CREATE OR REPLACE VIEW lending_club.customers_loan_v AS
SELECT
    l.loan_id,
    c.member_id,
    c.emp_title,
    c.emp_length,
    c.home_ownership,
    c.annual_income,
    c.address_state,
    c.address_zipcode,
    c.address_country,
    c.grade,
    c.sub_grade,
    c.verification_status,
    c.total_high_credit_limit,
    c.application_type,
    c.join_annual_income,
    c.verification_status_joint,
    l.loan_amount,
    l.funded_amount,
    l.loan_term_years,
    l.interest_rate,
    l.monthly_installment,
    l.issue_date,
    l.loan_status,
    l.loan_purpose,
    r.total_principal_received,
    r.total_interest_received,
    r.total_late_fee_received,
    r.last_payment_date,
    r.next_payment_date,
    d.delinq_2yrs,
    d.delinq_amnt,
    d.mths_since_last_delinq,
    e.pub_rec,
    e.pub_rec_bankruptcies,
    e.inq_last_6mths
FROM lending_club.customers c
LEFT JOIN lending_club.loans l
    ON c.member_id = l.member_id
LEFT JOIN lending_club.loans_repayments r
    ON l.loan_id = r.loan_id
LEFT JOIN lending_club.loans_defaulters_delinq d
    ON c.member_id = d.member_id
LEFT JOIN lending_club.loans_defaulters_detail_rec_enq e
    ON c.member_id = e.member_id
""")


Out[57]: DataFrame[]

In [0]:
spark.sql("""
CREATE TABLE if not exists  lending_club.customers_loan_t
AS
SELECT
    l.loan_id,
    c.member_id,
    c.emp_title,
    c.emp_length,
    c.home_ownership,
    c.annual_income,
    c.address_state,
    c.address_zipcode,
    c.address_country,
    c.grade,
    c.sub_grade,
    c.verification_status,
    c.total_high_credit_limit,
    c.application_type,
    c.join_annual_income,
    c.verification_status_joint,
    l.loan_amount,
    l.funded_amount,
    l.loan_term_years,
    l.interest_rate,
    l.monthly_installment,
    l.issue_date,
    l.loan_status,
    l.loan_purpose,
    r.total_principal_received,
    r.total_interest_received,
    r.total_late_fee_received,
    r.last_payment_date,
    r.next_payment_date,
    d.delinq_2yrs,
    d.delinq_amnt,
    d.mths_since_last_delinq,
    e.pub_rec,
    e.pub_rec_bankruptcies,
    e.inq_last_6mths
FROM lending_club.customers c
LEFT JOIN lending_club.loans l
    ON c.member_id = l.member_id
LEFT JOIN lending_club.loans_repayments r
    ON l.loan_id = r.loan_id
LEFT JOIN lending_club.loans_defaulters_delinq d
    ON c.member_id = d.member_id
LEFT JOIN lending_club.loans_defaulters_detail_rec_enq e
    ON c.member_id = e.member_id
""")


Out[61]: DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
# List all tables inside lending_club schema
tables_df = spark.sql("SHOW TABLES IN lending_club")

# Loop and refresh each one
for row in tables_df.collect():
    table_name = row["tableName"]
    print(f"Refreshing table: {table_name}")
    spark.sql(f"REFRESH TABLE lending_club.{table_name}")


Refreshing table: customers
Refreshing table: customers_delta
Refreshing table: customers_loan_t
Refreshing table: customers_loan_v
Refreshing table: loans
Refreshing table: loans_defaulters_delinq
Refreshing table: loans_defaulters_detail_rec_enq
Refreshing table: loans_repayments


In [0]:
spark.sql("SHOW TABLES IN lending_club").show(truncate=False)



+------------+-------------------------------+-----------+
|database    |tableName                      |isTemporary|
+------------+-------------------------------+-----------+
|lending_club|customers                      |false      |
|lending_club|customers_delta                |false      |
|lending_club|customers_loan_t               |false      |
|lending_club|customers_loan_v               |false      |
|lending_club|loans                          |false      |
|lending_club|loans_defaulters_delinq        |false      |
|lending_club|loans_defaulters_detail_rec_enq|false      |
|lending_club|loans_repayments               |false      |
+------------+-------------------------------+-----------+



In [0]:
from pyspark.sql import Row

results = []
for row in tables_df.collect():
    table_name = row["tableName"]
    spark.sql(f"REFRESH TABLE lending_club.{table_name}")
    count = spark.sql(f"SELECT COUNT(*) as cnt FROM lending_club.{table_name}").collect()[0]["cnt"]
    results.append(Row(table=table_name, row_count=count))

display(spark.createDataFrame(results))


table,row_count
customers,2260633
customers_delta,2260633
customers_loan_t,2260633
customers_loan_v,2260633
loans,0
loans_defaulters_delinq,0
loans_defaulters_detail_rec_enq,0
loans_repayments,0
