In [4]:
print("Spark Version:", spark.version)
print("Hive Catalog Implementation:", spark.conf.get("spark.sql.catalogImplementation"))
print("Warehouse Directory:", spark.conf.get("spark.sql.warehouse.dir"))

# Check if Hive classes are available
try:
    from pyspark.sql import HiveContext
    print("HiveContext is available")
except ImportError:
    print("HiveContext not available")

Spark Version: 3.3.0
Hive Catalog Implementation: in-memory
Warehouse Directory: file:/home/jupyter/spark-warehouse
HiveContext is available


In [6]:
from pyspark.sql import SparkSession
# Create new SparkSession with Hive support
spark = SparkSession.builder \
    .appName("External_table_for_analysis") \
    .master("local[*]") \
    .enableHiveSupport() \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("spark.sql.warehouse.dir", "/home/jupyter/spark-warehouse") \
    .getOrCreate()

# Verify Hive support is now enabled
print("Hive Catalog Implementation:", spark.conf.get("spark.sql.catalogImplementation"))

Hive Catalog Implementation: hive


In [8]:
customer_data_df = spark.read.format("parquet").load("Lending_club_project/cleaned/customers_data_parquet")

In [9]:
customer_data_df.printSchema

<bound method DataFrame.printSchema of DataFrame[member_id: string, emp_title: string, emp_length: int, home_ownership: string, annual_income: float, address_state: string, address_zipcode: string, address_country: string, grade: string, sub_grade: string, verification_status: string, total_high_credit_limit: float, application_type: string, join_annual_income: float, verification_status_joint: string, ingest_date: timestamp]>

In [54]:
spark.sql("CREATE DATABASE lendingclubdata")

AnalysisException: Namespace 'lendingclubdata' already exists

In [None]:

# Create in default database
spark.sql("""
CREATE EXTERNAL TABLE lendingclubdata.customers(
    member_id string, 
    emp_title string, 
    emp_length int, 
    home_ownership string, 
    annual_income float, 
    address_state string, 
    address_zipcode string, 
    address_country string,
    grade string, 
    sub_grade string, 
    verification_status string, 
    total_high_credit_limit float, 
    application_type string,
    join_annual_income float, 
    verification_status_joint string, 
    ingest_date timestamp
) 
STORED AS PARQUET 
LOCATION 'Lending_club_project/cleaned/customers_data_parquet'
""")



AnalysisException: Table or view 'customers' already exists in database 'lendingclubdata'

In [None]:
# Here I need to create EXTERNAL TABLE as on above code but as i am working on local and having issues with hive metastore creating table which i can access across my cluster
spark.sql("CREATE DATABASE IF NOT EXISTS lendingclubdata")

spark.sql("""
DROP TABLE IF EXISTS lendingclubdata.customers
""")

spark.sql("""
CREATE TABLE lendingclubdata.customers (
    member_id string, 
    emp_title string, 
    emp_length int, 
    home_ownership string, 
    annual_income float, 
    address_state string, 
    address_zipcode string, 
    address_country string,
    grade string, 
    sub_grade string, 
    verification_status string, 
    total_high_credit_limit float, 
    application_type string,
    join_annual_income float, 
    verification_status_joint string, 
    ingest_date timestamp
) USING PARQUET
LOCATION 'file:////home/jupyter/Lending_club_project/cleaned/customers_data_parquet'
""")

spark.sql("REFRESH TABLE lendingclubdata.customers")

DataFrame[]

In [72]:
spark.sql("SELECT * FROM lendingclubdata.customers").show(2)

+--------------------+--------------------+----------+--------------+-------------+-------------+---------------+---------------+-----+---------+-------------------+-----------------------+----------------+------------------+-------------------------+--------------------+
|           member_id|           emp_title|emp_length|home_ownership|annual_income|address_state|address_zipcode|address_country|grade|sub_grade|verification_status|total_high_credit_limit|application_type|join_annual_income|verification_status_joint|         ingest_date|
+--------------------+--------------------+----------+--------------+-------------+-------------+---------------+---------------+-----+---------+-------------------+-----------------------+----------------+------------------+-------------------------+--------------------+
|0d3c568ff6944b11c...|Bookkeeper/Accoun...|        10|      MORTGAGE|      48000.0|           SC|          297xx|            USA|    C|       C5|       Not Verified|               2

In [73]:
spark.sql("DESCRIBE formatted lendingclubdata.customers").show(truncate= False)

+----------------------------+---------------+-------+
|col_name                    |data_type      |comment|
+----------------------------+---------------+-------+
|member_id                   |string         |null   |
|emp_title                   |string         |null   |
|emp_length                  |int            |null   |
|home_ownership              |string         |null   |
|annual_income               |float          |null   |
|address_state               |string         |null   |
|address_zipcode             |string         |null   |
|address_country             |string         |null   |
|grade                       |string         |null   |
|sub_grade                   |string         |null   |
|verification_status         |string         |null   |
|total_high_credit_limit     |float          |null   |
|application_type            |string         |null   |
|join_annual_income          |float          |null   |
|verification_status_joint   |string         |null   |
|ingest_da

In [None]:
#Loans data 
spark.sql("CREATE DATABASE IF NOT EXISTS lendingclubdata")

spark.sql("""
DROP TABLE IF EXISTS lendingclubdata.loans
""")


spark.sql("""
CREATE TABLE lendingclubdata.loans(loan_id string, member_id string, loan_amnt float, funded_amnt float, loan_term_months string,
           interest_rate float, monthly_installment float, issue_date string, loan_status string, 
          loan_purpose string, loan_title string, ingested_time timestamp)
          USING PARQUET LOCATION 'file:////home/jupyter/Lending_club_project/cleaned/loan_data_parquet'
          """)
spark.sql("REFRESH TABLE lendingclubdata.loans")

DataFrame[]

In [75]:
spark.sql("SELECT * FROM lendingclubdata.loans").show(2)

+---------+--------------------+---------+-----------+----------------+-------------+-------------------+----------+-----------+------------------+------------------+--------------------+
|  loan_id|           member_id|loan_amnt|funded_amnt|loan_term_months|interest_rate|monthly_installment|issue_date|loan_status|      loan_purpose|        loan_title|       ingested_time|
+---------+--------------------+---------+-----------+----------------+-------------+-------------------+----------+-----------+------------------+------------------+--------------------+
|130943487|51989e17caab932ca...|   4800.0|     4800.0|            null|        15.04|             166.49|  Mar-2018| Fully Paid|           medical|  Medical expenses|2025-08-26 00:17:...|
|130514193|043da8398b63b2c44...|  40000.0|    40000.0|            null|        14.07|             932.19|  Mar-2018|    Current|debt_consolidation|Debt consolidation|2025-08-26 00:17:...|
+---------+--------------------+---------+-----------+------

In [76]:
#Loan repayments data
spark.sql("CREATE DATABASE IF NOT EXISTS lendingclubdata")

spark.sql("""
DROP TABLE IF EXISTS lendingclubdata.loan_repayments
""")


spark.sql("""
CREATE TABLE lendingclubdata.loan_repayments(loan_id string,total_principal_received float,total_interest_received float,
          total_late_fee_received float,total_payment_received float,last_payment_amount float,last_payment_date string,
          next_payment_date string,ingest_date timestamp) 
          USING PARQUET LOCATION 'file:////home/jupyter/Lending_club_project/cleaned/loan_repayments_parquet' """)

spark.sql("REFRESH TABLE lendingclubdata.loan_repayments")
spark.sql("SELECT * FROM lendingclubdata.loan_repayments").show()

+---------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|  loan_id|total_principal_received|total_interest_received|total_late_fee_received|total_payment_received|last_payment_amount|last_payment_date|next_payment_date|         ingest_date|
+---------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|113619662|                  7500.0|                 357.79|                    0.0|              7857.791|            6642.78|         Feb-2018|             null|2025-08-26 11:43:...|
|111985436|                15937.81|                3123.29|                    0.0|               19061.1|            1002.33|         Feb-2019|         Apr-2019|2025-08-26 11:43:...|
|113569389|                10106.07|                3470.43|               

In [78]:

#loan delinquers
spark.sql("CREATE DATABASE IF NOT EXISTS lendingclubdata")

spark.sql("""
DROP TABLE IF EXISTS lendingclubdata.delinq
""")


spark.sql("""
CREATE TABLE lendingclubdata.delinq(member_id string,delinq_2yrs integer,delinq_amnt float,mths_since_last_delinq integer )
          USING PARQUET LOCATION 'file:////home/jupyter/Lending_club_project/cleaned/loans_defaulters_delinq_parquet' """)
spark.sql("REFRESH TABLE lendingclubdata.delinq")
spark.sql("SELECT * FROM lendingclubdata.delinq").show(2)

+--------------------+-----------+-----------+----------------------+
|           member_id|delinq_2yrs|delinq_amnt|mths_since_last_delinq|
+--------------------+-----------+-----------+----------------------+
|680fb82b4acb97795...|          0|        0.0|                    24|
|0e0a6f18a90718c2a...|          0|        0.0|                    54|
+--------------------+-----------+-----------+----------------------+
only showing top 2 rows



In [82]:
spark.sql("CREATE DATABASE IF NOT EXISTS lendingclubdata")

spark.sql("""
DROP TABLE IF EXISTS lendingclubdata.loan_defaulters_detail_rec
""")


spark.sql("""
CREATE TABLE lendingclubdata.loan_defaulters_detail_rec(member_id string, pub_rec int, pub_rec_bankruptcies int, inq_last_6mths int) USING PARQUET LOCATION 'file:////home/jupyter/Lending_club_project/cleaned/loans_defaulters_detailed_records_parquet'""")
spark.sql("REFRESH TABLE lendingclubdata.loan_defaulters_detail_rec")
spark.sql("SELECT * FROM lendingclubdata.loan_defaulters_detail_rec").show(2)

+--------------------+-------+--------------------+--------------+
|           member_id|pub_rec|pub_rec_bankruptcies|inq_last_6mths|
+--------------------+-------+--------------------+--------------+
|5c18f413cebed2192...|      1|                   1|             0|
|e98eb408f9863ef59...|      0|                   0|             0|
+--------------------+-------+--------------------+--------------+
only showing top 2 rows



In [13]:
spark.sql("SHOW TABLES FROM lendingclubdata").show(truncate=False)

+---------------+--------------------------+-----------+
|namespace      |tableName                 |isTemporary|
+---------------+--------------------------+-----------+
|lendingclubdata|customers                 |false      |
|lendingclubdata|delinq                    |false      |
|lendingclubdata|loan_defaulters_detail_rec|false      |
|lendingclubdata|loan_repayments           |false      |
|lendingclubdata|loans                     |false      |
+---------------+--------------------------+-----------+



In [14]:
spark.sql("show databases").show()

+---------------+
|      namespace|
+---------------+
|        default|
|lendingclubdata|
+---------------+



In [None]:
#Creating view  customer_complete_view
spark.sql("USE DATABASE lendingclubdata")
spark.sql("""
CREATE OR REPLACE VIEW customer_complete_view AS
SELECT 
    c.member_id,
    c.emp_title,
    c.emp_length,
    c.home_ownership,
    c.annual_income,
    c.address_state,
    c.address_zipcode,
    c.address_country,
    c.grade,
    c.sub_grade,
    c.verification_status,
    c.total_high_credit_limit,
    c.application_type,
    c.join_annual_income,
    c.verification_status_joint,
    
    -- Loans data
    l.loan_id,
    l.loan_amnt,
    l.funded_amnt,
    l.loan_term_months,
    l.interest_rate,
    l.monthly_installment,
    l.issue_date,
    l.loan_status,
    l.loan_purpose,
    l.loan_title,
    
    -- Loan repayments data
    lr.total_principal_received,
    lr.total_interest_received,
    lr.total_late_fee_received,
    lr.total_payment_received,
    lr.last_payment_amount,
    lr.last_payment_date,
    lr.next_payment_date,
    
    -- Delinquency data
    d.delinq_2yrs,
    d.delinq_amnt,
    d.mths_since_last_delinq,
    
    -- Loan defaulters detail data
    ldd.pub_rec,
    ldd.pub_rec_bankruptcies,
    ldd.inq_last_6mths

FROM customers c
LEFT JOIN loans l ON c.member_id = l.member_id
LEFT JOIN loan_repayments lr ON l.loan_id = lr.loan_id
LEFT JOIN delinq d ON c.member_id = d.member_id
LEFT JOIN loan_defaulters_detail_rec ldd ON c.member_id = ldd.member_id;

""")

DataFrame[]

In [18]:
spark.sql("SHOW VIEWS").show(truncate=False)


+---------------+----------------------+-----------+
|namespace      |viewName              |isTemporary|
+---------------+----------------------+-----------+
|lendingclubdata|customer_complete_view|false      |
+---------------+----------------------+-----------+



In [19]:
spark.sql("select * from customer_complete_view").show(3)

+--------------------+--------------------+----------+--------------+-------------+-------------+---------------+---------------+-----+---------+-------------------+-----------------------+----------------+------------------+-------------------------+--------+---------+-----------+----------------+-------------+-------------------+----------+-----------+------------------+--------------------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+-----------+-----------+----------------------+-------+--------------------+--------------+
|           member_id|           emp_title|emp_length|home_ownership|annual_income|address_state|address_zipcode|address_country|grade|sub_grade|verification_status|total_high_credit_limit|application_type|join_annual_income|verification_status_joint| loan_id|loan_amnt|funded_amnt|loan_term_months|interest_rate|monthly_installment|issue_date|loan_status|    

In [31]:
#Identifying the bad data
spark.sql("""
SELECT member_id,count(*) AS total_cnt FROM lendingclubdata.customers GROUP BY member_id  ORDER BY total_cnt DESC;
          """).show(5,truncate=False)

+----------------------------------------------------------------+---------+
|member_id                                                       |total_cnt|
+----------------------------------------------------------------+---------+
|ca5fd93b4f9adf94118962b8d3e3d24e4810d891d1dbcaa0f98e641e16d4b4a2|3        |
|27bdc71bcc167a89e07e981246320dbdaddc8c67354ed022f3e835368da0fb0d|3        |
|ab24d776473f88620dd571bb74ad88889daee02904f82aa07ae91a4c69e6ec58|3        |
|460e1a4a960113bbfd4d3defca860fc4e7387f7b40270d5f78cb469a3baceda1|2        |
|912a3ce5af18a80b2ca3453db9a63d16baeecf27ae01f97bdb062770915ec8ac|2        |
+----------------------------------------------------------------+---------+
only showing top 5 rows



In [33]:
spark.sql(""" SELECT * FROM lendingclubdata.customers
           WHERE member_id = 'ca5fd93b4f9adf94118962b8d3e3d24e4810d891d1dbcaa0f98e641e16d4b4a2' ;
""").show()

+--------------------+---------+----------+--------------+-------------+-------------+---------------+---------------+-----+---------+-------------------+-----------------------+----------------+------------------+-------------------------+--------------------+
|           member_id|emp_title|emp_length|home_ownership|annual_income|address_state|address_zipcode|address_country|grade|sub_grade|verification_status|total_high_credit_limit|application_type|join_annual_income|verification_status_joint|         ingest_date|
+--------------------+---------+----------+--------------+-------------+-------------+---------------+---------------+-----+---------+-------------------+-----------------------+----------------+------------------+-------------------------+--------------------+
|ca5fd93b4f9adf941...|     null|         6|          RENT|      30000.0|           NY|          104xx|            USA|    C|       C5|           Verified|                31620.0|      Individual|              null|

In [34]:
spark.sql("""
SELECT member_id,count(*) AS total_cnt FROM lendingclubdata.delinq GROUP BY member_id  ORDER BY total_cnt DESC;
          """).show(5,truncate=False)

+----------------------------------------------------------------+---------+
|member_id                                                       |total_cnt|
+----------------------------------------------------------------+---------+
|74a3e0f8c9e64b9ddb788e83b01a93e74b4f1a7f0653b17bbfa594b3e4e3aa29|2        |
|96bdf6e6b517986899048173d01be098b7b0600ad4bbabfc14a58198da2ae537|2        |
|a380e78c166ab255ce6c71c6ba6b2072a868c057d7681694e6fd8fab7b7158d5|2        |
|7a7514d802a29e28ac347989eb0341a74138f34d005cbf3187b8f37070f720bd|2        |
|678da331e66060c87b970cc880604cf86656ca341b6d22ffe629d141caf2c775|2        |
+----------------------------------------------------------------+---------+
only showing top 5 rows



In [35]:
spark.sql("""
SELECT member_id,count(*) AS total_cnt FROM lendingclubdata.loan_defaulters_detail_rec GROUP BY member_id  ORDER BY total_cnt DESC;
          """).show(5,truncate=False)

+----------------------------------------------------------------+---------+
|member_id                                                       |total_cnt|
+----------------------------------------------------------------+---------+
|e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855|6        |
|ab24d776473f88620dd571bb74ad88889daee02904f82aa07ae91a4c69e6ec58|3        |
|27bdc71bcc167a89e07e981246320dbdaddc8c67354ed022f3e835368da0fb0d|3        |
|ca5fd93b4f9adf94118962b8d3e3d24e4810d891d1dbcaa0f98e641e16d4b4a2|3        |
|a6ae6eb048d08c8f35eb7fd7f7820b5f39af5c6800a59c68790275d696f1dd7c|2        |
+----------------------------------------------------------------+---------+
only showing top 5 rows



###### we can observe count of member_id in the table are multiple which implies these each id is having different data then which data should we consider ? This should be decided by upstream team for further processing

In [38]:
repeating_data = spark.sql("SELECT member_id,count(*) AS total_cnt FROM lendingclubdata.customers GROUP BY member_id HAVING total_cnt > 1")

In [40]:
repeating_data.count()

293

In [45]:
repeating_customer_data = spark.sql("""SELECT member_id FROM (SELECT member_id,count(*) AS total_cnt FROM lendingclubdata.customers
                            GROUP BY member_id HAVING total_cnt > 1)""")

In [46]:
repeating_customer_data.count()

293

In [None]:
repeating_loan_defaulters_df= spark.sql("""SELECT member_id FROM (SELECT member_id,count(*) AS total_cnt FROM lendingclubdata.delinq
                            GROUP BY member_id HAVING total_cnt > 1)""")

In [50]:
repeating_loan_defaulters_df.count()

94

In [51]:
repeating_loan_defaulters_detail_df= spark.sql("""SELECT member_id FROM (SELECT member_id,count(*) AS total_cnt FROM lendingclubdata.loan_defaulters_detail_rec
                            GROUP BY member_id HAVING total_cnt > 1)""")

In [52]:
repeating_loan_defaulters_detail_df.count()

299

In [None]:
repeating_customer_data.repartition(1).write.format("csv").option("header","true").mode("overwrite").option("path","")