In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName("loans") \
.master("spark://10.208.36.84:7077") \
.config("spark.hadoop.defaultFS","hdfs://10.208.36.84:9000") \
.config("spark.executor.cores",2) \
.config("spark.executor.memory","4g") \
.config("spark.cores.max","8") \
.getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/19 17:26:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/03/19 17:26:13 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
loans_df = spark.read.format("csv") \
.option("inferSchema","true") \
.option("header","true") \
.load("/home/tushar/Documents/project/loans_data_raw")

                                                                                

In [4]:
loans_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amnt: double (nullable = true)
 |-- funded_amnt: double (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- issue_d: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- title: string (nullable = true)



In [5]:
loans_df = loans_df.withColumnRenamed("loan_amt","loan_amount") \
.withColumnRenamed("funded_amnt","funded_amount") \
.withColumnRenamed("term","loan_term_months") \
.withColumnRenamed("int_rate","interest_rate") \
.withColumnRenamed("issue_d","issue_date") \
.withColumnRenamed("purpose","loan_purpose") \
.withColumnRenamed("title","loan_title") \
.withColumnRenamed("loan_amnt","loan_amount")

In [6]:
loans_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: double (nullable = true)
 |-- funded_amount: double (nullable = true)
 |-- loan_term_months: string (nullable = true)
 |-- interest_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)



In [7]:
loans_df = loans_df.withColumn("loan_amount",col("loan_amount").cast("float"))

In [8]:
loans_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: double (nullable = true)
 |-- loan_term_months: string (nullable = true)
 |-- interest_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)



In [9]:
loans_df = loans_df.withColumn("funded_amount",col("funded_amount").cast("float")) \
.withColumn("interest_rate",col("interest_rate").cast("float")) \
.withColumn("installment",col("installment").cast("float"))

In [10]:
loans_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_months: string (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)



In [11]:
loans_df = loans_df.withColumnRenamed("installment","monthly_installemnt") 

In [12]:
loans_df = loans_df.withColumn("ingest_data" , current_timestamp())

In [13]:
loans_df.createOrReplaceTempView('loans')

In [14]:
spark.sql("select count(*) from loans").show()

+--------+
|count(1)|
+--------+
|  868845|
+--------+



In [15]:
check_columns = ['loan_amount','funded_amount','loan_term_months','interest_rate','monthly_installemnt','issue_date','loan_status','loan_purpose']                   

In [16]:
loans = loans_df.na.drop("any",subset=check_columns)

In [17]:
loans.count()

                                                                                

868836

In [18]:
loans.createOrReplaceTempView('cloans')

In [19]:
spark.sql("select * from cloans limit 10").show()

+--------+--------------------+-----------+-------------+----------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+
| loan_id|           member_id|loan_amount|funded_amount|loan_term_months|interest_rate|monthly_installemnt|issue_date|loan_status|      loan_purpose|          loan_title|         ingest_data|
+--------+--------------------+-----------+-------------+----------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+
|57267443|48e96acef66a1b3aa...|    15000.0|      15000.0|       36 months|        19.99|             557.38|  Aug-2015| Fully Paid|debt_consolidation|  Debt consolidation|2024-03-19 17:26:...|
|57186326|a26f0e7701a6ab87d...|     6000.0|       6000.0|       36 months|         6.89|             184.97|  Aug-2015| Fully Paid|debt_consolidation|  Debt consolidation|2024-03-19 17:26:...|
|57044889|eeb80dce9907d9aa4...|    

In [20]:
loans = loans.withColumn("loan_term_months",(regexp_replace(col("loan_term_months")," months","") \
                        .cast('int') / 12).cast('int')) \
.withColumnRenamed("loan_term_months","loan_term_years")

In [21]:
loans.show()

+--------+--------------------+-----------+-------------+---------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+
| loan_id|           member_id|loan_amount|funded_amount|loan_term_years|interest_rate|monthly_installemnt|issue_date|loan_status|      loan_purpose|          loan_title|         ingest_data|
+--------+--------------------+-----------+-------------+---------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+
|57267443|48e96acef66a1b3aa...|    15000.0|      15000.0|              3|        19.99|             557.38|  Aug-2015| Fully Paid|debt_consolidation|  Debt consolidation|2024-03-19 17:26:...|
|57186326|a26f0e7701a6ab87d...|     6000.0|       6000.0|              3|         6.89|             184.97|  Aug-2015| Fully Paid|debt_consolidation|  Debt consolidation|2024-03-19 17:26:...|
|57044889|eeb80dce9907d9aa4...|     9625

In [22]:
spark.sql("select loan_purpose,count(*) from cloans group by loan_purpose order by count(*) desc").show()



+--------------------+--------+
|        loan_purpose|count(1)|
+--------------------+--------+
|  debt_consolidation|  494400|
|         credit_card|  203469|
|    home_improvement|   57943|
|               other|   50978|
|      major_purchase|   18519|
|             medical|   10215|
|                 car|    8428|
|      small_business|    7979|
|              moving|    5785|
|            vacation|    5752|
|               house|    4876|
|    renewable_energy|     481|
|             wedding|       9|
|and also pay off ...|       1|
|         educational|       1|
+--------------------+--------+



                                                                                

In [23]:
loan_purpose_list = ['debt_consolidation','credit_card','home_improvement','other','major_purchase','medical','car','small_business','moving','vacation','house','renewable_energy','wedding','educational']               

In [24]:
loans = loans.withColumn('loan_purpose',when(col('loan_purpose').isin(loan_purpose_list) ,col('loan_purpose')) \
.otherwise("other"))

In [25]:
loans.createOrReplaceTempView("llons")

In [26]:
spark.sql("select loan_purpose,count(*) from llons group by loan_purpose having count(*)=1 ").show()



+------------+--------+
|loan_purpose|count(1)|
+------------+--------+
| educational|       1|
+------------+--------+



                                                                                

In [27]:
loans.groupby('loan_purpose').agg(count(col('loan_purpose'))).show()

[Stage 16:>                                                         (0 + 6) / 6]

+------------------+-------------------+
|      loan_purpose|count(loan_purpose)|
+------------------+-------------------+
|           wedding|                  9|
|             other|              50979|
|    small_business|               7979|
|debt_consolidation|             494400|
|       credit_card|             203469|
|            moving|               5785|
|          vacation|               5752|
|  renewable_energy|                481|
|             house|               4876|
|               car|               8428|
|    major_purchase|              18519|
|           medical|              10215|
|  home_improvement|              57943|
|       educational|                  1|
+------------------+-------------------+



                                                                                

In [28]:
loans.write.format("parquet").mode("overwrite").option("path","/home/tushar/Documents/project/cleaned_loans/parquet").save()

                                                                                

In [29]:
loans.write.format("csv").mode("overwrite").option("path","/home/tushar/Documents/project/cleaned_loans/csv").save()

                                                                                