In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName("loans") \
.master("spark://10.208.36.84:7077") \
.config("spark.hadoop.defaultFS","hdfs://10.208.36.84:9000") \
.config("spark.sql.warehouse.dir","/home/hadoop/spark_workspace/shared_folder/tushar/Tables_data")\
.config("spark.executor.cores",2) \
.config("spark.executor.memory","4g") \
.config("spark.cores.max","8") \
.enableHiveSupport() \
.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
spark.sql("show databases").show()

+--------------+
|     namespace|
+--------------+
|       default|
|project_trendy|
+--------------+



In [7]:
spark.sql("select member_id ,count(*) as total from project_trendy.customers group by member_id having count(*)>1 order by total desc").show()

+--------------------+-----+
|           member_id|total|
+--------------------+-----+
|e3b0c44298fc1c149...|    8|
|ca5fd93b4f9adf941...|    3|
|27bdc71bcc167a89e...|    3|
|ab24d776473f88620...|    3|
|df80f1a8b448b328b...|    2|
|4c644a03ab100b0ec...|    2|
|f94662f4716a1f62f...|    2|
|5d52e7773cb0efff3...|    2|
|ff198a907ac2a230d...|    2|
|65e7b35dec6f1a9ef...|    2|
|ab962e5163fc17ff2...|    2|
|2b14e817ac88a8df4...|    2|
|4e95006686e33fbdf...|    2|
|4f6d66d76996e7206...|    2|
|af04c388ce4174287...|    2|
|d8211286a676d20d6...|    2|
|d26d6fa0710cff15d...|    2|
|2fb9a177922c00368...|    2|
|be2e2c04f930aef3e...|    2|
|f31a1aa77df7d6a0b...|    2|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [11]:
spark.sql("select * from project_trendy.customers where member_id like 'e3b0c44298fc1c149%' ").show()

+--------------------+---------+----------+--------------+-------------+-------------+---------------+---------------+-----+---------+-------------------+-----------------------+----------------+------------------+-------------------------+
|           member_id|emp_title|emp_length|home_ownership|annual_income|address_state|address_zipcode|address_country|grade|sub_grade|verification_status|total_high_credit_limit|application_type|join_annual_income|verification_status_joint|
+--------------------+---------+----------+--------------+-------------+-------------+---------------+---------------+-----+---------+-------------------+-----------------------+----------------+------------------+-------------------------+
|e3b0c44298fc1c149...|     null|         6|          null|         null|         null|           null|            USA| null|     null|               null|                   null|            null|              null|                     null|
|e3b0c44298fc1c149...|     null|    

<p> All the values are null so we can drop them </p>

In [17]:
spark.sql("use project_trendy")
spark.sql("show tables").show(truncate=False)

+--------------+--------------------+-----------+
|     namespace|           tableName|isTemporary|
+--------------+--------------------+-----------+
|project_trendy|           customers|      false|
|project_trendy|      customers_loan|      false|
|project_trendy|    customers_loan_v|      false|
|project_trendy|               loans|      false|
|project_trendy|loans_defaulters_...|      false|
|project_trendy|loans_defaulters_...|      false|
|project_trendy|    loans_repayments|      false|
+--------------+--------------------+-----------+



In [20]:
bad_data_customer_df = spark.sql("select member_id from (select member_id ,count(*) as total from project_trendy.customers group by member_id having count(*)>1 order by total desc)")

In [21]:
bad_data_customer_df.printSchema()

root
 |-- member_id: string (nullable = true)



In [22]:
bad_data_customer_df.count()

476

In [23]:
bad_data_loan_def_delin1_df = spark.sql("""select member_id from (select member_id ,count(*) as 
total from project_trendy.loans_defaulters_delinq group by member_id having count(*)>1 order 
by total desc)
""")

In [24]:
bad_data_loan_def_delin1_df.count()

136

In [25]:
bad_data_defaulters_detail_rec_enq = spark.sql("""select member_id from (select member_id ,count(*) as 
total from project_trendy.loans_defaulters_detail_rec_enq group by member_id having count(*)>1 order 
by total desc)
""")

In [26]:
bad_data_defaulters_detail_rec_enq.count()

123

In [29]:
bad_data_customer_df.repartition(1).write.format("csv").option("header",True).mode("overwrite") \
.option("path","/home/tushar/Documents/project/bad_data/customers").save()

In [30]:
bad_data_loan_def_delin1_df.repartition(1).write.format("csv").option("header",True).mode("overwrite") \
.option("path","/home/tushar/Documents/project/bad_data/defaulters_detail").save()

In [31]:
bad_data_defaulters_detail_rec_enq.repartition(1).write.format("csv").option("header",True).mode("overwrite") \
.option("path","/home/tushar/Documents/project/bad_data/defaulters_enquiry").save()

In [33]:
bad_conolidated = bad_data_customer_df.select("member_id") \
.union(bad_data_loan_def_delin1_df.select("member_id")) \
.union(bad_data_defaulters_detail_rec_enq.select("member_id"))

In [36]:
bad_conolidated .count()

                                                                                

735

In [34]:
bad_consolidated = bad_conolidated.distinct()

In [35]:
bad_consolidated.count()

                                                                                

476

In [37]:
bad_consolidated.repartition(1).write.format("csv").option("header",True).mode("overwrite") \
.option("path","/home/tushar/Documents/project/bad_data/consolidated").save()

                                                                                

In [41]:
bad_consolidated.createOrReplaceTempView("bad_data")

In [45]:
custom_df = spark.sql("select * from project_trendy.customers where member_id not in (select member_id from bad_data)")

In [47]:
custom_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path","/home/tushar/Documents/project/removed_bad_data/customer").save()

                                                                                

In [48]:
detail_df = spark.sql("select * from project_trendy.loans_defaulters_delinq where member_id not in (select member_id from bad_data)")

In [49]:
detail_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path","/home/tushar/Documents/project/removed_bad_data/default_detail").save()

                                                                                

In [50]:
enquiry_df = spark.sql("select * from project_trendy.loans_defaulters_detail_rec_enq where member_id not in (select member_id from bad_data)")

In [51]:
enquiry_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path","/home/tushar/Documents/project/removed_bad_data/default_enquiry").save()

                                                                                