In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config('spark.shuffle.useOldFetchProtocol','true'). \
config("spark.sql.warehouse.dir", f"/user/itv007136/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
spark

In [3]:
loans_def_raw_df=spark.read.format("csv").option("header","true").option("inferSchema","true").load("/public/trendytech/lendingclubproject/raw/loans_defaulters_csv")

In [4]:
loans_def_raw_df

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
9cb79aa7323e81be1...,2.0,0.0,0.0,0.0,0.0,0.0,11.0,
0dd2bbc517e3c8f9e...,0.0,0.0,1.0,1.0,3.0,0.0,,115.0
458458599d3df3bfc...,0.0,0.0,1.0,1.0,1.0,0.0,,76.0
05ea141ec28b5c7f7...,0.0,0.0,0.0,0.0,0.0,0.0,,
aac68850fdac09fd0...,1.0,0.0,0.0,0.0,0.0,0.0,21.0,
3a423e4589e89f429...,0.0,0.0,0.0,0.0,0.0,0.0,,
f1efcf7dfbfef21be...,0.0,0.0,0.0,0.0,1.0,0.0,,
c89986155a070db2e...,1.0,0.0,0.0,0.0,1.0,15.0,5.0,
118dc629b6e134419...,0.0,0.0,0.0,0.0,0.0,0.0,,
a86fa4b7493708333...,0.0,0.0,0.0,0.0,0.0,0.0,,


In [5]:
loans_def_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: string (nullable = true)
 |-- delinq_amnt: double (nullable = true)
 |-- pub_rec: string (nullable = true)
 |-- pub_rec_bankruptcies: double (nullable = true)
 |-- inq_last_6mths: string (nullable = true)
 |-- total_rec_late_fee: string (nullable = true)
 |-- mths_since_last_delinq: string (nullable = true)
 |-- mths_since_last_record: string (nullable = true)



In [6]:
loans_def_raw_df.createOrReplaceTempView("loans_defaulters")

In [7]:
spark.sql("select delinq_2yrs,count(*) as total from loans_defaulters group by delinq_2yrs order by total desc")

delinq_2yrs,total
0.0,1838878
1.0,281335
2.0,81285
3.0,29539
4.0,13179
5.0,6599
6.0,3717
7.0,2062
8.0,1223
9.0,818


### Defining Schema

In [8]:
loans_defaulters_schema="member_id string,delinq_2yrs float,delinq_amnt float,pub_rec float,pub_rec_bankruptcies float,inq_last_6mths float,total_rec_late_fee float,mths_since_last_delinq float,mths_since_last_record float"

In [9]:
loans_def_raw_df=spark.read.format("csv").option("header","true").schema(loans_defaulters_schema).load("/public/trendytech/lendingclubproject/raw/loans_defaulters_csv")

In [10]:
loans_def_raw_df

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
9cb79aa7323e81be1...,2.0,0.0,0.0,0.0,0.0,0.0,11.0,
0dd2bbc517e3c8f9e...,0.0,0.0,1.0,1.0,3.0,0.0,,115.0
458458599d3df3bfc...,0.0,0.0,1.0,1.0,1.0,0.0,,76.0
05ea141ec28b5c7f7...,0.0,0.0,0.0,0.0,0.0,0.0,,
aac68850fdac09fd0...,1.0,0.0,0.0,0.0,0.0,0.0,21.0,
3a423e4589e89f429...,0.0,0.0,0.0,0.0,0.0,0.0,,
f1efcf7dfbfef21be...,0.0,0.0,0.0,0.0,1.0,0.0,,
c89986155a070db2e...,1.0,0.0,0.0,0.0,1.0,15.0,5.0,
118dc629b6e134419...,0.0,0.0,0.0,0.0,0.0,0.0,,
a86fa4b7493708333...,0.0,0.0,0.0,0.0,0.0,0.0,,


In [11]:
loans_def_raw_df.createOrReplaceTempView("loans_defaulters")

In [12]:
spark.sql("select delinq_2yrs,count(*) as total from loans_defaulters group by delinq_2yrs order by total desc")

delinq_2yrs,total
0.0,1838878
1.0,281335
2.0,81285
3.0,29539
4.0,13179
5.0,6599
6.0,3717
7.0,2062
8.0,1223
9.0,818


In [20]:
loans_def_raw_df.filter("delinq_2yrs is null").count()

261

In [21]:
from pyspark.sql.functions import col

### Now we will change the datatype of delinq_2yrs to integer and also replace null with 0

In [22]:
loans_def_processed_df=loans_def_raw_df.withColumn("delinq_2yrs",col('delinq_2yrs').cast("integer"))

In [23]:
loans_def_new_processed_df=loans_def_processed_df.fillna(0,subset=["delinq_2yrs"])

In [35]:
loans_def_new_processed_df.filter("delinq_2yrs is null").count()

0

In [26]:
loans_def_new_processed_df.createOrReplaceTempView("loans_defaulters")

In [27]:
loans_def_new_processed_df

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
9cb79aa7323e81be1...,2,0.0,0.0,0.0,0.0,0.0,11.0,
0dd2bbc517e3c8f9e...,0,0.0,1.0,1.0,3.0,0.0,,115.0
458458599d3df3bfc...,0,0.0,1.0,1.0,1.0,0.0,,76.0
05ea141ec28b5c7f7...,0,0.0,0.0,0.0,0.0,0.0,,
aac68850fdac09fd0...,1,0.0,0.0,0.0,0.0,0.0,21.0,
3a423e4589e89f429...,0,0.0,0.0,0.0,0.0,0.0,,
f1efcf7dfbfef21be...,0,0.0,0.0,0.0,1.0,0.0,,
c89986155a070db2e...,1,0.0,0.0,0.0,1.0,15.0,5.0,
118dc629b6e134419...,0,0.0,0.0,0.0,0.0,0.0,,
a86fa4b7493708333...,0,0.0,0.0,0.0,0.0,0.0,,


In [28]:
spark.sql("select * from loans_defaulters where delinq_2yrs==0")

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
0dd2bbc517e3c8f9e...,0,0.0,1.0,1.0,3.0,0.0,,115.0
458458599d3df3bfc...,0,0.0,1.0,1.0,1.0,0.0,,76.0
05ea141ec28b5c7f7...,0,0.0,0.0,0.0,0.0,0.0,,
3a423e4589e89f429...,0,0.0,0.0,0.0,0.0,0.0,,
f1efcf7dfbfef21be...,0,0.0,0.0,0.0,1.0,0.0,,
118dc629b6e134419...,0,0.0,0.0,0.0,0.0,0.0,,
a86fa4b7493708333...,0,0.0,0.0,0.0,0.0,0.0,,
6e8d94bf446e97025...,0,0.0,0.0,0.0,0.0,0.0,36.0,
3de585156dc6b73f6...,0,0.0,0.0,0.0,0.0,0.0,,
e88945f86a96f8d71...,0,0.0,0.0,0.0,1.0,0.0,,


### We will divide the data into separate datasets delinq and public

In [29]:
loans_def_delinq_df=spark.sql(
"""
    select member_id,delinq_2yrs,delinq_amnt,int(mths_since_last_delinq)
    from loans_defaulters where delinq_2yrs>0 or mths_since_last_delinq>0
""")

In [30]:
loans_def_delinq_df.count()

1106163

In [31]:
loans_def_records_enq_df=spark.sql(
"""
    select member_id
    from loans_defaulters where 
    pub_rec>0.0 or pub_rec_bankruptcies>0.0 or inq_last_6mths>0.0
""")

In [32]:
loans_def_records_enq_df.count()

1070125

In [33]:
loans_def_delinq_df.write.format("csv").option("header","true").mode("overwrite").option("path","/user/itv007136/lendingclubproject/cleaned/loans_defaulters_delinq_csv").save()

In [34]:
loans_def_delinq_df.write.format("parquet").mode("overwrite").option("path","/user/itv007136/lendingclubproject/cleaned/loans_defaulters_delinq_parquet").save()

In [39]:
loans_def_records_enq_df.write.format("csv").option("header","true").mode("overwrite").option("path","/user/itv007136/lendingclubproject/cleaned/loans_defaulters_records_enq_csv").save()

In [40]:
loans_def_records_enq_df.write.format("parquet").mode("overwrite").option("path","/user/itv007136/lendingclubproject/cleaned/loans_defaulters_records_enq_parquet").save()

In [36]:
loans_def_p_pub_rec_df=loans_def_new_processed_df.withColumn("pub_rec",col('pub_rec').cast("integer")).fillna(0,subset=["pub_rec"])

In [39]:
loans_def_p_pub_rec_bankruptcies_df=loans_def_p_pub_rec_df.withColumn("pub_rec_bankruptcies",col('pub_rec_bankruptcies').cast("integer")).fillna(0,subset=["pub_rec_bankruptcies"])

In [40]:
loans_def_p_inq_last_6mths_df=loans_def_p_pub_rec_bankruptcies_df.withColumn("inq_last_6mths",col('inq_last_6mths').cast("integer")).fillna(0,subset=["inq_last_6mths"])

In [41]:
loans_def_p_inq_last_6mths_df.createOrReplaceTempView("loan_defaulters")

In [42]:
loans_def_detail_records_enq_df=spark.sql("select member_id,pub_rec,pub_rec_bankruptcies,inq_last_6mths from loan_defaulters")

In [43]:
loans_def_detail_records_enq_df

member_id,pub_rec,pub_rec_bankruptcies,inq_last_6mths
9cb79aa7323e81be1...,0,0,0
0dd2bbc517e3c8f9e...,1,1,3
458458599d3df3bfc...,1,1,1
05ea141ec28b5c7f7...,0,0,0
aac68850fdac09fd0...,0,0,0
3a423e4589e89f429...,0,0,0
f1efcf7dfbfef21be...,0,0,1
c89986155a070db2e...,0,0,1
118dc629b6e134419...,0,0,0
a86fa4b7493708333...,0,0,0


In [44]:
loans_def_detail_records_enq_df.write.format("csv").option("header","true").mode("overwrite").option("path","/user/itv007136/lendingclubproject/cleaned/loans_defaulters_detail_records_enq_csv").save()

In [45]:
loans_def_detail_records_enq_df.write.format("parquet").mode("overwrite").option("path","/user/itv007136/lendingclubproject/cleaned/loans_defaulters_detail_records_enq_parquet").save()