In [21]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("LendingClubProject_DC_loan_defaulters") \
    .master("local[*]") \
    .getOrCreate()

In [22]:
loan_repayment_raw_df = spark.read.format("csv").option("header","true").option("inferSchema","true")\
    .load("Lending_club_project/raw/loans_defaulters_data_csv")

In [24]:
loan_repayment_raw_df.show(2)

+--------------------+-----------+-----------+-------+--------------------+--------------+------------------+----------------------+----------------------+
|           member_id|delinq_2yrs|delinq_amnt|pub_rec|pub_rec_bankruptcies|inq_last_6mths|total_rec_late_fee|mths_since_last_delinq|mths_since_last_record|
+--------------------+-----------+-----------+-------+--------------------+--------------+------------------+----------------------+----------------------+
|6d5091b3fcaaeb4ea...|        0.0|        0.0|    0.0|                 0.0|           1.0|               0.0|                  30.0|                  null|
|b5e7938b0a2da4cea...|        1.0|        0.0|    0.0|                 0.0|           4.0|               0.0|                   6.0|                  null|
+--------------------+-----------+-----------+-------+--------------------+--------------+------------------+----------------------+----------------------+
only showing top 2 rows



In [25]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType

defaulters_schema = StructType([
    StructField("member_id", StringType(), True),
    StructField("delinq_2yrs", FloatType(), True),
    StructField("delinq_amnt", FloatType(), True),
    StructField("pub_rec", FloatType(), True),
    StructField("pub_rec_bankruptcies", FloatType(), True),
    StructField("inq_last_6mths", FloatType(), True),
    StructField("total_rec_late_fee", FloatType(), True),
    StructField("mths_since_last_delinq", FloatType(), True),
    StructField("mths_since_last_record", FloatType(), True)
])

In [26]:
loan_repayment_raw_df = spark.read.format("csv").option("header","true").option("inferSchema","true").schema(defaulters_schema)\
    .load("Lending_club_project/raw/loans_defaulters_data_csv")

In [27]:
loan_repayment_raw_df.printSchema()


root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: float (nullable = true)
 |-- delinq_amnt: float (nullable = true)
 |-- pub_rec: float (nullable = true)
 |-- pub_rec_bankruptcies: float (nullable = true)
 |-- inq_last_6mths: float (nullable = true)
 |-- total_rec_late_fee: float (nullable = true)
 |-- mths_since_last_delinq: float (nullable = true)
 |-- mths_since_last_record: float (nullable = true)



In [28]:
from pyspark.sql.functions import col
#delinq_2yrs as this the count but we not in the float terms it should be the integer count and nulls with the value 0 (person who is not defaulted)
delinq_df = loan_repayment_raw_df.withColumn("delinq_2yrs",col("delinq_2yrs").cast("integer")).fillna(0,subset=["delinq_2yrs"])

In [29]:
delinq_df.createOrReplaceTempView("delinq_df")

In [30]:
spark.sql("select count(*) from delinq_df where delinq_2yrs != 0").show()

+--------+
|count(1)|
+--------+
|  141945|
+--------+



In [31]:
#storing only the data of customers who has delinq records 
loans_defaulters_df = spark.sql("SELECT member_id,delinq_2yrs,delinq_amnt,int(mths_since_last_delinq)FROM delinq_df WHERE delinq_2yrs > 0 OR mths_since_last_delinq > 0")

In [32]:
# loans_defaulters_df.select("*").distinct().show()
loans_defaulters_df.select("*").filter('mths_since_last_delinq=0 AND delinq_2yrs = 0').show()

+---------+-----------+-----------+----------------------+
|member_id|delinq_2yrs|delinq_amnt|mths_since_last_delinq|
+---------+-----------+-----------+----------------------+
+---------+-----------+-----------+----------------------+



In [12]:
# loans_defaulters_df has members id and months data where they are defaulted
mem_id_loans_defaulters_df = spark.sql("SELECT member_id FROM delinq_df WHERE delinq_2yrs > 0 OR mths_since_last_delinq > 0")

In [None]:
loans_defaulters_df.write.format("parquet").mode("overwrite")\
.option("path","Lending_club_project/cleaned/loans_defaulters_delinq_parquet").save()

In [35]:
df = spark.read.format("parquet").load("Lending_club_project/cleaned/loans_defaulters_delinq_parquet")
df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: integer (nullable = true)
 |-- delinq_amnt: float (nullable = true)
 |-- mths_since_last_delinq: integer (nullable = true)



In [30]:
loans_defaulters_df.repartition(1).write.format("csv").mode("overwrite").option("header","True")\
.option("path","Lending_club_project/cleaned/csv/loans_defaulters_delinq_csv").save()

In [31]:
mem_id_loans_defaulters_df.write.format("parquet").mode("overwrite")\
.option("path","Lending_club_project/cleaned/loans_defaulters_delinq_records_parquet").save()

In [32]:
mem_id_loans_defaulters_df.repartition(1).write.format("csv").mode("overwrite").option("header","True")\
.option("path","Lending_club_project/cleaned/csv/lloans_defaulters_delinq_records_csv").save()

In [13]:
pubrec_df = loan_repayment_raw_df.withColumn("pub_rec",col("pub_rec").cast("integer")).fillna(0,subset=["pub_rec"])

In [14]:
pub_rec_bankruptcies_df = pubrec_df.withColumn("pub_rec_bankruptcies",col("pub_rec_bankruptcies").cast("integer")).fillna(0,subset=["pub_rec_bankruptcies"])

In [15]:
inq_last_6mths_df = pub_rec_bankruptcies_df.withColumn("inq_last_6mths",col("inq_last_6mths").cast("integer")).fillna(0,subset=["inq_last_6mths"])

In [16]:
inq_last_6mths_df.createOrReplaceTempView("loandefaulters")

In [17]:
detailed_loan_defaulters = spark.sql("SELECT member_id,pub_rec,pub_rec_bankruptcies,inq_last_6mths FROM loandefaulters")

In [18]:
detailed_loan_defaulters.show()

+--------------------+-------+--------------------+--------------+
|           member_id|pub_rec|pub_rec_bankruptcies|inq_last_6mths|
+--------------------+-------+--------------------+--------------+
|6d5091b3fcaaeb4ea...|      0|                   0|             1|
|b5e7938b0a2da4cea...|      0|                   0|             4|
|91060b858433e8a61...|      0|                   0|             0|
|cab1fa9f533688b0a...|      0|                   0|             0|
|f74e401c1ab0adf78...|      0|                   0|             3|
|8aef4bb29d609d8d6...|      0|                   0|             0|
|538b4653da3b1e814...|      0|                   0|             0|
|b24d55f21390533c5...|      0|                   0|             0|
|1035c5401b0ca76d0...|      1|                   1|             1|
|cb0f1777593e77909...|      0|                   0|             0|
|a962f4d59caec5fa1...|      0|                   0|             0|
|e7592ab57b3afd9f1...|      0|                   0|           

In [19]:
detailed_loan_defaulters.write.format("parquet").mode("overwrite")\
.option("path","Lending_club_project/cleaned/loans_defaulters_detailed_records_parquet").save()

In [20]:
detailed_loan_defaulters.repartition(1).write.format("csv").mode("overwrite").option("header","True")\
.option("path","Lending_club_project/cleaned/csv/loans_defaulters_detailed_csv").save()

In [37]:
df2 = spark.read.format("parquet").load("Lending_club_project/cleaned/loans_defaulters_detailed_records_parquet")
df2.printSchema

<bound method DataFrame.printSchema of DataFrame[member_id: string, pub_rec: int, pub_rec_bankruptcies: int, inq_last_6mths: int]>