In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("LendingClubProject_DC_loan_defaulters") \
    .master("local[*]") \
    .getOrCreate()

In [3]:
loan_repayment_raw_df = spark.read.format("csv").option("header","true").option("inferSchema","true")\
    .load("Lending_club_project/raw/loans_defaulters_data_csv")

In [8]:
loan_repayment_raw_df.show()

+--------------------+-----------+-----------+-------+--------------------+--------------+------------------+----------------------+----------------------+
|           member_id|delinq_2yrs|delinq_amnt|pub_rec|pub_rec_bankruptcies|inq_last_6mths|total_rec_late_fee|mths_since_last_delinq|mths_since_last_record|
+--------------------+-----------+-----------+-------+--------------------+--------------+------------------+----------------------+----------------------+
|6d5091b3fcaaeb4ea...|        0.0|        0.0|    0.0|                 0.0|           1.0|               0.0|                  30.0|                  null|
|b5e7938b0a2da4cea...|        1.0|        0.0|    0.0|                 0.0|           4.0|               0.0|                   6.0|                  null|
|91060b858433e8a61...|        0.0|        0.0|    0.0|                 0.0|           0.0|               0.0|                  null|                  null|
|cab1fa9f533688b0a...|        0.0|        0.0|    0.0|          

In [10]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType

defaulters_schema = StructType([
    StructField("member_id", StringType(), True),
    StructField("delinq_2yrs", FloatType(), True),
    StructField("delinq_amnt", FloatType(), True),
    StructField("pub_rec", FloatType(), True),
    StructField("pub_rec_bankruptcies", FloatType(), True),
    StructField("inq_last_6mths", FloatType(), True),
    StructField("total_rec_late_fee", FloatType(), True),
    StructField("mths_since_last_delinq", FloatType(), True),
    StructField("mths_since_last_record", FloatType(), True)
])

In [11]:
loan_repayment_raw_df = spark.read.format("csv").option("header","true").option("inferSchema","true").schema(defaulters_schema)\
    .load("Lending_club_project/raw/loans_defaulters_data_csv")

In [12]:
loan_repayment_raw_df.printSchema()


root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: float (nullable = true)
 |-- delinq_amnt: float (nullable = true)
 |-- pub_rec: float (nullable = true)
 |-- pub_rec_bankruptcies: float (nullable = true)
 |-- inq_last_6mths: float (nullable = true)
 |-- total_rec_late_fee: float (nullable = true)
 |-- mths_since_last_delinq: float (nullable = true)
 |-- mths_since_last_record: float (nullable = true)



In [13]:
from pyspark.sql.functions import col
#delinq_2yrs as this the count but we not in the float terms it should be the integer count and nulls with the value 0 (person who is not defaulted)
delinq_df = loan_repayment_raw_df.withColumn("delinq_2yrs",col("delinq_2yrs").cast("integer")).fillna(0,subset=["delinq_2yrs"])

In [14]:
delinq_df.createOrReplaceTempView("delinq_df")

In [17]:
spark.sql("select count(*) from delinq_df where delinq_2yrs != 0").show()

+--------+
|count(1)|
+--------+
|  141945|
+--------+



In [18]:
#storing only the data of customers who has delinq records 
loans_defaulters_df = spark.sql("SELECT member_id,delinq_2yrs,int(mths_since_last_delinq)FROM delinq_df WHERE delinq_2yrs > 0 OR mths_since_last_delinq > 0")

In [27]:
# loans_defaulters_df.select("*").distinct().show()
loans_defaulters_df.select("*").filter('mths_since_last_delinq=0 AND delinq_2yrs = 0').show()

+---------+-----------+----------------------+
|member_id|delinq_2yrs|mths_since_last_delinq|
+---------+-----------+----------------------+
+---------+-----------+----------------------+



In [None]:
# loans_defaulters_df has members id and months data where they are defaulted
mem_id_loans_defaulters_df = spark.sql("SELECT member_id FROM delinq_df WHERE delinq_2yrs > 0 OR mths_since_last_delinq > 0")

In [29]:
loans_defaulters_df.write.format("parquet").mode("overwrite")\
.option("path","Lending_club_project/cleaned/loans_defaulters_delinq_parquet").save()

In [30]:
loans_defaulters_df.repartition(1).write.format("csv").mode("overwrite").option("header","True")\
.option("path","Lending_club_project/cleaned/csv/loans_defaulters_delinq_csv").save()

In [31]:
mem_id_loans_defaulters_df.write.format("parquet").mode("overwrite")\
.option("path","Lending_club_project/cleaned/loans_defaulters_delinq_records_parquet").save()

In [32]:
mem_id_loans_defaulters_df.repartition(1).write.format("csv").mode("overwrite").option("header","True")\
.option("path","Lending_club_project/cleaned/csv/lloans_defaulters_delinq_records_csv").save()

In [None]:
 # 127.0.0.1:8888/lab?token=5785be11c174dc4942716e9f1de21b2ddc7e6a9dfb32138d

In [None]:
http://127.0.0.1:8888/