# Table of Content

* [#Read-20k-file](#Read-20k-file)
* [#EDA](#EDA)
* [#Correlation-Test](#Correlation-Test)

# Read 20k file

In [1]:
var df = spark.read.format("csv")
  .option("inferSchema", "true")
  .option("header", "true")
  .load("Loan_2017_20k.csv")

Intitializing Scala interpreter ...

Spark Web UI available at http://221364ec0696:4040
SparkContext available as 'sc' (version = 2.4.3, master = local[*], app id = local-1558870758271)
SparkSession available as 'spark'


df: org.apache.spark.sql.DataFrame = [loan_amnt: int, funded_amnt: int ... 84 more fields]


In [2]:
// cache in memory to speed up
df.cache
df.count



res0: Long = 20070


In [3]:
df.createOrReplaceTempView("df")

# EDA
* [#Update-Loan-Status](#Update-Loan-Status)
* [#Fill-NA](#Fill-NA)
* [#Check-for-String-type-column](#Check-for-String-type-column)
* [#Check-for-Integer-type-column](#Check-for-Integer-type-column)
* [#Check-for-Double-type-column](#Check-for-Double-type-column)

## Update Loan Status

Merge the loan status to 
- 0: indicates for good loan
- 1: indicates for bad loan

In [4]:
df.groupBy("loan_status").count().show()

+------------------+-----+
|       loan_status|count|
+------------------+-----+
|        Fully Paid| 5446|
|           Default|    1|
|   In Grace Period|  136|
|       Charged Off| 1660|
|Late (31-120 days)|  367|
|           Current|12402|
| Late (16-30 days)|   58|
+------------------+-----+



In [5]:
df.createOrReplaceTempView("df")

In [6]:
val newDF = spark.sql("""
    select a.*,
    case when loan_status in ('Fully Paid', 'In Grace Period', 'Current') then 1 else 0 end as new_loan_status
    from df a
""")

newDF: org.apache.spark.sql.DataFrame = [loan_amnt: int, funded_amnt: int ... 85 more fields]


In [7]:
df = newDF.drop("loan_status").withColumnRenamed("new_loan_status","loan_status")

df: org.apache.spark.sql.DataFrame = [loan_amnt: int, funded_amnt: int ... 84 more fields]


In [8]:
df.groupBy("loan_status").count().show()

+-----------+-----+
|loan_status|count|
+-----------+-----+
|          1|17984|
|          0| 2086|
+-----------+-----+



## Fill NA
* for Integer -> 0
* for String -> ""
* for Double -> 0.0

In [9]:
// Generate a map for each field and it's filled value if na
val typeMap = df.dtypes.map(column => 
    column._2 match {
        case "IntegerType" => (column._1 -> 0)
        case "StringType" => (column._1 -> "")
        case "DoubleType" => (column._1 -> 0.0)
    }).toMap

typeMap: scala.collection.immutable.Map[String,Any] = Map(emp_title -> "", tot_coll_amt -> 0, zip_code -> "", home_ownership -> "", num_bc_sats -> 0, total_pymnt -> 0.0, pct_tl_nvr_dlq -> 0.0, avg_cur_bal -> 0, sub_grade -> "", chargeoff_within_12_mths -> 0.0, pymnt_plan -> "", collection_recovery_fee -> 0.0, inq_last_6mths -> 0, tot_hi_cred_lim -> 0, total_il_high_credit_limit -> 0, last_credit_pull_d -> "", recoveries -> 0.0, total_rec_late_fee -> 0.0, mo_sin_rcnt_tl -> 0, emp_length -> "", policy_code -> 0, num_actv_rev_tl -> 0, mo_sin_old_il_acct -> 0.0, mo_sin_rcnt_rev_tl_op -> 0, verification_status -> "", revol_bal -> 0, num_actv_bc_tl -> 0, num_tl_90g_dpd_24m -> 0, num_tl_op_past_12m -> 0, num_sats -> 0, term -> "", mort_acc -> 0, percent_bc_gt_75 -> 0.0, bc_open_to_buy -> 0, la...

In [10]:
df = df.na.fill(typeMap)

df: org.apache.spark.sql.DataFrame = [loan_amnt: int, funded_amnt: int ... 84 more fields]


In [11]:
// Verify there's no NA in dataframe now
df.na.drop.count

res5: Long = 20070


## Check for String type column

In [12]:
val stringColumn = df.dtypes.filter(column => column._2 == "StringType").map(_._1)

stringColumn: Array[String] = Array(term, grade, sub_grade, emp_title, emp_length, home_ownership, verification_status, issue_d, pymnt_plan, purpose, title, zip_code, addr_state, earliest_cr_line, initial_list_status, last_pymnt_d, last_credit_pull_d, application_type, hardship_flag, disbursement_method, debt_settlement_flag)


In [13]:
val stringDF = df.stat.freqItems(stringColumn)

stringDF: org.apache.spark.sql.DataFrame = [term_freqItems: array<string>, grade_freqItems: array<string> ... 19 more fields]


In [14]:
stringColumn.foreach(l => stringDF.select(l+"_freqItems").show(false))

+----------------------+
|term_freqItems        |
+----------------------+
|[60 months, 36 months]|
+----------------------+

+---------------------+
|grade_freqItems      |
+---------------------+
|[D, G, A, C, F, E, B]|
+---------------------+

+--------------------------------------------------------------------------------------------------------------------------------------------+
|sub_grade_freqItems                                                                                                                         |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|[C4, A3, B2, G2, G5, E4, D2, F3, B1, D5, B4, E1, A2, C3, F2, A5, G1, D4, F5, E3, G4, C2, C5, A4, D1, B3, G3, A1, F1, E5, F4, D3, C1, B5, E2]|
+--------------------------------------------------------------------------------------------------------------------------------------------+

+------------------------------------


+----------------------------------------------------------------------------------------------------------+
|emp_length_freqItems                                                                                      |
+----------------------------------------------------------------------------------------------------------+
|[1 year, 3 years, < 1 year, 6 years, 9 years, 2 years, 10+ years, 5 years, 8 years, n/a, 4 years, 7 years]|
+----------------------------------------------------------------------------------------------------------+

+--------------------------+
|home_ownership_freqItems  |
+--------------------------+
|[OWN, RENT, MORTGAGE, ANY]|
+--------------------------+

+-----------------------------------------+
|verification_status_freqItems            |
+-----------------------------------------+
|[Verified, Not Verified, Source Verified]|
+-----------------------------------------+

+-------------------------------------------------------------------------------------

+-----------------------------+
|initial_list_status_freqItems|
+-----------------------------+
|[w, f]                       |
+-----------------------------+

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|last_pymnt_d_freqItems                                                                                                                                                                                                                                                |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[Apr-2017, Feb-2017, Aug-2018, Mar-2018, Ju

Below columns with only a few possible values can be dropped:
* term
* pymnt_plan
* initial_list_status
* application_type
* hardship_flag
* disbursement_method
* debt_settlement_flag

In [15]:
df = df.drop("term")
.drop("pymnt_plan")
.drop("initial_list_status")
.drop("application_type")
.drop("hardship_flag")
.drop("disbursement_method")
.drop("debt_settlement_flag")

df: org.apache.spark.sql.DataFrame = [loan_amnt: int, funded_amnt: int ... 77 more fields]


## Check for Integer type column

In [16]:
val integerColumn = df.dtypes.filter(column => column._2 == "IntegerType").map(_._1)

integerColumn: Array[String] = Array(loan_amnt, funded_amnt, delinq_2yrs, inq_last_6mths, open_acc, pub_rec, revol_bal, total_acc, collections_12_mths_ex_med, policy_code, acc_now_delinq, tot_coll_amt, tot_cur_bal, total_rev_hi_lim, acc_open_past_24mths, avg_cur_bal, bc_open_to_buy, delinq_amnt, mo_sin_old_rev_tl_op, mo_sin_rcnt_rev_tl_op, mo_sin_rcnt_tl, mort_acc, mths_since_recent_bc, num_accts_ever_120_pd, num_actv_bc_tl, num_actv_rev_tl, num_bc_sats, num_bc_tl, num_il_tl, num_op_rev_tl, num_rev_accts, num_rev_tl_bal_gt_0, num_sats, num_tl_120dpd_2m, num_tl_30dpd, num_tl_90g_dpd_24m, num_tl_op_past_12m, pub_rec_bankruptcies, tax_liens, tot_hi_cred_lim, total_bal_ex_mort, total_bc_limit, total_il_high_credit_limit, loan_status)


In [17]:
integerColumn.foreach( l => {
    println(l)
    df.describe(l).show(false)
})

loan_amnt
+-------+-----------------+
|summary|loan_amnt        |
+-------+-----------------+
|count  |20070            |
|mean   |14727.77279521674|
|stddev |9605.943924842913|
|min    |1000             |
|max    |40000            |
+-------+-----------------+

funded_amnt
+-------+-----------------+
|summary|funded_amnt      |
+-------+-----------------+
|count  |20070            |
|mean   |14727.77279521674|
|stddev |9605.943924842913|
|min    |1000             |
|max    |40000            |
+-------+-----------------+

delinq_2yrs
+-------+------------------+
|summary|delinq_2yrs       |
+-------+------------------+
|count  |20070             |
|mean   |0.3164424514200299|
|stddev |0.8923656245867851|
|min    |0                 |
|max    |25                |
+-------+------------------+

inq_last_6mths
+-------+------------------+
|summary|inq_last_6mths    |
+-------+------------------+
|count  |20070             |
|mean   |0.5067762830094669|
|stddev |0.799514602653405 |
|min    |


num_rev_accts
+-------+------------------+
|summary|num_rev_accts     |
+-------+------------------+
|count  |20070             |
|mean   |13.237717987045341|
|stddev |7.871839693276913 |
|min    |2                 |
|max    |107               |
+-------+------------------+

num_rev_tl_bal_gt_0
+-------+-------------------+
|summary|num_rev_tl_bal_gt_0|
+-------+-------------------+
|count  |20070              |
|mean   |5.401395117090185  |
|stddev |3.271756089403912  |
|min    |0                  |
|max    |28                 |
+-------+-------------------+

num_sats
+-------+------------------+
|summary|num_sats          |
+-------+------------------+
|count  |20070             |
|mean   |11.582012954658694|
|stddev |5.717695147820476 |
|min    |1                 |
|max    |63                |
+-------+------------------+

num_tl_120dpd_2m
+-------+--------------------+
|summary|num_tl_120dpd_2m    |
+-------+--------------------+
|count  |20070               |
|mean   |4.982561036

## Check for Double type column

In [18]:
val doubleColumn = df.dtypes.filter(column => column._2 == "DoubleType").map(_._1)

doubleColumn: Array[String] = Array(funded_amnt_inv, int_rate, installment, annual_inc, dti, revol_util, out_prncp, out_prncp_inv, total_pymnt, total_pymnt_inv, total_rec_prncp, total_rec_int, total_rec_late_fee, recoveries, collection_recovery_fee, last_pymnt_amnt, bc_util, chargeoff_within_12_mths, mo_sin_old_il_acct, pct_tl_nvr_dlq, percent_bc_gt_75)


In [19]:
doubleColumn.foreach( l => {
    println(l)
    df.describe(l).show(false)
})

funded_amnt_inv
+-------+-----------------+
|summary|funded_amnt_inv  |
+-------+-----------------+
|count  |20070            |
|mean   |14723.55879422023|
|stddev |9602.98512801013 |
|min    |1000.0           |
|max    |40000.0          |
+-------+-----------------+

int_rate
+-------+------------------+
|summary|int_rate          |
+-------+------------------+
|count  |20070             |
|mean   |13.181286995515938|
|stddev |5.160479853400301 |
|min    |5.32              |
|max    |30.99             |
+-------+------------------+

installment
+-------+------------------+
|summary|installment       |
+-------+------------------+
|count  |20070             |
|mean   |438.5379506726432 |
|stddev |282.00868318404855|
|min    |30.12             |
|max    |1566.8            |
+-------+------------------+

annual_inc
+-------+-----------------+
|summary|annual_inc       |
+-------+-----------------+
|count  |20070            |
|mean   |80446.58286148479|
|stddev |91653.89088551658|
|min   

# Correlation-Test

In [30]:
var corrColumn = df.dtypes.filter(column => column._2 == "IntegerType" || column._2 == "DoubleType").map(_._1)

corrColumn: Array[String] = Array(loan_amnt, funded_amnt, funded_amnt_inv, int_rate, installment, annual_inc, dti, delinq_2yrs, inq_last_6mths, open_acc, pub_rec, revol_bal, revol_util, total_acc, out_prncp, out_prncp_inv, total_pymnt, total_pymnt_inv, total_rec_prncp, total_rec_int, total_rec_late_fee, recoveries, collection_recovery_fee, last_pymnt_amnt, collections_12_mths_ex_med, policy_code, acc_now_delinq, tot_coll_amt, tot_cur_bal, total_rev_hi_lim, acc_open_past_24mths, avg_cur_bal, bc_open_to_buy, bc_util, chargeoff_within_12_mths, delinq_amnt, mo_sin_old_il_acct, mo_sin_old_rev_tl_op, mo_sin_rcnt_rev_tl_op, mo_sin_rcnt_tl, mort_acc, mths_since_recent_bc, num_accts_ever_120_pd, num_actv_bc_tl, num_actv_rev_tl, num_bc_sats, num_bc_tl, num_il_tl, num_op_rev_tl, num_rev_accts, num...

In [31]:
var pairColumn = for (i <- corrColumn;
                      j <- corrColumn if i != j) 
                      yield (i,j)

pairColumn: Array[(String, String)] = Array((loan_amnt,funded_amnt), (loan_amnt,funded_amnt_inv), (loan_amnt,int_rate), (loan_amnt,installment), (loan_amnt,annual_inc), (loan_amnt,dti), (loan_amnt,delinq_2yrs), (loan_amnt,inq_last_6mths), (loan_amnt,open_acc), (loan_amnt,pub_rec), (loan_amnt,revol_bal), (loan_amnt,revol_util), (loan_amnt,total_acc), (loan_amnt,out_prncp), (loan_amnt,out_prncp_inv), (loan_amnt,total_pymnt), (loan_amnt,total_pymnt_inv), (loan_amnt,total_rec_prncp), (loan_amnt,total_rec_int), (loan_amnt,total_rec_late_fee), (loan_amnt,recoveries), (loan_amnt,collection_recovery_fee), (loan_amnt,last_pymnt_amnt), (loan_amnt,collections_12_mths_ex_med), (loan_amnt,policy_code), (loan_amnt,acc_now_delinq), (loan_amnt,tot_coll_amt), (loan_amnt,tot_cur_bal), (loan_amnt,total_re...

In [32]:
pairColumn.slice(1,3)

res17: Array[(String, String)] = Array((loan_amnt,funded_amnt_inv), (loan_amnt,int_rate))


In [46]:
import spark.implicits._

import spark.implicits._


In [54]:
// This could take a while
var corrMaxtrix = pairColumn.map{ case (x,y) => {
    var corr = df.stat.corr(x,y)
    (x,y,corr)
}}.toSeq

corrMaxtrix: Seq[(String, String, Double)] = WrappedArray((loan_amnt,funded_amnt,1.0), (loan_amnt,funded_amnt_inv,0.9999945720567341), (loan_amnt,int_rate,0.10640651992579327), (loan_amnt,installment,0.9468955766694314), (loan_amnt,annual_inc,0.26583466116252075), (loan_amnt,dti,0.037033414975937325), (loan_amnt,delinq_2yrs,-0.01344946497246236), (loan_amnt,inq_last_6mths,-0.015938323536782782), (loan_amnt,open_acc,0.1659593558273838), (loan_amnt,pub_rec,-0.02637627622449272), (loan_amnt,revol_bal,0.3202576055020392), (loan_amnt,revol_util,0.10673464280216499), (loan_amnt,total_acc,0.19088752917165852), (loan_amnt,out_prncp,0.6218975082209893), (loan_amnt,out_prncp_inv,0.6218744366203187), (loan_amnt,total_pymnt,0.7571617972179704), (loan_amnt,total_pymnt_inv,0.7571532331034232), (loan_...

In [55]:
var corrMatrixDF = corrMaxtrix.toDF("colA","colB","corr")

corrMatrixDF: org.apache.spark.sql.DataFrame = [colA: string, colB: string ... 1 more field]


In [75]:
corrMatrixDF.na.drop.orderBy($"corr".desc).show(100,false)

+--------------------------+--------------------------+------------------+
|colA                      |colB                      |corr              |
+--------------------------+--------------------------+------------------+
|funded_amnt               |loan_amnt                 |1.0               |
|loan_amnt                 |funded_amnt               |1.0               |
|out_prncp                 |out_prncp_inv             |0.9999980595265819|
|out_prncp_inv             |out_prncp                 |0.9999980595265819|
|total_pymnt               |total_pymnt_inv           |0.9999957370804106|
|total_pymnt_inv           |total_pymnt               |0.9999957370804106|
|loan_amnt                 |funded_amnt_inv           |0.9999945720567341|
|funded_amnt_inv           |funded_amnt               |0.9999945720567339|
|funded_amnt_inv           |loan_amnt                 |0.9999945720567339|
|funded_amnt               |funded_amnt_inv           |0.9999945720567339|
|num_sats                

Column pairs with very high corr (>= 0.8), keep only the first one.
```
loan_amnt
out_prncp_inv
total_pymnt_inv
funded_amnt_inv
open_acc
tot_cur_bal
recoveries
num_rev_tl_bal_gt_0
total_pymnt
total_pymnt_inv
installment
funded_amnt_inv
total_bal_ex_mort
bc_open_to_buy
bc_util
num_op_rev_tl
num_sats
num_bc_tl
num_actv_bc_tl
acc_now_delinq
num_actv_bc_tl
avg_cur_bal
total_bc_limit
num_op_rev_tl
```

In [106]:
val colToDrop = List("loan_amnt","out_prncp_inv","total_pymnt_inv","funded_amnt_inv","open_acc","tot_cur_bal","recoveries","num_rev_tl_bal_gt_0","total_pymnt","total_pymnt_inv","installment","funded_amnt_inv","total_bal_ex_mort","bc_open_to_buy","bc_util","num_op_rev_tl","num_sats","num_bc_tl","num_actv_bc_tl","acc_now_delinq","num_actv_bc_tl","avg_cur_bal","total_bc_limit","num_op_rev_tl")

colToDrop: List[String] = List(loan_amnt, out_prncp_inv, total_pymnt_inv, funded_amnt_inv, open_acc, tot_cur_bal, recoveries, num_rev_tl_bal_gt_0, total_pymnt, total_pymnt_inv, installment, funded_amnt_inv, total_bal_ex_mort, bc_open_to_buy, bc_util, num_op_rev_tl, num_sats, num_bc_tl, num_actv_bc_tl, acc_now_delinq, num_actv_bc_tl, avg_cur_bal, total_bc_limit, num_op_rev_tl)


In [107]:
colToDrop.foreach(l => df = df.drop(l))

In [109]:
// reduce to 59
df.columns.toArray.size

res58: Int = 59


In [110]:
df.printSchema

root
 |-- funded_amnt: integer (nullable = false)
 |-- int_rate: double (nullable = false)
 |-- grade: string (nullable = false)
 |-- sub_grade: string (nullable = false)
 |-- emp_title: string (nullable = false)
 |-- emp_length: string (nullable = false)
 |-- home_ownership: string (nullable = false)
 |-- annual_inc: double (nullable = false)
 |-- verification_status: string (nullable = false)
 |-- issue_d: string (nullable = false)
 |-- purpose: string (nullable = false)
 |-- title: string (nullable = false)
 |-- zip_code: string (nullable = false)
 |-- addr_state: string (nullable = false)
 |-- dti: double (nullable = false)
 |-- delinq_2yrs: integer (nullable = false)
 |-- earliest_cr_line: string (nullable = false)
 |-- inq_last_6mths: integer (nullable = false)
 |-- pub_rec: integer (nullable = false)
 |-- revol_bal: integer (nullable = false)
 |-- revol_util: double (nullable = false)
 |-- total_acc: integer (nullable = false)
 |-- out_prncp: double (nullable = false)
 |-- total

In [114]:
df.repartition(1).write.mode("overwrite").csv("Loan_2017_20k_alex.csv")