# Numerical Features

In [44]:
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.OneHotEncoderEstimator
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature._
import org.apache.spark.mllib.feature.Normalizer
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql._

import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.OneHotEncoderEstimator
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature._
import org.apache.spark.mllib.feature.Normalizer
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql._


In [55]:
def addNumericFeatures ( df:DataFrame ) : DataFrame = {
    
    //get columns that are integers or doubles
    val integerColumns = df.dtypes.filter(column => column._2 == "IntegerType").map(_._1)
    val doubleColumn = df.dtypes.filter(column => column._2 == "DoubleType").map(_._1)
    
    //add the numeric columns together into a single array
    var numberColumns = integerColumn ++ doubleColumn
    
    //remove the target variable from the numberColumns
    numberColumns = numberColumns diff Array("loan_status")
    
    //define Vector Assembler for the numeric columns
    val numericVectorAssembler = new VectorAssembler()
      .setInputCols(numberColumns)
      .setOutputCol("numerical_features")
      .setHandleInvalid("skip") 
    
    //Pipepline to turn numeric columns into
    val pipelineNumericVectorAssembler = new Pipeline()
      .setStages(Array(numericVectorAssembler))
        
    //Assemble the numeric features vector
    val numericVector_df = pipelineNumericVectorAssembler
      .fit(df)
      .transform(df)
    
    //numeric scaler, this one is a  minmax scaler
    val scaler = new MinMaxScaler()
      .setInputCol("numerical_features")
      .setOutputCol("scaledFeatures")
    
    // Normalize each feature to have unit standard deviation.
    val scaled_df = scaler
      .fit(numericVector_df)
      .transform(numericVector_df)
    
    return scaled_df
}

addNumericFeatures: (df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


In [57]:
val testfunctionDF = addNumericFeatures(df) 

testfunctionDF: org.apache.spark.sql.DataFrame = [loan_amnt: int, term: string ... 65 more fields]


In [58]:
testfunctionDF.select("numerical_features","scaledFeatures").show()

+--------------------+--------------------+
|  numerical_features|      scaledFeatures|
+--------------------+--------------------+
|(47,[0,3,5,6,8,11...|[0.33333333333333...|
|[10000.0,0.0,1.0,...|[0.23076923076923...|
|[5000.0,0.0,4.0,1...|[0.10256410256410...|
|[4000.0,1.0,0.0,1...|[0.07692307692307...|
|(47,[0,3,5,6,8,11...|[0.33333333333333...|
|[15000.0,0.0,1.0,...|[0.35897435897435...|
|[15750.0,0.0,1.0,...|[0.37820512820512...|
|[7800.0,0.0,0.0,1...|[0.17435897435897...|
|[28000.0,0.0,1.0,...|[0.69230769230769...|
|[10000.0,1.0,0.0,...|[0.23076923076923...|
|[16000.0,0.0,1.0,...|[0.38461538461538...|
|(47,[0,3,5,6,8,11...|[0.05128205128205...|
|[4000.0,1.0,1.0,6...|[0.07692307692307...|
|[24000.0,3.0,1.0,...|[0.58974358974358...|
|[12000.0,0.0,1.0,...|[0.28205128205128...|
|[13000.0,0.0,3.0,...|[0.30769230769230...|
|[21000.0,4.0,0.0,...|[0.51282051282051...|
|[30800.0,1.0,0.0,...|[0.76410256410256...|
|[30000.0,0.0,2.0,...|[0.74358974358974...|
|(47,[0,3,5,6,8,11...|[0.1538461

# code testing below here

In [2]:
val df = spark.read.format("csv")
  .option("inferSchema", "true")
  .option("header", "true")
  .load("LCLoan_Wrangled.csv")

print((df.count(), df.columns.length))

(20070,65)

df: org.apache.spark.sql.DataFrame = [loan_amnt: int, term: string ... 63 more fields]


In [3]:
//get columns that are integer and double type
val integerColumn = df.dtypes.filter(column => column._2 == "IntegerType").map(_._1)
val doubleColumn = df.dtypes.filter(column => column._2 == "DoubleType").map(_._1)


integerColumn: Array[String] = Array(loan_amnt, delinq_2yrs, inq_last_6mths, open_acc, pub_rec, revol_bal, total_acc, collections_12_mths_ex_med, policy_code, acc_now_delinq, tot_coll_amt, tot_cur_bal, total_rev_hi_lim, acc_open_past_24mths, delinq_amnt, mo_sin_old_rev_tl_op, mo_sin_rcnt_rev_tl_op, mo_sin_rcnt_tl, mort_acc, mths_since_recent_bc, num_accts_ever_120_pd, num_actv_rev_tl, num_bc_sats, num_il_tl, num_rev_accts, num_sats, num_tl_120dpd_2m, num_tl_90g_dpd_24m, num_tl_op_past_12m, pub_rec_bankruptcies, tax_liens, tot_hi_cred_lim, total_il_high_credit_limit, loan_status)
doubleColumn: Array[String] = Array(int_rate, annual_inc, dti, revol_util, out_prncp, total_pymnt, total_rec_int, total_rec_late_fee, collection_recovery_fee, last_pymnt_amnt, chargeoff_within_12_mths, mo_sin_ol...

In [9]:
//add the number columns together into single array
var numberColumn = integerColumn ++ doubleColumn

numberColumn: Array[String] = Array(loan_amnt, delinq_2yrs, inq_last_6mths, open_acc, pub_rec, revol_bal, total_acc, collections_12_mths_ex_med, policy_code, acc_now_delinq, tot_coll_amt, tot_cur_bal, total_rev_hi_lim, acc_open_past_24mths, delinq_amnt, mo_sin_old_rev_tl_op, mo_sin_rcnt_rev_tl_op, mo_sin_rcnt_tl, mort_acc, mths_since_recent_bc, num_accts_ever_120_pd, num_actv_rev_tl, num_bc_sats, num_il_tl, num_rev_accts, num_sats, num_tl_120dpd_2m, num_tl_90g_dpd_24m, num_tl_op_past_12m, pub_rec_bankruptcies, tax_liens, tot_hi_cred_lim, total_il_high_credit_limit, loan_status, int_rate, annual_inc, dti, revol_util, out_prncp, total_pymnt, total_rec_int, total_rec_late_fee, collection_recovery_fee, last_pymnt_amnt, chargeoff_within_12_mths, mo_sin_old_il_acct, pct_tl_nvr_dlq, percent_bc...

In [17]:
//remove target variable, any other columns here may need to be removed in an updated phase 2 notebook.
var removeColumns = Array("loan_status", "total_il_high_credit_limit")

removeColumns: Array[String] = Array(loan_status, total_il_high_credit_limit)


In [54]:
//remove the target variable
numberColumn = numberColumn diff Array("loan_status")

numberColumn: Array[String] = [Ljava.lang.String;@5eb9d2c0


In [21]:
// check columns
numberColumn

res8: Array[String] = Array(loan_amnt, delinq_2yrs, inq_last_6mths, open_acc, pub_rec, revol_bal, total_acc, collections_12_mths_ex_med, policy_code, acc_now_delinq, tot_coll_amt, tot_cur_bal, total_rev_hi_lim, acc_open_past_24mths, delinq_amnt, mo_sin_old_rev_tl_op, mo_sin_rcnt_rev_tl_op, mo_sin_rcnt_tl, mort_acc, mths_since_recent_bc, num_accts_ever_120_pd, num_actv_rev_tl, num_bc_sats, num_il_tl, num_rev_accts, num_sats, num_tl_120dpd_2m, num_tl_90g_dpd_24m, num_tl_op_past_12m, pub_rec_bankruptcies, tax_liens, tot_hi_cred_lim, int_rate, annual_inc, dti, revol_util, out_prncp, total_pymnt, total_rec_int, total_rec_late_fee, collection_recovery_fee, last_pymnt_amnt, chargeoff_within_12_mths, mo_sin_old_il_acct, pct_tl_nvr_dlq, percent_bc_gt_75)


In [22]:
//Vector Assembler
val numericVectorAssembler = new VectorAssembler()
  .setInputCols(numberColumn)
  .setOutputCol("numerical_features")
  .setHandleInvalid("skip") 

numericVectorAssembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_8228bf2542e7


In [30]:
val pipelineNumericVectorAssembler = new Pipeline()
  .setStages(Array(numericVectorAssembler))

val numericVector_df = pipelineNumericVectorAssembler
  .fit(df)
  .transform(df)

pipelineNumericVectorAssembler: org.apache.spark.ml.Pipeline = pipeline_27d6f1d1c75f
numericVector_df: org.apache.spark.sql.DataFrame = [loan_amnt: int, term: string ... 64 more fields]


In [33]:
numericVector_df.select("numerical_features").show()

+--------------------+
|  numerical_features|
+--------------------+
|(46,[0,3,5,6,8,11...|
|[10000.0,0.0,1.0,...|
|[5000.0,0.0,4.0,1...|
|[4000.0,1.0,0.0,1...|
|(46,[0,3,5,6,8,11...|
|[15000.0,0.0,1.0,...|
|[15750.0,0.0,1.0,...|
|[7800.0,0.0,0.0,1...|
|[28000.0,0.0,1.0,...|
|[10000.0,1.0,0.0,...|
|[16000.0,0.0,1.0,...|
|(46,[0,3,5,6,8,11...|
|[4000.0,1.0,1.0,6...|
|[24000.0,3.0,1.0,...|
|[12000.0,0.0,1.0,...|
|[13000.0,0.0,3.0,...|
|[21000.0,4.0,0.0,...|
|[30800.0,1.0,0.0,...|
|[30000.0,0.0,2.0,...|
|(46,[0,3,5,6,8,11...|
+--------------------+
only showing top 20 rows



In [47]:
//numeric scaler
val scaler = new MinMaxScaler()
  .setInputCol("numerical_features")
  .setOutputCol("scaledFeatures")

scaler: org.apache.spark.ml.feature.MinMaxScaler = minMaxScal_c8bfe8fbf891


In [48]:

// Normalize each feature to have unit standard deviation.
val scaledData = scaler
  .fit(numericVector_df)
  .transform(numericVector_df)

scaledData: org.apache.spark.sql.DataFrame = [loan_amnt: int, term: string ... 65 more fields]


In [49]:
scaledData.select("numerical_features","scaledFeatures").show()

+--------------------+--------------------+
|  numerical_features|      scaledFeatures|
+--------------------+--------------------+
|(46,[0,3,5,6,8,11...|[0.33333333333333...|
|[10000.0,0.0,1.0,...|[0.23076923076923...|
|[5000.0,0.0,4.0,1...|[0.10256410256410...|
|[4000.0,1.0,0.0,1...|[0.07692307692307...|
|(46,[0,3,5,6,8,11...|[0.33333333333333...|
|[15000.0,0.0,1.0,...|[0.35897435897435...|
|[15750.0,0.0,1.0,...|[0.37820512820512...|
|[7800.0,0.0,0.0,1...|[0.17435897435897...|
|[28000.0,0.0,1.0,...|[0.69230769230769...|
|[10000.0,1.0,0.0,...|[0.23076923076923...|
|[16000.0,0.0,1.0,...|[0.38461538461538...|
|(46,[0,3,5,6,8,11...|[0.05128205128205...|
|[4000.0,1.0,1.0,6...|[0.07692307692307...|
|[24000.0,3.0,1.0,...|[0.58974358974358...|
|[12000.0,0.0,1.0,...|[0.28205128205128...|
|[13000.0,0.0,3.0,...|[0.30769230769230...|
|[21000.0,4.0,0.0,...|[0.51282051282051...|
|[30800.0,1.0,0.0,...|[0.76410256410256...|
|[30000.0,0.0,2.0,...|[0.74358974358974...|
|(46,[0,3,5,6,8,11...|[0.1538461