In [2]:
import org.apache.spark.sql.SparkSession
import spark.implicits._

import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.OneHotEncoderEstimator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.mllib.linalg.{Vector, Vectors}

import org.apache.spark.sql.SparkSession
import spark.implicits._
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.OneHotEncoderEstimator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.mllib.linalg.{Vector, Vectors}


In [3]:
val df = spark.read.format("csv")
  .option("inferSchema", "true")
  .option("header", "true")
  .load("../Loan_2017_20k.csv")

print((df.count(), df.columns.length))

(20070,86)

df: org.apache.spark.sql.DataFrame = [loan_amnt: int, funded_amnt: int ... 84 more fields]


In [4]:
// https://towardsdatascience.com/feature-encoding-with-spark-2-3-0-part-1-9ede45562740

//val categorical_features = df.columns.filter(_.contains("status"))

val categorical_features = Array(
    "verification_status", 
    "loan_status", 
    "initial_list_status",
    "home_ownership",
    "acc_now_delinq",
)

val encodedFeatures = categorical_features.flatMap{ name =>
    
    val stringIndexer = new StringIndexer()
      .setInputCol(name)
      .setOutputCol(name + "_index")
    
    val oneHotEncoder = new OneHotEncoderEstimator()
      .setInputCols(Array(name + "_index"))
      .setOutputCols(Array(name + "_vec"))
      .setDropLast(false)
    
    Array(stringIndexer, oneHotEncoder)
}

val pipeline = new Pipeline()
  .setStages(encodedFeatures)

val df_transformed = pipeline
  .fit(df)
  .transform(df)

//df_transformed.columns.foreach(println)

categorical_features: Array[String] = Array(verification_status, loan_status, initial_list_status, home_ownership, acc_now_delinq)
encodedFeatures: Array[org.apache.spark.ml.Estimator[_ >: org.apache.spark.ml.feature.OneHotEncoderModel with org.apache.spark.ml.feature.StringIndexerModel <: org.apache.spark.ml.Model[_ >: org.apache.spark.ml.feature.OneHotEncoderModel with org.apache.spark.ml.feature.StringIndexerModel <: org.apache.spark.ml.Transformer with org.apache.spark.ml.param.shared.HasHandleInvalid with org.apache.spark.ml.util.MLWritable] with org.apache.spark.ml.param.shared.HasHandleInvalid with org.apache.spark.ml.util.MLWritable] with org.apache.spark.ml.param.shared.HasHandleInvalid with org.apache.spark.ml.util.DefaultParamsWritable{def copy(extra: org.apache.spark.ml.para...


In [5]:
df.columns
  .filter(_.contains("status"))
  .toArray
  .foreach(println)

verification_status
loan_status
initial_list_status


In [6]:
df_transformed.columns
  .filter(_.contains("status"))
  .toArray
  .foreach(println)

verification_status
loan_status
initial_list_status
verification_status_index
verification_status_vec
loan_status_index
loan_status_vec
initial_list_status_index
initial_list_status_vec


In [7]:
df_transformed
  .groupBy("loan_status")
  .count()
  .sort("count")
  .show()

+------------------+-----+
|       loan_status|count|
+------------------+-----+
|           Default|    1|
| Late (16-30 days)|   58|
|   In Grace Period|  136|
|Late (31-120 days)|  367|
|       Charged Off| 1660|
|        Fully Paid| 5446|
|           Current|12402|
+------------------+-----+



In [8]:
df_transformed
  .groupBy("loan_status_index")
  .count()
  .sort("count")
  .show()

+-----------------+-----+
|loan_status_index|count|
+-----------------+-----+
|              6.0|    1|
|              5.0|   58|
|              4.0|  136|
|              3.0|  367|
|              2.0| 1660|
|              1.0| 5446|
|              0.0|12402|
+-----------------+-----+



In [9]:
df_transformed.groupBy("loan_status_vec")
  .count()
  .sort("count")
  .show()

+---------------+-----+
|loan_status_vec|count|
+---------------+-----+
|  (7,[6],[1.0])|    1|
|  (7,[5],[1.0])|   58|
|  (7,[4],[1.0])|  136|
|  (7,[3],[1.0])|  367|
|  (7,[2],[1.0])| 1660|
|  (7,[1],[1.0])| 5446|
|  (7,[0],[1.0])|12402|
+---------------+-----+



In [10]:
val vecFeatures = df_transformed
  .columns.filter(_.contains("vec")).toArray

val vectorAssembler = new VectorAssembler()
  .setInputCols(vecFeatures)
  .setOutputCol("categorical_features")

val pipelineVectorAssembler = new Pipeline()
  .setStages(Array(vectorAssembler))

val result_df = pipelineVectorAssembler
  .fit(df_transformed)
  .transform(df_transformed)

vecFeatures: Array[String] = Array(verification_status_vec, loan_status_vec, initial_list_status_vec, home_ownership_vec, acc_now_delinq_vec)
vectorAssembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_19e24e6a34bb
pipelineVectorAssembler: org.apache.spark.ml.Pipeline = pipeline_1fe7f0bcecd3
result_df: org.apache.spark.sql.DataFrame = [loan_amnt: int, funded_amnt: int ... 95 more fields]


In [11]:
//https://towardsdatascience.com/feature-encoding-made-simple-with-spark-2-3-0-part-2-5bfc869a809a

val numerical_features = Array("annual_inc", "loan_amnt")

val vectorAssembler2 = new VectorAssembler()
  .setInputCols(numerical_features)
  .setOutputCol("numerical_features")

val pipelineVectorAssembler2 = new Pipeline()
  .setStages(Array(vectorAssembler, vectorAssembler2))

val result_df = pipelineVectorAssembler2
  .fit(df_transformed)
  .transform(df_transformed)

result_df.select("numerical_features", "categorical_features").show()

+------------------+--------------------+
|numerical_features|categorical_features|
+------------------+--------------------+
| [82000.0,14000.0]|(19,[0,4,10,12,16...|
| [69000.0,10000.0]|(19,[1,3,11,13,16...|
| [215000.0,5000.0]|(19,[0,3,11,12,16...|
|  [50000.0,4000.0]|(19,[0,5,10,13,16...|
| [89000.0,14000.0]|(19,[1,4,10,12,16...|
|[110000.0,15000.0]|(19,[0,5,10,12,16...|
| [70000.0,15750.0]|(19,[1,4,10,12,16...|
|  [73000.0,7800.0]|(19,[1,4,10,12,16...|
|[130000.0,28000.0]|(19,[0,3,11,13,16...|
| [38000.0,10000.0]|(19,[2,6,10,13,16...|
| [81000.0,16000.0]|(19,[0,5,10,12,16...|
|  [20832.0,3000.0]|(19,[2,3,11,13,16...|
|  [60000.0,4000.0]|(19,[0,3,10,13,16...|
| [76000.0,24000.0]|(19,[1,3,10,12,16...|
| [47000.0,12000.0]|(19,[1,3,10,12,16...|
| [98440.0,13000.0]|(19,[1,4,11,13,16...|
| [75000.0,21000.0]|(19,[1,3,11,12,16...|
| [70000.0,30800.0]|(19,[2,3,10,14,16...|
| [83000.0,30000.0]|(19,[0,3,10,12,16...|
|  [40000.0,7000.0]|(19,[1,3,10,13,16...|
+------------------+--------------

numerical_features: Array[String] = Array(annual_inc, loan_amnt)
vectorAssembler2: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_3c9034193868
pipelineVectorAssembler2: org.apache.spark.ml.Pipeline = pipeline_e9df290bec63
result_df: org.apache.spark.sql.DataFrame = [loan_amnt: int, funded_amnt: int ... 96 more fields]


In [12]:
//convert the sparse vector to a dense vector as a fail safe

val sparseToDense = udf((v : Vector) => v.toDense)
val result_df_dense = result_df.withColumn("numerical_features", sparseToDense($"numerical_features"))

/**
# convert the data to dense vector
def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1])]).toDF(['features'])
**/

org.apache.spark.sql.AnalysisException:  cannot resolve 'UDF(numerical_features)' due to data type mismatch: argument 1 requires vector type, however, '`numerical_features`' is of struct<type:tinyint,size:int,indices:array<int>,values:array<double>> type.;;

In [13]:
val scaler = new StandardScaler()
  .setInputCol("numerical_features")
  .setOutputCol("scaledFeatures")
  .setWithStd(true)
  .setWithMean(true)

// Normalize each feature to have unit standard deviation.
val scaledData = scaler
  .fit(result_df)
  .transform(result_df)

scaledData.select("numerical_features", "scaledFeatures", "categorical_features").show()

+------------------+--------------------+--------------------+
|numerical_features|      scaledFeatures|categorical_features|
+------------------+--------------------+--------------------+
| [82000.0,14000.0]|[0.01694873096501...|(19,[0,4,10,12,16...|
| [69000.0,10000.0]|[-0.1248892191143...|(19,[1,3,11,13,16...|
| [215000.0,5000.0]|[1.46806006639242...|(19,[0,3,11,12,16...|
|  [50000.0,4000.0]|[-0.3321908384611...|(19,[0,5,10,13,16...|
| [89000.0,14000.0]|[0.09332301177698...|(19,[1,4,10,12,16...|
|[110000.0,15000.0]|[0.32244585421288...|(19,[0,5,10,12,16...|
| [70000.0,15750.0]|[-0.1139786075697...|(19,[1,4,10,12,16...|
|  [73000.0,7800.0]|[-0.0812467729360...|(19,[1,4,10,12,16...|
|[130000.0,28000.0]|[0.54065808510422...|(19,[0,3,11,13,16...|
| [38000.0,10000.0]|[-0.4631181769959...|(19,[2,6,10,13,16...|
| [81000.0,16000.0]|[0.00603811942044...|(19,[0,5,10,12,16...|
|  [20832.0,3000.0]|[-0.6504315559930...|(19,[2,3,11,13,16...|
|  [60000.0,4000.0]|[-0.2230847230154...|(19,[0,3,10,13

scaler: org.apache.spark.ml.feature.StandardScaler = stdScal_73180b1a09a2
scaledData: org.apache.spark.sql.DataFrame = [loan_amnt: int, funded_amnt: int ... 97 more fields]


In [14]:
val vectorAssembler3 = new VectorAssembler()
  .setInputCols(Array("categorical_features", "scaledFeatures"))
  .setOutputCol("features")

val pipelineVectorAssembler3 = new Pipeline()
  .setStages(Array(vectorAssembler3))

val result_df = pipelineVectorAssembler3
  .fit(scaledData)
  .transform(scaledData)

result_df.select("features").show()

+--------------------+
|            features|
+--------------------+
|(21,[0,4,10,12,16...|
|(21,[1,3,11,13,16...|
|(21,[0,3,11,12,16...|
|(21,[0,5,10,13,16...|
|(21,[1,4,10,12,16...|
|(21,[0,5,10,12,16...|
|(21,[1,4,10,12,16...|
|(21,[1,4,10,12,16...|
|(21,[0,3,11,13,16...|
|(21,[2,6,10,13,16...|
|(21,[0,5,10,12,16...|
|(21,[2,3,11,13,16...|
|(21,[0,3,10,13,16...|
|(21,[1,3,10,12,16...|
|(21,[1,3,10,12,16...|
|(21,[1,4,11,13,16...|
|(21,[1,3,11,12,16...|
|(21,[2,3,10,14,16...|
|(21,[0,3,10,12,16...|
|(21,[1,3,10,13,16...|
+--------------------+
only showing top 20 rows



vectorAssembler3: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_cbb67173b158
pipelineVectorAssembler3: org.apache.spark.ml.Pipeline = pipeline_d6ff0194bb0f
result_df: org.apache.spark.sql.DataFrame = [loan_amnt: int, funded_amnt: int ... 98 more fields]


In [19]:
// create the labels column
val labelIndexer = new StringIndexer()
  .setInputCol("term")
  .setOutputCol("label")

val df3 = labelIndexer
  .fit(result_df)
  .transform(result_df)

labelIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_d96d87ff5786
df3: org.apache.spark.sql.DataFrame = [loan_amnt: int, funded_amnt: int ... 99 more fields]


In [20]:
// https://www.bmc.com/blogs/using-logistic-regression-scala-spark/

val model = new LogisticRegression().fit(df3)

val predictions = model.transform(df3)

model: org.apache.spark.ml.classification.LogisticRegressionModel = LogisticRegressionModel: uid = logreg_b34c6c1f9a72, numClasses = 2, numFeatures = 21
predictions: org.apache.spark.sql.DataFrame = [loan_amnt: int, funded_amnt: int ... 102 more fields]


In [21]:
predictions
  .select ("features", "label", "prediction")
  .show()

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(21,[0,4,10,12,16...|  0.0|       0.0|
|(21,[1,3,11,13,16...|  0.0|       0.0|
|(21,[0,3,11,12,16...|  0.0|       0.0|
|(21,[0,5,10,13,16...|  0.0|       0.0|
|(21,[1,4,10,12,16...|  0.0|       0.0|
|(21,[0,5,10,12,16...|  1.0|       0.0|
|(21,[1,4,10,12,16...|  1.0|       0.0|
|(21,[1,4,10,12,16...|  0.0|       0.0|
|(21,[0,3,11,13,16...|  0.0|       0.0|
|(21,[2,6,10,13,16...|  1.0|       0.0|
|(21,[0,5,10,12,16...|  1.0|       0.0|
|(21,[2,3,11,13,16...|  0.0|       0.0|
|(21,[0,3,10,13,16...|  0.0|       0.0|
|(21,[1,3,10,12,16...|  1.0|       1.0|
|(21,[1,3,10,12,16...|  1.0|       0.0|
|(21,[1,4,11,13,16...|  0.0|       0.0|
|(21,[1,3,11,12,16...|  0.0|       0.0|
|(21,[2,3,10,14,16...|  1.0|       1.0|
|(21,[0,3,10,12,16...|  1.0|       1.0|
|(21,[1,3,10,13,16...|  0.0|       0.0|
+--------------------+-----+----------+
only showing top 20 rows



In [22]:
//https://stackoverflow.com/questions/37566321/spark-random-forest-binary-classifier-metrics


val binaryClassificationEvaluator = new BinaryClassificationEvaluator()
  .setLabelCol("label")
  .setRawPredictionCol("prediction")

def printlnMetric(metricName: String): Unit = {
  println(metricName + " = " + binaryClassificationEvaluator
                                 .setMetricName(metricName)
                                 .evaluate(predictions))
}

printlnMetric("areaUnderROC")
printlnMetric("areaUnderPR")

areaUnderROC = 0.6403947867776694
areaUnderPR = 0.5044704154926782


binaryClassificationEvaluator: org.apache.spark.ml.evaluation.BinaryClassificationEvaluator = binEval_554a0f51f9a1
printlnMetric: (metricName: String)Unit
