Import Libraries

In [11]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

Start Spark session

In [12]:
spark = SparkSession.builder.appName("Loan Default ML Process 1").getOrCreate()

25/05/19 16:17:38 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Load Prepared Data

In [13]:
data = spark.read.csv("preprocessed_data_csv", header=True, inferSchema=True)

data.printSchema()
data.show(5)

                                                                                

root
 |-- loan_amnt: double (nullable = true)
 |-- term_index: double (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- grade_index: double (nullable = true)
 |-- sub_grade_index: double (nullable = true)
 |-- home_ownership_index: double (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- verification_status_index: double (nullable = true)
 |-- purpose_index: double (nullable = true)
 |-- dti: double (nullable = true)
 |-- delinq_2yrs: double (nullable = true)
 |-- inq_last_6mths: double (nullable = true)
 |-- open_acc: double (nullable = true)
 |-- pub_rec: double (nullable = true)
 |-- revol_util: double (nullable = true)
 |-- emp_length_index: double (nullable = true)
 |-- label: double (nullable = true)

+---------+----------+--------+-----------+---------------+--------------------+----------+-------------------------+-------------+-----+-----------+--------------+--------+-------+----------+----------------+-----+
|loan_amnt|term_index|int_rate|grade_in

In [15]:
input_cols = [col for col in data.columns if col not in ["default_ind", "label"]]
assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
assembled_df = assembler.transform(data).select("features", "label")

Split Data

In [16]:
train, test = assembled_df.randomSplit([0.7, 0.3], seed=42)

1st ML model: Decision Tree

In [18]:
print("\n Training Decision Tree...")
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
dt_model = dt.fit(train)
dt_preds = dt_model.transform(test)



 Training Decision Tree...


                                                                                

2nd ML model: Random Forest

In [19]:
print("\n Training Random Forest...")
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50)
rf_model = rf.fit(train)
rf_preds = rf_model.transform(test)


 Training Random Forest...


                                                                                

3rd ML model: Naive Bayes

In [20]:
print("\n Training Naive Bayes...")
nb = NaiveBayes(labelCol="label", featuresCol="features")
nb_model = nb.fit(train)
nb_preds = nb_model.transform(test)


 Training Naive Bayes...


                                                                                

Define Evaluation Function

In [25]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

def evaluate(predictions):
    evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    evaluator_prec = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
    evaluator_rec = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
    evaluator_auc = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="probability", metricName="areaUnderROC")

    print("Accuracy:", evaluator_acc.evaluate(predictions))
    print("Precision:", evaluator_prec.evaluate(predictions))
    print("Recall:", evaluator_rec.evaluate(predictions))
    print("ROC AUC:", evaluator_auc.evaluate(predictions))


Print result

In [26]:
print("\n Decision Tree Performance:")
evaluate(dt_preds)

print("\n Random Forest Performance:")
evaluate(rf_preds)

print("\n Naive Bayes Performance:")
evaluate(nb_preds)


 Decision Tree Performance:


                                                                                

Accuracy: 0.9459806096234586


                                                                                

Precision: 0.8948793137835704


                                                                                

Recall: 0.9459806096234586


                                                                                

ROC AUC: 0.5

 Random Forest Performance:


                                                                                

Accuracy: 0.9459806096234586


                                                                                

Precision: 0.8948793137835704


                                                                                

Recall: 0.9459806096234586


                                                                                

ROC AUC: 0.7018771820971957

 Naive Bayes Performance:


25/05/19 16:23:34 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/05/19 16:23:34 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

Accuracy: 0.5108900315376249


                                                                                

Precision: 0.9078485640141597


                                                                                

Recall: 0.5108900315376249


                                                                                

ROC AUC: 0.5617901125426205
