# Initial procedures

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType
from pyspark.ml import feature, classification, evaluation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

train_set_path = "data/train_set.csv"
test_set_path = "data/test_set.csv"
entire_set_path = "data/ready_set.csv"

spark = SparkSession.builder.appName("Loan_Approval").getOrCreate()

schema = StructType([
    StructField(name="person_age", dataType=FloatType(), nullable=True),
    StructField(name="person_gender", dataType=IntegerType(), nullable=True),
    StructField(name="person_education", dataType=IntegerType(), nullable=True),
    StructField(name="person_income", dataType=FloatType(), nullable=True),
    StructField(name="person_home_ownership", dataType=IntegerType(), nullable=True),
    StructField(name="loan_amnt", dataType=FloatType(), nullable=True),
    StructField(name="loan_intent", dataType=IntegerType(), nullable=True),
    StructField(name="loan_int_rate", dataType=FloatType(), nullable=True),
    StructField(name="loan_percent_income", dataType=FloatType(), nullable=True),
    StructField(name="credit_score", dataType=IntegerType(), nullable=True),
    StructField(name="previous_loan_defaults_on_file", dataType=IntegerType(), nullable=True),
    StructField(name="loan_status", dataType=IntegerType(), nullable=True),
  ])

In [17]:
df = spark.read.csv(path=entire_set_path, header=True, schema=schema)
df_train = spark.read.csv(path=train_set_path, header=True, schema=schema)
df_test = spark.read.csv(path=test_set_path, header=True, schema=schema)
df_train.show(10)

+----------+-------------+----------------+-------------+---------------------+---------+-----------+-------------+-------------------+------------+------------------------------+-----------+
|person_age|person_gender|person_education|person_income|person_home_ownership|loan_amnt|loan_intent|loan_int_rate|loan_percent_income|credit_score|previous_loan_defaults_on_file|loan_status|
+----------+-------------+----------------+-------------+---------------------+---------+-----------+-------------+-------------------+------------+------------------------------+-----------+
|      21.0|            0|               1|      24999.0|                    0|   2000.0|          2|        11.01|               0.08|         527|                             1|          0|
|      24.0|            1|               2|      99084.0|                    3|  14700.0|          0|         9.63|               0.15|         662|                             0|          0|
|      24.0|            1|              

# Standarization

In [18]:
vect = feature.VectorAssembler(inputCols=df_train.columns[:-1], outputCol="feat")
df_train = vect.transform(df_train)
df_train = df_train.select("loan_status", "feat")
df_train.printSchema()


scaler = feature.StandardScaler(inputCol="feat", outputCol="features")
scaler_t = scaler.fit(df_train)
df_train = scaler_t.transform(df_train)

df_train.show(n=10, truncate=False)

root
 |-- loan_status: integer (nullable = true)
 |-- feat: vector (nullable = true)

+-----------+----------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|loan_status|feat                                                                                    |features                                                                                                                                                                                                    |
+-----------+----------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Random Forest

In [19]:
forest = classification.RandomForestClassifier(labelCol="loan_status", featuresCol="features", maxDepth=8,
                                               minInstancesPerNode=4, seed=42)
forest_t = forest.fit(df_train)
pred_train = forest_t.transform(df_train)
pred_train.show(10)

# Evaluate the model on training data
evaluator = evaluation.MulticlassClassificationEvaluator(labelCol="loan_status", predictionCol="prediction",
                                                         metricName="accuracy")
accuracy = evaluator.evaluate(pred_train)
print(f"Random Forest Training Accuracy {accuracy}")

# Transform evaluation dataset using the pipeline
df_test = vect.transform(df_test)
df_test = df_test.select("loan_status", "feat")
df_test = scaler_t.transform(df_test)


# Evaluate the predictions on test data
pred_test = forest_t.transform(df_test)
eval_accuracy = evaluator.evaluate(pred_test)
print(f"Random Forest Evaluation Accuracy: {eval_accuracy}")

+-----------+--------------------+--------------------+--------------------+--------------------+----------+
|loan_status|                feat|            features|       rawPrediction|         probability|prediction|
+-----------+--------------------+--------------------+--------------------+--------------------+----------+
|          0|[21.0,0.0,1.0,249...|[3.67165383042514...|[19.5472901168969...|[0.97736450584484...|       0.0|
|          0|[24.0,1.0,2.0,990...|[4.19617580620016...|[15.7877875079139...|[0.78938937539569...|       0.0|
|          0|[24.0,1.0,1.0,121...|[4.19617580620016...|          [20.0,0.0]|           [1.0,0.0]|       0.0|
|          0|[25.0,0.0,2.0,493...|[4.37101646479184...|[19.6687542675771...|[0.98343771337885...|       0.0|
|          0|[22.0,1.0,3.0,811...|[3.84649448901682...|[19.9081209247184...|[0.99540604623592...|       0.0|
|          0|[23.0,1.0,2.0,152...|[4.02133514760849...|[6.76045857121904...|[0.33802292856095...|       1.0|
|          0|[23.0,

# Hyperparameter tuning for Random Forest

In [24]:
rf = classification.RandomForestClassifier(labelCol="loan_status", featuresCol="features")

# Hyperparameter grid
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [10, 50, 80])
             .addGrid(rf.maxDepth, [5, 8, 10])
             .addGrid(rf.minInstancesPerNode, [1, 2, 4])
             .build())

# Evaluator
evaluator = evaluation.MulticlassClassificationEvaluator(labelCol="loan_status", predictionCol="prediction", metricName="accuracy")

# Cross-validation
cv = CrossValidator(estimator=rf,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=5)

cvModel = cv.fit(df_train)

# Best hyperparameters
bestModel = cvModel.bestModel
print("Number of trees:", bestModel.getNumTrees)
print("Maximum depth:", bestModel.getMaxDepth())
print("Min instances per node", bestModel.getOrDefault(rf.minInstancesPerNode))

Number of trees: 50
Maximum depth: 10
Min instances per node 2


Accuracy for training and test data after hyperparameter tuning

In [27]:
forest = classification.RandomForestClassifier(labelCol="loan_status", featuresCol="features", numTrees=50, maxDepth=10,
                                               minInstancesPerNode=2, seed=42)

evaluator = evaluation.MulticlassClassificationEvaluator(labelCol="loan_status", predictionCol="prediction",
                                                         metricName="accuracy")

# Prediction for training data
forest_train = forest.fit(df_train)
pred_train = forest_t.transform(df_train)

accuracy = evaluator.evaluate(pred_train)
print(f"Random Forest Training Accuracy {accuracy}")

# Prediction for test data
pred_test = forest_t.transform(df_test)
eval_accuracy = evaluator.evaluate(pred_test)
print(f"Random Forest Evaluation Accuracy: {eval_accuracy}")

Random Forest Training Accuracy 0.8989435754647449
Random Forest Evaluation Accuracy: 0.8758334568084161
