In [None]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=5cba9c891728fac904f57c129ef537fcb42f0ff6aad84b784cb365b8c97cf75c
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Loan approval").getOrCreate()

In [None]:
sc = spark.sparkContext

In [None]:
filepath = '/content/loan_approval_dataset.csv'

In [None]:
data = spark.read.csv(filepath,header=True,inferSchema=True)

In [None]:
data.show()

+-------+-----------------+-------------+--------------+-------------+------------+----------+------------+-------------------------+------------------------+--------------------+-----------------+------------+
|loan_id| no_of_dependents|    education| self_employed| income_annum| loan_amount| loan_term| cibil_score| residential_assets_value| commercial_assets_value| luxury_assets_value| bank_asset_value| loan_status|
+-------+-----------------+-------------+--------------+-------------+------------+----------+------------+-------------------------+------------------------+--------------------+-----------------+------------+
|      1|                2|     Graduate|            No|      9600000|    29900000|        12|         778|                  2400000|                17600000|            22700000|          8000000|    Approved|
|      2|                0| Not Graduate|           Yes|      4100000|    12200000|         8|         417|                  2700000|                 220000

In [None]:
data.printSchema()

root
 |-- loan_id: integer (nullable = true)
 |--  no_of_dependents: integer (nullable = true)
 |--  education: string (nullable = true)
 |--  self_employed: string (nullable = true)
 |--  income_annum: integer (nullable = true)
 |--  loan_amount: integer (nullable = true)
 |--  loan_term: integer (nullable = true)
 |--  cibil_score: integer (nullable = true)
 |--  residential_assets_value: integer (nullable = true)
 |--  commercial_assets_value: integer (nullable = true)
 |--  luxury_assets_value: integer (nullable = true)
 |--  bank_asset_value: integer (nullable = true)
 |--  loan_status: string (nullable = true)



In [None]:
data.count()

4269

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
import pyspark.sql.functions as F
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
indexer = StringIndexer(inputCol=' loan_status', outputCol='Approval_Status').fit(data)
data = indexer.transform(data)

In [None]:
data.show()

+-------+-----------------+-------------+--------------+-------------+------------+----------+------------+-------------------------+------------------------+--------------------+-----------------+------------+---------------+
|loan_id| no_of_dependents|    education| self_employed| income_annum| loan_amount| loan_term| cibil_score| residential_assets_value| commercial_assets_value| luxury_assets_value| bank_asset_value| loan_status|Approval_Status|
+-------+-----------------+-------------+--------------+-------------+------------+----------+------------+-------------------------+------------------------+--------------------+-----------------+------------+---------------+
|      1|                2|     Graduate|            No|      9600000|    29900000|        12|         778|                  2400000|                17600000|            22700000|          8000000|    Approved|            0.0|
|      2|                0| Not Graduate|           Yes|      4100000|    12200000|         

In [None]:
data = data.drop(' self_employed')
data = data.drop(' education')
data = data.drop(' loan_status')
data.show()

+-------+-----------------+-------------+------------+----------+------------+-------------------------+------------------------+--------------------+-----------------+---------------+
|loan_id| no_of_dependents| income_annum| loan_amount| loan_term| cibil_score| residential_assets_value| commercial_assets_value| luxury_assets_value| bank_asset_value|Approval_Status|
+-------+-----------------+-------------+------------+----------+------------+-------------------------+------------------------+--------------------+-----------------+---------------+
|      1|                2|      9600000|    29900000|        12|         778|                  2400000|                17600000|            22700000|          8000000|            0.0|
|      2|                0|      4100000|    12200000|         8|         417|                  2700000|                 2200000|             8800000|          3300000|            1.0|
|      3|                3|      9100000|    29700000|        20|         5

In [None]:
inputcolumns = [' no_of_dependents',' income_annum',' loan_amount',' loan_term',
                ' cibil_score',' residential_assets_value',' commercial_assets_value',
                ' luxury_assets_value',' bank_asset_value']

In [None]:
for i in inputcolumns:
  data = data.withColumn(i,data[i].cast(IntegerType()))

In [None]:
data.printSchema()

root
 |-- loan_id: integer (nullable = true)
 |--  no_of_dependents: integer (nullable = true)
 |--  income_annum: integer (nullable = true)
 |--  loan_amount: integer (nullable = true)
 |--  loan_term: integer (nullable = true)
 |--  cibil_score: integer (nullable = true)
 |--  residential_assets_value: integer (nullable = true)
 |--  commercial_assets_value: integer (nullable = true)
 |--  luxury_assets_value: integer (nullable = true)
 |--  bank_asset_value: integer (nullable = true)
 |-- Approval_Status: double (nullable = false)



In [None]:
data = data.na.fill(0.0)

# Replace infinite values with a large finite value (e.g., 1e10)
data = data.na.replace(float('inf'), 1.0e10)

In [None]:
d = VectorAssembler(inputCols = inputcolumns, outputCol = 'features')
data = d.transform(data)

In [None]:
data.show()

+-------+-----------------+-------------+------------+----------+------------+-------------------------+------------------------+--------------------+-----------------+---------------+--------------------+
|loan_id| no_of_dependents| income_annum| loan_amount| loan_term| cibil_score| residential_assets_value| commercial_assets_value| luxury_assets_value| bank_asset_value|Approval_Status|            features|
+-------+-----------------+-------------+------------+----------+------------+-------------------------+------------------------+--------------------+-----------------+---------------+--------------------+
|      1|                2|      9600000|    29900000|        12|         778|                  2400000|                17600000|            22700000|          8000000|            0.0|[2.0,9600000.0,2....|
|      2|                0|      4100000|    12200000|         8|         417|                  2700000|                 2200000|             8800000|          3300000|        

In [None]:
train, test = data.randomSplit([0.8,0.2], seed=11)

In [None]:
train.count()

3437

In [None]:
test.count()

832

***Logistic Regression***

In [None]:
from pyspark.ml.classification import LogisticRegression

log = LogisticRegression(featuresCol='features',
                             labelCol='Approval_Status')

In [None]:
pipe = Pipeline(stages = [log])

In [None]:
fit_model = pipe.fit(train)

In [None]:
results = fit_model.transform(test)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
res = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Approval_Status')
efficiency = res.evaluate(results)

In [None]:
print("Accuracy of the model is: ",efficiency*100,"%")

Accuracy of the model is:  91.45854145854146 %


In [None]:
results.show()

+-------+-----------------+-------------+------------+----------+------------+-------------------------+------------------------+--------------------+-----------------+---------------+--------------------+--------------------+--------------------+----------+
|loan_id| no_of_dependents| income_annum| loan_amount| loan_term| cibil_score| residential_assets_value| commercial_assets_value| luxury_assets_value| bank_asset_value|Approval_Status|            features|       rawPrediction|         probability|prediction|
+-------+-----------------+-------------+------------+----------+------------+-------------------------+------------------------+--------------------+-----------------+---------------+--------------------+--------------------+--------------------+----------+
|      6|                0|      4800000|    13500000|        10|         319|                  6800000|                 8300000|            13700000|          5100000|            1.0|[0.0,4800000.0,1....|[-5.1985686643290.

***random Forest***

In [None]:
from pyspark.ml.classification import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(labelCol="Approval_Status", featuresCol="features", numTrees=100)

In [None]:
model = rf.fit(train)

In [None]:
predictions = model.transform(test)

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="Approval_Status", rawPredictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy*100}")

Accuracy: 95.5211455211455


In [None]:
feature_importances = model.featureImportances
print("Feature Importances:")
for i in range(len(inputcolumns)):
    print(f"{inputcolumns[i]}: {feature_importances[i]}")
    print()

Feature Importances:
 no_of_dependents: 0.004427422251825267

 income_annum: 0.006839974195924856

 loan_amount: 0.009514650251533485

 loan_term: 0.058980913805201186

 cibil_score: 0.8947365981530596

 residential_assets_value: 0.0050742411518231376

 commercial_assets_value: 0.007304527111556599

 luxury_assets_value: 0.007224478356102984

 bank_asset_value: 0.005897194722972775



In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [10, 50, 100])
             .addGrid(rf.maxDepth, [5, 10, 15])
             .build())

In [None]:
evaluator_after_parameter = BinaryClassificationEvaluator(labelCol="Approval_Status", rawPredictionCol="prediction")

In [None]:
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator_after_parameter, numFolds=5)


In [None]:
cvModel = cv.fit(train)

In [None]:
best_rf_model = cvModel.bestModel

In [None]:
test_results = best_rf_model.transform(test)
accuracy_after_parameter = evaluator_after_parameter.evaluate(test_results)


In [None]:
best_num_trees = best_rf_model.getNumTrees
best_max_depth = best_rf_model.getMaxDepth()

print("Best Hyperparameters:")
print(f"Number of Trees: {best_num_trees}")
print(f"Max Depth: {best_max_depth}")
print()
print(f"AND THEY GIVE OUT ACCURACY AS: {accuracy_after_parameter*100}")

Best Hyperparameters:
Number of Trees: 100
Max Depth: 15

AND THEY GIVE OUT ACCURACY AS: 95.39627039627038


In [None]:
print('Difference in ACCURACY after HyperParameter Tuning is',(accuracy_after_parameter-accuracy)*100)

Difference in ACCURACY after HyperParameter Tuning is -0.12487512487511232


***K NEAREST NEIGHBOR***

In [None]:
from pyspark.ml.classification import GBTClassifier

In [None]:
gbt = GBTClassifier(labelCol="Approval_Status", featuresCol="features", maxIter=10)


In [None]:
pipeline = Pipeline(stages=[gbt])

In [None]:
model = pipeline.fit(train)

In [None]:
predictions_gbt = model.transform(test)


In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="Approval_Status", rawPredictionCol="prediction")
accuracy_gbt = evaluator.evaluate(predictions_gbt)

In [None]:
accuracy_gbt

0.9444721944721944

In [None]:
paramGrid_gbt = (ParamGridBuilder()
    .addGrid(gbt.maxIter, [10, 20, 30])
    .addGrid(gbt.maxDepth, [3, 5, 7])
    .addGrid(gbt.stepSize, [0.1, 0.01])
    .build())

In [None]:
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid_gbt,
                          evaluator=evaluator,
                          numFolds=5)

In [None]:
cvModel_gbt = crossval.fit(train)

In [None]:
 bestModel_gbt = cvModel_gbt.bestModel
bestMaxIter = bestModel_gbt.stages[1].getMaxIter()
bestMaxDepth = bestModel_gbt.stages[1].getMaxDepth()
bestStepSize = bestModel_gbt.stages[1].getStepSize()

In [None]:
predictions_gbt = bestModel_gbt.transform(test)

In [None]:
bestaccuracy_gbt = evaluator.evaluate(predictions_gbt)
print(f"Best Max Iter: {bestMaxIter}")
print(f"Best Max Depth: {bestMaxDepth}")
print(f"Best Step Size: {bestStepSize}")
print()
print(f"AND THEY GIVE OUT ACCURACY AS: {bestaccuracy_gbt*100}")