# Project 5: Loan Approval Prediction

## Problem Statement: Develop a machine learning model to predict the likelihood of loan approval based on applicants' information.
### Student Name:- SUMAN RAKSHIT 
### CSI ID:- CT-CSI23/DS0605

#### Objectives of this project
- By Observing the featurs of a applicant the model can predict whether he/she is eligible for Loan or Not.

# Data Loading

In [1]:
import pandas as pd 
import numpy as np
df = pd.read_csv('loan_applicants.csv')

In [2]:
df.head(500)

Unnamed: 0,age,income,employment_status,loan_amount,loan_purpose,years_to_return,loan_approval_status
0,45,41565,Self-Employed,200000,Home,2.0,Not Approved
1,49,48541,Self-Employed,100000,Personal,1.0,Not Approved
2,57,50746,Employed,100000,Car,1.0,Approved
3,42,40411,Employed,450000,Car,4.5,Approved
4,44,20523,Self-Employed,250000,Car,2.5,Not Approved
...,...,...,...,...,...,...,...
495,38,43390,Employed,150000,Car,1.5,Not Approved
496,59,28515,Unemployed,500000,other,5.0,Not Approved
497,32,45749,Unemployed,100000,other,1.0,Not Approved
498,50,40824,Employed,400000,Car,4.0,Approved


In [3]:
df.shape

(1000, 7)

# Data Preprocessing

In [4]:
 df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   1000 non-null   int64  
 1   income                1000 non-null   int64  
 2   employment_status     1000 non-null   object 
 3   loan_amount           1000 non-null   int64  
 4   loan_purpose          1000 non-null   object 
 5   years_to_return       1000 non-null   float64
 6   loan_approval_status  1000 non-null   object 
dtypes: float64(1), int64(3), object(3)
memory usage: 54.8+ KB


In [5]:
df.isnull().sum()

age                     0
income                  0
employment_status       0
loan_amount             0
loan_purpose            0
years_to_return         0
loan_approval_status    0
dtype: int64

# Feature Engineering

In [6]:
df['employment_status']=df['employment_status'].map({'Unemployed': 0, 'Employed': 1, 'Self-Employed': 2})
df['loan_approval_status']=df['loan_approval_status'].map({'Not Approved': 0, 'Approved': 1})

In [7]:
df=df.drop('loan_purpose',axis=1)

In [8]:
df.head()

Unnamed: 0,age,income,employment_status,loan_amount,years_to_return,loan_approval_status
0,45,41565,2,200000,2.0,0
1,49,48541,2,100000,1.0,0
2,57,50746,1,100000,1.0,1
3,42,40411,1,450000,4.5,1
4,44,20523,2,250000,2.5,0


# Splitting the Data and trying different models 
# Model Traning 

In [9]:
#importaing all libraries for model preparing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix



In [10]:

X = df.drop('loan_approval_status', axis=1)
y = df['loan_approval_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Logistic Regression
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)


In [12]:
# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)


In [13]:
# SVM
svm_model = SVC()
svm_model.fit(X_train, y_train)


# Model Evaluation

In [14]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print("ROC-AUC:", roc_auc)

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

# Example usage:
print("Logistic Regression:")
evaluate_model(logistic_model, X_test, y_test)

print("Random Forest:")
evaluate_model(rf_model, X_test, y_test)

print("SVM:")
evaluate_model(svm_model, X_test, y_test)


Logistic Regression:
Accuracy: 0.53
Precision: 0.5
Recall: 0.010638297872340425
F1-score: 0.020833333333333336
ROC-AUC: 0.5006021678040948
Confusion Matrix:
[[105   1]
 [ 93   1]]
Random Forest:
Accuracy: 0.525
Precision: 0.49473684210526314
Recall: 0.5
F1-score: 0.4973544973544973
ROC-AUC: 0.5235849056603773
Confusion Matrix:
[[58 48]
 [47 47]]
SVM:
Accuracy: 0.54
Precision: 0.5131578947368421
Recall: 0.4148936170212766
F1-score: 0.45882352941176474
ROC-AUC: 0.5329185066238458
Confusion Matrix:
[[69 37]
 [55 39]]


# Selecting Efficent Model and Hyperparameter Tuning

In [15]:
from sklearn.model_selection import GridSearchCV

# Model Training with Hyperparameter Tuning (Random Forest)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150],      # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],     # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],     # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]        # Minimum number of samples required to be at a leaf node
}

# Create a Random Forest model
rf_model = RandomForestClassifier()

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the Random Forest model with the best hyperparameters
best_rf_model = RandomForestClassifier(**best_params)
best_rf_model.fit(X_train, y_train)

# Model Evaluation with Hyperparameter Tuning
print("Random Forest with Best Hyperparameters:")
evaluate_model(best_rf_model, X_test, y_test)


Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Random Forest with Best Hyperparameters:
Accuracy: 0.505
Precision: 0.47572815533980584
Recall: 0.5212765957446809
F1-score: 0.4974619289340102
ROC-AUC: 0.505921316740265
Confusion Matrix:
[[52 54]
 [45 49]]


In [16]:
y_pred = best_rf_model.predict(X_test)

# Evaluate the model's performance on the test set
print("Random Forest with Best Hyperparameters (Test Set):")
evaluate_model(best_rf_model, X_test, y_test)

Random Forest with Best Hyperparameters (Test Set):
Accuracy: 0.505
Precision: 0.47572815533980584
Recall: 0.5212765957446809
F1-score: 0.4974619289340102
ROC-AUC: 0.505921316740265
Confusion Matrix:
[[52 54]
 [45 49]]


# Saving The Model for further use

In [17]:
import joblib
# Save the model to a file
model_filename = 'best_random_forest_model.pkl'
joblib.dump(best_rf_model, model_filename)

print("Model saved successfully.")


Model saved successfully.


In [18]:
# Load the model from the file
loaded_model = joblib.load(model_filename)


# Testing the Model

In [19]:
import pandas as pd

# Create the DataFrame with the given data
df = pd.DataFrame({
    'age': [45],  # Age of the applicant
    'income': [48541],  # Applicant's income
    'employment_status': [1],  # Employment status (e.g., 0 - Unemployed, 1 - Employed, 2 - Self-employed, etc.)
    'loan_amount': [500000],  # Loan amount requested
    'years_to_return': [5.0]  # How many year it takes to return the Loan 
})


In [20]:
df_predictions = loaded_model.predict(df)

In [21]:
if df_predictions==1:
    print("Loan Approved")
else:
    print("Loan Not Approved")

Loan Approved
