<a href="https://colab.research.google.com/github/SurekhaBerlin/Loan-Status-Prediction-Using-Machine-Learning/blob/main/Loan_Status_Prediction_Using_Machine_Learning_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Loan Status Prediction Using Machine Learning

## STEP 1: Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')

## STEP 2: Reading the file

In [2]:
df = pd.read_csv('/content/loan_approval_data.csv')

## STEP 3: Checking the shape of the dataset

In [3]:
df.shape

(614, 13)

## STEP 4 : Display Top 5 Rows of The Dataset

In [4]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ApplicantIncome,614.0,5403.459283,6109.041673,150.0,2877.5,3812.5,5795.0,81000.0
CoapplicantIncome,614.0,1621.245798,2926.248369,0.0,0.0,1188.5,2297.25,41667.0
LoanAmount,592.0,146.412162,85.587325,9.0,100.0,128.0,168.0,700.0
Loan_Amount_Term,600.0,342.0,65.12041,12.0,360.0,360.0,360.0,480.0
Credit_History,564.0,0.842199,0.364878,0.0,1.0,1.0,1.0,1.0


## STEP 5 : Get Information About Our Dataset Like Total Number Rows, Total Number of Columns, Datatypes of Each Column And Memory Requirement

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


## STEP 6 : Check Null Values In The Dataset

In [8]:
df.isnull().sum()

Unnamed: 0,0
Loan_ID,0
Gender,13
Married,3
Dependents,15
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,14


In [9]:
df.isnull().sum()*100 / len(df)

Unnamed: 0,0
Loan_ID,0.0
Gender,2.117264
Married,0.488599
Dependents,2.442997
Education,0.0
Self_Employed,5.211726
ApplicantIncome,0.0
CoapplicantIncome,0.0
LoanAmount,3.583062
Loan_Amount_Term,2.28013


## STEP 7 : Handling the missisng values

In [10]:
df.dropna(subset=['LoanAmount', 'Gender', 'Dependents', 'Loan_Amount_Term'], inplace=True)

In [11]:
df.isnull().sum()*100 / len(df)

Unnamed: 0,0
Loan_ID,0.0
Gender,0.0
Married,0.0
Dependents,0.0
Education,0.0
Self_Employed,5.424955
ApplicantIncome,0.0
CoapplicantIncome,0.0
LoanAmount,0.0
Loan_Amount_Term,0.0


In [12]:
df['Self_Employed'].mode()[0]

'No'

In [13]:
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])

In [14]:
df.isnull().sum()*100 / len(df)

Unnamed: 0,0
Loan_ID,0.0
Gender,0.0
Married,0.0
Dependents,0.0
Education,0.0
Self_Employed,0.0
ApplicantIncome,0.0
CoapplicantIncome,0.0
LoanAmount,0.0
Loan_Amount_Term,0.0


In [15]:
df['Credit_History'].mode()[0]

1.0

In [16]:
df['Credit_History'] =df['Credit_History'].fillna(df['Credit_History'].mode()[0])

In [17]:
df['Credit_History'].unique()

array([1., 0.])

In [18]:
df.isnull().sum()*100 / len(df)

Unnamed: 0,0
Loan_ID,0.0
Gender,0.0
Married,0.0
Dependents,0.0
Education,0.0
Self_Employed,0.0
ApplicantIncome,0.0
CoapplicantIncome,0.0
LoanAmount,0.0
Loan_Amount_Term,0.0


## STEP 8 : Handling categorical Columns


In [19]:
df.sample(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
249,LP001825,Male,Yes,0,Graduate,No,1809,1868.0,90.0,360.0,1.0,Urban,Y
427,LP002368,Male,Yes,2,Graduate,No,5935,0.0,133.0,360.0,1.0,Semiurban,Y
166,LP001577,Female,Yes,0,Graduate,No,4583,0.0,112.0,360.0,1.0,Rural,N
316,LP002035,Male,Yes,2,Graduate,No,3717,0.0,120.0,360.0,1.0,Semiurban,Y
280,LP001910,Male,No,1,Not Graduate,Yes,4053,2426.0,158.0,360.0,0.0,Urban,N


In [20]:
df['Dependents'].unique()

array(['1', '0', '2', '3+'], dtype=object)

In [21]:
df['Dependents'] = df['Dependents'].replace(to_replace="3+", value='4')

In [22]:
df['Dependents'].unique()

array(['1', '0', '2', '4'], dtype=object)

In [23]:
df['Gender'] = df['Gender'].map({'Male':1, 'Female':0}).astype('int')

In [24]:
df['Loan_Status']= df['Loan_Status'].map({'Y':1, 'N':0}).astype('int')

In [25]:
df['Married'] = df['Married'].map({'Yes':1, 'No':0}).astype('int')

In [26]:
df['Self_Employed'] = df['Self_Employed'].map({'Yes':1, 'No':0}).astype('int')

In [27]:
df['Education'] = df['Education'].map({'Graduate':1, 'Not Graduate':0}).astype('int')

In [28]:
df['Property_Area'] = df['Property_Area'].map({'Urban':2, 'Rural':0, 'Semiurban':1}).astype('int')

In [29]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,2,1
5,LP001011,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2,1


## STEP 9 : Store feature Matrix in X & response(Target) in y

In [30]:
df.drop(['Loan_ID'], axis=1, inplace=True)

In [31]:
X = df.drop(['Loan_Status'], axis=1)

In [32]:
y = df['Loan_Status']

## STEP 10 : Feature Scaling

In [33]:
cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

In [34]:
st = StandardScaler()
X[cols]=st.fit_transform(X[cols])

In [35]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,2,1
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2,1


In [36]:
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,-0.128694,-0.049699,-0.214368,0.279961,1.0,0
2,1,1,0,1,1,-0.394296,-0.545638,-0.952675,0.279961,1.0,2
3,1,1,0,0,0,-0.464262,0.229842,-0.309634,0.279961,1.0,2
4,1,0,0,1,0,0.109057,-0.545638,-0.059562,0.279961,1.0,2
5,1,1,2,1,1,0.011239,0.834309,1.440866,0.279961,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,-0.411075,-0.545638,-0.893134,0.279961,1.0,0
610,1,1,4,1,0,-0.208727,-0.545638,-1.262287,-2.468292,1.0,0
611,1,1,1,1,0,0.456706,-0.466709,1.274152,0.279961,1.0,2
612,1,1,2,1,0,0.374659,-0.545638,0.488213,0.279961,1.0,2


## STEP 11 : Splitting the dataset to training set and testing set & applying K-Fold Crossvalidation

In [37]:
model_df={}
def model_selection(X, y, model):
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the model with the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    print(f"{model} Accuracy Score:  {accuracy_score(y_test,y_pred)}")
    score = cross_val_score(model,X,y,cv=5)
    print(f"{model} Avg cross val score is {np.mean(score)}")
    model_df[model]=round(np.mean(score)*100,2)
    # Print the confusion matrix
    print(f"{model} Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")

    # Print the AUC score
    print(f"{model} AUC Score: {roc_auc_score(y_test, y_pred)}")

    # Perform cross-validation and print the scores
    cv_scores = cross_val_score(model, X, y, cv=5)  # Renamed the variable
    print(f"{model} Cross Val Score: {cv_scores}")

    # Calculate and print the average cross-validation score
    average_val_score = cv_scores.mean()
    print(f"{model} Average Val Score: {average_val_score}")


In [38]:
model_df

{}

## 11.1 Logistic Regression

In [39]:
model = LogisticRegression()
model_selection(X, y, LogisticRegression())

LogisticRegression() Accuracy Score:  0.7927927927927928
LogisticRegression() Avg cross val score is 0.802964782964783
LogisticRegression() Confusion Matrix: 
[[12 22]
 [ 1 76]]
LogisticRegression() AUC Score: 0.6699770817417876
LogisticRegression() Cross Val Score: [0.8018018  0.78378378 0.78378378 0.83636364 0.80909091]
LogisticRegression() Average Val Score: 0.802964782964783


## 11.2 SVC

In [40]:
model = SVC()
model_selection(X, y, SVC())

SVC() Accuracy Score:  0.8018018018018018
SVC() Avg cross val score is 0.7938902538902539
SVC() Confusion Matrix: 
[[12 22]
 [ 0 77]]
SVC() AUC Score: 0.6764705882352942
SVC() Cross Val Score: [0.8018018  0.78378378 0.77477477 0.81818182 0.79090909]
SVC() Average Val Score: 0.7938902538902539


## 11.3 Decision Tree Classifier

In [41]:
model = DecisionTreeClassifier()
model_selection(X, y, DecisionTreeClassifier())

DecisionTreeClassifier() Accuracy Score:  0.7567567567567568
DecisionTreeClassifier() Avg cross val score is 0.7052252252252252
DecisionTreeClassifier() Confusion Matrix: 
[[21 13]
 [14 63]]
DecisionTreeClassifier() AUC Score: 0.7179144385026738
DecisionTreeClassifier() Cross Val Score: [0.75675676 0.7027027  0.66666667 0.69090909 0.65454545]
DecisionTreeClassifier() Average Val Score: 0.6943161343161343


## 11.4 Random Forest

In [42]:
model = RandomForestClassifier()
model_selection(X, y, RandomForestClassifier())


RandomForestClassifier() Accuracy Score:  0.7657657657657657
RandomForestClassifier() Avg cross val score is 0.7903030303030303
RandomForestClassifier() Confusion Matrix: 
[[13 21]
 [ 5 72]]
RandomForestClassifier() AUC Score: 0.6587089381207027
RandomForestClassifier() Cross Val Score: [0.79279279 0.75675676 0.8018018  0.82727273 0.8       ]
RandomForestClassifier() Average Val Score: 0.7957248157248158


## 11.5 Gradient Boost

In [43]:
model = GradientBoostingClassifier()
model_selection(X, y, GradientBoostingClassifier())

GradientBoostingClassifier() Accuracy Score:  0.7747747747747747
GradientBoostingClassifier() Avg cross val score is 0.7721703521703521
GradientBoostingClassifier() Confusion Matrix: 
[[13 21]
 [ 4 73]]
GradientBoostingClassifier() AUC Score: 0.6652024446142093
GradientBoostingClassifier() Cross Val Score: [0.78378378 0.73873874 0.78378378 0.78181818 0.77272727]
GradientBoostingClassifier() Average Val Score: 0.7721703521703521


## 11.6 KNN

In [44]:
model = KNeighborsClassifier()
model_selection(X, y, KNeighborsClassifier())

KNeighborsClassifier() Accuracy Score:  0.7027027027027027
KNeighborsClassifier() Avg cross val score is 0.7269123669123669
KNeighborsClassifier() Confusion Matrix: 
[[ 7 27]
 [ 6 71]]
KNeighborsClassifier() AUC Score: 0.5639801375095493
KNeighborsClassifier() Cross Val Score: [0.73873874 0.72072072 0.73873874 0.72727273 0.70909091]
KNeighborsClassifier() Average Val Score: 0.7269123669123669


## 11.7 Gaussian Naive Bayes

In [45]:
model = GaussianNB()
model_selection(X, y, GaussianNB())

GaussianNB() Accuracy Score:  0.8288288288288288
GaussianNB() Avg cross val score is 0.7866830466830466
GaussianNB() Confusion Matrix: 
[[15 19]
 [ 0 77]]
GaussianNB() AUC Score: 0.7205882352941176
GaussianNB() Cross Val Score: [0.78378378 0.76576577 0.77477477 0.82727273 0.78181818]
GaussianNB() Average Val Score: 0.7866830466830466


In [46]:
model_df

{LogisticRegression(): 80.3,
 SVC(): 79.39,
 DecisionTreeClassifier(): 70.52,
 RandomForestClassifier(): 79.03,
 GradientBoostingClassifier(): 77.22,
 KNeighborsClassifier(): 72.69,
 GaussianNB(): 78.67}

## STEP 12: Hyperparameter Tuning