In [2]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('loan_approval_dataset.csv')
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [3]:
# Exploratory data analysis
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


In [4]:
df.describe()

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2135.0,2.498712,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0
std,1232.498479,1.69591,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0
min,1.0,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1068.0,1.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,2135.0,3.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,3202.0,4.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0
max,4269.0,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


In [5]:
df.columns

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [6]:
# Removing the spaces between the characters
df.columns = df.columns.str.strip()

string_columns = df.select_dtypes(include=['object']).columns
df[string_columns] = df[string_columns].applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [7]:
# dropping unecessary columns
df = df.drop('loan_id', axis=1)

In [8]:
# Data preprocessing
# Encoding Categorical data
from sklearn.preprocessing import LabelEncoder, StandardScaler
le = LabelEncoder()
df['education'] = le.fit_transform(df['education'])
df['self_employed'] = le.fit_transform(df['self_employed'])

# Create a mapping dictionary with correct casing
x_map = {'Approved':1, 'Rejected':0}

# Apply the mapping
df['loan_status'] = df['loan_status'].map(x_map)

In [9]:
df.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,0,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,0,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,0,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,3,0,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,1,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0


In [10]:
# feature seperation
x = df.drop('loan_status', axis=1).values
y = df['loan_status']

In [11]:
# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.20, random_state=42)

# feature scaling
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [12]:
# training our models
# logistic regression model
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)
y_pred = lr_model.predict(x_test)

# model evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy_score(y_test, y_pred)

0.905152224824356

In [13]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[275  43]
 [ 38 498]]
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       318
           1       0.92      0.93      0.92       536

    accuracy                           0.91       854
   macro avg       0.90      0.90      0.90       854
weighted avg       0.90      0.91      0.90       854



In [14]:
# training SVM model
from sklearn.svm import SVC
svc_model = SVC(kernel='rbf', random_state=0)
svc_model.fit(x_train, y_train)
y_predict = svc_model.predict(x_test)

# model evaluation
accuracy_score(y_test, y_predict)

0.9238875878220141

In [15]:
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

[[292  26]
 [ 39 497]]
              precision    recall  f1-score   support

           0       0.88      0.92      0.90       318
           1       0.95      0.93      0.94       536

    accuracy                           0.92       854
   macro avg       0.92      0.92      0.92       854
weighted avg       0.92      0.92      0.92       854



In [16]:
# Training KNN model
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn_model.fit(x_train, y_train)
y_pre = knn_model.predict(x_test)

# model evaluation
accuracy_score(y_test, y_pre)

0.892271662763466

In [17]:
print(confusion_matrix(y_test, y_pre))
print(classification_report(y_test, y_pre))

[[279  39]
 [ 53 483]]
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       318
           1       0.93      0.90      0.91       536

    accuracy                           0.89       854
   macro avg       0.88      0.89      0.89       854
weighted avg       0.89      0.89      0.89       854



In [18]:
# training the naive bayes model
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)
y_pr = nb_model.predict(x_test)

# model evaluation
accuracy_score(y_test, y_pr)

0.936768149882904

In [19]:
print(confusion_matrix(y_test, y_pr))
print(classification_report(y_test, y_pr))

[[298  20]
 [ 34 502]]
              precision    recall  f1-score   support

           0       0.90      0.94      0.92       318
           1       0.96      0.94      0.95       536

    accuracy                           0.94       854
   macro avg       0.93      0.94      0.93       854
weighted avg       0.94      0.94      0.94       854



In [20]:
# training random forest classifier model
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=20, criterion='entropy', random_state=42)
rf_model.fit(x_train, y_train)
y_prdt = rf_model.predict(x_test)

# model evaluation
accuracy_score(y_test, y_prdt)

0.9765807962529274

In [21]:
print(confusion_matrix(y_test, y_prdt))
print(classification_report(y_test, y_prdt))

[[306  12]
 [  8 528]]
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       318
           1       0.98      0.99      0.98       536

    accuracy                           0.98       854
   macro avg       0.98      0.97      0.97       854
weighted avg       0.98      0.98      0.98       854



In [22]:
# saving our randomforest model
import pickle
pickle.dump(rf_model, open('model.pkl', 'wb'))

# saving our encoder
pickle.dump(le, open('encoder.pkl', 'wb'))


In [23]:
# saving our scaler
pickle.dump(sc, open('scaler.pkl', 'wb'))