**DATA COLLECTION AND PREPARATION**

In [None]:
### Importing Libraries
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
import imblearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

In [None]:
data = pd.read_csv('loan_prediction.csv')
data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [None]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [None]:
# Dropping Loan_ID column
data.drop(['Loan_ID'],axis=1,inplace=True)

In [None]:
data['Gender'] = data['Gender'].fillna(data['Gender'].mode()[0])
data['Married'] = data['Married'].fillna(data['Married'].mode()[0])

#replacing + with space for filling the nan values
data['Dependents']=data['Dependents'].str.replace('+','')
data['Dependents'] = data['Dependents'].fillna(data['Dependents'].mode()[0])
data['Self_Employed'] = data['Self_Employed'].fillna (data['Self_Employed'].mode()[0])
data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].mode()[0])
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mode()[0])
data['Credit_History'] = data['Credit_History'].fillna (data['Credit_History'].mode()[0])

In [None]:
data

In [None]:
# Import label encoder
from sklearn import preprocessing

# label_encoder object knows how
# to understand word labels.
label_encoder = preprocessing.LabelEncoder()
obj = (data.dtypes == 'object')
for col in list(obj[obj].index):
  data[col] = label_encoder.fit_transform(data[col])


In [None]:
data

In [None]:
#changing the datype of each float column to int

data['Gender']=data['Gender'].astype('int64')
data['Married']=data['Married'].astype('int64')
data['Dependents']=data['Dependents'].astype('int64')
data['Self_Employed' ]=data['Self_Employed'].astype('int64')
data['CoapplicantIncome'] = data['CoapplicantIncome'].astype('int64')
data['LoanAmount']=data['LoanAmount'].astype('int64')
data['Loan_Amount_Term']=data['Loan_Amount_Term'].astype('int64')
data['Credit_History']=data['Credit_History'].astype('int64')

In [None]:
#Balancing the dataset by using smote
from imblearn.combine import SMOTETomek
smote = SMOTETomek()

In [None]:
#dividing the dataset into dependent and independent y and x respectively
y = data['Loan_Status']
x = data.drop(columns=['Loan_Status'], axis=1)

In [None]:
#creating a new x and y variables for the balnced set
x_bal,y_bal = smote.fit_resample(x,y)

In [None]:
#printing the values of y before balancing the data and after
print(y.value_counts())
print(y_bal.value_counts())

In [None]:
x_bal

**EXPLORATORY DATA ANALYSIS**

In [None]:
data.describe()

In [None]:
#plotting the using distplot
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.distplot(data['ApplicantIncome'], color='r')
plt.subplot(122)
sns.distplot(data['Credit_History'])
plt.show()

In [None]:
#platting the count plot
plt.figure(figsize=(18,4))
plt.subplot(1,4,1)
sns.countplot(x='Gender',data=data)
plt.subplot(1,4,2)
sns.countplot(x ='Education',data=data)
plt.show()

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(131)
sns.countplot(x = 'Married', hue='Gender', data = data)
plt.subplot(132)
sns.countplot(x = 'Self_Employed', hue='Education', data = data)
plt.subplot(133)
sns.countplot(x = 'Property_Area', hue='Loan_Amount_Term', data = data)

In [None]:
#visulaized based gender and income what would be the appplication status
sns.swarmplot(x='Gender', y='ApplicantIncome', hue = 'Loan_Status', data = data)

In [None]:
# perfroming feature Scaling operation using standard scaller on X part of the dataset becaus
#there different type of values in the columns
sc=StandardScaler()
x_bal=sc.fit_transform(x_bal)
x_bal = pd.DataFrame(x_bal)

In [None]:
#splitting the dataset in train and test on balanced dataset
X_train, X_test, y_train, y_test = train_test_split( x_bal, y_bal, test_size=0.33, random_state=42)

**MODEL BUILDING**

In [None]:
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
xg = GradientBoostingClassifier()

**Decision tree model**

In [None]:
def decisionTree():
   dt.fit(X_train,y_train)
   yPred = dt.predict(X_test)
   print('****DecisionTreeClassifier****')
   print('Confusion matrix')
   print(confusion_matrix(y_test,yPred))
   print('Classification report')
   print(classification_report(y_test, yPred))

**Random forest model**

In [None]:
def randomForest():
  rf.fit(X_train,y_train)
  yPred = rf.predict(X_test)
  print('***RandomForestClassifier***')
  print('Confusion matrix')
  print(confusion_matrix(y_test,yPred))
  print('Classification report')
  print(classification_report(y_test, yPred))

In [None]:
def KNN():
  knn.fit(X_train,y_train)
  yPred = knn.predict(X_test)
  print('***KNeighborsClassifier***')
  print('Confusion matrix')
  print(confusion_matrix(y_test,yPred))
  print('Classification report')
  print(classification_report(y_test,yPred))

**Xgboost model**

In [None]:
def xgboost():
  xg.fit(X_train,y_train)
  yPred = xg.predict(X_test)
  print('****Gradient BoostingClassifier***')
  print('Confusion matrix')
  print(confusion_matrix(y_test,yPred))
  print('Classification report')
  print(classification_report (y_test,yPred))

**ANN model**

In [None]:
# Importing the Keras libraries and packages
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
# Initialising the ANN
classifier = Sequential()
# Adding the input layer and the first hidden layer
classifier.add(Dense (units=100, activation='relu', input_dim=11))
# Adding the second hidden layer
classifier.add(Dense (units=50, activation='relu'))
# Adding the output layer
classifier.add(Dense (units=1, activation='sigmoid'))
# Compiling the ANN
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

**TRAINING**



In [None]:
decisionTree()
randomForest()
KNN()
xgboost()

In [None]:
# Fitting the ANN to the training set
model_history = classifier.fit(X_train, y_train, batch_size=100, validation_split=0.2, epochs=100)

In [None]:
classifier.save("loan.h5")

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
y_pred

In [None]:
y_pred = (y_pred > 0.5)
y_pred

**Testing the Model**

In [None]:
# Gender Married Dependents Education Self_Employed Applicant_Income Coapplicant_Income Loan_Amount Loan_Amount_Term Credit_History Property_Area
dt.predict([[1,1, 0, 1, 1, 4276, 1542,145, 248, 0,1]])

In [None]:
# Gender Married Dependents Education Self_Employed Applicant_Income Coapplicant_Income Loan_Amount Loan_Amount_Term Credit_History Property_Area

rf.predict([[1,1, 0, 1, 1, 4276, 1542,145, 248, 0,1]])

In [None]:
# Gender Married Dependents Education Self_Employed Applicant_Income Coapplicant_Income Loan_Amount Loan_Amount_Term Credit_History Property_Area

knn.predict([[1,1, 0, 1, 1, 4276, 1542,145, 248, 0,1]])

In [None]:
# Gender Married Dependents Education Self_Employed Applicant_Income Coapplicant_Income Loan_Amount Loan_Amount_Term Credit_History Property_Area

xg.predict([[1,1, 0, 1, 1, 4276, 1542,145, 248, 0,1]])

In [None]:
def predict_exit(sample_value):
  # Convert list to numpy array
  sample_value = np.array(sample_value)
  # Reshape because sample value contains only 1 record
  sample_value = sample_value.reshape(1, -1)
  # Feature Scaling
  sample_value = sc.transform(sample_value)
  return classifier.predict(sample_value)

In [None]:
# Predictions
# Value order Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
sample_value = [[1, 1, 0, 1, 1, 4276, 1542,145, 240, 0, 1]]

output = predict_exit(sample_value)
output

In [None]:
# Predictions
# Value order Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
sample_value = [[1, 0, 1, 1, 1, 45, 14,45, 240, 1, 1]]

output = predict_exit(sample_value)
output

**PERFORMANCE TESTING & HYPERPARAMETER TUNING**

**Testing Model With Multiple Evaluation Metrics**

In [None]:
def compareModel():
  decisionTree()
  print('-'*100)
  randomForest()
  print('-'*100)
  KNN()
  print('-'*100)
  xgboost()
  print('-'*100)


In [None]:
compareModel()

In [None]:
yPred = classifier.predict(X_test)
print(accuracy_score(y_pred,y_test))
print("ANN Model")
print("Confusion_Matrix")
print(confusion_matrix(y_test,y_pred))
print("Classification Report")
print(classification_report(y_test,y_pred))

**Comparing Model Accuracy Before & After Applying Hyperparameter Tuning**

In [None]:
from sklearn.model_selection import cross_val_score

#Random forest model is selected
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
yPred = rf.predict(X_test)

In [None]:
f1_score(yPred, y_test, average='weighted')

In [None]:
cv = cross_val_score(rf,x,y,cv=5)
np.mean(cv)

In [None]:
print('Confusion matrix')
print(confusion_matrix(y_test,yPred))
print('Classification report')
print(classification_report(y_test, yPred))

In [None]:
#saving the model by using pickle function
pickle.dump(rf,open('rdf.pkl', 'wb'))