## **Loan Prediction**

In [0]:
# Import required libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.tree import DecisionTreeClassifier 

import warnings
warnings.filterwarnings("ignore")

In [0]:
# path to data
data_path = './loan.csv'

In [0]:
# load the data
data_df = pd.read_csv(data_path)

In [21]:
# look at the data
data_df.head(5)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,5849,0.0,0.0,360.0,1.0,1
1,4583,1508.0,128.0,360.0,1.0,0
2,3000,0.0,66.0,360.0,1.0,1
3,2583,2358.0,120.0,360.0,1.0,1
4,6000,0.0,141.0,360.0,1.0,1


In [22]:
# dimension of `data_df`
data_df.shape

(614, 6)

The **loan** dataset is having 614 samples and 6 features set (including target field).

In [23]:
# column names of the dataset
data_df.columns

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Loan_Status'],
      dtype='object')

In [24]:
# know more about data
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 6 columns):
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           614 non-null float64
Loan_Amount_Term     614 non-null float64
Credit_History       614 non-null float64
Loan_Status          614 non-null int64
dtypes: float64(4), int64(2)
memory usage: 28.9 KB


In [25]:
data_df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
count,614.0,614.0,614.0,614.0,614.0,614.0
mean,5403.459283,1621.245798,141.166124,334.201954,0.773616,0.687296
std,6109.041673,2926.248369,88.34063,82.183884,0.418832,0.463973
min,150.0,0.0,0.0,0.0,0.0,0.0
25%,2877.5,0.0,98.0,360.0,1.0,0.0
50%,3812.5,1188.5,125.0,360.0,1.0,1.0
75%,5795.0,2297.25,164.75,360.0,1.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0,1.0


In [26]:
# Look on Target field 
data_df['Loan_Status'].value_counts()

1    422
0    192
Name: Loan_Status, dtype: int64

In our **loan** dataset, we hare having 422 samples of positive target field and 192 samples of negative target field. It becomes a binary classification problem.

In [0]:
# Extract features and target fields from the data
X = data_df.drop(['Loan_Status'], axis = 1)
y = data_df['Loan_Status']

In [0]:
# Split the data into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0, stratify = y)

In [29]:
# verify the splitting of data  
len(X_train) + len(X_test)

614

### **Build Baseline Model**

In [0]:
# build a baseline DecisionTreeClassifier model with default parameters
clf = DecisionTreeClassifier()

In [31]:
# DecisionTreeClassifier default parameters
clf

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [32]:
# fit the DecisionTreeClassifier model on training data
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [33]:
# perform 5-fold-cross-validation on training dataset
scores = cross_val_score(clf, X_train, y_train, cv = 5, scoring = 'accuracy')
scores

array([0.68604651, 0.6627907 , 0.63953488, 0.6744186 , 0.67058824])

In [34]:
# average accuracy after performing 5-fold-cross-validation on training dataset
scores.mean()

0.6666757865937072

In [35]:
# make prediction
train_predictions = clf.predict(X_train)
test_predictions = clf.predict(X_test)
print(f"Training accuracy = {accuracy_score(y_train, train_predictions)}")
print(f"Testing accuracy = {accuracy_score(y_test, test_predictions)}") 

Training accuracy = 1.0
Testing accuracy = 0.6864864864864865


The **DecisionTreeClassifier** model with default parameters gives us 68.64% accuracy on testing dataset.

### **GridSearchCV** 

Perform GridSearchCV algorithm to do hyperparameter tuning, and look for the best parameters for the DecisionTreeClassifier model. 

In [0]:
# choose a set of parameters for DecisionTreeClassifier
parameters = {
    'max_depth': [1, 2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 3, 4, 5],
    'criterion': ['gini', 'entropy']
}

In [43]:
# find out the best hyperparameters for DecisionTreeClassifier using GridSearchCV
grid_search = GridSearchCV(clf, parameters, scoring=make_scorer(accuracy_score))
fit_grid_search = grid_search.fit(X_train, y_train)
best_clf_grid = fit_grid_search.best_estimator_
best_clf_grid

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

Best parameters obtain by **GridSearchCV** are:

*   max_depth = 1 
*   min_samples_leaf = 1
*   min_samples_split = 2
*   criterion = 'gini'




In [44]:
# perform 5-fold-cross-validation on GridSearchCV optimized model
grid_search_scores = cross_val_score(best_clf_grid, X_train, y_train, cv=5, scoring='accuracy')
grid_search_scores

array([0.74418605, 0.70930233, 0.72093023, 0.84883721, 0.78823529])

In [45]:
# average accuracy after performing 5-fold-cross-validation on GridSearchCV optimized DecisionTreeClassifier model
grid_search_scores.mean()

0.7622982216142271

After performing GridSearchCV optimization on DecisionTreeClassifier model we get an accuracy of 76.22% on training dataset.

In [47]:
# make prediction
best_clf_grid.fit(X_train, y_train)
grid_search_train_predictions = best_clf_grid.predict(X_train)
grid_search_test_predictions = best_clf_grid.predict(X_test)
print(f"Training Accuracy = {accuracy_score(y_train, grid_search_train_predictions)}")
print(f"Testing Accuracy = {accuracy_score(y_test, grid_search_test_predictions)}")

Training Accuracy = 0.7622377622377622
Testing Accuracy = 0.7891891891891892


The **DecisionTreeClassiifer** model with GridSearchCV optimization gives us an accuracy of 78.91% on Testing dataset.

### **RandomizedSearchCV**

Perform **RandomizedSearchCV** algorithm to do hyperparameter tuning, and look for the best parameters for the DecisionTreeClassifier model. 



In [0]:
# choose a set of parameters for DecisionTreeClassifier
parameters = {
    'max_depth': [1, 2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 3, 4, 5],
    'criterion': ['gini', 'entropy']
}

In [49]:
# find out the best hyperparameters for DecisionTreeClassifier using RandomizedSearchCV
ran_search = RandomizedSearchCV(clf, parameters, scoring=make_scorer(accuracy_score))
fit_ran_search = ran_search.fit(X_train, y_train)
best_clf_ran = fit_ran_search.best_estimator_
best_clf_ran

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

Best parameters obtain by **RandomSearchCv** are:

*   max_depth = 1 
*   min_samples_leaf = 4
*   min_samples_split = 5
*   criterion = 'entropy'




In [50]:
# perform 5-fold-cross-validation on RandomizedSearchCV optimized model
ran_search_scores = cross_val_score(best_clf_ran, X_train, y_train, cv=5, scoring='accuracy')
ran_search_scores

array([0.74418605, 0.70930233, 0.72093023, 0.84883721, 0.78823529])

In [51]:
# average accuracy after performing 5-fold-cross-validation on RandomizedSearchCV optimized DecisionTreeClassifier model
ran_search_scores.mean()

0.7622982216142271

After performing RandomizedSearchCV optimization on DecisionTreeClassifier model we get an accuracy of 76.22% on training dataset.

In [52]:
# make prediction
best_clf_ran.fit(X_train, y_train)
ran_search_train_predictions = best_clf_ran.predict(X_train)
ran_search_test_predictions = best_clf_ran.predict(X_test)
print(f"Training Accuracy = {accuracy_score(y_train, ran_search_train_predictions)}")
print(f"Testing Accuracy = {accuracy_score(y_test, ran_search_test_predictions)}")

Training Accuracy = 0.7622377622377622
Testing Accuracy = 0.7891891891891892


The **DecisionTreeClassiifer** model with RandomizedSearchCV optimization gives us an accuracy of 78.91% on Testing dataset.