# Universalbank

In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

import pandasql as psql

# pip install pandasql

# import datetime class from datetime module

from datetime import datetime

In [2]:
# Load the Universal bank data

bankdata = pd.read_csv(r"D:\iiit notes\Programming\AI\Internship practice\39 Seasion 09-Jul-2021\Universalbank.csv", header=0) 
bankdata.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [3]:
bankdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB


In [4]:
# Count the target or dependent variable by '0' & '1' and their proportion 
# (> 10 : 1, then the dataset is imbalance data)

CreditCard_count = bankdata.CreditCard.value_counts()
print('Class 0:', CreditCard_count[0])
print('Class 1:', CreditCard_count[1])
print('Proportion:', round(CreditCard_count[0] / CreditCard_count[1], 2), ': 1')
print('Total Bank records:', len(bankdata))

Class 0: 3530
Class 1: 1470
Proportion: 2.4 : 1
Total Bank records: 5000


In [5]:
# Delete the columns which are not influencing the target variable

del bankdata['ID']
del bankdata['ZIP Code']

In [6]:
bankdata.columns

Index(['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account',
       'Online', 'CreditCard'],
      dtype='object')

In [7]:
# cols1 is variables - creating a dummy variables

cols1 = ['Family', 'Education']
print(cols1)

['Family', 'Education']


In [8]:
# cols2 variables - MinMaxScalar function

cols2 = ['Age', 'Experience', 'Income', 'CCAvg', 'Mortgage']
print(cols2)

['Age', 'Experience', 'Income', 'CCAvg', 'Mortgage']


In [9]:
# Create dummy variable for all range values

bankdata = pd.get_dummies(bankdata, columns=cols1)
bankdata.head().T

Unnamed: 0,0,1,2,3,4
Age,25.0,45.0,39.0,35.0,35.0
Experience,1.0,19.0,15.0,9.0,8.0
Income,49.0,34.0,11.0,100.0,45.0
CCAvg,1.6,1.5,1.0,2.7,1.0
Mortgage,0.0,0.0,0.0,0.0,0.0
Personal Loan,0.0,0.0,0.0,0.0,0.0
Securities Account,1.0,1.0,0.0,0.0,0.0
CD Account,0.0,0.0,0.0,0.0,0.0
Online,0.0,0.0,0.0,0.0,0.0
CreditCard,0.0,0.0,0.0,0.0,1.0


In [10]:
# Identify the dependent and Target variables

IndepVar = []
for col in bankdata.columns:
    if col != 'CreditCard':
        IndepVar.append(col)

TargetVar = 'CreditCard'

x = bankdata[IndepVar]
y = bankdata[TargetVar]

In [11]:
# Splitting the dataset into train and test 

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 42)
x_test_F1 = x_test.copy()

In [12]:
# Feature Scaling - Each independent variable is in different range. The process of transforming all the 
# features in the given data set to a fixed range is known as ‘Scaling’

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Conver x_train values

x_train[cols2] = sc.fit_transform(x_train[cols2])


# Conver x_test values

x_test[cols2] = sc.fit_transform(x_test[cols2])

# Convert to dataframes

x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)

In [13]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train[cols2] = mmscaler.fit_transform(x_train[cols2])
x_train = pd.DataFrame(x_train)

x_test[cols2] = mmscaler.fit_transform(x_test[cols2])
x_test = pd.DataFrame(x_test)

In [14]:
x_test.head()

Unnamed: 0,Age,Experience,Income,CCAvg,Mortgage,Personal Loan,Securities Account,CD Account,Online,Family_1,Family_2,Family_3,Family_4,Education_1,Education_2,Education_3
1501,0.159091,0.152174,0.137755,0.03,0.0,0,1,0,0,0,1,0,0,0,1,0
2586,0.545455,0.565217,0.719388,0.61,0.0,1,0,0,0,0,0,0,1,1,0,0
2653,0.159091,0.173913,0.576531,0.31,0.64252,0,0,0,1,0,1,0,0,1,0,0
1055,0.181818,0.195652,0.27551,0.1,0.0,0,1,0,1,1,0,0,0,1,0,0
705,0.886364,0.847826,0.112245,0.07,0.0,0,0,0,1,0,0,1,0,0,1,0


In [15]:
# Initialize an array that stores the Accuracy

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import sklearn.metrics as metrics
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.metrics import classification_report, confusion_matrix

accuracy = []

for a in range(1, 10, 1):
    k = a
    bankdataKNN = KNeighborsClassifier(n_neighbors=k)
    bankdataKNN.fit(x_train, y_train)
    y_pred = bankdataKNN.predict(x_test)
    print('KNN_K_value = ', a)
    # actual values

    actual = y_test

    # predicted values
    predicted = y_pred

    # confusion matrix

    matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
    print('Confusion matrix : \n', matrix)

    # outcome values order in sklearn

    tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

    print('Outcome Values : \n', tp, fn, fp, tn)

    # classification report for precision, recall f1-score and accuracy

    matrix = classification_report(actual,predicted,labels=[1,0])
    print('Classification report : \n', matrix)

    # calculating the metrics

    sensitivity = round(tp/(tp+fn), 3) 

    specificity = round(tn/(tn+fp), 3)

    accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
    balanced_accuracy = round((sensitivity+specificity)/2, 3)
    precision = round(tp/(tp+fp), 3)
    f1Score = round((2*tp/(2*tp + fp +fn)), 3);

    # Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
    # A model with a score of +1 is a perfect model and -1 is a poor model

    from math import sqrt

    m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

    MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

    print('Accuracy :', round(accuracy*100, 2), '%')
    print('Precision :', round(precision*100, 2), '%')
    print('Recall :', round(sensitivity*100, 2), '%')
    print('F1 Score :', f1Score)
    print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
    print('MCC', MCC)

    # Area under ROC curve 
    print('roc_auc_score:', round(roc_auc_score(y_test, y_pred), 3))
    print('---------------------------------------------------------------------------------------------')

KNN_K_value =  1
Confusion matrix : 
 [[170 257]
 [287 786]]
Outcome Values : 
 170 257 287 786
Classification report : 
               precision    recall  f1-score   support

           1       0.37      0.40      0.38       427
           0       0.75      0.73      0.74      1073

    accuracy                           0.64      1500
   macro avg       0.56      0.57      0.56      1500
weighted avg       0.64      0.64      0.64      1500

Accuracy : 63.7 %
Precision : 37.2 %
Recall : 39.8 %
F1 Score : 0.385
Balanced Accuracy : 56.6 %
MCC 0.128
roc_auc_score: 0.565
---------------------------------------------------------------------------------------------
KNN_K_value =  2
Confusion matrix : 
 [[ 92 335]
 [ 88 985]]
Outcome Values : 
 92 335 88 985
Classification report : 
               precision    recall  f1-score   support

           1       0.51      0.22      0.30       427
           0       0.75      0.92      0.82      1073

    accuracy                           0.72  

# KNN with stratified sampling

In [16]:
# Splitting the dataset into train and test 

from sklearn.model_selection import train_test_split

x1_train, x1_test, y1_train, y1_test = train_test_split(x, y, test_size = 0.30, random_state = 42, stratify=y)
x1_test_F1 = x_test.copy()

In [17]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x1_train[cols2] = mmscaler.fit_transform(x1_train[cols2])
x1_train = pd.DataFrame(x1_train)

x1_test[cols2] = mmscaler.fit_transform(x1_test[cols2])
x1_test = pd.DataFrame(x1_test)

In [18]:
# Initialize an array that stores the Accuracy

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import sklearn.metrics as metrics
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.metrics import classification_report, confusion_matrix

accuracy = []

for a in range(1, 10, 1):
    k = a
    bankdataKNN = KNeighborsClassifier(n_neighbors=k)
    bankdataKNN.fit(x1_train, y1_train)
    y2_pred = bankdataKNN.predict(x1_test)
    print('KNN_K_value = ', a)
    # actual values

    actual = y_test

    # predicted values
    predicted = y2_pred

    # confusion matrix

    matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
    print('Confusion matrix : \n', matrix)

    # outcome values order in sklearn

    tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

    print('Outcome Values : \n', tp, fn, fp, tn)

    # classification report for precision, recall f1-score and accuracy

    matrix = classification_report(actual,predicted,labels=[1,0])
    print('Classification report : \n', matrix)

    # calculating the metrics

    sensitivity = round(tp/(tp+fn), 3) 

    specificity = round(tn/(tn+fp), 3)

    accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
    balanced_accuracy = round((sensitivity+specificity)/2, 3)
    precision = round(tp/(tp+fp), 3)
    f1Score = round((2*tp/(2*tp + fp +fn)), 3);

    # Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
    # A model with a score of +1 is a perfect model and -1 is a poor model

    from math import sqrt

    m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

    MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

    print('Accuracy :', round(accuracy*100, 2), '%')
    print('Precision :', round(precision*100, 2), '%')
    print('Recall :', round(sensitivity*100, 2), '%')
    print('F1 Score :', f1Score)
    print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
    print('MCC', MCC)

    # Area under ROC curve 
    print('roc_auc_score:', round(roc_auc_score(y_test, y2_pred), 3))
    print('---------------------------------------------------------------------------------------------')

KNN_K_value =  1
Confusion matrix : 
 [[112 315]
 [316 757]]
Outcome Values : 
 112 315 316 757
Classification report : 
               precision    recall  f1-score   support

           1       0.26      0.26      0.26       427
           0       0.71      0.71      0.71      1073

    accuracy                           0.58      1500
   macro avg       0.48      0.48      0.48      1500
weighted avg       0.58      0.58      0.58      1500

Accuracy : 57.9 %
Precision : 26.2 %
Recall : 26.2 %
F1 Score : 0.262
Balanced Accuracy : 48.4 %
MCC -0.032
roc_auc_score: 0.484
---------------------------------------------------------------------------------------------
KNN_K_value =  2
Confusion matrix : 
 [[ 48 379]
 [115 958]]
Outcome Values : 
 48 379 115 958
Classification report : 
               precision    recall  f1-score   support

           1       0.29      0.11      0.16       427
           0       0.72      0.89      0.80      1073

    accuracy                           0.67

# Random forest with random sampling

In [19]:
# Build Random Forest classification model and Train the model using the training sets

from sklearn.ensemble import RandomForestClassifier  

bankdataRF = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                                    criterion='gini', max_depth=None, max_features='auto',
                                    max_leaf_nodes=None, max_samples=None,
                                    min_impurity_decrease=0.0, min_impurity_split=None,
                                    min_samples_leaf=1, min_samples_split=2,
                                    min_weight_fraction_leaf=0.0, n_estimators=500,
                                    n_jobs=None, oob_score=False, random_state=0, verbose=0,
                                    warm_start=False)

bankdataRF = bankdataRF.fit(x_train, y_train)

# Predict the model with test data set

y3_pred = bankdataRF.predict(x_test)

# actual values

actual = y_test

# predicted values
predicted = y3_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y3_pred), 3))

Confusion matrix : 
 [[119 308]
 [109 964]]
Outcome Values : 
 119 308 109 964
Classification report : 
               precision    recall  f1-score   support

           1       0.52      0.28      0.36       427
           0       0.76      0.90      0.82      1073

    accuracy                           0.72      1500
   macro avg       0.64      0.59      0.59      1500
weighted avg       0.69      0.72      0.69      1500

Accuracy : 72.2 %
Precision : 52.2 %
Recall : 27.9 %
F1 Score : 0.363
Balanced Accuracy : 58.8 %
MCC 0.223
roc_auc_score: 0.589


# Random forest with stratified sampling

In [20]:
# Build Random Forest classification model and Train the model using the training sets

from sklearn.ensemble import RandomForestClassifier  

bankdataRF1 = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                                     criterion='gini', max_depth=None, max_features='auto',
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0, min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0, n_estimators=500,
                                     n_jobs=None, oob_score=False, random_state=0, verbose=0,
                                     warm_start=False)

bankdataRF1 = bankdataRF.fit(x1_train, y1_train)

# Predict the model with test data set

y4_pred = bankdataRF1.predict(x1_test)

# actual values

actual = y_test

# predicted values
predicted = y4_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y4_pred), 3))

Confusion matrix : 
 [[ 54 373]
 [133 940]]
Outcome Values : 
 54 373 133 940
Classification report : 
               precision    recall  f1-score   support

           1       0.29      0.13      0.18       427
           0       0.72      0.88      0.79      1073

    accuracy                           0.66      1500
   macro avg       0.50      0.50      0.48      1500
weighted avg       0.59      0.66      0.61      1500

Accuracy : 66.3 %
Precision : 28.9 %
Recall : 12.6 %
F1 Score : 0.176
Balanced Accuracy : 50.1 %
MCC 0.003
roc_auc_score: 0.501


# Decision Tree with Random Sampling

In [21]:
# To build the decision tree model with Over sampling 

from sklearn.tree import DecisionTreeClassifier 

bankdataDT = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                                    max_depth=None, max_features=None, max_leaf_nodes=None,
                                    min_impurity_decrease=0.0, min_impurity_split=None,
                                    min_samples_leaf=1, min_samples_split=2,min_weight_fraction_leaf=0.0,
                                    random_state=None, splitter='best')

bankdataDT = bankdataDT.fit(x_train,y_train)

# Predict with test data

y5_pred = bankdataDT.predict(x_test)

# actual values

actual = y_test

# predicted values
predicted = y5_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y5_pred), 3))

Confusion matrix : 
 [[174 253]
 [289 784]]
Outcome Values : 
 174 253 289 784
Classification report : 
               precision    recall  f1-score   support

           1       0.38      0.41      0.39       427
           0       0.76      0.73      0.74      1073

    accuracy                           0.64      1500
   macro avg       0.57      0.57      0.57      1500
weighted avg       0.65      0.64      0.64      1500

Accuracy : 63.9 %
Precision : 37.6 %
Recall : 40.7 %
F1 Score : 0.391
Balanced Accuracy : 56.9 %
MCC 0.135
roc_auc_score: 0.569


# Decision Tree with stratified Sampling

In [22]:
# To build the decision tree model with Over sampling 

from sklearn.tree import DecisionTreeClassifier 

bankdataDT1 = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                                     max_depth=None, max_features=None, max_leaf_nodes=None,
                                     min_impurity_decrease=0.0, min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,min_weight_fraction_leaf=0.0,
                                     random_state=None, splitter='best')

bankdataDT1 = bankdataDT1.fit(x1_train,y1_train)

# Predict with test data

y6_pred = bankdataDT.predict(x1_test)

# actual values

actual = y_test

# predicted values
predicted = y6_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y6_pred), 3))

Confusion matrix : 
 [[135 292]
 [300 773]]
Outcome Values : 
 135 292 300 773
Classification report : 
               precision    recall  f1-score   support

           1       0.31      0.32      0.31       427
           0       0.73      0.72      0.72      1073

    accuracy                           0.61      1500
   macro avg       0.52      0.52      0.52      1500
weighted avg       0.61      0.61      0.61      1500

Accuracy : 60.5 %
Precision : 31.0 %
Recall : 31.6 %
F1 Score : 0.313
Balanced Accuracy : 51.8 %
MCC 0.036
roc_auc_score: 0.518


# Logistic Regression with Random Sampling

In [23]:
# To build the 'Logistic Regression' model with random sampling

from sklearn.linear_model import LogisticRegression

bankdataLR = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                                intercept_scaling=1, max_iter=100, multi_class='auto', 
                                n_jobs=None, penalty='l2', random_state=None,
                                solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

bankdataLR = bankdataLR.fit(x_train,y_train)

# Predict the model with test data set

y7_pred = bankdataLR.predict(x_test)

# actual values

actual = y_test

# predicted values
predicted = y7_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y7_pred), 3))

Confusion matrix : 
 [[  71  356]
 [  17 1056]]
Outcome Values : 
 71 356 17 1056
Classification report : 
               precision    recall  f1-score   support

           1       0.81      0.17      0.28       427
           0       0.75      0.98      0.85      1073

    accuracy                           0.75      1500
   macro avg       0.78      0.58      0.56      1500
weighted avg       0.76      0.75      0.69      1500

Accuracy : 75.1 %
Precision : 80.7 %
Recall : 16.6 %
F1 Score : 0.276
Balanced Accuracy : 57.5 %
MCC 0.289
roc_auc_score: 0.575


# Logistic Regression with Stratified Sampling

In [24]:
# To build the 'Logistic Regression' model with random sampling

from sklearn.linear_model import LogisticRegression

bankdataLR1 = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                                 intercept_scaling=1, max_iter=100, multi_class='auto', 
                                 n_jobs=None, penalty='l2', random_state=None,
                                 solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

bankdataLR1 = bankdataLR1.fit(x1_train,y1_train)

# Predict the model with test data set

y8_pred = bankdataLR.predict(x1_test)

# actual values

actual = y_test

# predicted values
predicted = y8_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y8_pred), 3))

Confusion matrix : 
 [[  21  406]
 [  57 1016]]
Outcome Values : 
 21 406 57 1016
Classification report : 
               precision    recall  f1-score   support

           1       0.27      0.05      0.08       427
           0       0.71      0.95      0.81      1073

    accuracy                           0.69      1500
   macro avg       0.49      0.50      0.45      1500
weighted avg       0.59      0.69      0.61      1500

Accuracy : 69.1 %
Precision : 26.9 %
Recall : 4.9 %
F1 Score : 0.083
Balanced Accuracy : 49.8 %
MCC -0.008
roc_auc_score: 0.498
