# loan_data

In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

import pandasql as psql

# pip install pandasql

# import datetime class from datetime module

from datetime import datetime

In [2]:
# Load the loan bank data

loans = pd.read_csv(r"D:\iiit notes\Programming\AI\Internship practice\44 season 19-jul-2021\loan_data.csv", header=0) 
loans.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [3]:
# Change the name of variable

loans =loans.rename(columns = {'not.fully.paid': 'NFPaid'}, inplace = False)
loans.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,NFPaid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [4]:
# Identify the category variables

cat_cols = ['purpose']

# Convert the catagory variable into dummy variables

loans = pd.get_dummies(loans,columns=cat_cols)
loans=pd.DataFrame(loans)
loans.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,NFPaid,purpose_all_other,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0,0,1,0,0,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,0,0,1,0,0,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,0,0,1,0,0,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0,0,1,0,0,0,0,0


In [5]:
loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   credit.policy               9578 non-null   int64  
 1   int.rate                    9578 non-null   float64
 2   installment                 9578 non-null   float64
 3   log.annual.inc              9578 non-null   float64
 4   dti                         9578 non-null   float64
 5   fico                        9578 non-null   int64  
 6   days.with.cr.line           9578 non-null   float64
 7   revol.bal                   9578 non-null   int64  
 8   revol.util                  9578 non-null   float64
 9   inq.last.6mths              9578 non-null   int64  
 10  delinq.2yrs                 9578 non-null   int64  
 11  pub.rec                     9578 non-null   int64  
 12  NFPaid                      9578 non-null   int64  
 13  purpose_all_other           9578 

In [6]:
loans['NFPaid'].value_counts()

0    8045
1    1533
Name: NFPaid, dtype: int64

In [7]:
# Identify the dependent and Target variables

IndepVar = []
for col in loans.columns:
    if col != 'NFPaid':
        IndepVar.append(col)

TargetVar = 'NFPaid'

x = loans[IndepVar]
y = loans[TargetVar]

In [8]:
# Splitting the dataset into train and test 

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 12)
x_test_F1 = x_test.copy()

In [9]:
# Identify variables which are fit for scaling

cols = ['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util']

# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train[cols] = mmscaler.fit_transform(x_train[cols])
x_train = pd.DataFrame(x_train)

x_test[cols] = mmscaler.fit_transform(x_test[cols])
x_test = pd.DataFrame(x_test)

# Random Forest without LDA & PCA

In [10]:
# Build Random Forest Classification model and train model using the training dataset

from sklearn.ensemble import RandomForestClassifier

modelRF = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=2, min_samples_split=2, min_samples_leaf=1, 
                                 min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, 
                                 min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, 
                                 n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, 
                                 ccp_alpha=0.0, max_samples=None)

modelRF = modelRF.fit(x_train, y_train)

# Predict the model with the test data set

y_pred = modelRF.predict(x_test)

from sklearn.metrics import classification_report, confusion_matrix

# actual values

actual = y_test

# predicted values
predicted = y_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

from sklearn.metrics import roc_curve, roc_auc_score

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y_pred), 3))

Confusion matrix : 
 [[   0  488]
 [   0 2386]]
Outcome Values : 
 0 488 0 2386
Classification report : 
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       488
           0       0.83      1.00      0.91      2386

    accuracy                           0.83      2874
   macro avg       0.42      0.50      0.45      2874
weighted avg       0.69      0.83      0.75      2874

Accuracy : 83.0 %
Precision : nan %
Recall : 0.0 %
F1 Score : 0.0
Balanced Accuracy : 50.0 %
MCC nan
roc_auc_score: 0.5


# Logistic Regression without LDA & PCA

In [11]:
# To build the 'Logistic Regression' model with random sampling

from sklearn.linear_model import LogisticRegression

modelLR = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                            intercept_scaling=1, max_iter=100, multi_class='auto',
                            n_jobs=None, penalty='l2', random_state=None,
                            solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

modelLR = modelLR.fit(x_train,y_train)

# Predict the model with test data set

y1_pred = modelLR.predict(x_test)

from sklearn.metrics import classification_report, confusion_matrix

# actual values

actual = y_test

# predicted values
predicted = y1_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

from sklearn.metrics import roc_curve, roc_auc_score

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y1_pred), 3))

Confusion matrix : 
 [[  10  478]
 [  11 2375]]
Outcome Values : 
 10 478 11 2375
Classification report : 
               precision    recall  f1-score   support

           1       0.48      0.02      0.04       488
           0       0.83      1.00      0.91      2386

    accuracy                           0.83      2874
   macro avg       0.65      0.51      0.47      2874
weighted avg       0.77      0.83      0.76      2874

Accuracy : 83.0 %
Precision : 47.6 %
Recall : 2.0 %
F1 Score : 0.039
Balanced Accuracy : 50.7 %
MCC 0.07
roc_auc_score: 0.508


# Decision Tree without PCA & LCA

In [12]:
# To build the decision tree model with Over sampling

from sklearn.tree import DecisionTreeClassifier

modelDT = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                                max_depth=None, max_features=None, max_leaf_nodes=None,
                                min_impurity_decrease=0.0, min_impurity_split=None,
                                min_samples_leaf=1, min_samples_split=2,min_weight_fraction_leaf=0.0,
                                random_state=None, splitter='best')

modelDT = modelDT.fit(x_train,y_train)

# Predict with test data

y2_pred = modelDT.predict(x_test)


from sklearn.metrics import classification_report, confusion_matrix

# actual values

actual = y_test

# predicted values
predicted = y2_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

from sklearn.metrics import roc_curve, roc_auc_score

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y2_pred), 3))

Confusion matrix : 
 [[ 124  364]
 [ 457 1929]]
Outcome Values : 
 124 364 457 1929
Classification report : 
               precision    recall  f1-score   support

           1       0.21      0.25      0.23       488
           0       0.84      0.81      0.82      2386

    accuracy                           0.71      2874
   macro avg       0.53      0.53      0.53      2874
weighted avg       0.73      0.71      0.72      2874

Accuracy : 71.4 %
Precision : 21.3 %
Recall : 25.4 %
F1 Score : 0.232
Balanced Accuracy : 53.1 %
MCC 0.058
roc_auc_score: 0.531


# SVM without PCA & LCA

In [13]:
# Training the SVM algorithm

from sklearn.svm import SVC

modelSVMGaussian = SVC(kernel='rbf', random_state = 42, class_weight='balanced')
modelSVMGaussian.fit(x_train, y_train)

# Predicting the values

y3_pred = modelSVMGaussian.predict(x_test)


from sklearn.metrics import classification_report, confusion_matrix

# actual values

actual = y_test

# predicted values
predicted = y3_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

from sklearn.metrics import roc_curve, roc_auc_score

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y3_pred), 3))

Confusion matrix : 
 [[ 261  227]
 [ 701 1685]]
Outcome Values : 
 261 227 701 1685
Classification report : 
               precision    recall  f1-score   support

           1       0.27      0.53      0.36       488
           0       0.88      0.71      0.78      2386

    accuracy                           0.68      2874
   macro avg       0.58      0.62      0.57      2874
weighted avg       0.78      0.68      0.71      2874

Accuracy : 67.7 %
Precision : 27.1 %
Recall : 53.5 %
F1 Score : 0.36
Balanced Accuracy : 62.0 %
MCC 0.192
roc_auc_score: 0.621


# KNN Without PCA & LCA

In [14]:
# Build the algorithm with KNN

from sklearn import neighbors

modelKNN = neighbors.KNeighborsClassifier(n_neighbors=5,weights='uniform', algorithm='auto', leaf_size=30, p=2, 
                                          metric='minkowski', metric_params=None, n_jobs=None)
modelKNN.fit(x_train, y_train)

# Predict the model with test dataset

y4_pred = modelKNN.predict(x_test)


from sklearn.metrics import classification_report, confusion_matrix

# actual values

actual = y_test

# predicted values
predicted = y4_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

from sklearn.metrics import roc_curve, roc_auc_score

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y4_pred), 3))

Confusion matrix : 
 [[  41  447]
 [  81 2305]]
Outcome Values : 
 41 447 81 2305
Classification report : 
               precision    recall  f1-score   support

           1       0.34      0.08      0.13       488
           0       0.84      0.97      0.90      2386

    accuracy                           0.82      2874
   macro avg       0.59      0.53      0.52      2874
weighted avg       0.75      0.82      0.77      2874

Accuracy : 81.6 %
Precision : 33.6 %
Recall : 8.4 %
F1 Score : 0.134
Balanced Accuracy : 52.5 %
MCC 0.093
roc_auc_score: 0.525


# Train the Principal Component Analysis (PCA) with train data and test data

In [15]:
# Principal component analysis (PCA) is a statistical technique to convert high dimensional data to low dimensional data
# by selecting the most important features that capture maximum information about the dataset. The features are selected
# on the basis of variance that they cause in the output. The feature that causes highest variance is the first principal
# component. The feature that is responsible for second highest variance is considered the second principal component,
# and so on. It is important to mention that principal components do not have any correlation with each other.

from sklearn.decomposition import PCA

applyPCA = PCA()

x1_train = applyPCA.fit_transform(x_train)
x1_test = applyPCA.transform(x_test)
explained_variance = applyPCA.explained_variance_ratio_
print(explained_variance)

[7.70350903e-01 5.26343855e-02 4.51284661e-02 2.74139052e-02
 2.01639802e-02 1.49645574e-02 1.19621976e-02 1.13944105e-02
 9.91289478e-03 8.95880118e-03 7.05676282e-03 6.44225078e-03
 5.78360411e-03 3.74839448e-03 2.19124563e-03 9.69680833e-04
 8.28188722e-04 9.53711941e-05 1.39077347e-33]


# Random Forest with PCA

In [16]:
# Build Random Forest Classification model and train model using the training dataset

from sklearn.ensemble import RandomForestClassifier

modelRF = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=2, min_samples_split=2, min_samples_leaf=1, 
                                 min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, 
                                 min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, 
                                 n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, 
                                 ccp_alpha=0.0, max_samples=None)

modelRF = modelRF.fit(x1_train, y_train)

# Predict the model with the test data set

y5_pred = modelRF.predict(x1_test)

from sklearn.metrics import classification_report, confusion_matrix

# actual values

actual = y_test

# predicted values
predicted = y5_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

from sklearn.metrics import roc_curve, roc_auc_score

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y5_pred), 3))

Confusion matrix : 
 [[   0  488]
 [   0 2386]]
Outcome Values : 
 0 488 0 2386
Classification report : 
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       488
           0       0.83      1.00      0.91      2386

    accuracy                           0.83      2874
   macro avg       0.42      0.50      0.45      2874
weighted avg       0.69      0.83      0.75      2874

Accuracy : 83.0 %
Precision : nan %
Recall : 0.0 %
F1 Score : 0.0
Balanced Accuracy : 50.0 %
MCC nan
roc_auc_score: 0.5


# Logistic Regression with PCA

In [17]:
# To build the 'Logistic Regression' model with random sampling

from sklearn.linear_model import LogisticRegression

modelLR = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                            intercept_scaling=1, max_iter=100, multi_class='auto',
                            n_jobs=None, penalty='l2', random_state=None,
                            solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

modelLR = modelLR.fit(x1_train,y_train)

# Predict the model with test data set

y6_pred = modelLR.predict(x1_test)

from sklearn.metrics import classification_report, confusion_matrix

# actual values

actual = y_test

# predicted values
predicted = y6_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

from sklearn.metrics import roc_curve, roc_auc_score

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y6_pred), 3))

Confusion matrix : 
 [[  10  478]
 [  11 2375]]
Outcome Values : 
 10 478 11 2375
Classification report : 
               precision    recall  f1-score   support

           1       0.48      0.02      0.04       488
           0       0.83      1.00      0.91      2386

    accuracy                           0.83      2874
   macro avg       0.65      0.51      0.47      2874
weighted avg       0.77      0.83      0.76      2874

Accuracy : 83.0 %
Precision : 47.6 %
Recall : 2.0 %
F1 Score : 0.039
Balanced Accuracy : 50.7 %
MCC 0.07
roc_auc_score: 0.508


# Decision Tree with PCA

In [18]:
# To build the decision tree model with Over sampling

from sklearn.tree import DecisionTreeClassifier

modelDT = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                                max_depth=None, max_features=None, max_leaf_nodes=None,
                                min_impurity_decrease=0.0, min_impurity_split=None,
                                min_samples_leaf=1, min_samples_split=2,min_weight_fraction_leaf=0.0,
                                random_state=None, splitter='best')

modelDT = modelDT.fit(x1_train,y_train)

# Predict with test data

y7_pred = modelDT.predict(x1_test)

from sklearn.metrics import classification_report, confusion_matrix

# actual values

actual = y_test

# predicted values
predicted = y7_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

from sklearn.metrics import roc_curve, roc_auc_score

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y7_pred), 3))

Confusion matrix : 
 [[ 123  365]
 [ 466 1920]]
Outcome Values : 
 123 365 466 1920
Classification report : 
               precision    recall  f1-score   support

           1       0.21      0.25      0.23       488
           0       0.84      0.80      0.82      2386

    accuracy                           0.71      2874
   macro avg       0.52      0.53      0.53      2874
weighted avg       0.73      0.71      0.72      2874

Accuracy : 71.1 %
Precision : 20.9 %
Recall : 25.2 %
F1 Score : 0.228
Balanced Accuracy : 52.8 %
MCC 0.053
roc_auc_score: 0.528


# SVM with PCA

In [19]:
# Training the SVM algorithm

from sklearn.svm import SVC

modelSVMGaussian = SVC(kernel='rbf', random_state = 42, class_weight='balanced')

modelSVMGaussian.fit(x1_train, y_train)

# Predicting the values

y8_pred = modelSVMGaussian.predict(x1_test)


from sklearn.metrics import classification_report, confusion_matrix

# actual values

actual = y_test

# predicted values
predicted = y8_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

from sklearn.metrics import roc_curve, roc_auc_score

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y8_pred), 3))

Confusion matrix : 
 [[ 257  231]
 [ 695 1691]]
Outcome Values : 
 257 231 695 1691
Classification report : 
               precision    recall  f1-score   support

           1       0.27      0.53      0.36       488
           0       0.88      0.71      0.79      2386

    accuracy                           0.68      2874
   macro avg       0.57      0.62      0.57      2874
weighted avg       0.78      0.68      0.71      2874

Accuracy : 67.8 %
Precision : 27.0 %
Recall : 52.7 %
F1 Score : 0.357
Balanced Accuracy : 61.8 %
MCC 0.188
roc_auc_score: 0.618


# KNN with PCA

In [20]:
# Build the algorithm with KNN

from sklearn import neighbors

modelKNN = neighbors.KNeighborsClassifier(n_neighbors=5,weights='uniform', algorithm='auto', leaf_size=30, p=2, 
                                          metric='minkowski', metric_params=None, n_jobs=None)
modelKNN.fit(x1_train, y_train)

# Predict the model with test dataset

y9_pred = modelKNN.predict(x1_test)


from sklearn.metrics import classification_report, confusion_matrix

# actual values

actual = y_test

# predicted values
predicted = y9_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

from sklearn.metrics import roc_curve, roc_auc_score

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y9_pred), 3))

Confusion matrix : 
 [[  41  447]
 [  81 2305]]
Outcome Values : 
 41 447 81 2305
Classification report : 
               precision    recall  f1-score   support

           1       0.34      0.08      0.13       488
           0       0.84      0.97      0.90      2386

    accuracy                           0.82      2874
   macro avg       0.59      0.53      0.52      2874
weighted avg       0.75      0.82      0.77      2874

Accuracy : 81.6 %
Precision : 33.6 %
Recall : 8.4 %
F1 Score : 0.134
Balanced Accuracy : 52.5 %
MCC 0.093
roc_auc_score: 0.525


# Train the Linear Discriminant Analysis (LDA) with train data and test data

In [21]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

applyLDA = LinearDiscriminantAnalysis()

x2_train = applyLDA.fit_transform(x_train, y_train)
x2_test = applyLDA.transform(x_test)

explained_variance = applyLDA.explained_variance_ratio_
print(explained_variance)

[1.]


# Random Forest with LDA

In [22]:
# Build Random Forest Classification model and train model using the training dataset

from sklearn.ensemble import RandomForestClassifier

modelRF = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=2, min_samples_split=2, min_samples_leaf=1, 
                                 min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, 
                                 min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, 
                                 n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, 
                                 ccp_alpha=0.0, max_samples=None)

modelRF = modelRF.fit(x2_train, y_train)

# Predict the model with the test data set

y10_pred = modelRF.predict(x2_test)

from sklearn.metrics import classification_report, confusion_matrix

# actual values

actual = y_test

# predicted values
predicted = y10_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

from sklearn.metrics import roc_curve, roc_auc_score

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y10_pred), 3))

Confusion matrix : 
 [[   0  488]
 [   0 2386]]
Outcome Values : 
 0 488 0 2386
Classification report : 
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       488
           0       0.83      1.00      0.91      2386

    accuracy                           0.83      2874
   macro avg       0.42      0.50      0.45      2874
weighted avg       0.69      0.83      0.75      2874

Accuracy : 83.0 %
Precision : nan %
Recall : 0.0 %
F1 Score : 0.0
Balanced Accuracy : 50.0 %
MCC nan
roc_auc_score: 0.5


# Logistic Regression with LDA

In [23]:
# To build the 'Logistic Regression' model with random sampling

from sklearn.linear_model import LogisticRegression

modelLR = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                            intercept_scaling=1, max_iter=100, multi_class='auto',
                            n_jobs=None, penalty='l2', random_state=None,
                            solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

modelLR = modelLR.fit(x2_train,y_train)

# Predict the model with test data set

y11_pred = modelLR.predict(x2_test)


from sklearn.metrics import classification_report, confusion_matrix

# actual values

actual = y_test

# predicted values
predicted = y11_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

from sklearn.metrics import roc_curve, roc_auc_score

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y11_pred), 3))

Confusion matrix : 
 [[  19  469]
 [  18 2368]]
Outcome Values : 
 19 469 18 2368
Classification report : 
               precision    recall  f1-score   support

           1       0.51      0.04      0.07       488
           0       0.83      0.99      0.91      2386

    accuracy                           0.83      2874
   macro avg       0.67      0.52      0.49      2874
weighted avg       0.78      0.83      0.77      2874

Accuracy : 83.1 %
Precision : 51.4 %
Recall : 3.9 %
F1 Score : 0.072
Balanced Accuracy : 51.6 %
MCC 0.105
roc_auc_score: 0.516


# Decision Tree with LDA

In [24]:
# To build the decision tree model with Over sampling

from sklearn.tree import DecisionTreeClassifier

modelDT = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                                max_depth=None, max_features=None, max_leaf_nodes=None,
                                min_impurity_decrease=0.0, min_impurity_split=None,
                                min_samples_leaf=1, min_samples_split=2,min_weight_fraction_leaf=0.0,
                                random_state=None, splitter='best')

modelDT = modelDT.fit(x2_train,y_train)

# Predict with test data

y12_pred = modelDT.predict(x2_test)

from sklearn.metrics import classification_report, confusion_matrix

# actual values

actual = y_test

# predicted values
predicted = y12_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

from sklearn.metrics import roc_curve, roc_auc_score

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y12_pred), 3))

Confusion matrix : 
 [[ 102  386]
 [ 281 2105]]
Outcome Values : 
 102 386 281 2105
Classification report : 
               precision    recall  f1-score   support

           1       0.27      0.21      0.23       488
           0       0.85      0.88      0.86      2386

    accuracy                           0.77      2874
   macro avg       0.56      0.55      0.55      2874
weighted avg       0.75      0.77      0.76      2874

Accuracy : 76.8 %
Precision : 26.6 %
Recall : 20.9 %
F1 Score : 0.234
Balanced Accuracy : 54.6 %
MCC 0.101
roc_auc_score: 0.546


# SVM with LDA

In [25]:
# Training the SVM algorithm

from sklearn.svm import SVC

modelSVMGaussian = SVC(kernel='rbf', random_state = 42, class_weight='balanced')

modelSVMGaussian.fit(x2_train, y_train)

# Predicting the values

y13_pred = modelSVMGaussian.predict(x2_test)

from sklearn.metrics import classification_report, confusion_matrix

# actual values

actual = y_test

# predicted values
predicted = y13_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

from sklearn.metrics import roc_curve, roc_auc_score

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y13_pred), 3))

Confusion matrix : 
 [[ 250  238]
 [ 671 1715]]
Outcome Values : 
 250 238 671 1715
Classification report : 
               precision    recall  f1-score   support

           1       0.27      0.51      0.35       488
           0       0.88      0.72      0.79      2386

    accuracy                           0.68      2874
   macro avg       0.57      0.62      0.57      2874
weighted avg       0.78      0.68      0.72      2874

Accuracy : 68.4 %
Precision : 27.1 %
Recall : 51.2 %
F1 Score : 0.355
Balanced Accuracy : 61.5 %
MCC 0.186
roc_auc_score: 0.616


# KNN with LDA

In [26]:
# Build the algorithm with KNN

from sklearn import neighbors

modelKNN = neighbors.KNeighborsClassifier(n_neighbors=5,weights='uniform', algorithm='auto', leaf_size=30, p=2, 
                                          metric='minkowski', metric_params=None, n_jobs=None)
modelKNN.fit(x2_train, y_train)

# Predict the model with test dataset

y14_pred = modelKNN.predict(x2_test)

from sklearn.metrics import classification_report, confusion_matrix

# actual values

actual = y_test

# predicted values
predicted = y14_pred

# confusion matrix

matrix = confusion_matrix(actual, predicted, labels=[1, 0], sample_weight=None, normalize=None,)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp,fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)

print('Outcome Values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3) 

specificity = round(tn/(tn+fp), 3)

accuracy = round((tp+tn)/(tp+fp+tn+fn), 3)
balanced_accuracy = round((sensitivity+specificity)/2, 3)
precision = round(tp/(tp+fp), 3)
f1Score = round((2*tp/(2*tp + fp +fn)), 3);

# Mathews Correlatin coefficient (MCC). Range of values of MCC lie between -1 to +1
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

m = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)

MCC = round(((tp* tn) - (fp * fn)) / sqrt(m), 3)

print('Accuracy :', round(accuracy*100, 2), '%')
print('Precision :', round(precision*100, 2), '%')
print('Recall :', round(sensitivity*100, 2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2), '%')
print('MCC', MCC)

from sklearn.metrics import roc_curve, roc_auc_score

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y14_pred), 3))

Confusion matrix : 
 [[  47  441]
 [  81 2305]]
Outcome Values : 
 47 441 81 2305
Classification report : 
               precision    recall  f1-score   support

           1       0.37      0.10      0.15       488
           0       0.84      0.97      0.90      2386

    accuracy                           0.82      2874
   macro avg       0.60      0.53      0.53      2874
weighted avg       0.76      0.82      0.77      2874

Accuracy : 81.8 %
Precision : 36.7 %
Recall : 9.6 %
F1 Score : 0.153
Balanced Accuracy : 53.1 %
MCC 0.114
roc_auc_score: 0.531
