In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", 50)

# Import psql to run queries 

import pandasql as psql

In [2]:
# load the cancer dataset 

cancer = pd.read_csv(r"D:\00 Datasets\Others\Data-08\breast_cancer.csv", header=0) 
cancer.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,TumorType
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [3]:
# Display the informationof dataset

cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [4]:
# Display the unique variables count

cancer.nunique()

mean radius                456
mean texture               479
mean perimeter             522
mean area                  539
mean smoothness            474
mean compactness           537
mean concavity             537
mean concave points        542
mean symmetry              432
mean fractal dimension     499
radius error               540
texture error              519
perimeter error            533
area error                 528
smoothness error           547
compactness error          541
concavity error            533
concave points error       507
symmetry error             498
fractal dimension error    545
worst radius               457
worst texture              511
worst perimeter            514
worst area                 544
worst smoothness           411
worst compactness          529
worst concavity            539
worst concave points       492
worst symmetry             500
worst fractal dimension    535
TumorType                    2
dtype: int64

In [5]:
# Identify the independent and Target (dependent) variables

IndepVar = []
for col in cancer.columns:
    if col != 'TumorType':
        IndepVar.append(col)

TargetVar = 'TumorType'

x = cancer[IndepVar]
y = cancer[TargetVar]

In [6]:
# Split the data into train and test (random sampling)

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, stratify=y, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((455, 30), (114, 30), (455,), (114,))

In [7]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train = mmscaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)

x_test = mmscaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)

# GradientBoostingClassifier

In [8]:
# GradientBoostingClassifier is used for classification problems

from sklearn.ensemble import GradientBoostingClassifier

modelGBC = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, 
                                      subsample=1.0, criterion='friedman_mse', min_samples_split=2, 
                                      min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, 
                                      min_impurity_decrease=0.0, min_impurity_split=None, init=None, 
                                      random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, 
                                      warm_start=False, validation_fraction=0.1, n_iter_no_change=None, 
                                      tol=0.0001, ccp_alpha=0.0)
# fit the model with train data

modelGBC.fit(x_train, y_train)

GradientBoostingClassifier()

In [9]:
# Predict the model with test data

y_pred = modelGBC.predict(x_test)

# confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# actual values

actual = y_test

# predicted values

predicted = y_pred

# confusion matrix

matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])

print('Classification report : \n',matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3)
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)

# Area under ROC curve 

from sklearn.metrics import roc_curve, roc_auc_score

print('roc_auc_score:', round(roc_auc_score(y_test, y_pred), 3))

Confusion matrix : 
 [[72  0]
 [ 4 38]]
Outcome values : 
 72 0 4 38
Classification report : 
               precision    recall  f1-score   support

           1       0.95      1.00      0.97        72
           0       1.00      0.90      0.95        42

    accuracy                           0.96       114
   macro avg       0.97      0.95      0.96       114
weighted avg       0.97      0.96      0.96       114

Accuracy : 96.5 %
Precision : 94.7 %
Recall : 100.0 %
F1 Score : 0.973
Balanced Accuracy : 95.2 %
MCC : 0.926
roc_auc_score: 0.952


# Random Forest classification

In [10]:
# Build Random Forest classification model and Train the model using the training sets

from sklearn.ensemble import RandomForestClassifier  

modelRF = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                                 criterion='entropy', max_depth=None, max_features='auto',
                                 max_leaf_nodes=None, max_samples=None,
                                 min_impurity_decrease=0.0, min_impurity_split=None,
                                 min_samples_leaf=1, min_samples_split=2,
                                 min_weight_fraction_leaf=0.0, n_estimators=100,
                                 n_jobs=None, oob_score=False, random_state=0, verbose=0,
                                 warm_start=False)

modelRF = modelRF.fit(x_train, y_train)

# Predict the model with test data set

y1_pred = modelRF.predict(x_test)

# confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# actual values

actual = y_test

# predicted values

predicted = y1_pred 

# confusion matrix

matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])

print('Classification report : \n',matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3)
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)

# Area under ROC curve 

print('roc_auc_score:', round(roc_auc_score(y_test, y1_pred), 3))
print('-----------------------------------------------------------------------')

Confusion matrix : 
 [[71  1]
 [ 4 38]]
Outcome values : 
 71 1 4 38
Classification report : 
               precision    recall  f1-score   support

           1       0.95      0.99      0.97        72
           0       0.97      0.90      0.94        42

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114

Accuracy : 95.6 %
Precision : 94.7 %
Recall : 98.6 %
F1 Score : 0.966
Balanced Accuracy : 94.6 %
MCC : 0.906
roc_auc_score: 0.945
-----------------------------------------------------------------------


# GradientBoostingClassifier (Multi-Class Classification- Wine)

In [11]:
# load the power dataset 

wine = pd.read_csv(r"D:\00 Datasets\Others\Data-08\wine.csv", header=0) 
wine.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,WineType
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,0
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,0
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,0
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,0
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,0


In [12]:
# display the dataset information

wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    int64  
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    int64  
 13  WineT

In [13]:
# Display the number of unique values count in each variable

wine.nunique()

alcohol                         126
malic_acid                      133
ash                              79
alcalinity_of_ash                63
magnesium                        53
total_phenols                    97
flavanoids                      132
nonflavanoid_phenols             39
proanthocyanins                 101
color_intensity                 132
hue                              78
od280/od315_of_diluted_wines    122
proline                         121
WineType                          3
dtype: int64

In [14]:
# Identify the independent and Target (dependent) variables

IndepVar = []
for col in wine.columns:
    if col != 'WineType':
        IndepVar.append(col)

TargetVar = 'WineType'

x = wine[IndepVar]
y = wine[TargetVar]

In [15]:
# Split the data into train and test (random sampling)

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, stratify=y, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((142, 13), (36, 13), (142,), (36,))

In [16]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train = mmscaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)

x_test = mmscaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)

In [17]:
# GradientBoostingClassifier is used for classification problems

from sklearn.ensemble import GradientBoostingClassifier

modelGBMC = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, 
                                       subsample=1.0, criterion='friedman_mse', min_samples_split=2, 
                                       min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, 
                                       min_impurity_decrease=0.0, min_impurity_split=None, init=None, 
                                       random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, 
                                       warm_start=False, validation_fraction=0.1, n_iter_no_change=None, 
                                       tol=0.0001, ccp_alpha=0.0)
# fit the model with train data

modelGBMC.fit(x_train, y_train)

GradientBoostingClassifier()

In [18]:
# Predict model with test data 

y_pred = modelGBMC.predict(x_test)

# confusion matrix in sklearn

from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from math import sqrt

print(confusion_matrix(y_pred, y_test)) # Verticle is actual values & horizontal is predicted values

# Actual and predicted classes

lst_actual_class = y_test
lst_predicted_class = y_pred

# Class = Label 0-3

lst_classes = [0, 1, 2]

# Compute multi-class confusion matrix

arr_out_matrix = multilabel_confusion_matrix(lst_actual_class, lst_predicted_class, labels=lst_classes)

# Temp store results

model_acc = [];
model_recall = [];
model_prec = [];
model_fscore = [];
model_spec = [];
model_bal_acc = [];
model_mcc = [];
for no_class in range(len(lst_classes)):
    arr_data = arr_out_matrix[no_class];
    print("Print Class: {0}".format(no_class));

    tp = arr_data[1][1]
    fp = arr_data[0][1]
    tn = arr_data[0][0]
    fn = arr_data[1][0]
    
    sensitivity = round(tp/(tp+fn), 3);
    specificity = round(tn/(tn+fp), 3);
    accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
    balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
    precision = round(tp/(tp+fp), 3);
    f1Score = round((2*tp/(2*tp + fp + fn)), 3);
    
    mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
    MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
    model_acc.append(accuracy);
    model_prec.append(precision);
    model_recall.append(sensitivity);
    model_fscore.append(f1Score);
    model_spec.append(specificity);
    model_bal_acc.append(balanced_accuracy);
    model_mcc.append(MCC);
    
    print("TP={0}, FP={1}, TN={2}, FN={3}".format(tp, fp, tn, fn));
    print("Accuracy: {0}".format(accuracy));    # Accuracy score
    print("Precision: {0}".format(precision)); # Precision score
    print("Sensitivity: {0}".format(sensitivity)); # Recall score
    print("F1-Score: {0}".format(f1Score)); # F1 score
    print("Specificity: {0}".format(specificity)); # True Nagative Rate
    print("Balanced Accuracy: {0}".format(balanced_accuracy)); # Balance accuracy score
    print("MCC: {0}\n".format(MCC)); # Matthews Correlation Coefficient

[[10  1  0]
 [ 2 13  0]
 [ 0  0 10]]
Print Class: 0
TP=10, FP=1, TN=23, FN=2
Accuracy: 0.917
Precision: 0.909
Sensitivity: 0.833
F1-Score: 0.87
Specificity: 0.958
Balanced Accuracy: 0.896
MCC: 0.81

Print Class: 1
TP=13, FP=2, TN=20, FN=1
Accuracy: 0.917
Precision: 0.867
Sensitivity: 0.929
F1-Score: 0.897
Specificity: 0.909
Balanced Accuracy: 0.919
MCC: 0.828

Print Class: 2
TP=10, FP=0, TN=26, FN=0
Accuracy: 1.0
Precision: 1.0
Sensitivity: 1.0
F1-Score: 1.0
Specificity: 1.0
Balanced Accuracy: 1.0
MCC: 1.0



In [19]:
# OVERALL - FINAL PREDICTION PERFORMANCE

# importing mean() 

from statistics import mean
import math

print("Overall Performance Prediction:");
print("Accuracy: {0}%".format(round(mean(model_acc)*100, 4)));
print("Precision: {0}%".format(round(mean(model_prec)*100, 4)));
print("Recall or Sensitivity: {0}%".format(round(mean(model_recall)*100, 4)));
print("F1-Score: {0}".format(round(mean(model_fscore), 4)));
print("Specificity or True Nagative Rate: {0}%".format(round(mean(model_spec)*100, 4)));
print("Balanced Accuracy: {0}%\n".format(round(mean(model_bal_acc)*100, 4)));
print("MCC: {0}\n".format(round(mean(model_mcc), 4)));

Overall Performance Prediction:
Accuracy: 94.4667%
Precision: 92.5333%
Recall or Sensitivity: 92.0667%
F1-Score: 0.9223
Specificity or True Nagative Rate: 95.5667%
Balanced Accuracy: 93.8333%

MCC: 0.8793



In [20]:
# Build Random Forest classification model and Train the model using the training sets

from sklearn.ensemble import RandomForestClassifier  

modelRF = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                                 criterion='entropy', max_depth=None, max_features='auto',
                                 max_leaf_nodes=None, max_samples=None,
                                 min_impurity_decrease=0.0, min_impurity_split=None,
                                 min_samples_leaf=1, min_samples_split=2,
                                 min_weight_fraction_leaf=0.0, n_estimators=100,
                                 n_jobs=None, oob_score=False, random_state=0, verbose=0,
                                 warm_start=False)
# Fit the model with train data

modelRF = modelRF.fit(x_train, y_train)

In [21]:
# Predict model with test data 

y_pred = modelRF.predict(x_test)

# confusion matrix in sklearn

from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from math import sqrt

print(confusion_matrix(y_pred, y_test)) # Verticle is actual values & horizontal is predicted values

# Actual and predicted classes

lst_actual_class = y_test
lst_predicted_class = y_pred

# Class = Label 0-3

lst_classes = [0, 1, 2]

# Compute multi-class confusion matrix

arr_out_matrix = multilabel_confusion_matrix(lst_actual_class, lst_predicted_class, labels=lst_classes)

# Temp store results

model_acc = [];
model_recall = [];
model_prec = [];
model_fscore = [];
model_spec = [];
model_bal_acc = [];
model_mcc = [];
for no_class in range(len(lst_classes)):
    arr_data = arr_out_matrix[no_class];
    print("Print Class: {0}".format(no_class));

    tp = arr_data[1][1]
    fp = arr_data[0][1]
    tn = arr_data[0][0]
    fn = arr_data[1][0]
    
    sensitivity = round(tp/(tp+fn), 3);
    specificity = round(tn/(tn+fp), 3);
    accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
    balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
    precision = round(tp/(tp+fp), 3);
    f1Score = round((2*tp/(2*tp + fp + fn)), 3);
    
    mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
    MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
    model_acc.append(accuracy);
    model_prec.append(precision);
    model_recall.append(sensitivity);
    model_fscore.append(f1Score);
    model_spec.append(specificity);
    model_bal_acc.append(balanced_accuracy);
    model_mcc.append(MCC);
    
    print("TP={0}, FP={1}, TN={2}, FN={3}".format(tp, fp, tn, fn));
    print("Accuracy: {0}".format(accuracy));    # Accuracy score
    print("Precision: {0}".format(precision)); # Precision score
    print("Sensitivity: {0}".format(sensitivity)); # Recall score
    print("F1-Score: {0}".format(f1Score)); # F1 score
    print("Specificity: {0}".format(specificity)); # True Nagative Rate
    print("Balanced Accuracy: {0}".format(balanced_accuracy)); # Balance accuracy score
    print("MCC: {0}\n".format(MCC)); # Matthews Correlation Coefficient

[[11  0  0]
 [ 1 14  0]
 [ 0  0 10]]
Print Class: 0
TP=11, FP=0, TN=24, FN=1
Accuracy: 0.972
Precision: 1.0
Sensitivity: 0.917
F1-Score: 0.957
Specificity: 1.0
Balanced Accuracy: 0.958
MCC: 0.938

Print Class: 1
TP=14, FP=1, TN=21, FN=0
Accuracy: 0.972
Precision: 0.933
Sensitivity: 1.0
F1-Score: 0.966
Specificity: 0.955
Balanced Accuracy: 0.978
MCC: 0.944

Print Class: 2
TP=10, FP=0, TN=26, FN=0
Accuracy: 1.0
Precision: 1.0
Sensitivity: 1.0
F1-Score: 1.0
Specificity: 1.0
Balanced Accuracy: 1.0
MCC: 1.0



In [22]:
# OVERALL - FINAL PREDICTION PERFORMANCE

# importing mean() 

from statistics import mean
import math

print("Overall Performance Prediction:");
print("Accuracy: {0}%".format(round(mean(model_acc)*100, 4)));
print("Precision: {0}%".format(round(mean(model_prec)*100, 4)));
print("Recall or Sensitivity: {0}%".format(round(mean(model_recall)*100, 4)));
print("F1-Score: {0}".format(round(mean(model_fscore), 4)));
print("Specificity or True Nagative Rate: {0}%".format(round(mean(model_spec)*100, 4)));
print("Balanced Accuracy: {0}%\n".format(round(mean(model_bal_acc)*100, 4)));
print("MCC: {0}\n".format(round(mean(model_mcc), 4)));

Overall Performance Prediction:
Accuracy: 98.1333%
Precision: 97.7667%
Recall or Sensitivity: 97.2333%
F1-Score: 0.9743
Specificity or True Nagative Rate: 98.5%
Balanced Accuracy: 97.8667%

MCC: 0.9607



In [23]:
# Set the parameters for GradientBoostingClassifier

parameters = {'learning_rate': [0.01,0.1,0.3,1],
              'n_estimators' : [5,50,250, 500],
              'max_depth'    : [1,3,5,7,9]}

In [24]:
# GradientBoostingClassifier model and Fine tune hyper parametrs

from sklearn.ensemble import GradientBoostingClassifier

modelGBMC = GradientBoostingClassifier()

# Perform grid search - perform 2 fold cross-validation

from sklearn.model_selection import GridSearchCV

modelGBMC_GS = GridSearchCV(estimator = modelGBMC, param_grid = parameters, cv = 2, n_jobs=-1)

In [25]:
# Fit the model with train data (longer time)

modelGBMC_GS.fit(x_train, y_train)

GridSearchCV(cv=2, estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.1, 0.3, 1],
                         'max_depth': [1, 3, 5, 7, 9],
                         'n_estimators': [5, 50, 250, 500]})

In [26]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n", modelGBMC_GS.best_estimator_)
print("\n The best score across ALL searched params:\n", modelGBMC_GS.best_score_)
print("\n The best parameters across ALL searched params:\n", modelGBMC_GS.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 GradientBoostingClassifier(learning_rate=0.3, max_depth=1, n_estimators=50)

 The best score across ALL searched params:
 0.9577464788732395

 The best parameters across ALL searched params:
 {'learning_rate': 0.3, 'max_depth': 1, 'n_estimators': 50}


In [27]:
# Predict model with test data 

y_pred = modelGBMC_GS.predict(x_test)

# confusion matrix in sklearn

from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from math import sqrt

print(confusion_matrix(y_pred, y_test)) # Verticle is actual values & horizontal is predicted values

# Actual and predicted classes

lst_actual_class = y_test
lst_predicted_class = y_pred

# Class = Label 0-3

lst_classes = [0, 1, 2]

# Compute multi-class confusion matrix

arr_out_matrix = multilabel_confusion_matrix(lst_actual_class, lst_predicted_class, labels=lst_classes)

# Temp store results

model_acc = [];
model_recall = [];
model_prec = [];
model_fscore = [];
model_spec = [];
model_bal_acc = [];
model_mcc = [];
for no_class in range(len(lst_classes)):
    arr_data = arr_out_matrix[no_class];
    print("Print Class: {0}".format(no_class));

    tp = arr_data[1][1]
    fp = arr_data[0][1]
    tn = arr_data[0][0]
    fn = arr_data[1][0]
    
    sensitivity = round(tp/(tp+fn), 3);
    specificity = round(tn/(tn+fp), 3);
    accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
    balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
    precision = round(tp/(tp+fp), 3);
    f1Score = round((2*tp/(2*tp + fp + fn)), 3);
    
    mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
    MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
    model_acc.append(accuracy);
    model_prec.append(precision);
    model_recall.append(sensitivity);
    model_fscore.append(f1Score);
    model_spec.append(specificity);
    model_bal_acc.append(balanced_accuracy);
    model_mcc.append(MCC);
    
    print("TP={0}, FP={1}, TN={2}, FN={3}".format(tp, fp, tn, fn));
    print("Accuracy: {0}".format(accuracy));    # Accuracy score
    print("Precision: {0}".format(precision)); # Precision score
    print("Sensitivity: {0}".format(sensitivity)); # Recall score
    print("F1-Score: {0}".format(f1Score)); # F1 score
    print("Specificity: {0}".format(specificity)); # True Nagative Rate
    print("Balanced Accuracy: {0}".format(balanced_accuracy)); # Balance accuracy score
    print("MCC: {0}\n".format(MCC)); # Matthews Correlation Coefficient

[[11  0  0]
 [ 1 14  0]
 [ 0  0 10]]
Print Class: 0
TP=11, FP=0, TN=24, FN=1
Accuracy: 0.972
Precision: 1.0
Sensitivity: 0.917
F1-Score: 0.957
Specificity: 1.0
Balanced Accuracy: 0.958
MCC: 0.938

Print Class: 1
TP=14, FP=1, TN=21, FN=0
Accuracy: 0.972
Precision: 0.933
Sensitivity: 1.0
F1-Score: 0.966
Specificity: 0.955
Balanced Accuracy: 0.978
MCC: 0.944

Print Class: 2
TP=10, FP=0, TN=26, FN=0
Accuracy: 1.0
Precision: 1.0
Sensitivity: 1.0
F1-Score: 1.0
Specificity: 1.0
Balanced Accuracy: 1.0
MCC: 1.0



In [28]:
# OVERALL - FINAL PREDICTION PERFORMANCE

# importing mean() 

from statistics import mean
import math

print("Overall Performance Prediction:");
print("Accuracy: {0}%".format(round(mean(model_acc)*100, 4)));
print("Precision: {0}%".format(round(mean(model_prec)*100, 4)));
print("Recall or Sensitivity: {0}%".format(round(mean(model_recall)*100, 4)));
print("F1-Score: {0}".format(round(mean(model_fscore), 4)));
print("Specificity or True Nagative Rate: {0}%".format(round(mean(model_spec)*100, 4)));
print("Balanced Accuracy: {0}%\n".format(round(mean(model_bal_acc)*100, 4)));
print("MCC: {0}\n".format(round(mean(model_mcc), 4)));

Overall Performance Prediction:
Accuracy: 98.1333%
Precision: 97.7667%
Recall or Sensitivity: 97.2333%
F1-Score: 0.9743
Specificity or True Nagative Rate: 98.5%
Balanced Accuracy: 97.8667%

MCC: 0.9607



In [29]:
# Implementation of Model using RandomizedSearchCV 

import numpy as np

from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 50 , stop = 500, num = 10)] # returns 10 numbers 

learning_rate = [float(x) for x in np.linspace(start = 0.01 , stop = 1, num = 20)]

max_depth = [int(x) for x in np.linspace(1, 9, num = 9)] 

max_depth.append(None)

random_grid = {'n_estimators': n_estimators, 'learning_rate': learning_rate,
          'max_depth': max_depth}

print(random_grid)

{'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 450, 500], 'learning_rate': [0.01, 0.06210526315789474, 0.11421052631578947, 0.16631578947368422, 0.21842105263157896, 0.2705263157894737, 0.32263157894736844, 0.37473684210526315, 0.4268421052631579, 0.4789473684210527, 0.5310526315789474, 0.5831578947368421, 0.6352631578947369, 0.6873684210526316, 0.7394736842105263, 0.791578947368421, 0.8436842105263158, 0.8957894736842106, 0.9478947368421053, 1.0], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, None]}


In [30]:
from sklearn.ensemble import GradientBoostingClassifier

modelGBMC1 = GradientBoostingClassifier()

# Perform grid search - perform 2 fold cross-validation

from sklearn.model_selection import RandomizedSearchCV

modelGBMC1_GS = RandomizedSearchCV(estimator = modelGBMC1, param_distributions=random_grid, cv = 5, n_jobs=-1)

modelGBMC1_GS.fit(x_train, y_train)

print(modelGBMC1_GS.best_params_)

{'n_estimators': 450, 'max_depth': 1, 'learning_rate': 0.9478947368421053}


In [31]:
# Predict model with test data 

y_pred = modelGBMC1_GS.predict(x_test)

# confusion matrix in sklearn

from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from math import sqrt

print(confusion_matrix(y_pred, y_test)) # Verticle is actual values & horizontal is predicted values

# Actual and predicted classes

lst_actual_class = y_test
lst_predicted_class = y_pred

# Class = Label 0-3

lst_classes = [0, 1, 2]

# Compute multi-class confusion matrix

arr_out_matrix = multilabel_confusion_matrix(lst_actual_class, lst_predicted_class, labels=lst_classes)

# Temp store results

model_acc = [];
model_recall = [];
model_prec = [];
model_fscore = [];
model_spec = [];
model_bal_acc = [];
model_mcc = [];
for no_class in range(len(lst_classes)):
    arr_data = arr_out_matrix[no_class];
    print("Print Class: {0}".format(no_class));

    tp = arr_data[1][1]
    fp = arr_data[0][1]
    tn = arr_data[0][0]
    fn = arr_data[1][0]
    
    sensitivity = round(tp/(tp+fn), 3);
    specificity = round(tn/(tn+fp), 3);
    accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
    balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
    precision = round(tp/(tp+fp), 3);
    f1Score = round((2*tp/(2*tp + fp + fn)), 3);
    
    mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
    MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
    model_acc.append(accuracy);
    model_prec.append(precision);
    model_recall.append(sensitivity);
    model_fscore.append(f1Score);
    model_spec.append(specificity);
    model_bal_acc.append(balanced_accuracy);
    model_mcc.append(MCC);
    
    print("TP={0}, FP={1}, TN={2}, FN={3}".format(tp, fp, tn, fn));
    print("Accuracy: {0}".format(accuracy));    # Accuracy score
    print("Precision: {0}".format(precision)); # Precision score
    print("Sensitivity: {0}".format(sensitivity)); # Recall score
    print("F1-Score: {0}".format(f1Score)); # F1 score
    print("Specificity: {0}".format(specificity)); # True Nagative Rate
    print("Balanced Accuracy: {0}".format(balanced_accuracy)); # Balance accuracy score
    print("MCC: {0}\n".format(MCC)); # Matthews Correlation Coefficient

[[11  0  0]
 [ 1 14  0]
 [ 0  0 10]]
Print Class: 0
TP=11, FP=0, TN=24, FN=1
Accuracy: 0.972
Precision: 1.0
Sensitivity: 0.917
F1-Score: 0.957
Specificity: 1.0
Balanced Accuracy: 0.958
MCC: 0.938

Print Class: 1
TP=14, FP=1, TN=21, FN=0
Accuracy: 0.972
Precision: 0.933
Sensitivity: 1.0
F1-Score: 0.966
Specificity: 0.955
Balanced Accuracy: 0.978
MCC: 0.944

Print Class: 2
TP=10, FP=0, TN=26, FN=0
Accuracy: 1.0
Precision: 1.0
Sensitivity: 1.0
F1-Score: 1.0
Specificity: 1.0
Balanced Accuracy: 1.0
MCC: 1.0



In [32]:
# OVERALL - FINAL PREDICTION PERFORMANCE

# importing mean() 

from statistics import mean
import math

print("Overall Performance Prediction:");
print("Accuracy: {0}%".format(round(mean(model_acc)*100, 4)));
print("Precision: {0}%".format(round(mean(model_prec)*100, 4)));
print("Recall or Sensitivity: {0}%".format(round(mean(model_recall)*100, 4)));
print("F1-Score: {0}".format(round(mean(model_fscore), 4)));
print("Specificity or True Nagative Rate: {0}%".format(round(mean(model_spec)*100, 4)));
print("Balanced Accuracy: {0}%\n".format(round(mean(model_bal_acc)*100, 4)));
print("MCC: {0}\n".format(round(mean(model_mcc), 4)));

Overall Performance Prediction:
Accuracy: 98.1333%
Precision: 97.7667%
Recall or Sensitivity: 97.2333%
F1-Score: 0.9743
Specificity or True Nagative Rate: 98.5%
Balanced Accuracy: 97.8667%

MCC: 0.9607



# Binary Classification for Universal Bank data

In [33]:
# Load the Universal bank data

bankdata = pd.read_csv(r"D:\00 Datasets\Others\Data-06\Universalbank.csv", header=0) 
bankdata.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [34]:
# Delete the columns which are not influencing the target variable

del bankdata['ID']
del bankdata['ZIP Code']

In [35]:
# cols1 is variables - crating a dummy variables & cols2 variables - MinMaxScalar function

cols1 = ['Family', 'Education']
cols2 = ['Age', 'Experience', 'Income', 'CCAvg', 'Mortgage']

In [36]:
# Create dummy variable for all range values

bankdata = pd.get_dummies(bankdata, columns=cols1)
bankdata.head().T

Unnamed: 0,0,1,2,3,4
Age,25.0,45.0,39.0,35.0,35.0
Experience,1.0,19.0,15.0,9.0,8.0
Income,49.0,34.0,11.0,100.0,45.0
CCAvg,1.6,1.5,1.0,2.7,1.0
Mortgage,0.0,0.0,0.0,0.0,0.0
Personal Loan,0.0,0.0,0.0,0.0,0.0
Securities Account,1.0,1.0,0.0,0.0,0.0
CD Account,0.0,0.0,0.0,0.0,0.0
Online,0.0,0.0,0.0,0.0,0.0
CreditCard,0.0,0.0,0.0,0.0,1.0


In [37]:
# Identify the dependent and Target variables

IndepVar = []
for col in bankdata.columns:
    if col != 'CreditCard':
        IndepVar.append(col)

TargetVar = 'CreditCard'

x = bankdata[IndepVar]
y = bankdata[TargetVar]

In [38]:
# Splitting the dataset into train and test 

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, stratify=y, random_state=142)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((3500, 16), (1500, 16), (3500,), (1500,))

In [39]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train[cols2] = mmscaler.fit_transform(x_train[cols2])
x_train = pd.DataFrame(x_train)

x_test[cols2] = mmscaler.fit_transform(x_test[cols2])
x_test = pd.DataFrame(x_test)

In [40]:
# GradientBoostingClassifier is used for classification problems

from sklearn.ensemble import GradientBoostingClassifier

modelGBC = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, 
                                      subsample=1.0, criterion='friedman_mse', min_samples_split=2, 
                                      min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, 
                                      min_impurity_decrease=0.0, min_impurity_split=None, init=None, 
                                      random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, 
                                      warm_start=False, validation_fraction=0.1, n_iter_no_change=None, 
                                      tol=0.0001, ccp_alpha=0.0)
# Fit the model with train data

modelGBC.fit(x_train, y_train)

# Predict the model with test data

y_pred = modelGBC.predict(x_test)

# confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# actual values

actual = y_test

# predicted values

predicted = y_pred

# confusion matrix

matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])

print('Classification report : \n',matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3)
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)

# Area under ROC curve 

from sklearn.metrics import roc_curve, roc_auc_score

print('roc_auc_score:', round(roc_auc_score(y_test, y_pred), 3))

Confusion matrix : 
 [[  79  362]
 [  21 1038]]
Outcome values : 
 79 362 21 1038
Classification report : 
               precision    recall  f1-score   support

           1       0.79      0.18      0.29       441
           0       0.74      0.98      0.84      1059

    accuracy                           0.74      1500
   macro avg       0.77      0.58      0.57      1500
weighted avg       0.76      0.74      0.68      1500

Accuracy : 74.5 %
Precision : 79.0 %
Recall : 17.9 %
F1 Score : 0.292
Balanced Accuracy : 58.0 %
MCC : 0.291
roc_auc_score: 0.58


In [41]:
# Build Random Forest classification model and Train the model using the training sets

from sklearn.ensemble import RandomForestClassifier  

modelRF = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                                 criterion='entropy', max_depth=None, max_features='auto',
                                 max_leaf_nodes=None, max_samples=None,
                                 min_impurity_decrease=0.0, min_impurity_split=None,
                                 min_samples_leaf=1, min_samples_split=2,
                                 min_weight_fraction_leaf=0.0, n_estimators=100,
                                 n_jobs=None, oob_score=False, random_state=0, verbose=0,
                                 warm_start=False)

modelRF = modelRF.fit(x_train, y_train)

# Predict the model with test data set

y1_pred = modelRF.predict(x_test)

# confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# actual values

actual = y_test

# predicted values

predicted = y1_pred 

# confusion matrix

matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])

print('Classification report : \n',matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3)
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)

# Area under ROC curve 

print('roc_auc_score:', round(roc_auc_score(y_test, y1_pred), 3))
print('-----------------------------------------------------------------------')

Confusion matrix : 
 [[116 325]
 [108 951]]
Outcome values : 
 116 325 108 951
Classification report : 
               precision    recall  f1-score   support

           1       0.52      0.26      0.35       441
           0       0.75      0.90      0.81      1059

    accuracy                           0.71      1500
   macro avg       0.63      0.58      0.58      1500
weighted avg       0.68      0.71      0.68      1500

Accuracy : 71.1 %
Precision : 51.8 %
Recall : 26.3 %
F1 Score : 0.349
Balanced Accuracy : 58.0 %
MCC : 0.206
roc_auc_score: 0.581
-----------------------------------------------------------------------


In [42]:
# Build Random Forest classification model and Train the model using the training sets

from sklearn.linear_model import LogisticRegression  

modelLR = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                             intercept_scaling=1, class_weight=None, random_state=None, 
                             solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, 
                             warm_start=False, n_jobs=None, l1_ratio=None)

modelLR = modelRF.fit(x_train, y_train)

# Predict the model with test data set

y1_pred = modelLR.predict(x_test)

# confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# actual values

actual = y_test

# predicted values

predicted = y1_pred 

# confusion matrix

matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])

print('Classification report : \n',matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3)
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)

# Area under ROC curve 

print('roc_auc_score:', round(roc_auc_score(y_test, y1_pred), 3))
print('-----------------------------------------------------------------------')

Confusion matrix : 
 [[116 325]
 [108 951]]
Outcome values : 
 116 325 108 951
Classification report : 
               precision    recall  f1-score   support

           1       0.52      0.26      0.35       441
           0       0.75      0.90      0.81      1059

    accuracy                           0.71      1500
   macro avg       0.63      0.58      0.58      1500
weighted avg       0.68      0.71      0.68      1500

Accuracy : 71.1 %
Precision : 51.8 %
Recall : 26.3 %
F1 Score : 0.349
Balanced Accuracy : 58.0 %
MCC : 0.206
roc_auc_score: 0.581
-----------------------------------------------------------------------


In [43]:
# Implementation of Model using RandomizedSearchCV 

import numpy as np

n_estimators = [int(x) for x in np.linspace(start = 50 , stop = 500, num = 10)] # returns 10 numbers 

learning_rate = [float(x) for x in np.linspace(start = 0.01 , stop = 1, num = 20)]

max_depth = [int(x) for x in np.linspace(1, 9, num = 9)] 

max_depth.append(None)

random_grid = {'n_estimators': n_estimators, 'learning_rate': learning_rate, 'max_depth': max_depth}

print(random_grid)

{'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 450, 500], 'learning_rate': [0.01, 0.06210526315789474, 0.11421052631578947, 0.16631578947368422, 0.21842105263157896, 0.2705263157894737, 0.32263157894736844, 0.37473684210526315, 0.4268421052631579, 0.4789473684210527, 0.5310526315789474, 0.5831578947368421, 0.6352631578947369, 0.6873684210526316, 0.7394736842105263, 0.791578947368421, 0.8436842105263158, 0.8957894736842106, 0.9478947368421053, 1.0], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, None]}


In [44]:
from sklearn.ensemble import GradientBoostingClassifier

modelGBC1 = GradientBoostingClassifier()

# Perform grid search - perform 2 fold cross-validation

from sklearn.model_selection import RandomizedSearchCV

modelGBC1_GS = RandomizedSearchCV(estimator = modelGBC1, param_distributions=random_grid, cv = 5, n_jobs=-1)

modelGBC1_GS.fit(x_train, y_train)

print(modelGBMC1_GS.best_params_)

{'n_estimators': 450, 'max_depth': 1, 'learning_rate': 0.9478947368421053}


In [45]:
# Predict the model with test data

y1_pred = modelGBC1_GS.predict(x_test)

# confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# actual values

actual = y_test

# predicted values

predicted = y1_pred

# confusion matrix

matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])

print('Classification report : \n',matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3)
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)

# Area under ROC curve 

from sklearn.metrics import roc_curve, roc_auc_score

print('roc_auc_score:', round(roc_auc_score(y_test, y1_pred), 3))

Confusion matrix : 
 [[  83  358]
 [  27 1032]]
Outcome values : 
 83 358 27 1032
Classification report : 
               precision    recall  f1-score   support

           1       0.75      0.19      0.30       441
           0       0.74      0.97      0.84      1059

    accuracy                           0.74      1500
   macro avg       0.75      0.58      0.57      1500
weighted avg       0.75      0.74      0.68      1500

Accuracy : 74.3 %
Precision : 75.5 %
Recall : 18.8 %
F1 Score : 0.301
Balanced Accuracy : 58.2 %
MCC : 0.284
roc_auc_score: 0.581


In [46]:
# Set the parameters for GradientBoostingClassifier

parameters = {'learning_rate': [0.01,0.1,0.3,1,10],
              'n_estimators' : [5,50,250, 500],
              'max_depth'    : [1,3,5,7,9]}

In [48]:
from sklearn.ensemble import GradientBoostingClassifier

modelGBC = GradientBoostingClassifier()

# Perform grid search - perform 2 fold cross-validation

from sklearn.model_selection import GridSearchCV

modelGBC_GS = GridSearchCV(estimator = modelGBC, param_grid = parameters, cv = 5, n_jobs=-1)

In [49]:
# Fit the model with train data (longer time)

modelGBC_GS.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.1, 0.3, 1, 10],
                         'max_depth': [1, 3, 5, 7, 9],
                         'n_estimators': [5, 50, 250, 500]})

In [51]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n", modelGBC_GS.best_estimator_)
print("\n The best score across ALL searched params:\n", modelGBC_GS.best_score_)
print("\n The best parameters across ALL searched params:\n", modelGBC_GS.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 GradientBoostingClassifier(n_estimators=50)

 The best score across ALL searched params:
 0.7417142857142857

 The best parameters across ALL searched params:
 {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}


In [52]:
# Predict the model with test data

y_pred = modelGBC_GS.predict(x_test)

# confusion matrix in sklearn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# actual values

actual = y_test

# predicted values

predicted = y_pred

# confusion matrix

matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)

# outcome values order in sklearn

tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy

matrix = classification_report(actual,predicted,labels=[1,0])

print('Classification report : \n',matrix)

# calculating the metrics

sensitivity = round(tp/(tp+fn), 3)
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
    
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);

# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1. 
# A model with a score of +1 is a perfect model and -1 is a poor model

from math import sqrt

mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)

print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)

# Area under ROC curve 

from sklearn.metrics import roc_curve, roc_auc_score

print('roc_auc_score:', round(roc_auc_score(y_test, y_pred), 3))

Confusion matrix : 
 [[  77  364]
 [  17 1042]]
Outcome values : 
 77 364 17 1042
Classification report : 
               precision    recall  f1-score   support

           1       0.82      0.17      0.29       441
           0       0.74      0.98      0.85      1059

    accuracy                           0.75      1500
   macro avg       0.78      0.58      0.57      1500
weighted avg       0.76      0.75      0.68      1500

Accuracy : 74.6 %
Precision : 81.9 %
Recall : 17.5 %
F1 Score : 0.288
Balanced Accuracy : 58.0 %
MCC : 0.298
roc_auc_score: 0.579
