In [48]:
#Importing Libraries
import numpy as np #To handle Mathematical calculations
import matplotlib.pyplot as plt #To plot charts 
import pandas as pd #TO import and manage datasets
import glob
import os

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [49]:
#Importing datasets

training_path = 'Datasets/Training Data.csv'
training_data = pd.read_csv(training_path)

test_path = 'Datasets/Test Data.csv'
test_data = pd.read_csv(test_path)

In [50]:
#Listing Data types

training_data.dtypes

ID        int64
A1        int64
A2        int64
A3        int64
A4        int64
A5        int64
A6       object
A7        int64
A8        int64
A9        int64
Class     int64
dtype: object

In [87]:
#Removing rows with '?' (Data cleaning)

training_data = training_data[training_data['A6'] != '?']
test_data = test_data[test_data['A6'] != '?']

In [88]:
#Splitting datasets into test and training data

y_train = training_data.iloc[:,10]
X_train = training_data.iloc[:,1:10]

y_test = test_data.iloc[:,10]
X_test = test_data.iloc[:,1:10]

In [89]:
from sklearn.model_selection import cross_val_score

In [90]:
#Trying out with Logistic Regression

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(cross_val_score(logreg, X_train, y_train, cv = 10, scoring='accuracy').mean()))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(cross_val_score(logreg, X_test, y_test, cv = 10, scoring='accuracy').mean()))

Accuracy of Logistic regression classifier on training set: 0.96
Accuracy of Logistic regression classifier on test set: 0.96


In [91]:
#Trying out with Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier().fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(cross_val_score(dtc, X_train, y_train, cv = 10, scoring='accuracy').mean()))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(cross_val_score(dtc, X_train, y_train, cv = 10, scoring='accuracy').mean()))

Accuracy of Decision Tree classifier on training set: 0.94
Accuracy of Decision Tree classifier on test set: 0.94


In [92]:
#Hence Logistic Regression is performing extremely well

In [93]:
#Confusion Matrix

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
y_pred = logreg.predict(X_test)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

Confusion Matrix
[[299   8]
 [  7 166]]


In [94]:
#Testing accuracy using k-folds cross validation

accuracy = cross_val_score(logreg, X_test, y_test, cv = 10, scoring='accuracy').mean()
print('Accuracy: ' + str(accuracy))

Accuracy: 0.960668150238819


In [95]:
#Testing precision

from sklearn.metrics import precision_score
print('Precision: ' + str(precision_score(y_test, y_pred, average='macro')))

Precision: 0.9655735857561415


In [96]:
#More traditional approach of using Confusion matrix to find all the needed parameters

TP = confusion_matrix(y_test, y_pred)[1][1]
TN = confusion_matrix(y_test, y_pred)[0][0]
FP = confusion_matrix(y_test, y_pred)[0][1]
FN = confusion_matrix(y_test, y_pred)[1][0]

print('True Positives:', TP)
print('True Negatives:', TN)
print('False Positives:', FP)
print('False Negatives:', FN)

# calculate accuracy
conf_accuracy = (float (TP+TN) / float(TP + TN + FP + FN))

# calculate the sensitivity
conf_sensitivity = (TP / float(TP + FN))

# calculate the specificity
conf_specificity = (TN / float(TN + FP))

# calculate precision
conf_precision = (TN / float(TN + FP))

print('-'*50)

print(f'Accuracy: {round(conf_accuracy,2)}') 
print(f'Sensitivity: {round(conf_sensitivity,2)}') 
print(f'Specificity: {round(conf_specificity,2)}') 
print(f'Precision: {round(conf_precision,2)}')

True Positives: 166
True Negatives: 299
False Positives: 8
False Negatives: 7
--------------------------------------------------
Accuracy: 0.97
Sensitivity: 0.96
Specificity: 0.97
Precision: 0.97
