In [2]:
# Importing all packages required to perform various classifications

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

clevelanddata = pd.read_csv('cleveland_cleaned.csv') 
statlogdata = pd.read_csv('statlog_cleaned.csv')
combineddata = pd.read_csv('combined.csv')

***CLEVELAND:***

In [3]:
# Cleveland Dataset: checking the distribution of heart disease variable where 1 means absence and 2 means presence of heart disease
clevelanddata['absence or presence of heart disease'].value_counts()

1    160
2    137
Name: absence or presence of heart disease, dtype: int64

In [4]:
# splitting the features and the targets for cleveland
X = clevelanddata.drop(columns='absence or presence of heart disease', axis=1)
Y = clevelanddata['absence or presence of heart disease']

In [5]:
# check
print(Y)

0      1
1      2
2      2
3      1
4      1
      ..
292    2
293    2
294    2
295    2
296    2
Name: absence or presence of heart disease, Length: 297, dtype: int64


In [6]:
print(X)

      age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0    63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1    67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2    67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3    37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4    41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   
..    ...  ...  ...       ...    ...  ...      ...      ...    ...      ...   
292  57.0  0.0  4.0     140.0  241.0  0.0      0.0    123.0    1.0      0.2   
293  45.0  1.0  1.0     110.0  264.0  0.0      0.0    132.0    0.0      1.2   
294  68.0  1.0  4.0     144.0  193.0  1.0      0.0    141.0    0.0      3.4   
295  57.0  1.0  4.0     130.0  131.0  0.0      0.0    115.0    1.0      1.2   
296  57.0  0.0  2.0     130.0  236.0  0.0      2.0    174.0    0.0      0.0   

     slope   ca  thal  
0      3.0  0.0   6.0  
1  

In [7]:
# split to training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

print(X.shape, X_train.shape, X_test.shape)

(297, 13) (237, 13) (60, 13)


In [8]:
# LOGISTIC REGRESSION
model = LogisticRegression(max_iter=1000)

# training the model with training data
model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
training_data_f1_score = f1_score(Y_train, X_train_prediction)

print("LR - Accuracy on Training data: ", training_data_accuracy)
print("LR - F1 score on Training data: ", training_data_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
testing_data_f1_score = f1_score(Y_test, X_test_prediction)

print("LR - Accuracy on Testing data: ", testing_data_accuracy)
print("LR - F1 score on Testing data: ", testing_data_f1_score)

LR - Accuracy on Training data:  0.8270042194092827
LR - F1 score on Training data:  0.8487084870848707
LR - Accuracy on Testing data:  0.9166666666666666
LR - F1 score on Testing data:  0.912280701754386


In [9]:
# SVM
# define the SVM model with best parameters found via GridSearch
svm_model = SVC(kernel='linear', C=10, class_weight='balanced')

# training the model with training data
svm_model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = svm_model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
training_data_f1_score = f1_score(Y_train, X_train_prediction)

print("SVM - Accuracy on Training data: ", training_data_accuracy)
print("SVM - F1 score on Training data: ", training_data_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = svm_model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
testing_data_f1_score = f1_score(Y_test, X_test_prediction)

print("SVM - Accuracy on Testing data: ", testing_data_accuracy)
print("SVM - F1 score on Testing data: ", testing_data_f1_score)

SVM - Accuracy on Training data:  0.8312236286919831
SVM - F1 score on Training data:  0.849624060150376
SVM - Accuracy on Testing data:  0.8833333333333333
SVM - F1 score on Testing data:  0.8771929824561403


In [10]:
# NAIVE BAYES
# define the Naive Bayes model with best hyperparameters found via GridSearch
nb_model = GaussianNB(var_smoothing=1e-09)

# training the model with training data
nb_model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = nb_model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
training_data_f1_score = f1_score(Y_train, X_train_prediction)

print("NB - Accuracy on Training data: ", training_data_accuracy)
print("NB - F1 score on Training data: ", training_data_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = nb_model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
testing_data_f1_score = f1_score(Y_test, X_test_prediction)

print("NB - Accuracy on Testing data: ", testing_data_accuracy)
print("NB - F1 score on Testing data: ", testing_data_f1_score)

NB - Accuracy on Training data:  0.8438818565400844
NB - F1 score on Training data:  0.8634686346863468
NB - Accuracy on Testing data:  0.9166666666666666
NB - F1 score on Testing data:  0.912280701754386


In [11]:
# ANN
# define the ANN model with best hyperparameters found via GridSearch
model = MLPClassifier(max_iter=10000,hidden_layer_sizes=(15, 10), activation='logistic', alpha=0.01)

# training the model with training data
model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
training_data_f1_score = f1_score(Y_train, X_train_prediction)

print("ANN - Accuracy on Training data: ", training_data_accuracy)
print("ANN - F1 score on Training data: ", training_data_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
testing_data_f1_score = f1_score(Y_test, X_test_prediction)

print("ANN - Accuracy on Testing data: ", testing_data_accuracy)
print("ANN - F1 score on Testing data: ", testing_data_f1_score)

ANN - Accuracy on Training data:  0.8607594936708861
ANN - F1 score on Training data:  0.8754716981132077
ANN - Accuracy on Testing data:  0.8833333333333333
ANN - F1 score on Testing data:  0.8813559322033899


In [12]:
# RANDOM FOREST
# define the RF model with best hyperparameters
model = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5)

# training the model with training data
model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
training_data_f1_score = f1_score(Y_train, X_train_prediction)

print("RF - Accuracy on Training data: ", training_data_accuracy)
print("RF - F1 score on Training data: ", training_data_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
testing_data_f1_score = f1_score(Y_test, X_test_prediction)

print("RF - Accuracy on Testing data: ", testing_data_accuracy)
print("RF - F1 score on Testing data: ", testing_data_f1_score)

RF - Accuracy on Training data:  0.9873417721518988
RF - F1 score on Training data:  0.9886792452830188
RF - Accuracy on Testing data:  0.9166666666666666
RF - F1 score on Testing data:  0.9152542372881356


In [13]:
# KNN
# Performing k-Nearest Neighbors model best hyperparameters found via GridSearch
model = KNeighborsClassifier(n_neighbors=11, p=2, weights='uniform')

# training the model with training data
model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = model.predict(X_train)

training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

print("k-NN - Accuracy on Training data: ", training_data_accuracy)

training_f1_score = f1_score(Y_train, X_train_prediction, average='weighted')
print("k-NN - F1 score on Training data: ", training_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = model.predict(X_test)

testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)

print("k-NN - Accuracy on Testing data: ", testing_data_accuracy)

testing_f1_score = f1_score(Y_test, X_test_prediction, average='weighted')
print("k-NN - F1 score on Testing data: ", testing_f1_score)

k-NN - Accuracy on Training data:  0.7130801687763713
k-NN - F1 score on Training data:  0.7106230160715576
k-NN - Accuracy on Testing data:  0.65
k-NN - F1 score on Testing data:  0.6495137538205057


***STATLOG:***

In [14]:
# Statlog Dataset: checking the distribution of heart disease variable where 1 means absence and 2 means presence of heart disease
statlogdata['absence or presence of heart disease'].value_counts()

1    150
2    120
Name: absence or presence of heart disease, dtype: int64

In [15]:
# splitting the features and the targets for statlog
X = statlogdata.drop(columns='absence or presence of heart disease', axis=1)
Y = statlogdata['absence or presence of heart disease']

In [16]:
# check
print(Y)
print(X)

0      2
1      1
2      2
3      1
4      1
      ..
265    1
266    1
267    1
268    1
269    2
Name: absence or presence of heart disease, Length: 270, dtype: int64
      age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0    70.0  1.0  4.0     130.0  322.0  0.0      2.0    109.0    0.0      2.4   
1    67.0  0.0  3.0     115.0  564.0  0.0      2.0    160.0    0.0      1.6   
2    57.0  1.0  2.0     124.0  261.0  0.0      0.0    141.0    0.0      0.3   
3    64.0  1.0  4.0     128.0  263.0  0.0      0.0    105.0    1.0      0.2   
4    74.0  0.0  2.0     120.0  269.0  0.0      2.0    121.0    1.0      0.2   
..    ...  ...  ...       ...    ...  ...      ...      ...    ...      ...   
265  52.0  1.0  3.0     172.0  199.0  1.0      0.0    162.0    0.0      0.5   
266  44.0  1.0  2.0     120.0  263.0  0.0      0.0    173.0    0.0      0.0   
267  56.0  0.0  2.0     140.0  294.0  0.0      2.0    153.0    0.0      1.3   
268  57.0  1.0  4.0     140.0  192.0  0.0

In [17]:
# split to training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

print(X.shape, X_train.shape, X_test.shape)

(270, 13) (216, 13) (54, 13)


In [18]:
# LOGISTIC REGRESSION
model = LogisticRegression(max_iter=1000)

# training the model with training data
model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
training_data_f1_score = f1_score(Y_train, X_train_prediction)

print("LR - Accuracy on Training data: ", training_data_accuracy)
print("LR - F1 score on Training data: ", training_data_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
testing_data_f1_score = f1_score(Y_test, X_test_prediction)

print("LR - Accuracy on Testing data: ", testing_data_accuracy)
print("LR - F1 score on Testing data: ", testing_data_f1_score)

LR - Accuracy on Training data:  0.8564814814814815
LR - F1 score on Training data:  0.8724279835390946
LR - Accuracy on Testing data:  0.8518518518518519
LR - F1 score on Testing data:  0.8823529411764706


In [19]:
# SVM
# define the SVM model with best parameters found via GridSearch
svm_model = SVC(kernel='linear', C=10, class_weight='balanced')

# training the model with training data
svm_model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = svm_model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
training_data_f1_score = f1_score(Y_train, X_train_prediction)

print("SVM - Accuracy on Training data: ", training_data_accuracy)
print("SVM - F1 score on Training data: ", training_data_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = svm_model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
testing_data_f1_score = f1_score(Y_test, X_test_prediction)

print("SVM - Accuracy on Testing data: ", testing_data_accuracy)
print("SVM - F1 score on Testing data: ", testing_data_f1_score)

SVM - Accuracy on Training data:  0.8703703703703703
SVM - F1 score on Training data:  0.8823529411764707
SVM - Accuracy on Testing data:  0.7962962962962963
SVM - F1 score on Testing data:  0.8358208955223881


In [20]:
# NAIVE BAYES
# define the Naive Bayes model with best hyperparameters found via GridSearch
nb_model = GaussianNB(var_smoothing=1e-09)

# training the model with training data
nb_model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = nb_model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
training_data_f1_score = f1_score(Y_train, X_train_prediction)

print("NB - Accuracy on Training data: ", training_data_accuracy)
print("NB - F1 score on Training data: ", training_data_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = nb_model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
testing_data_f1_score = f1_score(Y_test, X_test_prediction)

print("NB - Accuracy on Testing data: ", testing_data_accuracy)
print("NB - F1 score on Testing data: ", testing_data_f1_score)

NB - Accuracy on Training data:  0.8564814814814815
NB - F1 score on Training data:  0.8680851063829789
NB - Accuracy on Testing data:  0.8703703703703703
NB - F1 score on Testing data:  0.8955223880597014


In [21]:
# ANN
# define the ANN model with best hyperparameters found via GridSearch
model = MLPClassifier(max_iter=10000,hidden_layer_sizes=(15, 10), activation='logistic', alpha=0.01)

# training the model with training data
model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
training_data_f1_score = f1_score(Y_train, X_train_prediction)

print("ANN - Accuracy on Training data: ", training_data_accuracy)
print("ANN - F1 score on Training data: ", training_data_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
testing_data_f1_score = f1_score(Y_test, X_test_prediction)

print("ANN - Accuracy on Testing data: ", testing_data_accuracy)
print("ANN - F1 score on Testing data: ", testing_data_f1_score)

ANN - Accuracy on Training data:  0.8472222222222222
ANN - F1 score on Training data:  0.8663967611336033
ANN - Accuracy on Testing data:  0.8518518518518519
ANN - F1 score on Testing data:  0.8787878787878787


In [22]:
# RANDOM FOREST
# define the RF model with best hyperparameters
model = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5)

# training the model with training data
model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
training_data_f1_score = f1_score(Y_train, X_train_prediction)

print("RF - Accuracy on Training data: ", training_data_accuracy)
print("RF - F1 score on Training data: ", training_data_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
testing_data_f1_score = f1_score(Y_test, X_test_prediction)

print("RF - Accuracy on Testing data: ", testing_data_accuracy)
print("RF - F1 score on Testing data: ", testing_data_f1_score)

RF - Accuracy on Training data:  0.9861111111111112
RF - F1 score on Training data:  0.9873417721518987
RF - Accuracy on Testing data:  0.8333333333333334
RF - F1 score on Testing data:  0.8615384615384615


In [23]:
# KNN
# Performing k-Nearest Neighbors model best hyperparameters found via GridSearch
model = KNeighborsClassifier(n_neighbors=11, p=2, weights='uniform')

# training the model with training data
model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = model.predict(X_train)

training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

print("k-NN - Accuracy on Training data: ", training_data_accuracy)

training_f1_score = f1_score(Y_train, X_train_prediction, average='weighted')
print("k-NN - F1 score on Training data: ", training_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = model.predict(X_test)

testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)

print("k-NN - Accuracy on Testing data: ", testing_data_accuracy)

testing_f1_score = f1_score(Y_test, X_test_prediction, average='weighted')
print("k-NN - F1 score on Testing data: ", testing_f1_score)

k-NN - Accuracy on Training data:  0.7222222222222222
k-NN - F1 score on Training data:  0.7204009674505348
k-NN - Accuracy on Testing data:  0.6481481481481481
k-NN - F1 score on Testing data:  0.6507253678512837


***COMBINED:***

In [24]:
# Combined Dataset: checking the distribution of heart disease variable where 1 means absence and 2 means presence of heart disease
combineddata['absence or presence of heart disease'].value_counts()

1    310
2    257
Name: absence or presence of heart disease, dtype: int64

In [25]:
# splitting the features and the targets
X = combineddata.drop(columns='absence or presence of heart disease', axis=1)
Y = combineddata['absence or presence of heart disease']

In [26]:
# check
print(Y)
print(X)

0      2
1      1
2      2
3      1
4      2
      ..
562    2
563    2
564    2
565    1
566    1
Name: absence or presence of heart disease, Length: 567, dtype: int64
      age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0    62.0  0.0  4.0     140.0  268.0  0.0      2.0    160.0    0.0      3.6   
1    58.0  0.0  1.0     150.0  283.0  1.0      2.0    162.0    0.0      1.0   
2    46.0  1.0  4.0     120.0  249.0  0.0      2.0    144.0    0.0      0.8   
3    52.0  1.0  1.0     118.0  186.0  0.0      2.0    190.0    0.0      0.0   
4    35.0  1.0  4.0     126.0  282.0  0.0      2.0    156.0    1.0      0.0   
..    ...  ...  ...       ...    ...  ...      ...      ...    ...      ...   
562  67.0  1.0  4.0     125.0  254.0  1.0      0.0    163.0    0.0      0.2   
563  57.0  1.0  3.0     128.0  229.0  0.0      2.0    150.0    0.0      0.4   
564  59.0  1.0  1.0     134.0  204.0  0.0      0.0    162.0    0.0      0.8   
565  34.0  0.0  2.0     118.0  210.0  0.0

In [27]:
# split to training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

print(X.shape, X_train.shape, X_test.shape)

(567, 13) (453, 13) (114, 13)


In [28]:
# LOGISTIC REGRESSION
model = LogisticRegression(max_iter=1000)

# training the model with training data
model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
training_data_f1_score = f1_score(Y_train, X_train_prediction)

print("LR - Accuracy on Training data: ", training_data_accuracy)
print("LR - F1 score on Training data: ", training_data_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
testing_data_f1_score = f1_score(Y_test, X_test_prediction)

print("LR - Accuracy on Testing data: ", testing_data_accuracy)
print("LR - F1 score on Testing data: ", testing_data_f1_score)

LR - Accuracy on Training data:  0.8432671081677704
LR - F1 score on Training data:  0.8637236084452976
LR - Accuracy on Testing data:  0.8771929824561403
LR - F1 score on Testing data:  0.8703703703703703


In [29]:
# SVM
# define the SVM model with best parameters found via GridSearch
svm_model = SVC(kernel='linear', C=10, class_weight='balanced')

# training the model with training data
svm_model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = svm_model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
training_data_f1_score = f1_score(Y_train, X_train_prediction)

print("SVM - Accuracy on Training data: ", training_data_accuracy)
print("SVM - F1 score on Training data: ", training_data_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = svm_model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
testing_data_f1_score = f1_score(Y_test, X_test_prediction)

print("SVM - Accuracy on Testing data: ", testing_data_accuracy)
print("SVM - F1 score on Testing data: ", testing_data_f1_score)

SVM - Accuracy on Training data:  0.8543046357615894
SVM - F1 score on Training data:  0.8715953307392996
SVM - Accuracy on Testing data:  0.8771929824561403
SVM - F1 score on Testing data:  0.8653846153846154


In [30]:
# NAIVE BAYES
# define the Naive Bayes model with best hyperparameters found via GridSearch
nb_model = GaussianNB(var_smoothing=1e-09)

# training the model with training data
nb_model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = nb_model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
training_data_f1_score = f1_score(Y_train, X_train_prediction)

print("NB - Accuracy on Training data: ", training_data_accuracy)
print("NB - F1 score on Training data: ", training_data_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = nb_model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
testing_data_f1_score = f1_score(Y_test, X_test_prediction)

print("NB - Accuracy on Testing data: ", testing_data_accuracy)
print("NB - F1 score on Testing data: ", testing_data_f1_score)

NB - Accuracy on Training data:  0.8653421633554084
NB - F1 score on Training data:  0.8820116054158608
NB - Accuracy on Testing data:  0.8596491228070176
NB - F1 score on Testing data:  0.8571428571428571


In [32]:
# ANN
# define the ANN model with best hyperparameters found via GridSearch
model = MLPClassifier(max_iter=10000,hidden_layer_sizes=(15, 10), activation='logistic', alpha=0.01)

# training the model with training data
model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
training_data_f1_score = f1_score(Y_train, X_train_prediction)

print("ANN - Accuracy on Training data: ", training_data_accuracy)
print("ANN - F1 score on Training data: ", training_data_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
testing_data_f1_score = f1_score(Y_test, X_test_prediction)

print("ANN - Accuracy on Testing data: ", testing_data_accuracy)
print("ANN - F1 score on Testing data: ", testing_data_f1_score)

ANN - Accuracy on Training data:  0.8631346578366446
ANN - F1 score on Training data:  0.8812260536398469
ANN - Accuracy on Testing data:  0.8596491228070176
ANN - F1 score on Testing data:  0.8518518518518519


In [34]:
# RANDOM FOREST
# define the RF model with best hyperparameters
model = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5)

# training the model with training data
model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
training_data_f1_score = f1_score(Y_train, X_train_prediction)

print("RF - Accuracy on Training data: ", training_data_accuracy)
print("RF - F1 score on Training data: ", training_data_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
testing_data_f1_score = f1_score(Y_test, X_test_prediction)

print("RF - Accuracy on Testing data: ", testing_data_accuracy)
print("RF - F1 score on Testing data: ", testing_data_f1_score)

RF - Accuracy on Training data:  0.9977924944812362
RF - F1 score on Training data:  0.9980353634577603
RF - Accuracy on Testing data:  0.9210526315789473
RF - F1 score on Testing data:  0.9174311926605504


In [37]:
# KNN
# Performing k-Nearest Neighbors model best hyperparameters found via GridSearch
model = KNeighborsClassifier(n_neighbors=11, p=2, weights='uniform')

# training the model with training data
model.fit(X_train, Y_train)

# accuracy and F1 score on training data
X_train_prediction = model.predict(X_train)

training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

print("k-NN - Accuracy on Training data: ", training_data_accuracy)

training_f1_score = f1_score(Y_train, X_train_prediction, average='weighted')
print("k-NN - F1 score on Training data: ", training_f1_score)

# accuracy and F1 score on testing data
X_test_prediction = model.predict(X_test)

testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)

print("k-NN - Accuracy on Testing data: ", testing_data_accuracy)

testing_f1_score = f1_score(Y_test, X_test_prediction, average='weighted')
print("k-NN - F1 score on Testing data: ", testing_f1_score)

k-NN - Accuracy on Training data:  0.7527593818984547
k-NN - F1 score on Training data:  0.7500690568991449
k-NN - Accuracy on Testing data:  0.7280701754385965
k-NN - F1 score on Testing data:  0.7279655468027514
