## The notebook explores and compares different classification models for a heart-disease dataset.

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from mlxtend.plotting import plot_decision_regions
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [34]:
data = pd.read_csv("./heart.csv")
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [35]:
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [36]:
data.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


There is no missing data in the dataset

In [37]:
features = ['age', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
target = 'target'

In [38]:
X = data[features]
y = data[target]

In [39]:
def get_k_fold_splits(k, random_state, X, y):
  kfold = KFold(n_splits = k, shuffle = True, random_state = random_state)
  spl = kfold.split(X)
  return spl

def get_k_fold_cv(X, y, k, random_state):
  splits = get_k_fold_splits(k, random_state, X, y)
  logistic_acc_scores = []
  knn_acc_scores = []
  svm_acc_scores = []
  nb_acc_scores = []
  dt_acc_scores = []
  rf_acc_scores = []
  for i, (train_index, test_index) in enumerate(splits):
    print(f"Fold: {i}")
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    results = build_classifier(X_train, X_test, y_train, y_test, random_state)
    print("Logistic Regression Results: ")
    logistic_acc_scores.append(get_results(results["actual"], results["logistic_pred"]))
    print("KNN Results: ")
    knn_acc_scores.append(get_results(results["actual"], results["knn_pred"]))
    print("SVM Results: ")
    svm_acc_scores.append(get_results(results["actual"], results["svm_pred"]))
    print("Naive Bayes Results: ")
    nb_acc_scores.append(get_results(results["actual"], results["nb_pred"]))
    print("Decision Tree Results: ")
    dt_acc_scores.append(get_results(results["actual"], results["dt_pred"]))
    print("Random Forest Results: ")
    rf_acc_scores.append(get_results(results["actual"], results["rf_pred"]))
    print("\n")
  mean_logistic_acc_score = sum(logistic_acc_scores)/len(logistic_acc_scores)
  mean_knn_acc_score = sum(knn_acc_scores)/len(knn_acc_scores)
  mean_svm_acc_score = sum(svm_acc_scores)/len(svm_acc_scores)
  mean_nb_acc_score = sum(nb_acc_scores)/len(nb_acc_scores)
  mean_dt_acc_score = sum(dt_acc_scores)/len(dt_acc_scores)
  mean_rf_acc_score = sum(rf_acc_scores)/len(rf_acc_scores)

  print("\n")
  print("Mean logistic regression classifier accuracy score: " + str(mean_logistic_acc_score))
  print("Mean KNN classifier accuracy score: " + str(mean_knn_acc_score))
  print("Mean SVM classifier accuracy score: " + str(mean_svm_acc_score))
  print("Mean Naive Bayes classifier accuracy score: " + str(mean_nb_acc_score))
  print("Mean Decision Tree classifier accuracy score: " + str(mean_dt_acc_score))
  print("Mean Random Forest classifier accuracy score: " + str(mean_rf_acc_score))

def build_classifier(X_train, X_test, y_train, y_test, random_state):
    sc = StandardScaler()
    scaled_columns = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
    X_train[scaled_columns] = sc.fit_transform(X_train[scaled_columns])
    X_test[scaled_columns] = sc.transform(X_test[scaled_columns])
    logistic_predictions = build_logistic_regression(X_train, X_test, y_train)
    knn_predictions = build_knn_classifier(X_train, X_test, y_train)
    svm_predictions = build_svm_classifier(X_train, X_test, y_train)
    nb_predictions = build_nb_classifier(X_train, X_test, y_train)
    dt_predictions = build_decision_tree_classifier(X_train, X_test, y_train)
    rf_predictions = build_random_forest_classifier(X_train, X_test, y_train)
    results_columns = {'actual': y_test, 'logistic_pred': logistic_predictions, 'knn_pred': knn_predictions, 'svm_pred': svm_predictions,
                      'nb_pred': nb_predictions, 'dt_pred': dt_predictions, 'rf_pred': rf_predictions}
    results = pd.DataFrame(results_columns)
    return results

def build_logistic_regression(X_train, X_test, y_train):
    classifier = LogisticRegression(random_state = 0)
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    return predictions

def build_knn_classifier(X_train, X_test, y_train):
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    return predictions

def build_svm_classifier(X_train, X_test, y_train):
    classifier = SVC(kernel = 'linear', random_state = 0)
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    return predictions

def build_nb_classifier(X_train, X_test, y_train):
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    return predictions

def build_decision_tree_classifier(X_train, X_test, y_train):
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    return predictions

def build_random_forest_classifier(X_train, X_test, y_train):
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    return predictions

def get_results(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    acc_score = accuracy_score(y_test, y_pred)
    print("Confusion Matrix: ")
    print(cm)
    print("Accuracy Score: ")
    print(acc_score)
    print("\n")
    return acc_score

In [40]:
number_of_folds = 5
state = 42
get_k_fold_cv(X, y, number_of_folds, state)


Fold: 0
Logistic Regression Results: 
Confusion Matrix: 
[[70 32]
 [12 91]]
Accuracy Score: 
0.7853658536585366


KNN Results: 
Confusion Matrix: 
[[76 26]
 [18 85]]
Accuracy Score: 
0.7853658536585366


SVM Results: 
Confusion Matrix: 
[[70 32]
 [12 91]]
Accuracy Score: 
0.7853658536585366


Naive Bayes Results: 
Confusion Matrix: 
[[71 31]
 [19 84]]
Accuracy Score: 
0.7560975609756098


Decision Tree Results: 
Confusion Matrix: 
[[102   0]
 [  3 100]]
Accuracy Score: 
0.9853658536585366


Random Forest Results: 
Confusion Matrix: 
[[ 99   3]
 [  3 100]]
Accuracy Score: 
0.9707317073170731




Fold: 1
Logistic Regression Results: 
Confusion Matrix: 
[[86 14]
 [12 93]]
Accuracy Score: 
0.8731707317073171


KNN Results: 
Confusion Matrix: 
[[93  7]
 [16 89]]
Accuracy Score: 
0.8878048780487805


SVM Results: 
Confusion Matrix: 
[[84 16]
 [12 93]]
Accuracy Score: 
0.8634146341463415


Naive Bayes Results: 
Confusion Matrix: 
[[83 17]
 [15 90]]
Accuracy Score: 
0.8439024390243902


Decisi

### Decision Tree and Random Forest classifier's accuracy scores are better than other classification models