In [705]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Reading Cleveland data input

In [706]:
df = pd.read_csv('cleveland_data.csv', header = None, skiprows = 1)
df.columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope','ca', 'thal', 'condition']

# Helper functions

In [707]:
model_accuracies = []

In [708]:
def preprocessing(df,test_split):
    X = df.iloc[:,:-1].values
    y = df.iloc[:,-1].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_split, random_state = 0)

    sc = StandardScaler()

    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    return X_train, X_test, y_train, y_test

In [709]:
def print_accuracy(model_name, cm_train, cm_test, y_train, y_test, model_accuracies):
    training_accuracy = (cm_train[0][0] + cm_train[1][1])/len(y_train)
    test_accuracy = (cm_test[0][0] + cm_test[1][1])/len(y_test)
    print(f"{model_name} training accuracy = {training_accuracy}")
    print(f"{model_name} test accuracy = {test_accuracy}")
    
    model_accuracies.append([model_name,test_accuracy])
    return model_accuracies

# Logistic Regression

In [710]:
X_train, X_test, y_train, y_test = preprocessing(df,0.3)

model = LogisticRegression()
model.fit(X_train, y_train)
pred_test = model.predict(X_test)
cm_test = confusion_matrix(pred_test, y_test)
pred_train = model.predict(X_train)
cm_train = confusion_matrix(pred_train, y_train)

model_accuracies = print_accuracy("Logistic Regression",cm_train, cm_test, y_train, y_test, model_accuracies)

Logistic Regression training accuracy = 0.855072463768116
Logistic Regression test accuracy = 0.8555555555555555


# Random Forest

In [711]:
X_train, X_test, y_train, y_test = preprocessing(df,0.4)

model = RandomForestClassifier(n_estimators = 7)
model.fit(X_train, y_train)
pred_test = model.predict(X_test)
cm_test = confusion_matrix(pred_test, y_test)
pred_train = model.predict(X_train)
cm_train = confusion_matrix(pred_train, y_train)

model_accuracies = print_accuracy("Random Forest",cm_train, cm_test, y_train, y_test, model_accuracies)

Random Forest training accuracy = 0.9662921348314607
Random Forest test accuracy = 0.8151260504201681


# SVM

In [712]:
X_train, X_test, y_train, y_test = preprocessing(df,0.2)

model = SVC()
model.fit(X_train, y_train)
pred_test = model.predict(X_test)
cm_test = confusion_matrix(pred_test, y_test)
pred_train = model.predict(X_train)
cm_train = confusion_matrix(pred_train, y_train)

model_accuracies = print_accuracy("SVM",cm_train, cm_test, y_train, y_test, model_accuracies)

SVM training accuracy = 0.9071729957805907
SVM test accuracy = 0.85


# Native Bayes

In [713]:
X_train, X_test, y_train, y_test = preprocessing(df,0.2)

model = GaussianNB()
model.fit(X_train, y_train)
pred_test = model.predict(X_test)
cm_test = confusion_matrix(pred_test, y_test)
pred_train = model.predict(X_train)
cm_train = confusion_matrix(pred_train, y_train)

model_accuracies = print_accuracy("Native Bayes",cm_train, cm_test, y_train, y_test, model_accuracies)

Native Bayes training accuracy = 0.8565400843881856
Native Bayes test accuracy = 0.8333333333333334


# Decision Tree

In [714]:
X_train, X_test, y_train, y_test = preprocessing(df,0.2)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
pred_test = model.predict(X_test)
cm_test = confusion_matrix(pred_test, y_test)
pred_train = model.predict(X_train)
cm_train = confusion_matrix(pred_train, y_train)

model_accuracies = print_accuracy("Decision Tree",cm_train, cm_test, y_train, y_test, model_accuracies)

Decision Tree training accuracy = 1.0
Decision Tree test accuracy = 0.7


# XGBoost

In [715]:
X_train, X_test, y_train, y_test = preprocessing(df,0.3)

model = XGBClassifier()
model.fit(X_train, y_train)
pred_test = model.predict(X_test)
cm_test = confusion_matrix(pred_test, y_test)

pred_train = model.predict(X_train)
cm_train = confusion_matrix(pred_train, y_train)

model_accuracies = print_accuracy("XGBoost",cm_train, cm_test, y_train, y_test, model_accuracies)

XGBoost training accuracy = 1.0
XGBoost test accuracy = 0.8111111111111111




# CatBoost

In [716]:
X_train, X_test, y_train, y_test = preprocessing(df,0.3)

model = CatBoostClassifier()
model.fit(X_train, y_train)
pred_test = model.predict(X_test)
cm_test = confusion_matrix(pred_test, y_test)

pred_train = model.predict(X_train)
cm_train = confusion_matrix(pred_train, y_train)

model_accuracies = print_accuracy("CatBoost",cm_train, cm_test, y_train, y_test, model_accuracies)

Learning rate set to 0.005258
0:	learn: 0.6885992	total: 490us	remaining: 490ms
1:	learn: 0.6840467	total: 1.02ms	remaining: 510ms
2:	learn: 0.6806249	total: 1.68ms	remaining: 560ms
3:	learn: 0.6768973	total: 2.42ms	remaining: 603ms
4:	learn: 0.6744085	total: 2.78ms	remaining: 553ms
5:	learn: 0.6705872	total: 3.13ms	remaining: 519ms
6:	learn: 0.6676589	total: 3.88ms	remaining: 551ms
7:	learn: 0.6639000	total: 4.28ms	remaining: 531ms
8:	learn: 0.6600509	total: 4.66ms	remaining: 513ms
9:	learn: 0.6561451	total: 5.07ms	remaining: 502ms
10:	learn: 0.6528145	total: 5.39ms	remaining: 485ms
11:	learn: 0.6486841	total: 5.67ms	remaining: 467ms
12:	learn: 0.6452873	total: 6.15ms	remaining: 467ms
13:	learn: 0.6422106	total: 6.97ms	remaining: 491ms
14:	learn: 0.6386898	total: 7.36ms	remaining: 483ms
15:	learn: 0.6344875	total: 7.85ms	remaining: 483ms
16:	learn: 0.6311988	total: 8.14ms	remaining: 471ms
17:	learn: 0.6283264	total: 8.51ms	remaining: 464ms
18:	learn: 0.6254759	total: 8.99ms	remaining:

# K-Nearest Neighbors

# Convolutional Neural Network

# Model Accuracy Summary

In [717]:
model_accuracies.sort(key=lambda x: x[1],reverse=True)
print(model_accuracies)

[['CatBoost', 0.8666666666666667], ['Logistic Regression', 0.8555555555555555], ['SVM', 0.85], ['Native Bayes', 0.8333333333333334], ['Random Forest', 0.8151260504201681], ['XGBoost', 0.8111111111111111], ['Decision Tree', 0.7]]
