In [400]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB


# Reading Cleveland data input

In [401]:
df = pd.read_csv('cleveland_data.csv', header = None, skiprows = 1)
df.columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope','ca', 'thal', 'condition']

# Helper functions

In [402]:
model_accuracies = []

In [403]:
def preprocessing(df,test_split):
    X = df.iloc[:,:-1].values
    y = df.iloc[:,-1].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_split, random_state = 0)

    sc = StandardScaler()

    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    return X_train, X_test, y_train, y_test

In [404]:
def print_accuracy(model_name, cm_train, cm_test, y_train, y_test, model_accuracies):
    training_accuracy = (cm_train[0][0] + cm_train[1][1])/len(y_train)
    test_accuracy = (cm_test[0][0] + cm_test[1][1])/len(y_test)
    print(f"{model_name} training accuracy = {training_accuracy}")
    print(f"{model_name} test accuracy = {test_accuracy}")
    
    model_accuracies.append([model_name,test_accuracy])
    return model_accuracies

# Logistic Regression

In [405]:
X_train, X_test, y_train, y_test = preprocessing(df,0.3)

model = LogisticRegression()
model.fit(X_train, y_train)
pred_test = model.predict(X_test)
cm_test = confusion_matrix(pred_test, y_test)
pred_train = model.predict(X_train)
cm_train = confusion_matrix(pred_train, y_train)

model_accuracies = print_accuracy("Logistic Regression",cm_train, cm_test, y_train, y_test, model_accuracies)

Logistic Regression training accuracy = 0.855072463768116
Logistic Regression test accuracy = 0.8555555555555555


# Random Forest

In [406]:
X_train, X_test, y_train, y_test = preprocessing(df,0.4)

model = RandomForestClassifier(n_estimators = 9)
model.fit(X_train, y_train)
pred_test = model.predict(X_test)
cm_test = confusion_matrix(pred_test, y_test)
pred_train = model.predict(X_train)
cm_train = confusion_matrix(pred_train, y_train)

model_accuracies = print_accuracy("Random Forest",cm_train, cm_test, y_train, y_test, model_accuracies)

Random Forest training accuracy = 0.9775280898876404
Random Forest test accuracy = 0.7899159663865546


# SVM

In [407]:
X_train, X_test, y_train, y_test = preprocessing(df,0.2)

model = SVC()
model.fit(X_train, y_train)
pred_test = model.predict(X_test)
cm_test = confusion_matrix(pred_test, y_test)
pred_train = model.predict(X_train)
cm_train = confusion_matrix(pred_train, y_train)

model_accuracies = print_accuracy("SVM",cm_train, cm_test, y_train, y_test, model_accuracies)

SVM training accuracy = 0.9071729957805907
SVM test accuracy = 0.85


# Native Bayes

In [408]:
X_train, X_test, y_train, y_test = preprocessing(df,0.2)

model = GaussianNB()
model.fit(X_train, y_train)
pred_test = model.predict(X_test)
cm_test = confusion_matrix(pred_test, y_test)
pred_train = model.predict(X_train)
cm_train = confusion_matrix(pred_train, y_train)

model_accuracies = print_accuracy("Native Bayes",cm_train, cm_test, y_train, y_test, model_accuracies)

Native Bayes training accuracy = 0.8565400843881856
Native Bayes test accuracy = 0.8333333333333334


# Decision Tree

# Model Accuracy Summary

In [409]:
print(model_accuracies)

[['Logistic Regression', 0.8555555555555555], ['Random Forest', 0.7899159663865546], ['SVM', 0.85], ['Native Bayes', 0.8333333333333334]]
