Data from: https://archive.ics.uci.edu/ml/datasets/Heart+Disease

## Predicting what influences heart disease. 

**Features:**

age: age in years 

sex: sex (1 = male; 0 = female) 

chest pain type 
- Value 1: typical angina 
- Value 2: atypical angina 
- Value 3: non-anginal pain 
- Value 4: asymptomatic 

trestbps: resting blood pressure (in mm Hg on admission to the hospital) 

chol: serum cholestoral in mg/dl 

fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 

restecg: resting electrocardiographic results 
- Value 0: normal 
- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) 
- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria 

thalach: maximum heart rate achieved 

exang: exercise induced angina (1 = yes; 0 = no) 

oldpeak = ST depression induced by exercise relative to rest 

slope: the slope of the peak exercise ST segment 
- Value 1: upsloping 
- Value 2: flat 
- Value 3: downsloping 

ca: number of major vessels (0-3) colored by flourosopy 

thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

num: diagnosis of heart disease (angiographic disease status) 
- Value 0: < 50% diameter narrowing 
- Value 1: > 50% diameter narrowing 


In [1]:
# Import Dependencies
%matplotlib inline

# Start Python Imports
import math, time, random, datetime

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')


# Machine learning
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
#Get Data

df = pd.read_csv('heart.csv')

train, test = train_test_split(df, test_size=0.2)

y_train = train['target']
X_train = train[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']]
y_test = test['target']
X_test = test[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']]

In [3]:
# Function that runs the requested algorithm and returns the accuracy metrics
def fit_ml_algo(algo, X_train, y_train, cv):
    
    # One Pass
    model = algo.fit(X_train, y_train)
    acc = round(model.score(X_train, y_train) * 100, 2)
    
    # Cross Validation 
    train_pred = model_selection.cross_val_predict(algo, 
                                                  X_train, 
                                                  y_train, 
                                                  cv=cv, 
                                                  n_jobs = -1)
    # Cross-validation accuracy metric
    acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
    
    return train_pred, acc, acc_cv


# Random Forest Classifier ----------------------------------------------------------------

train_pred_log, acc_log, acc_cv_rfor = fit_ml_algo(RandomForestClassifier(), 
                                                               X_train, 
                                                               y_train, 
                                                                    10)

# Logistic Regression ---------------------------------------------------------------------

train_pred_log, acc_log, acc_cv_log = fit_ml_algo(LogisticRegression(), 
                                                               X_train, 
                                                               y_train, 
                                                                    10)


# K-Nearest Neighbours --------------------------------------------------------------------

train_pred_knn, acc_knn, acc_cv_knn = fit_ml_algo(KNeighborsClassifier(), 
                                                  X_train, 
                                                  y_train, 
                                                  10)


# Gaussian Naive Bayes --------------------------------------------------------------------

train_pred_gaussian, acc_gaussian, acc_cv_gaussian = fit_ml_algo(GaussianNB(), 
                                                                      X_train, 
                                                                      y_train, 
                                                                           10)

# Linear SVC --------------------------------------------------------------------------------

train_pred_svc, acc_linear_svc, acc_cv_linear_svc = fit_ml_algo(LinearSVC(),
                                                                X_train, 
                                                                y_train, 
                                                                10)

# Stochastic Gradient Descent ----------------------------------------------------------------

train_pred_sgd, acc_sgd, acc_cv_sgd = fit_ml_algo(SGDClassifier(), 
                                                  X_train, 
                                                  y_train,
                                                  10)

# Decision Tree Classifier ------------------------------------------------------------------

train_pred_dt, acc_dt, acc_cv_dt = fit_ml_algo(DecisionTreeClassifier(), 
                                                                X_train, 
                                                                y_train,
                                                                10)

# Gradient Boosting Trees -------------------------------------------------------------------

train_pred_gbt, acc_gbt, acc_cv_gbt = fit_ml_algo(GradientBoostingClassifier(), 
                                                                       X_train, 
                                                                       y_train,
                                                                       10)

In [4]:
# Feature Importance

feat_labels = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
clf.fit(X_train, y_train)

for feature in zip(feat_labels, clf.feature_importances_):
    print(feature)

('age', 0.08614291532827466)
('sex', 0.037749812472126935)
('cp', 0.1252193974019389)
('trestbps', 0.07969008033166816)
('chol', 0.08624944148100525)
('fbs', 0.009507971129407771)
('restecg', 0.021624204458213617)
('thalach', 0.10698081672356102)
('exang', 0.0378923180386257)
('oldpeak', 0.12408554923993322)
('slope', 0.0495576402495387)
('ca', 0.12810065420957886)
('thal', 0.10719919893612637)


In [5]:
# Accuracy Scores --------------------------------------------------------------------------

models = pd.DataFrame({
    'Model': ['RandomForestClassifier', 'KNN', 'Logistic Regression', 'Naive Bayes', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree', 'Gradient Boosting Trees'],
    'Score': [
        acc_cv_rfor,
        acc_knn, 
        acc_log,  
        acc_gaussian, 
        acc_sgd, 
        acc_linear_svc, 
        acc_dt,
        acc_gbt
    ]})
print("---Accuracy Scores---")
models.sort_values(by='Score', ascending=False)

---Accuracy Scores---


Unnamed: 0,Model,Score
6,Decision Tree,100.0
7,Gradient Boosting Trees,99.59
2,Logistic Regression,85.54
3,Naive Bayes,83.06
0,RandomForestClassifier,82.64
1,KNN,78.1
5,Linear SVC,76.86
4,Stochastic Gradient Decent,66.94


In [6]:
# Cross-Validation Accuracy Scores ------------------------------------------------------------------

cv_models = pd.DataFrame({
    'Model': ['RandomForestClassifier', 'KNN', 'Logistic Regression', 'Naive Bayes', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree', 'Gradient Boosting Trees'],
    'Score': [
        acc_cv_rfor,
        acc_cv_knn, 
        acc_cv_log,      
        acc_cv_gaussian, 
        acc_cv_sgd, 
        acc_cv_linear_svc, 
        acc_cv_dt,
        acc_cv_gbt
    ]})
print('---Cross-validation Accuracy Scores---')
cv_models.sort_values(by='Score', ascending=False)

---Cross-validation Accuracy Scores---


Unnamed: 0,Model,Score
2,Logistic Regression,83.88
0,RandomForestClassifier,82.64
3,Naive Bayes,81.4
7,Gradient Boosting Trees,80.99
5,Linear SVC,74.38
6,Decision Tree,72.31
1,KNN,62.81
4,Stochastic Gradient Decent,59.09
