# Using Python and Machine Learning Algorithms within Tableau: Heart Disease

UCI Machine Learning Repository link:

https://archive.ics.uci.edu/ml/datasets/Heart+Disease
    
Used reference code to deploy functions to Tableau via TabPy:

https://www.tableau.com/about/blog/2017/1/building-advanced-analytics-applications-tabpy-64916

In [None]:
# Import necessary packages into Python
import math
import numpy as np
import pandas as pd
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import cross_val_score, cross_val_predict, StratifiedKFold 
from sklearn import preprocessing, metrics, svm, ensemble
from sklearn.metrics import accuracy_score, classification_report
import tabpy_client
%pylab inline

### Data pre-processing: drop nulls, examine class attribute

In [None]:
hd = pd.read_csv('./processed.cleveland.data.csv', names= ["age", "sex", "chest_pain", "resting_bp", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "diagnosis"])
print 'The size of the file, before nulls dropped, is: ', hd.shape

hd = hd.replace('?', np.nan)
hd = hd.dropna()
# Define diagnosis of 1-4 as 'risk' and 0 as 'healthy'
def diagnosis(row):
    if row['diagnosis'] > 0:
        return 'risk'
    else:
        return 'healthy'
hd['diagnosis'] = hd.apply(diagnosis, axis=1)
print 'The size of the file, after nulls dropped, is: ', hd.shape
#hd.to_csv('./cleveland_data_for tableau.csv', index=False)
hd.head()

In [None]:
hd.groupby('diagnosis').describe()

In [None]:
# Since class attribute is vales 0-4, there is no need to convert text to numeric using encoder, no transformation needed
encoder = preprocessing.LabelEncoder()
hd['diagnosis'] = encoder.fit_transform(hd['diagnosis'])
hd.head()

In [None]:
# Split data into X, y
X = np.array(hd.drop(['diagnosis'], 1))
y = np.array(hd['diagnosis'])

Support Vector Machine reference:

http://scikit-learn.org/stable/modules/svm.html

To determine which model evaluations work best, via 'scoring':

http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [None]:
# Scale the data (Assume that all features are centered around 0 and have variance in the same order. If a feature has a variance that is orders of magnitude larger that others, it might dominate the objective function and make the estimator unable to learn from other features correctly as expected.)
# Note in order for StandardScaler to work, need to remove any nulls in data set prior to running
scalar = preprocessing.StandardScaler().fit(X)
X = scalar.transform(X)

# 10 fold stratified cross validation
kf = StratifiedKFold(y, n_folds=10, random_state=None, shuffle=True)

# Define the parameter grid to use for tuning the Support Vector Machine
parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
              {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

# Choose performance measures for modeling
scoringmethods = ['f1','accuracy','precision', 'recall','roc_auc']
# scoringmethods = ['f1_weighted', 'accuracy', 'precision_weighted', 'recall_weighted']

In [None]:
# Iterate through different metrics looking for best parameter set
for score in scoringmethods:
    print("~~~ Hyper-parameter tuning for best %s ~~~" % score)
    
    # Setup for grid search with cross-validation for Support Vector Machine
    # n_jobs=-1 for parallel execution using all available cores
    svmclf = GridSearchCV(svm.SVC(C=1), parameters, cv=kf, scoring=score,n_jobs=-1)
    svmclf.fit(X, y)
    
    # Show each result from grid search
    print("Scores for different parameter combinations in the grid:")
    for params, mean_score, scores in svmclf.grid_scores_:
        print("  %0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params)) 
    print("")
    
# Show classification report for the best model (set of parameters) run over the full dataset
print("Classification report:")
y_pred = svmclf.predict(X)
print(classification_report(y, y_pred))
    
# Show the definition of the best model
print("Best model:")
print(svmclf.best_estimator_)
    
# Show accuracy
print("Accuracy: %0.3f" % accuracy_score(y, y_pred, normalize=True))
print("Aucroc: %0.3f" % metrics.roc_auc_score(y, y_pred))
print("")

In [None]:
# Logistic regression with 10 fold stratified cross-validation using model specific cross-validation in scikit-learn
lgclf = LogisticRegressionCV(Cs=list(np.power(10.0, np.arange(-10, 10))),penalty='l2',scoring='roc_auc',cv=kf)
lgclf.fit(X, y)
y_pred = lgclf.predict(X)

# Show classification report for the best model (set of parameters) run over the full dataset
print("Classification report:")
print(classification_report(y, y_pred))

# Show accuracy and area under ROC curve
print("Accuracy: %0.3f" % accuracy_score(y, y_pred, normalize=True))
print("Aucroc: %0.3f" % metrics.roc_auc_score(y, y_pred))

In [None]:
# Naive Bayes with 10 fold stratified cross-validation
nbclf = GaussianNB()
scores = cross_val_score(nbclf, X, y, cv=kf, scoring= 'accuracy')

print("Accuracy: %0.3f" % (scores.mean()))
print("Aucroc: %0.3f" % metrics.roc_auc_score(y, cross_val_predict(nbclf, X, y, cv=kf)))

In [None]:
# Define the parameter grid to use for tuning the Gradient Boosting Classifier
gridparams = dict(learning_rate=[0.01, 0.1],loss=['deviance','exponential'])

# Parameters we're not tuning for this classifier
params = {'n_estimators': 100, 'max_depth': 4}

# Setup for grid search with cross-validation for Gradient Boosting Classifier
# n_jobs=-1 for parallel execution using all available cores
gbclf = GridSearchCV(ensemble.GradientBoostingClassifier(**params), gridparams, cv=kf, scoring='roc_auc',n_jobs=-1)
gbclf.fit(X,y)

# Show the definition of the best model
print("Best model:")
print(gbclf.best_estimator_)
print("")

# Show classification report for the best model (set of parameters) run over the full dataset
print("Classification report:")    
y_pred = gbclf.predict(X)
print(classification_report(y, y_pred))

# Show accuracy and area under ROC curve
print("Accuracy: %0.3f" % accuracy_score(y, y_pred, normalize=True))
print("Aucroc: %0.3f" % metrics.roc_auc_score(y, y_pred))

In [None]:
# Connect to TabPy server using the client library
connection = tabpy_client.Client('http://localhost:9004/')

In [None]:
# The scoring function that will use the Gradient Boosting Classifier to classify new data points
def HDDiagnosis(age, sex, chest_pain, resting_bp, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal):
    X = np.column_stack([age, sex, chest_pain, resting_bp, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal])
    X = scalar.transform(X)
    return encoder.inverse_transform(gbclf.predict(X)).tolist()

In [None]:
connection.deploy('HDDiagnosis',
                  HDDiagnosis,
                  'Returns diagnosis suggestion (healthy or risk) based on ensemble model trained using Cleveland Heart Disease dataset',
                  override=True)