In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler,MinMaxScaler,StandardScaler, LabelEncoder
from sklearn import model_selection
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")


In [None]:
#NEEDS TO READ FROM SQL

df = pd.read_csv("C:/Users/cdj3e/vu_bootcamp/Project_4/Resources/heart_data.csv")
df.head()

In [None]:
df = df.rename(columns={"age": "Age", "sex": "Sex", "cp": "Chest Pain Type", "trestbps": "Resting BP", "chol": "Cholesterol", "fbs": "Fasting Blood Sugar", "restecg": "Resting ECG", "thalach": "Max Heart Rate", "exang": "Exercise Induced Angina", "oldpeak": "OldPeak", "slope": "ST Slope", "num": "Heart Disease"})

df.head()

In [None]:
#string_col = df.select_dtypes(include="int64").columns
#df[string_col]=df[string_col].astype("float64")

Data Exploration

In [None]:
#correlation

px.imshow(df.corr(),title="Heart Disease Prediction Correlation")

In [None]:
#histogram correlation

fig=px.histogram(df, 
                 x="Heart Disease",
                 color="Sex",
                 hover_data=df.columns,
                 title="Heart Disease by Sex",
                 barmode="group")
fig.show()

In [None]:
#histogram distribution

fig=px.histogram(df,
                 x="Age",
                 hover_data=df.columns,
                 title="Distribution of Age")
fig.show()

In [None]:
#pairplot

sns.pairplot(df,hue="Heart Disease")
plt.tight_layout()
plt.plot()

In [None]:
#distribution shape by kernel density estimate

plt.figure(figsize=(15,10))

for i, col in enumerate(df.columns, 1):
    plt.subplot(4,3,i)
    sns.histplot(df[col], kde=True)
    plt.tight_layout()
    plt.plot()

In [None]:
#boxplot distribution

fig = px.box(df,y="Resting BP",x="Heart Disease",title=f"Resting BP Distribution by FBS",color="Sex")
fig.show()

Loop to determine best models

In [None]:
y = df['Heart Disease']
X = df.drop(columns='Heart Disease')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

In [None]:
models={"Logistic Regression": LogisticRegression(),
        "SVM": SVC(),
        "Naive Bayers": GaussianNB(),
        "K-Nearest Neighbors": KNeighborsClassifier(),
        "Decision Tree Classifier": DecisionTreeClassifier(),
        "Random Forest Classifier": RandomForestClassifier()}

In [None]:
for i in range(len(models)):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

In [None]:
for i in range(len(models)):
    model=list(models.values())[i]
    model.fit

    y_prediction=model.predict(X_test)

In [None]:
for i in range(len(models)):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    y_prediction=model.predict(X_test)

    accuracy=accuracy_score(y_test,y_prediction)

    print(str(list(models.keys())[i])+" Score = ", accuracy)

    plt.suptitle('Accuracy', color='blue')

Varieties of Scaling

In [None]:
scaler = preprocessing.RobustScaler()
robust_scaling_df = scaler.fit_transform(df)
robust_scaling_df = pd.DataFrame(robust_scaling_df, columns =['Age', 'Sex', 'Chest Pain Type', 'Resting BP', 'Cholesterol', 'Fasting Blood Sugar', 'Resting ECG', 'Max Heart Rate', 'Exercise Induced Angina', 'OldPeak', 'ST Slope', 'Heart Disease'])
 
scaler = preprocessing.StandardScaler()
standard_scaling_df = scaler.fit_transform(df)
standard_scaling_df = pd.DataFrame(standard_scaling_df, columns =['Age', 'Sex', 'Chest Pain Type', 'Resting BP', 'Cholesterol', 'Fasting Blood Sugar', 'Resting ECG', 'Max Heart Rate', 'Exercise Induced Angina', 'OldPeak', 'ST Slope', 'Heart Disease'])
 
scaler = preprocessing.MinMaxScaler()
minmax_scaling_df = scaler.fit_transform(df)
minmax_scaling_df = pd.DataFrame(minmax_scaling_df, columns =['Age', 'Sex', 'Chest Pain Type', 'Resting BP', 'Cholesterol', 'Fasting Blood Sugar', 'Resting ECG', 'Max Heart Rate', 'Exercise Induced Angina', 'OldPeak', 'ST Slope', 'Heart Disease'])

fig, (unscaled, robust, standard, minmax) = plt.subplots(ncols = 4, figsize =(20, 5))

unscaled.set_title('Unscaled')
sns.kdeplot(df['Chest Pain Type'], ax = unscaled, color ='red')
sns.kdeplot(df['Heart Disease'], ax = unscaled, color ='black')

robust.set_title('Robust Scaling')
sns.kdeplot(robust_scaling_df['Chest Pain Type'], ax = robust, color ='red')
sns.kdeplot(robust_scaling_df['Heart Disease'], ax = robust, color ='black')

standard.set_title('Standard Scaling')
sns.kdeplot(standard_scaling_df['Chest Pain Type'], ax = standard, color ='red')
sns.kdeplot(standard_scaling_df['Heart Disease'], ax = standard, color ='black')

minmax.set_title('Min/Max Scaling')
sns.kdeplot(minmax_scaling_df['Chest Pain Type'], ax = minmax, color ='red')
sns.kdeplot(minmax_scaling_df['Heart Disease'], ax = minmax, color ='black')

plt.show()


Non-Tree Based Algorithms

In [None]:
# Stratified K-Fold

X = df.drop(columns='Heart Disease')
y = df['Heart Disease']

scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X)

logistic_regression = LogisticRegression()


stratified_k_fold = model_selection.StratifiedKFold(n_splits=10)
accuracy = []

for train_index, test_index in stratified_k_fold.split(X, y):
    X_train=X_scaled[train_index]
    y_train=y[train_index]
    
    X_test=X_scaled[test_index]
    y_test=y[test_index]

    logistic_regression.fit(X_train, y_train)
    accuracy.append(logistic_regression.score(X_test, y_test))

print("Maximum Stratified K-Fold accuracy: ", max(accuracy)*100, "%")
print("Minimum Stratified K-Fold accuracy: ", min(accuracy)*100, "%")

In [None]:
# naive bayes

NB_accuracy=[]
    
naive_bayers=GaussianNB()
naive_bayers.fit(X_train,y_train)
NB_accuracy.append(naive_bayers.score(X_test, y_test))

print("Maximum Naive Bayers accuracy: ", max(NB_accuracy)*100, "%")
print("Minimum Naive Bayers accuracy: ", min(NB_accuracy)*100, "%")

In [None]:
# support vector machine linear kernel

from sklearn.svm import SVC
linear_SVC_accuracy=[]
    
svc_linear=SVC(kernel="linear")
svc_linear.fit(X_train,y_train)
linear_SVC_accuracy.append(svc_linear.score(X_test, y_test))

print("Maximum SVC (linear kernel) accuracy: ", max(linear_SVC_accuracy)*100, "%")
print("Minimum SVC (linear kernel) accuracy: ", min(linear_SVC_accuracy)*100, "%")

In [None]:
# support vector machine sigmoid kernel

sigmoid_SVC_accuracy=[]
    
svc_sigmoid=SVC(kernel="sigmoid")
svc_sigmoid.fit(X_train,y_train)
sigmoid_SVC_accuracy.append(svc_sigmoid.score(X_test, y_test))

print("Maximum SVC (sigmoid kernel) accuracy: ", max(sigmoid_SVC_accuracy)*100, "%")
print("Minimum SVC (sigmoid kernel) accuracy: ", min(sigmoid_SVC_accuracy)*100, "%")

In [None]:
# support vector machine rbf kernel

rbf_svc_accuracy=[]
    
svc_rbf=SVC(kernel="rbf")
svc_rbf.fit(X_train,y_train)
rbf_svc_accuracy.append(svc_rbf.score(X_test, y_test))

print("Maximum SVC (rbf kernel) accuracy: ", max(rbf_svc_accuracy)*100, "%")
print("Minimum SVC (rbf kernel) accuracy: ", min(rbf_svc_accuracy)*100, "%")

In [None]:
# support vector machine poly kernel

poly_svc_accuracy=[]
    
svc_poly=SVC(kernel="poly")
svc_poly.fit(X_train,y_train)
poly_svc_accuracy.append(svc_poly.score(X_test, y_test))

print("Maximum SVC (poly kernel) accuracy: ", max(poly_svc_accuracy)*100, "%")
print("Minimum SVC (poly kernel) accuracy: ", min(poly_svc_accuracy)*100, "%")

In [None]:
# k-nearest neighbors 

k_nearest_neighbors_accuracy=[]
    
k_nearest_neighbors=KNeighborsClassifier(n_neighbors=32)
k_nearest_neighbors.fit(X_train,y_train)
k_nearest_neighbors_accuracy.append(k_nearest_neighbors.score(X_test, y_test))

print("Maximum K-Nearest Neighbors accuracy: ", max(k_nearest_neighbors_accuracy)*100, "%")
print("Minimum K-Nearest Neighbors accuracy: ", min(k_nearest_neighbors_accuracy)*100, "%")

Tree based algorithms

In [None]:
df = df.apply(LabelEncoder().fit_transform)
df.head()

In [None]:
# decision tree classifier

from sklearn.tree import DecisionTreeClassifier
decision_tree_accuracy = []
    
decision_tree = DecisionTreeClassifier(criterion="entropy")
decision_tree.fit(X_train,y_train)
decision_tree_accuracy.append(decision_tree.score(X_test, y_test))

print("Decision Tree accuracy: ", max(decision_tree_accuracy)*100, "%")
#print("Minimum Decision Tree accuracy: ", min(decision_tree_accuracy)*100, "%")

In [None]:
# decision tree classifier visualization

import graphviz
from sklearn import tree

visual = tree.export_graphviz(decision_tree, out_file=None, 
                                feature_names=X,  
                                class_names=y,
                                filled=True)

graph = graphviz.Source(visual, format="png") 
graph

In [None]:
# random forest classifier

random_forest_accuracy = []
    
random_forest = RandomForestClassifier(n_estimators=200, criterion="entropy")
random_forest.fit(X_train,y_train)
random_forest_accuracy.append(random_forest.score(X_test, y_test))

print("Random Forest accuracy: ", max(random_forest_accuracy)*100, "%")

In [None]:
plt.figure(figsize=(20,15))
importance = random_forest.feature_importances_
idxs = np.argsort(importance)
plt.title("Feature Importance")
plt.barh(range(len(idxs)),importance[idxs],align="center")
plt.yticks(range(len(idxs)),[X[i] for i in idxs])
plt.xlabel("Random Forest Feature Importance")
plt.tight_layout()
plt.show()