In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # visualization
import matplotlib.pyplot as plt 

!pip install pandas-bokeh
import pandas_bokeh
pandas_bokeh.output_notebook()
pd.set_option('plotting.backend', 'pandas_bokeh')
from bokeh.models import ColumnDataSource
from bokeh.models.widgets import DataTable, TableColumn

from sklearn.preprocessing import OneHotEncoder 
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve, auc, plot_roc_curve

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Table of Contents
* [Analyzing the Data](#analyze)
* [Visualization](#viz)
    - [Correlation Between Variables](#corr)
    - [Pandas Bokeh](#pandas_bokeh)
* [Data Preperation](#prep)
    - [One Hot Encoding](#1hot)
    - [Standardization](#standard)
    - [Feature Selection](#feature_select)
* [Modeling](#model)
    - [Building the models](#bob_the_builder)
    - [Model Selection](#americas_next_top_model)
* [Conclusion](#section-three)

<a id="analyze"></a>
# Analyzing the Data
First taking a look at some rows of the data and looking at the variable terms:
1. age - age in years

2. sex - sex <br>
    1 = male <br>
    0 = female

3. cp - chest pain type <br>
    0 = typical angina <br>
    1 = atypical angina <br>
    2 = non-anginal pain <br>
    3 = asymptomatic

4. trestbps - resting blood pressure (in mm Hg on admission to the hospital)

5. chol - serum cholestoral in mg/dl

6. fbs - fasting blood sugar > 120 mg/dl  <br>
    1 = true <br>
    0 = false

7. restecg - resting electrocardiographic results  <br>
    0 = normal <br>
    1 = having ST-T <br>
    2 = hypertrophy

8. thalach - maximum heart rate achieved

9. exng - exercise induced angina <br>
    1 = yes <br>
    0 = no

10. oldpeak - ST depression induced by exercise relative to rest

11. slp - the slope of the peak exercise ST segment <br>
    1 = upsloping<br>
    2 = flat<br>
    3 = downsloping

12. caa - number of major vessels (0-3) colored by flourosopy

13. thall <br>
    1 = fixed defect<br>
    2 = normal<br>
    3 = reversable defect
    
14. output - the predicted attribute - diagnosis of heart disease (angiographic disease status) <br>
    Value 0 = < diameter narrowing<br>
    Value 1 = > 50% diameter narrowing

In [None]:
heart = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
display(heart.head(3))
display(heart.describe())

Slight bias by having more datapoints with a heart attack diagnosis than not.

In [None]:
heart[['age','output']].groupby(['output']).count().rename(columns = {'age':'count of output values'})

Less females in the dataset, and they are significantly more likely to have a heart attack diagnosis.

In [None]:
heart[["sex", "output"]].groupby(['sex'], as_index=False).agg(['count', 'mean'])

Correlation between the output and the independent variables:

In [None]:
heart.corr()[['output']].multiply(100).T.applymap('{:.2f}%'.format)

<a id="viz"></a>
# Visualization

<a id="corr"></a>
## Correlation Between Variables

In [None]:
f = plt.figure(figsize=(12, 12))
plt.matshow(heart.corr(), fignum=f.number)
plt.xticks(range(heart.select_dtypes(['number']).shape[1]), heart.select_dtypes(['number']).columns, fontsize=14, rotation=45)
plt.yticks(range(heart.select_dtypes(['number']).shape[1]), heart.select_dtypes(['number']).columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);

https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html

In [None]:
corr = heart.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

From the heatmaps we can see that there is some multicollinearity between the variables, i.e. correlation between the independent variables. It is worth making note of so we can handle it with regularization in our models below. 

In [None]:
sns.pairplot(heart, hue="output")

Pairplot shows a lot of information that it can be hard to process it all. Something that stood out to me is that instead of missing data we have 0's in some rows where that isn't valid for the column such as 'slp' and 'thall'. 

<a id="pandas_bokeh"></a>
## Pandas Bokeh
I'm a big fan of interactive plots so I love bokeh. To make things easier, there is a pandas-bokeh library available. See the documention here: https://github.com/PatrikHlobil/Pandas-Bokeh

In [None]:
def thall_rename(thall):
    if thall==0:
        return 'Unknown'
    if thall==1:
        return 'Fixed Defect'
    if thall==2:
        return 'Normal'
    return 'Reversable Defect'

def output_rename(output):
    if output==0:
        return '0 - Non-critical Patient'
    else:
        return '1 - Heart Attack Diagnosis'

###############
# Data Prep 
###############
df = heart.copy()
df.thall = df.thall.apply(lambda x: thall_rename(x))
df.output = df.output.apply(lambda x: output_rename(x))

df = df.filter(['thalachh','age','thall','output'])#.groupby(['thall']).count()#.agg(['mean','count'])

###############
# Making the Plot 
###############
data_table = DataTable(
    columns=[TableColumn(field=Ci, title=Ci) for Ci in df.columns],
    source=ColumnDataSource(df),
    width=300,
    height=300,
)

p_scatter = df.plot_bokeh.scatter(
    x="age",
    y="thalachh",
    category="output",
    title="Correlation between Age/Thalachh values to Heart Attack Diagnosis",
    show_figure=False,
)
pandas_bokeh.plot_grid([[data_table, p_scatter]], plot_width=350, plot_height=450)

In [None]:
df = heart.copy()
df = df.filter(['thall','output'])
df.thall = df.thall.apply(lambda x: thall_rename(x))
df['1 - Heart Attack Diagnosis'] = df.output #df[df['output'] == 1].filter('output')
df = df.rename(columns = {'output': '0 - Non-critical Patient'})
df["0 - Non-critical Patient"] = df["0 - Non-critical Patient"].replace({0:1, 1:0})
df = df.groupby('thall').count()


p_stacked_bar = df.plot_bokeh.bar(
    ylabel="Price per Unit [€]",
    title="Fruit prices per Year",
    stacked=True,
    alpha=0.6)

<a id="prep"></a>
# Data Preparation for Modeling
<a id="1hot"></a>
## One Hot Encoding
Since there are a lot of predictors that have numeric values, but are truly categories (e.g. Female [0] vs Male [1]), I'm going to use one hot encoding so that the machine learning algortithms don't try to use the order of the numbers as an attribute of significance (i.e. treat a higher number as more significant as a lower number). 

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

Variables that are categories: 'sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall'

In [None]:
heart_categories = heart[['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']]

enc = OneHotEncoder()
enc.fit(heart_categories)
onehotlabels = enc.transform(heart_categories).toarray()

#enc.inverse_transform(onehotlabels) # To convert them back to the original shape

columns = enc.get_feature_names(['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall'])
heart_categories = pd.DataFrame(onehotlabels, columns=columns)

heart = heart.drop(['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall'], axis=1)
heart = heart.join(heart_categories)

In [None]:
heart.head()

<a id="feature_select"></a>
## Feature Selection

There is one duplicate row so I removed that:

In [None]:
#display(heart[heart.duplicated()])
heart = heart.drop_duplicates()

Keeping the k=23 best predictors to reduce noise.

In [None]:
y = np.ravel(heart[['output']])
X = heart.loc[:, heart.columns != 'output']
X = SelectKBest(chi2, k=23).fit_transform(X, y)

display(heart.corr()[['output']].multiply(100).abs().sort_values(by = ['output']).applymap('{:.2f}%'.format).T)

temp = pd.DataFrame(X).merge(pd.DataFrame(y), left_index=True, right_index=True)
display(temp.corr()[['0_y']].multiply(100).abs().sort_values(by = ['0_y']).applymap('{:.2f}%'.format).T)

<a id="split"></a>
## Splitting the data
Going with the classic kfold cross validation to split the data.

Knowing nothing about the data collection process, it is not really safe to assume i.i.d. though so a future test could be using special techniques for grouped data - https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators-for-grouped-data

<a id="standardize"></a>
## Standarizing the Data
Using sklearn's StandardScaler to scale the data <br>
https://scikit-learn.org/stable/modules/preprocessing.html

In [None]:
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

<a id="model"></a>
# Modeling
<a id="bob_the_builder"></a>
## Building the models

I tried several combinations of the following sklearn models: <br>
    <li>RandomForestClassifier
    <li>LogisticRegression
    <li>SVC
    <li>GradientBoostingClassifier

In [None]:
rfc1 = RandomForestClassifier()
rfc2 = RandomForestClassifier(n_estimators=500)
rfc3 = RandomForestClassifier(n_estimators=1000)
rfc4 = RandomForestClassifier(criterion='entropy')
rfc5 = RandomForestClassifier(n_estimators=500, criterion='entropy')

lgc1 = LogisticRegression()
lgc2 = LogisticRegression(penalty='none')
lgc3 = LogisticRegression(solver='liblinear')
lgc4 = LogisticRegression(solver='newton-cg')
lgc5 = LogisticRegression(solver='saga', max_iter=500)
lgc6 = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=.1, max_iter=500)
lgc7 = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=.5, max_iter=500)
lgc8 = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=.9, max_iter=500)

svm1 = svm.SVC()
svm2 = svm.SVC(C=.1)
svm3 = svm.SVC(C=.5)
svm4 = svm.SVC(C=2)
svm5 = svm.SVC(kernel = 'linear')
svm6 = svm.SVC(kernel = 'poly')
svm7 = svm.SVC(kernel = 'sigmoid')

gbc1 = GradientBoostingClassifier()
gbc2 = GradientBoostingClassifier(learning_rate = 0.05, max_depth=2, n_estimators=50)
gbc3 = GradientBoostingClassifier(learning_rate = 0.05, max_depth=2, n_estimators=60)
gbc4 = GradientBoostingClassifier(loss = 'exponential')
gbc5 = GradientBoostingClassifier(loss = 'exponential', learning_rate = .05, max_depth=2, n_estimators = 50)
gbc6 = GradientBoostingClassifier(loss = 'exponential', learning_rate = .05, max_depth=2, n_estimators = 60)

models = [rfc1, rfc2, rfc3, rfc4, rfc5,
          lgc1, lgc2, lgc3, lgc4, lgc5, lgc6, lgc7, lgc8,
          svm1, svm2, svm3, svm4, svm5, svm6, svm7,
          gbc1, gbc2, gbc3, gbc4, gbc5, gbc6]

<a id="americas_next_top_model"></a>
## Model Selection
There are _**a lot**_ of ways to measure how well a model is performing. I'm going to focus on the following:
<li>Accuracy = # of Correct Predictions / Total Predictions <br>
<li>Sensitivity (True Positive Rate) = TP / (FN + TP)  <br>
<li>Specificity (True Negative Rate) = TN / (FP + TN) <br>
<li>F1 Score = 2 * ( 1 / ((1/precision) + (1/recall)) )  <br>
    
<br>
I used KFold cross validation with 5 folds to help account for the randomness in the data.
   
   

In [None]:
'''
Compare several models to see which performs the best.

RETURNS:
    model_df - pandas dataframe with the mean scores for each model.
    
PARAMETERS:
    model_arr (REQ) - list of models to compare.
    X_train (REQ) - independent variables for the training data
    y_train (REQ) - dependent variables for the training data
    num_of_folds (OPT) - number of times to perform KFold cross validation. Default is 5.
'''
def compare_models(model_arr, X_train, y_train, num_of_folds = 5):
    assert (model_arr is not None),"Must provide a list of models to test."
    assert (X_train is not None and isinstance(X_train,(np.ndarray))),"Must provide X_train as a numpy array."
    assert (y_train is not None and isinstance(y_train,(np.ndarray))),"Must provide y_train as a numpy array."

    kf = KFold(n_splits = num_of_folds)
    accuracy_dict = {}
    sensitivity_dict = {}
    specificity_dict = {}
    f1_dict = {}

    for train_index, test_index in kf.split(X_train):

        k_X_train = X_train[train_index]
        k_y_train = y_train[train_index]
        k_X_test = X_train[test_index]
        k_y_test = y_train[test_index]

        for model in model_arr:
    
            model.fit(k_X_train, k_y_train)
            y_pred = model.predict(k_X_test)
            accuracy = accuracy_score(y_pred, k_y_test)

            cm = confusion_matrix(k_y_test, y_pred)
            tn = cm[0][0]
            fp = cm[0][1]
            fn = cm[1][0]
            tp = cm[1][1]
            sensitivity = tp / (fn + tp)
            specificity = tn / (fp + tn)
            f1 = f1_score(k_y_test, y_pred)

            try:
                accuracy_dict[model] = np.append(accuracy_dict[model], accuracy)
                sensitivity_dict[model] = np.append(sensitivity_dict[model], sensitivity)
                specificity_dict[model] = np.append(specificity_dict[model], specificity)
                f1_dict[model] = np.append(f1_dict[model], f1)
            except KeyError:
                accuracy_dict[model] = np.array(accuracy)
                sensitivity_dict[model] = np.array(sensitivity)
                specificity_dict[model] = np.array(specificity)
                f1_dict[model] = np.array(f1)
    
    
    # Compile the score dictionaries into a pandas dataframe.
    
    model_df = pd.DataFrame(columns = ['model', 'accuracy', 'sensitivity', 'specificity', 'f1'])
    
    for model in model_arr:
        accuracy = accuracy_dict[model].mean() * 100
        sensitivity = sensitivity_dict[model].mean() * 100
        specificity = specificity_dict[model].mean() * 100
        f1 = f1_dict[model].mean() * 100
        
        row = {'model' : model,
               'accuracy' : accuracy,
               'sensitivity' : sensitivity,
               'specificity' : specificity,
               'f1' : f1}
                
        model_df = model_df.append(row, ignore_index=True)
    
    return (model_df)


model_df = compare_models(model_arr = models, X_train=X_train, y_train=y_train).sort_values('accuracy', ascending=False)
pd.set_option("precision", 2)
display(model_df)
model_df.describe()

Plotting the ROC for the top 3 models:

In [None]:
model_list = model_df.model.tolist()
top_x_models = 3

for i in range(top_x_models):
    model = model_list[i]
    plot_roc_curve(model, X_test, y_test)  
    plt.show() 

Finally, lets combine the best models and have them vote to make a prediction: 

In [None]:
vc = VotingClassifier(estimators=[('svm', svm2), ('rf', rfc4), ('lr', lgc1), ('gb',gbc5)], voting='hard')
vc.fit(X_train,y_train)
y_pred = vc.predict(X_test)
accuracy_score(y_pred, y_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=vc.classes_)
disp.plot() 

<a id="fin"></a>
## Fin! 
Thank you for reading my notebook, feedback is much appreciated!