# Dectecting Heart Disease Machine Learning Project

#### Citation:
This project uses data from the UCI Machine Learning Repository


Janosi,Andras, Steinbrunn,William, Pfisterer,Matthias, and Detrano,Robert. (1988). Heart Disease. UCI Machine Learning Repository. https://doi.org/10.24432/C52P4X.

### Imports

In [1]:
# Importing Libraries

# Custom Functions
from heart_ml_utils import *

# Data Exploration and Plotting
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math

# Pipelining
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

# Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Scoring
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay

In [2]:
feat_names = ['age', # (numerical)
              'sex', # (binary)
              'cp', # (categorical) [1-4] chest pain 
              'trestbps', # (numerical) resting blood pressure 
              'chol', # (numerical) serum cholestoral in mg/dl 
              'fbs', # (binary) fasting blood sugar > 120 mg/dl 
              'restecg', # (ordinal)(pre-encoded) [0-2] resting electrocardiographic results 
              'thalach', # (numerical) maximum heart rate acheived 
              'exang', # (binary) exercised induced angina 
              'oldpeak', # (numerical) ST depression induced by exercise relative to rest 
              'slope', # (ordinal)(pre-encoded) [1-3] slope of the peak exercise ST segment
              'ca', # (numerical) number of major vessels colored by flourosopy
              'thal', # (ordinal)(pre-encoded) "3 = normal; 6 = fixed defect; 7 = reversable defect"
              'num'] # (categorical) [0-4] diagnosis 
heart_df = pd.read_csv('cleveland_heart_data', names = feat_names)

## Data Cleaning and Feature Engineering

In [3]:
# Drop observations with problematic values (see heart_ml_data_explor.ipynb)

cleaned_df = heart_df

# Create a dictionary of column-value pairs for flexible alteration
drop_dict = {'restecg':1,
             'ca':'?',
             'thal':'?'
}

# Drop data according to dictionary
for drop_col in list(drop_dict.keys()):
    cleaned_df = drop_data(cleaned_df, drop_col, drop_dict[drop_col]) #using custom function (see heart_ml_utils.py)   

print(f'dropped {round((heart_df.shape[0] - cleaned_df.shape[0])*100 / heart_df.shape[0],2)}% of data')

dropped 3.3% of data


In [3]:
# ALTERNATIVELY: Replace ?s with None to be imputed later

cleaned_df = heart_df

cleaned_df = string_to_Na(cleaned_df, '?') #using custom function (see heart_ml_utils.py)

In [4]:
# Now that the ?s have been dealt with the columns must be converted from object to float

# Create a dictionary of column-type pairs for flexible alteration
dtype_dict = {'ca':'float',
              'thal':'float'
}

# Convert columns into new data types according to dictionary
for col in list(dtype_dict.keys()):
    cleaned_df[col] = cleaned_df[col].astype(dtype_dict[col])

In [5]:
# Rather than predicting specific diseases, this model will classify between diseased and healthy

cleaned_df['num'] = 0
cleaned_df.loc[cleaned_df['num']>0, 'num'] = 1
# 0 means healthy, 1 means diseased

In [6]:
# OPTIONAL: Consider using this binary version later (depending on the type of model)

extra_feature_df = cleaned_df
extra_feature_df['st_no_change'] = 0
extra_feature_df.loc[extra_feature_df['oldpeak']>0, 'st_no_change'] = 1

## Data Splitting

In [7]:
# Separate predictor features (X) and target feature (y)

X = cleaned_df # OPTIONAL: switch with extra_feature_df
y = X.pop('num')

In [8]:
# Seperate test data from training data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Preprocessing Pipeline

In [10]:
# Seperate features by data type for preprocessing pipeline

onehot_features = ['cp']

In [36]:
# OPTIONAL: Identify ordinally pre-ecoded features to test onehot encoding instead

ordinal_to_onehot = ['restecg', 'slope', 'thal']
onehot_features += ordinal_to_onehot

In [None]:
# UNUSED: Create ordinal encoder
#(potentially useful when switching to the 76 feature dataset or reusing code in another project)

# Create dictionary of ordinal features and their ordered values
ordinal_codes_dict = {'feature':['ordered','list','of','values'],
                      'feature':['ordered','list','of','values']
}

# Get column indexes of ordinal features
ordinal_ids = X.get_indexer(list(ordinal_codes_dict.keys()))

# Create list of ordered lists, ordered by feature index
ordinal_codes_list = []
for id in ordinal_ids:
    ordinal_codes_list.append(ordinal_codes_dict[X.columns[id]])

# Create tuple to be used for ordinal encoder in a column transformer
ordinal_transformer = ("cat_ordinal", 
                       OrdinalEncoder(categories=ordinal_codes_list), 
                       ordinal_ids)

In [11]:
# Create categorical encoder

cat_encoder = ColumnTransformer(transformers=[
    #ordinal_transformer,
    ("cat_onehot", 
     OneHotEncoder(drop='first', sparse_output=False, handle_unknown="ignore"), 
     onehot_features)
])

In [12]:
# Create categorical pipeline

cat_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), #can be varied with hyperparameter tuning
                         cat_encoder)

In [13]:
# Create numerical pipeline

num_pipe = make_pipeline(SimpleImputer(strategy='mean')) #can be varied with hyperparameter tuning

In [14]:
# Create full preprocessor with categorical and numerical columns

cat_features = onehot_features #+ list(ordinal_codes_dict.keys())
num_features = list(set(X.columns)-set(cat_features))

preprocessor = ColumnTransformer(transformers=[
    ('num_pipe', num_pipe, num_features),
    ('cat_pipe', cat_pipe, cat_features)
])

In [15]:
preprocessor

**Choices to try adjusting when testing the models**
* do not remove as much data
* use binary version of oldpeak
* onehot encode features currently ordinally pre-encoded