In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

### TASK: Heart Disease Prediction
* STEP 1: Data Preprocessing
* STEP 2: Model Application
* STEP 3: Model Selection

##### Data Preprocessing
* Dataset: 2020 annual CDC survey data of 400k adults related to their health status[319795*18]
    * prediction (y): HeartDisease (YES or NO) [heavily unbalanced]
    * features (X): 17 variables (13 categories and 4 numericals)

In [2]:
df = pd.read_csv('heart_2020.csv')
display(df)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [4]:
cat_cols = ['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 
             'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']
num_cols = ['BMI','PhysicalHealth','MentalHealth','SleepTime']

# rescale the data
df1 = df[num_cols]
df1 = MinMaxScaler().fit_transform(df1)
df1 = pd.DataFrame(df1, columns=['BMI','PhysicalHealth','MentalHealth','SleepTime'])

# binarize the data
df2 = df[cat_cols]
cat_cols_encoded = []
for col in cat_cols:
    cat_cols_encoded += [f"{col}_{cat}" for cat in list(df2[col].unique())]
    
oh_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_cols = oh_encoder.fit_transform(df2)
df2 = pd.DataFrame(encoded_cols, columns=cat_cols_encoded)

# standardize the data
X = pd.concat([df1, df2], axis=1)
y = pd.DataFrame(df['HeartDisease'].apply(lambda x:0 if x=='No' else 1))
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# before training
X_train, X_test, y_train, y_test = train_test_split(X_scaled, np.array(y).reshape(-1), test_size=0.1)

# calculate test error
def err(preds,lbl):
    n = len(lbl)
    return sum((lbl-preds)**2) / n

##### Model Application

In [5]:
# Logistic Regression
clf1 = LogisticRegression().fit(X_train, y_train)
score1 = clf1.score(X_train, y_train)
y_test_pred1 = clf1.predict(X_test)
print('LR -- Score: {}, Test Error: {}'.format(score1,err(y_test_pred1,y_test)))

# Random Forest
clf2 = RandomForestClassifier().fit(X_train, y_train)
score2 = clf2.score(X_train, y_train)
y_test_pred2 = clf2.predict(X_test)
print('RF -- Score: {}, Test Error: {}'.format(score2,err(y_test_pred2,y_test)))

LR -- Score: 0.9158139777287494, Test Error: 0.083270794246404
RF -- Score: 0.9964908013828327, Test Error: 0.09699812382739212


In [8]:
# Artificial Neural Network
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(1,input_shape=(num_input,) ,activation='sigmoid'))
model.compile(optimizer='sgd', metrics=['accuracy'], loss='binary_crossentropy')
history1 = model.fit(X_train, y_train, epochs=100,validation_split=0.1)
_, model_accuracy_train = model.evaluate(X_train,y_train)
y_test_pred3 = model.predict(X_test).reshape(-1)
print('NN -- Score: {}, Test Error: {}'.format(model_accuracy_train,err(y_test_pred3,y_test)))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
NN -- Score: 0.9152545928955078, Test Error: 0.06590414533740957


##### Model Selection

In [7]:
# Logistic Regression
params_lr = {'C': np.logspace(-3,3,7), 'penalty': ['l1','l2']}
lr = LogisticRegression()
clf1 = GridSearchCV(lr, params_lr, cv=3).fit(X_train, y_train)
y_test_pred1 = clf1.predict(X_test)
print('After GridSearchCV LR with best parameters {}: \n    Score: {}, Test Error: {}'.format(clf1.best_estimator_, clf1.best_score_, err(y_test_pred1,y_test)))

# Random Forest
params_rf = {'n_estimators':[20, 40, 60, 80, 100], 'max_depth': [2,3,4]}
rf = RandomForestClassifier()
clf2 = GridSearchCV(rf, params_rf, cv=3).fit(X_train, y_train)
y_test_pred2 = clf2.predict(X_test)
print('After GridSearchCV RF with best parameters {}: \n    Score: {}, Test Error: {}'.format(clf2.best_estimator_, clf2.best_score_, err(y_test_pred2,y_test)))

# # Support Vector Machine
# params_svm = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
# svc = SVC()
# clf3 = GridSearchCV(svc, params_svm, cv=3).fit(X_train, y_train)
# y_test_pred3 = clf3.predict(X_test)
# print('After GridSearchCV SVM with best parameters {}: \n    Score: {}, Test Error: {}'.format(clf3.best_estimator_, clf3.best_score_,err(y_test_pred3,y_test)))

After GridSearchCV LR with best parameters LogisticRegression(C=0.01): 
    Score: 0.9158139798754953, Test Error: 0.08323952470293934
After GridSearchCV RF with best parameters RandomForestClassifier(max_depth=4, n_estimators=40): 
    Score: 0.9146187656934103, Test Error: 0.0841776110068793


In [9]:
# Artificial Neural Network
num_input = X_train.shape[1]
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(15,input_shape=(num_input,) ,activation='tanh'))
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))
model.compile(optimizer='sgd', metrics=['accuracy'], loss='binary_crossentropy')
history2 = model.fit(X_train, y_train, epochs=100,validation_split=0.1)
_, model_accuracy_train = model.evaluate(X_train,y_train)
y_test_pred3 = model.predict(X_test).reshape(-1)
print('NN -- Score: {}, Test Error: {}'.format(model_accuracy_train,err(y_test_pred3,y_test)))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
NN -- Score: 0.9166026711463928, Test Error: 0.06528286521497087


In [6]:
# Artificial Neural Network
num_input = X_train.shape[1]
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(25,input_shape=(num_input,) ,activation='tanh'))
model.add(tf.keras.layers.Dense(15,activation='tanh'))
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))
model.compile(optimizer='sgd', metrics=['accuracy'], loss='binary_crossentropy')
history = model.fit(X_train, y_train, epochs=100,validation_split=0.1)
_, model_accuracy_train = model.evaluate(X_train,y_train)
y_test_pred3 = model.predict(X_test).reshape(-1)
print('NN -- Score: {}, Test Error: {}'.format(model_accuracy_train,err(y_test_pred3,y_test)))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
NN -- Score: 0.9166791439056396, Test Error: 0.06528908880428741
