In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('Heart_disease_dataset.csv')
data.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [3]:
data.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [4]:
data.Smoking.replace(('Yes','No'), (1,0), inplace = True)
data.HeartDisease.replace(('Yes','No'), (1,0), inplace = True)
data.AlcoholDrinking.replace(('Yes','No'), (1,0), inplace = True)
data.Stroke.replace(('Yes','No'), (1,0), inplace = True)
data.Sex.replace(('Male','Female'), (1,0), inplace = True)
data.Diabetic.replace(('Yes','No','No, borderline diabetes','Yes (during pregnancy)'), (1,0,0,0), inplace = True)
data.PhysicalActivity.replace(('Yes','No'), (1,0), inplace = True)
data.AgeCategory.replace(('18-24','25-29','30-34','35-39','40-44','45-49','50-54',
                          '55-59','60-64','65-69','70-74','75-79','80 or older'),
                         (0,0,0,0,0,1,1,1,1,1,1,1,1), inplace = True)

In [7]:
data.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [21]:
# PhysicalHealth, MentalHealth, Diffwalking, Race, Diabetic, Genhealth, SleepTime, Asthma, KidneyDisease, SkinCancer

data.SkinCancer.value_counts()

0    289976
1     29819
Name: SkinCancer, dtype: int64

In [9]:
data.GenHealth.replace(('Excellent', 'Very good', 'Good', 'Fair', 'Poor'), (5,4,3,2,1), inplace = True)

In [12]:
data.DiffWalking.replace(('Yes','No'), (1,0), inplace =True)

In [15]:
data.Asthma.replace(('Yes','No'), (1,0), inplace =True)

In [18]:
data.KidneyDisease.replace(('Yes','No'), (1,0), inplace =True)

In [20]:
data.SkinCancer.replace(('Yes','No'), (1,0), inplace =True)

In [22]:
data.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3.0,30.0,0,0,1,White,1,1,4,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,0,1,White,0,1,4,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,1,White,1,1,2,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,1,White,0,0,3,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,0,0,White,0,1,4,8.0,0,0,0


In [23]:
X = data.drop(['HeartDisease','Race'], axis = 1)
Y = data.HeartDisease

In [24]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 0)
sm.fit(X,Y)
x_resem, y_resem = sm.fit_resample(X, Y)

In [25]:
y_resem.value_counts()

0    292422
1    292422
Name: HeartDisease, dtype: int64

In [43]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x_resem, y_resem, test_size = 0.2, random_state = 0)

# Model Building

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score

In [45]:
max_features = int(x_resem.shape[0] * 0.8)
max_features

467875

In [53]:
bag_clf = BaggingClassifier(LogisticRegression(), n_estimators = 50, 
                            max_samples = max_features, bootstrap = True, 
                            n_jobs = -1, oob_score = True)
bag_clf.fit(xtrain, ytrain)
bag_clf.oob_score_

0.73540368688218

In [54]:
ypred = bag_clf.predict(xtest)
print("Accuracy Score : ", accuracy_score(ytest, ypred))
print("Precision Score : ", precision_score(ytest, ypred))
print("Recall Score : ", recall_score(ytest, ypred))
print("F1 Score : ", f1_score(ytest, ypred))

Accuracy Score :  0.7341859809009225
Precision Score :  0.7287869684224582
Recall Score :  0.7463252888493881
F1 Score :  0.7374518678646221


In [55]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators = 50, 
                            max_samples = max_features, bootstrap = True, 
                            n_jobs = -1, oob_score = True)
bag_clf.fit(xtrain, ytrain)
bag_clf.oob_score_

0.8794229227892065

In [56]:
ypred = bag_clf.predict(xtest)
print("Accuracy Score : ", accuracy_score(ytest, ypred))
print("Precision Score : ", precision_score(ytest, ypred))
print("Recall Score : ", recall_score(ytest, ypred))
print("F1 Score : ", f1_score(ytest, ypred))

Accuracy Score :  0.8813018834050047
Precision Score :  0.8793138621604162
Recall Score :  0.8840329527585972
F1 Score :  0.8816670928151368


In [57]:
rnd_clf = RandomForestClassifier(n_estimators = 50, n_jobs = -1, oob_score= True)
rnd_clf.fit(xtrain, ytrain)
rnd_clf.oob_score_

0.8773946032594175

In [58]:
ypred = rnd_clf.predict(xtest)
print("Accuracy Score : ", accuracy_score(ytest, ypred))
print("Precision Score : ", precision_score(ytest, ypred))
print("Recall Score : ", recall_score(ytest, ypred))
print("F1 Score : ", f1_score(ytest, ypred))

Accuracy Score :  0.8794552402773385
Precision Score :  0.8758802816901409
Recall Score :  0.884323511314692
F1 Score :  0.8800816465385269
