In [1]:
import pandas as pd

data = pd.read_csv("../input/heart-failure-prediction/heart.csv")

In [2]:
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## **Heart Disease = 1 Normal = 0**

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [4]:
data.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [5]:
data["ChestPainType"].unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [6]:
data["RestingECG"].unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [7]:
data["ExerciseAngina"].unique()

array(['N', 'Y'], dtype=object)

In [8]:
data["ST_Slope"].unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [9]:
label_data = data[["Sex","ChestPainType","RestingECG","ExerciseAngina","ST_Slope"]]

In [10]:
label_data.head()

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope
0,M,ATA,Normal,N,Up
1,F,NAP,Normal,N,Flat
2,M,ATA,ST,N,Up
3,F,ASY,Normal,Y,Flat
4,M,NAP,Normal,N,Up


In [11]:
dummy_data = pd.get_dummies(label_data)

In [12]:
dummy_data.head()

Unnamed: 0,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,0,1,0,0,1,0,0,1,0,1,0,0,0,1


In [13]:
rest_data = data.drop(label_data, axis = 1)
rest_data = rest_data.drop("HeartDisease", axis = 1)

In [14]:
rest_data.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
0,40,140,289,0,172,0.0
1,49,160,180,0,156,1.0
2,37,130,283,0,98,0.0
3,48,138,214,0,108,1.5
4,54,150,195,0,122,0.0


In [15]:
data_concat = pd.concat([rest_data, dummy_data],axis = 1)

In [16]:
data_concat.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1.5,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0.0,0,1,0,0,1,0,0,1,0,1,0,0,0,1


In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x = scaler.fit_transform(data_concat)

In [18]:
y = data["HeartDisease"].values

In [19]:
y.shape

(918,)

In [20]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2, random_state= 42)

In [21]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((734, 20), (734,), (184, 20), (184,))

## **Logistic Regression**

In [22]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression()


logistic_model.fit(x_train, y_train)


LogisticRegression()

In [23]:
print(f"Logistic Regression Training Accuracy: {logistic_model.score(x_train,y_train) * 100: 0.2f} %")
print(f"Logistic Regression Testing Accuracy: {logistic_model.score(x_test,y_test) * 100: 0.2f} %")

Logistic Regression Training Accuracy:  87.19 %
Logistic Regression Testing Accuracy:  85.33 %


## **Naive Bayes Model**

In [24]:
from sklearn.naive_bayes import GaussianNB

nav_model = GaussianNB()

nav_model.fit(x_train,y_train)

GaussianNB()

In [25]:
print(f"Naive Bayes Model Training Accuracy: {nav_model.score(x_train,y_train) * 100: 0.2f} %")
print(f"Naive Bayes Model Testing Accuracy: {nav_model.score(x_test,y_test) * 100: 0.2f} %")

Naive Bayes Model Training Accuracy:  85.83 %
Naive Bayes Model Testing Accuracy:  86.41 %


## **Support Vector Machine Model**

In [26]:
from sklearn.svm import SVC

svc_model = SVC()

svc_model.fit(x_train, y_train)


SVC()

In [27]:
print(f"SVC Model Training Accuracy: {svc_model.score(x_train,y_train) * 100: 0.2f} %")
print(f"SVC Model Testing Accuracy: {svc_model.score(x_test,y_test) * 100: 0.2f} %")

SVC Model Training Accuracy:  90.19 %
SVC Model Testing Accuracy:  89.13 %


## **K-Nearest Neighbors Model**

In [28]:
from sklearn.neighbors import KNeighborsClassifier

kne_model = KNeighborsClassifier(n_neighbors= 5)

kne_model.fit(x_train,y_train)

KNeighborsClassifier()

In [29]:
print(f"K Neighbor Model Training Accuracy: {kne_model.score(x_train,y_train) * 100: 0.2f} %")
print(f"K Neighbor Testing Accuracy: {kne_model.score(x_test,y_test) * 100: 0.2f} %")

K Neighbor Model Training Accuracy:  88.69 %
K Neighbor Testing Accuracy:  86.41 %


## **Decision Tree Model**

In [30]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()

tree_model.fit(x_train, y_train)

DecisionTreeClassifier()

In [31]:
print(f"Decision Tree Model Training Accuracy: {tree_model.score(x_train,y_train) * 100: 0.2f} %")
print(f"Decision Tree Testing Accuracy: {tree_model.score(x_test,y_test) * 100: 0.2f} %")

Decision Tree Model Training Accuracy:  100.00 %
Decision Tree Testing Accuracy:  80.43 %


## **Random Forest Model**

In [32]:
from sklearn.ensemble import RandomForestClassifier

forest_model = RandomForestClassifier(n_estimators= 100)

forest_model.fit(x_train,y_train)

RandomForestClassifier()

In [33]:
print(f"Random Forest Model Training Accuracy: {forest_model.score(x_train,y_train) * 100: 0.2f} %")
print(f"Random Forest Testing Accuracy: {forest_model.score(x_test,y_test) * 100: 0.2f} %")

Random Forest Model Training Accuracy:  100.00 %
Random Forest Testing Accuracy:  88.04 %


## **Neural Network Model**

In [34]:
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(max_iter= 200)

mlp_model.fit(x_train,y_train)



MLPClassifier()

In [35]:
print(f"Neural Model Training Accuracy: {mlp_model.score(x_train,y_train) * 100: 0.2f} %")
print(f"Neural Testing Accuracy: {mlp_model.score(x_test,y_test) * 100: 0.2f} %")

Neural Model Training Accuracy:  92.78 %
Neural Testing Accuracy:  91.30 %


## **Stack Classifier Model**

In [36]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

model_list = [("Logis",logistic_model), ("nav",nav_model), ("svc",svc_model), ("kne",kne_model),("tree",tree_model), ("forest",forest_model), ("mlp",mlp_model)]

stack_model = StackingClassifier(estimators= model_list, final_estimator= LogisticRegression())

stack_model.fit(x_train,y_train)



StackingClassifier(estimators=[('Logis', LogisticRegression()),
                               ('nav', GaussianNB()), ('svc', SVC()),
                               ('kne', KNeighborsClassifier()),
                               ('tree', DecisionTreeClassifier()),
                               ('forest', RandomForestClassifier()),
                               ('mlp', MLPClassifier())],
                   final_estimator=LogisticRegression())

In [37]:
print(f"Stack Model Training Accuracy: {stack_model.score(x_train,y_train) * 100: 0.2f} %")
print(f"Stack Model Testing Accuracy: {stack_model.score(x_test,y_test) * 100: 0.2f} %")

Stack Model Training Accuracy:  92.10 %
Stack Model Testing Accuracy:  88.04 %
