In [68]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

In [69]:
df=pd.read_csv('heart.csv')

In [70]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [71]:
dataset1=df.iloc[0:150,:]
dataset2=df.iloc[0:300,:]
dataset3=df.iloc[0:450,:]
dataset4=df.iloc[0:600,:]
dataset5=df.iloc[0:750,:]
dataset6=df.iloc[0:918,:]

In [72]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## Encoding

In [73]:
attributes1=[dataset1.Sex,dataset1.ChestPainType,dataset1.RestingECG,dataset1.ExerciseAngina,dataset1.ST_Slope]
attributes2=[dataset2.Sex,dataset2.ChestPainType,dataset2.RestingECG,dataset2.ExerciseAngina,dataset2.ST_Slope]
attributes3=[dataset3.Sex,dataset3.ChestPainType,dataset3.RestingECG,dataset3.ExerciseAngina,dataset3.ST_Slope]
attributes4=[dataset4.Sex,dataset4.ChestPainType,dataset4.RestingECG,dataset4.ExerciseAngina,dataset4.ST_Slope]
attributes5=[dataset5.Sex,dataset5.ChestPainType,dataset5.RestingECG,dataset5.ExerciseAngina,dataset5.ST_Slope]
attributes6=[dataset6.Sex,dataset6.ChestPainType,dataset6.RestingECG,dataset6.ExerciseAngina,dataset6.ST_Slope]

In [74]:
for i in attributes1:
    dummies=pd.get_dummies(i)
    dataset1=pd.concat([dataset1,dummies],axis='columns')    

In [75]:
dataset1.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,...,ASY,ATA,NAP,TA,Normal,ST,N,Y,Flat,Up
0,40,M,ATA,140,289,0,Normal,172,N,0.0,...,0,1,0,0,1,0,1,0,0,1
1,49,F,NAP,160,180,0,Normal,156,N,1.0,...,0,0,1,0,1,0,1,0,1,0
2,37,M,ATA,130,283,0,ST,98,N,0.0,...,0,1,0,0,0,1,1,0,0,1
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,...,1,0,0,0,1,0,0,1,1,0
4,54,M,NAP,150,195,0,Normal,122,N,0.0,...,0,0,1,0,1,0,1,0,0,1


In [76]:
for i in attributes2:
    dummies=pd.get_dummies(i)
    dataset2=pd.concat([dataset2,dummies],axis='columns')    

In [77]:
for i in attributes3:
    dummies=pd.get_dummies(i)
    dataset3=pd.concat([dataset3,dummies],axis='columns')    

In [78]:
for i in attributes4:
    dummies=pd.get_dummies(i)
    dataset4=pd.concat([dataset4,dummies],axis='columns')    

In [79]:
for i in attributes5:
    dummies=pd.get_dummies(i)
    dataset5=pd.concat([dataset5,dummies],axis='columns')    

In [80]:
for i in attributes6:
    dummies=pd.get_dummies(i)
    dataset6=pd.concat([dataset6,dummies],axis='columns')    

In [81]:
datasets=[dataset1,dataset2,dataset3,dataset4,dataset5,dataset6]

In [82]:
dataset1=dataset1.drop(columns=['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope'],axis=1)
dataset2=dataset2.drop(columns=['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope'],axis=1)
dataset3=dataset3.drop(columns=['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope'],axis=1)
dataset4=dataset4.drop(columns=['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope'],axis=1)
dataset5=dataset5.drop(columns=['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope'],axis=1)
dataset6=dataset6.drop(columns=['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope'],axis=1)

In [83]:
dataset1.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,F,M,ASY,ATA,NAP,TA,Normal,ST,N,Y,Flat,Up
0,40,140,289,0,172,0.0,0,0,1,0,1,0,0,1,0,1,0,0,1
1,49,160,180,0,156,1.0,1,1,0,0,0,1,0,1,0,1,0,1,0
2,37,130,283,0,98,0.0,0,0,1,0,1,0,0,0,1,1,0,0,1
3,48,138,214,0,108,1.5,1,1,0,1,0,0,0,1,0,0,1,1,0
4,54,150,195,0,122,0.0,0,0,1,0,0,1,0,1,0,1,0,0,1


## Standardizing the Data

In [84]:
dataset1[['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']]=StandardScaler().fit_transform(dataset1[['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']])
dataset2[['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']]=StandardScaler().fit_transform(dataset2[['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']])
dataset3[['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']]=StandardScaler().fit_transform(dataset3[['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']])
dataset4[['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']]=StandardScaler().fit_transform(dataset4[['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']])
dataset5[['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']]=StandardScaler().fit_transform(dataset5[['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']])
dataset6[['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']]=StandardScaler().fit_transform(dataset6[['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']])



In [85]:
dataset1.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,F,M,ASY,ATA,NAP,TA,Normal,ST,N,Y,Flat,Up
0,-0.929023,0.530489,0.5522,0,1.343673,-0.664216,0,0,1,0,1,0,0,1,0,1,0,0,1
1,0.197824,1.751875,-0.894206,0,0.670434,0.455249,1,1,0,0,0,1,0,1,0,1,0,1,0
2,-1.304639,-0.080204,0.472581,0,-1.770057,-0.664216,0,0,1,0,1,0,0,0,1,1,0,0,1
3,0.072619,0.40835,-0.443034,0,-1.349283,1.014982,1,1,0,1,0,0,0,1,0,0,1,1,0
4,0.823851,1.141182,-0.695159,0,-0.760199,-0.664216,0,0,1,0,0,1,0,1,0,1,0,0,1


## Splitting the Features and the Target

In [86]:
X1=dataset1.drop(columns='HeartDisease', axis=1)
Y1=dataset1['HeartDisease']
X2=dataset2.drop(columns='HeartDisease', axis=1)
Y2=dataset2['HeartDisease']
X3=dataset3.drop(columns='HeartDisease', axis=1)
Y3=dataset3['HeartDisease']
X4=dataset4.drop(columns='HeartDisease', axis=1)
Y4=dataset4['HeartDisease']
X5=dataset5.drop(columns='HeartDisease', axis=1)
Y5=dataset5['HeartDisease']
X6=dataset6.drop(columns='HeartDisease', axis=1)
Y6=dataset6['HeartDisease']

## Train Test Split

In [87]:
X_train_1,X_test_1,Y_train_1,Y_test_1=train_test_split(X1,Y1,test_size=0.5,stratify=Y1,random_state=0)
X_train_2,X_test_2,Y_train_2,Y_test_2=train_test_split(X2,Y2,test_size=0.33,stratify=Y2,random_state=0)
X_train_3,X_test_3,Y_train_3,Y_test_3=train_test_split(X3,Y3,test_size=0.5,stratify=Y3,random_state=0)
X_train_4,X_test_4,Y_train_4,Y_test_4=train_test_split(X4,Y4,test_size=0.5,stratify=Y4,random_state=0)
X_train_5,X_test_5,Y_train_5,Y_test_5=train_test_split(X5,Y5,test_size=0.33,stratify=Y5,random_state=0)
X_train_6,X_test_6,Y_train_6,Y_test_6=train_test_split(X6,Y6,test_size=0.5,stratify=Y6,random_state=0)

In [88]:
train_test=[(X_train_1,Y_train_1),(X_train_2,Y_train_2),(X_train_3,Y_train_3),(X_train_4,Y_train_4),(X_train_5,Y_train_5),(X_train_6,Y_train_6)]
x_test=[X_test_1,X_test_2,X_test_3,X_test_4,X_test_5,X_test_6]
y_test=[Y_test_1,Y_test_2,Y_test_3,Y_test_4,Y_test_5,Y_test_6]

## Decision Tree Classifier

In [99]:
dt_classifier=DecisionTreeClassifier()

In [100]:
for i in range(6):
    dt_classifier.fit(train_test[i][0],train_test[i][1])
    y_pred= dt_classifier.predict(x_test[i])
    accuracy= accuracy_score(y_test[i], y_pred)
    precision=precision_score(y_test[i],y_pred)
    recall=recall_score(y_test[i],y_pred)
    f1=f1_score(y_test[i],y_pred)
    print("For dataset ",i+1," Accuracy = %0.4f" % accuracy," Precision = %0.4f " % precision," Recall = %0.4f" % recall," F1 score = %0.4f " % f1)
    



For dataset  1  Accuracy = 0.7867  Precision = 0.8000   Recall = 0.5714  F1 score = 0.6667 
For dataset  2  Accuracy = 0.9596  Precision = 0.9231   Recall = 0.9730  F1 score = 0.9474 
For dataset  3  Accuracy = 0.8844  Precision = 0.8819   Recall = 0.9106  F1 score = 0.8960 
For dataset  4  Accuracy = 0.8500  Precision = 0.8807   Recall = 0.8659  F1 score = 0.8732 
For dataset  5  Accuracy = 0.7903  Precision = 0.8473   Recall = 0.7762  F1 score = 0.8102 
For dataset  6  Accuracy = 0.7647  Precision = 0.7967   Recall = 0.7717  F1 score = 0.7840 


## Support Vector Machine

In [101]:
svm_classifier=svm.SVC(kernel='linear')

In [102]:
for i in range(6):
    svm_classifier.fit(train_test[i][0],train_test[i][1])
    y_pred= svm_classifier.predict(x_test[i])
    accuracy= accuracy_score(y_test[i], y_pred)
    precision=precision_score(y_test[i],y_pred)
    recall=recall_score(y_test[i],y_pred)
    f1=f1_score(y_test[i],y_pred)
    print("For dataset ",i+1," Accuracy = %0.4f" % accuracy," Precision = %0.4f " % precision," Recall = %0.4f" % recall," F1 score = %0.4f " % f1)


For dataset  1  Accuracy = 0.9333  Precision = 0.8485   Recall = 1.0000  F1 score = 0.9180 
For dataset  2  Accuracy = 0.9394  Precision = 0.8780   Recall = 0.9730  F1 score = 0.9231 
For dataset  3  Accuracy = 0.9156  Precision = 0.8881   Recall = 0.9675  F1 score = 0.9261 
For dataset  4  Accuracy = 0.9200  Precision = 0.9101   Recall = 0.9609  F1 score = 0.9348 
For dataset  5  Accuracy = 0.8871  Precision = 0.8966   Recall = 0.9091  F1 score = 0.9028 
For dataset  6  Accuracy = 0.8780  Precision = 0.8992   Recall = 0.8780  F1 score = 0.8884 


## Logistic Regression

In [103]:
lr_classifier=LogisticRegression()

In [104]:
for i in range(6):
    lr_classifier.fit(train_test[i][0],train_test[i][1])
    y_pred= lr_classifier.predict(x_test[i])
    accuracy= accuracy_score(y_test[i], y_pred)
    precision=precision_score(y_test[i],y_pred)
    recall=recall_score(y_test[i],y_pred)
    f1=f1_score(y_test[i],y_pred)
    print("For dataset ",i+1," Accuracy = %0.4f" % accuracy," Precision = %0.4f " % precision," Recall = %0.4f" % recall," F1 score = %0.4f " % f1)


For dataset  1  Accuracy = 0.9333  Precision = 0.9259   Recall = 0.8929  F1 score = 0.9091 
For dataset  2  Accuracy = 0.9495  Precision = 0.9000   Recall = 0.9730  F1 score = 0.9351 
For dataset  3  Accuracy = 0.9200  Precision = 0.8947   Recall = 0.9675  F1 score = 0.9297 
For dataset  4  Accuracy = 0.9267  Precision = 0.9153   Recall = 0.9665  F1 score = 0.9402 
For dataset  5  Accuracy = 0.8750  Precision = 0.8889   Recall = 0.8951  F1 score = 0.8920 
For dataset  6  Accuracy = 0.8845  Precision = 0.9102   Recall = 0.8780  F1 score = 0.8938 


## K Nearest Neighbor

In [105]:
knn_classifier=KNeighborsClassifier()

In [106]:
for i in range(6):
    knn_classifier.fit(train_test[i][0],train_test[i][1])
    y_pred= knn_classifier.predict(x_test[i])
    accuracy= accuracy_score(y_test[i], y_pred)
    precision=precision_score(y_test[i],y_pred)
    recall=recall_score(y_test[i],y_pred)
    f1=f1_score(y_test[i],y_pred)
    print("For dataset ",i+1," Accuracy = %0.4f" % accuracy," Precision = %0.4f " % precision," Recall = %0.4f" % recall," F1 score = %0.4f " % f1)


For dataset  1  Accuracy = 0.8933  Precision = 0.8846   Recall = 0.8214  F1 score = 0.8519 
For dataset  2  Accuracy = 0.8990  Precision = 0.8857   Recall = 0.8378  F1 score = 0.8611 
For dataset  3  Accuracy = 0.9156  Precision = 0.8939   Recall = 0.9593  F1 score = 0.9255 
For dataset  4  Accuracy = 0.9133  Precision = 0.9135   Recall = 0.9441  F1 score = 0.9286 
For dataset  5  Accuracy = 0.8831  Precision = 0.8800   Recall = 0.9231  F1 score = 0.9010 
For dataset  6  Accuracy = 0.8715  Precision = 0.8854   Recall = 0.8819  F1 score = 0.8836 


## Voting Classifier

In [97]:
voting_model=VotingClassifier(estimators=[('dtc',DecisionTreeClassifier()),('knn',KNeighborsClassifier()),('svc',svm.SVC(kernel='linear')),('lrc',LogisticRegression())],voting='hard')

In [98]:
for i in range(6):
    voting_model.fit(train_test[i][0],train_test[i][1])
    y_pred= voting_model.predict(x_test[i])
    accuracy= accuracy_score(y_test[i], y_pred)
    precision=precision_score(y_test[i],y_pred)
    recall=recall_score(y_test[i],y_pred)
    f1=f1_score(y_test[i],y_pred)
    print("For dataset ",i+1," Accuracy = %0.4f" % accuracy," Precision = %0.4f " % precision," Recall = %0.4f" % recall," F1 score = %0.4f " % f1)


For dataset  1  Accuracy = 0.9333  Precision = 0.9259   Recall = 0.8929  F1 score = 0.9091 
For dataset  2  Accuracy = 0.9495  Precision = 0.9000   Recall = 0.9730  F1 score = 0.9351 
For dataset  3  Accuracy = 0.9244  Precision = 0.9015   Recall = 0.9675  F1 score = 0.9333 
For dataset  4  Accuracy = 0.9167  Precision = 0.9140   Recall = 0.9497  F1 score = 0.9315 
For dataset  5  Accuracy = 0.8750  Precision = 0.8944   Recall = 0.8881  F1 score = 0.8912 
For dataset  6  Accuracy = 0.8867  Precision = 0.9208   Recall = 0.8701  F1 score = 0.8947 


| Testing interval | Accuracy | Precision | Recall | F1 score |
|-----------------------|----------|-----------|--------|----------|
| 1-153 | 0.9333 | 0.9259 | 0.8929 | 0.9091 |
| 1-306 | 0.9495 | 0.9000 | 0.9730 | 0.9351 |
| 1-459 | 0.9244 | 0.9015 | 0.9675 | 0.9333 |
| 1-612 | 0.9167 | 0.9140 | 0.9497 | 0.9315 |
| 1-765 | 0.8750 | 0.8944 | 0.8881 | 0.8912 |
| 1-918 | 0.8911 | 0.9250 | 0.8740 | 0.8988 |