# Heart disease prediction - Naive bayes classifier

In [1]:
import pandas as pd
import numpy as np

from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
data = pd.read_csv(r"D:\Dataset\heart - Medical(kaggle).csv")
print("shape of the data : (rows,columns)")
print(np.shape(data))
print("Rank of the data : ")
print(np.ndim(data))
print("Size of data : ")
print(np.size(data))

shape of the data : (rows,columns)
(918, 12)
Rank of the data : 
2
Size of data : 
11016


In [3]:
#getting attribute names

for col in data.columns :
    print(col)

Age
Sex
ChestPainType
RestingBP
Cholesterol
FastingBS
RestingECG
MaxHR
ExerciseAngina
Oldpeak
ST_Slope
HeartDisease


In [4]:
print("=======================================================================")
print("Original dataset :")
print("=======================================================================")
print(data.head())
print("=======================================================================")
print("modified_dataset : ")
print("=======================================================================")

data = data.rename(columns = {"Sex" : "Gender"})

data.loc[data["Gender"] == "M","Gender"] = 1
data.loc[data["Gender"] == "F","Gender"] = 0

data.ChestPainType.unique()
data.loc[data["ChestPainType"] == "ATA","ChestPainType"] = 1
data.loc[data["ChestPainType"] == "NAP","ChestPainType"] = 2
data.loc[data["ChestPainType"] == "ASY","ChestPainType"] = 3
data.loc[data["ChestPainType"] == "TA","ChestPainType"] = 4

data.RestingECG.unique()
data.loc[data["RestingECG"] == "Normal","RestingECG"] = 1
data.loc[data["RestingECG"] == "ST","RestingECG"] = 2
data.loc[data["RestingECG"] == "LVH","RestingECG"] = 3

data.ExerciseAngina.unique()
data.loc[data["ExerciseAngina"] == "Y","ExerciseAngina"] = 1
data.loc[data["ExerciseAngina"] == "N","ExerciseAngina"] = 0

data.ST_Slope.unique()
data.loc[data["ST_Slope"] == "Up","ST_Slope"] = 1
data.loc[data["ST_Slope"] == "Flat","ST_Slope"] = 0
data.loc[data["ST_Slope"] == "Down","ST_Slope"] = -1

print(data.head())

Original dataset :
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  
modified_dataset : 
   Age Gender ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  \
0   40      1             1        140          289          0          1   
1   49      0 

In [5]:
X = data.drop("HeartDisease", axis = 1)
#axis parameter denotes the action where it is to be perform -(0: row, 1: column)

In [6]:
y = data.loc[:,"HeartDisease"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 1)
#random_state is to get the same sequence of random numbers(pseudo random number generator) for multiple time running of code

In [8]:
X_train.head()

Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
41,54,0,2,130,294,0,2,100,1,0.0,0
891,66,0,2,146,278,0,3,152,0,0.0,0
265,54,1,1,160,305,0,1,175,0,0.0,1
177,59,1,1,140,287,0,1,150,0,0.0,1
241,54,1,3,200,198,0,1,142,1,2.0,0


In [9]:
y_train.head()

41     1
891    0
265    0
177    0
241    1
Name: HeartDisease, dtype: int64

In [10]:
X_test.head()

Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
900,58,1,3,114,318,0,2,140,0,4.4,-1
570,56,1,3,128,223,0,2,119,1,2.0,-1
791,51,1,3,140,298,0,1,122,1,4.2,0
189,53,1,3,180,285,0,2,120,1,1.5,0
372,63,1,3,185,0,0,1,98,1,0.0,1


In [11]:
y_test

900    1
570    1
791    1
189    1
372    1
      ..
880    0
110    0
120    1
306    1
889    1
Name: HeartDisease, Length: 230, dtype: int64

In [12]:
gnb = GaussianNB()
print(gnb.get_params())

{'priors': None, 'var_smoothing': 1e-09}


In [13]:
model1 = gnb.fit(X_train,y_train)

In [14]:
y_pred1 = model1.predict(X_test)

In [15]:
print(classification_report(y_test, y_pred1))
print(confusion_matrix(y_test, y_pred1))
print('actual count',Counter(y_test))
print('machine predicted count',Counter(y_pred1))

              precision    recall  f1-score   support

           0       0.80      0.88      0.83        89
           1       0.92      0.86      0.89       141

    accuracy                           0.87       230
   macro avg       0.86      0.87      0.86       230
weighted avg       0.87      0.87      0.87       230

[[ 78  11]
 [ 20 121]]
actual count Counter({1: 141, 0: 89})
machine predicted count Counter({1: 132, 0: 98})


# K means clustering - method 2

In [16]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [17]:
std = StandardScaler()
stdX_train = std.fit_transform(X_train)
stdX_test = std.fit_transform(X_test)

In [18]:
kmc = KMeans(n_clusters = 2)

In [19]:
model2 = kmc.fit(stdX_train)

In [20]:
y_pred2 = model2.predict(stdX_test)

In [21]:
centroid = kmc.cluster_centers_
centroid

array([[-0.48379884, -0.29195667, -0.54624713, -0.22249558,  0.29618077,
        -0.28916789, -0.06986483,  0.56578107, -0.67981726, -0.56014293,
         0.6949586 ],
       [ 0.43063413,  0.25987352,  0.48621998,  0.19804552, -0.26363344,
         0.2573912 ,  0.06218737, -0.50360733,  0.60511207,  0.49858876,
        -0.61858953]])

In [22]:
print(classification_report(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))
print(Counter(y_test))
print(Counter(y_pred2))

              precision    recall  f1-score   support

           0       0.75      0.87      0.80        89
           1       0.91      0.82      0.86       141

    accuracy                           0.83       230
   macro avg       0.83      0.84      0.83       230
weighted avg       0.84      0.83      0.84       230

[[ 77  12]
 [ 26 115]]
Counter({1: 141, 0: 89})
Counter({1: 127, 0: 103})


# support vector machine

In [23]:
from sklearn.svm import SVC
import matplotlib.pyplot as plt

In [24]:
svc = SVC()
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [25]:
model2 = svc.fit(X_train,y_train)

In [26]:
y_pred2 = model2.predict(X_test)

In [27]:
print(classification_report(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))
print(Counter(y_test))
print(Counter(y_pred2))

              precision    recall  f1-score   support

           0       0.60      0.73      0.66        89
           1       0.80      0.70      0.75       141

    accuracy                           0.71       230
   macro avg       0.70      0.71      0.70       230
weighted avg       0.73      0.71      0.71       230

[[65 24]
 [43 98]]
Counter({1: 141, 0: 89})
Counter({1: 122, 0: 108})


# Result :

After trying out these 3 algorithms, Naive Bayes classifier algorithm has better accuracy for this particular dataset. 

I recommend Naive Bayes classifier model for future prediction.