In [None]:
import numpy as np
import pandas as pd

# Data Preprocessing

In [None]:
# loading the csv data to a Pandas DataFrame
heart_data = pd.read_csv('heart.csv')

In [None]:
heart_data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [None]:
# number of rows and columns in the dataset
heart_data.shape

(918, 12)

In [None]:
# getting some info about the data
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [None]:
# checking for missing values
heart_data.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [None]:
# statistical measures about the data
heart_data.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [None]:
# checking the distribution of Target Variable
heart_data['HeartDisease'].value_counts()

1    508
0    410
Name: HeartDisease, dtype: int64

1 --> Defective Heart

0 --> Healthy Heart

Handling Categorical Values

In [None]:
from sklearn.preprocessing import LabelEncoder
en = LabelEncoder()

In [None]:
cols = ['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope']
for col in cols:
    heart_data[col] = en.fit_transform(heart_data[col])

Splitting the Features and Target

In [None]:
X = heart_data.drop(columns='HeartDisease', axis=1)
Y = heart_data['HeartDisease']

In [None]:
print(X)

     Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0     40    1              1        140          289          0           1   
1     49    0              2        160          180          0           1   
2     37    1              1        130          283          0           2   
3     48    0              0        138          214          0           1   
4     54    1              2        150          195          0           1   
..   ...  ...            ...        ...          ...        ...         ...   
913   45    1              3        110          264          0           1   
914   68    1              0        144          193          1           1   
915   57    1              0        130          131          0           1   
916   57    0              1        130          236          0           0   
917   38    1              2        138          175          0           1   

     MaxHR  ExerciseAngina  Oldpeak  ST_Slope  
0  

In [None]:
print(Y)

0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: HeartDisease, Length: 918, dtype: int64


Importing sklearn library

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report

Splitting the Data into Training & Testing data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(918, 11) (734, 11) (184, 11)


Model Training

# Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [None]:
y_pred=model.predict(X_test)
acc=accuracy_score(y_pred,Y_test)

In [None]:
print('Accuracy on Test data : ', acc)

Accuracy on Test data :  0.8532608695652174


In [None]:
print(classification_report(y_pred,Y_test))

              precision    recall  f1-score   support

           0       0.83      0.84      0.83        81
           1       0.87      0.86      0.87       103

    accuracy                           0.85       184
   macro avg       0.85      0.85      0.85       184
weighted avg       0.85      0.85      0.85       184



In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(Y_test,y_pred))

[[68 14]
 [13 89]]


# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()

In [None]:
nb.fit(X_train, Y_train)

GaussianNB()

In [None]:
y_nb_pred=nb.predict(X_test)
print(accuracy_score(y_nb_pred,Y_test))

0.8260869565217391


In [None]:
print(classification_report(y_nb_pred,Y_test))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81        84
           1       0.83      0.85      0.84       100

    accuracy                           0.83       184
   macro avg       0.83      0.82      0.82       184
weighted avg       0.83      0.83      0.83       184



# Support Vector Machines

In [None]:
from sklearn import svm
m=svm.LinearSVC(random_state=0, tol=1e-5)

In [None]:
m.fit(X_train,Y_train)



LinearSVC(random_state=0, tol=1e-05)

In [None]:
y_svm_pred = m.predict(X_test)
print("Accuracy : ",accuracy_score(y_svm_pred,Y_test))

Accuracy :  0.6086956521739131


In [None]:
print(classification_report(y_svm_pred,Y_test))

              precision    recall  f1-score   support

           0       0.12      1.00      0.22        10
           1       1.00      0.59      0.74       174

    accuracy                           0.61       184
   macro avg       0.56      0.79      0.48       184
weighted avg       0.95      0.61      0.71       184



# SVM-HyperParameter Tuning

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
#Scaling the features using pipeline
pipeline = Pipeline([
            ('std_scaler',StandardScaler()),
                    ])
scaled_X_train = pipeline.fit_transform(X_train)
scaled_X_test = pipeline.transform(X_test)

In [None]:
from sklearn.svm import SVC

In [None]:
model_svc = SVC()
model_svc.fit(scaled_X_train,Y_train)

SVC()

In [None]:
model_svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [None]:
pred_svc = model_svc.predict(scaled_X_test)
print("Accuracy : ",accuracy_score(pred_svc,Y_test))

Accuracy :  0.8804347826086957


In [None]:
print(classification_report(Y_test,pred_svc))

              precision    recall  f1-score   support

           0       0.88      0.84      0.86        82
           1       0.88      0.91      0.89       102

    accuracy                           0.88       184
   macro avg       0.88      0.88      0.88       184
weighted avg       0.88      0.88      0.88       184



In [None]:
#Hyperparamter tuning using Gridsearch
from sklearn.model_selection import GridSearchCV
svm = SVC()
# param_grid = {'C':[0.01,0.05,0.1,1,10, 100, 1000],'kernel':['linear','rbf'], 'gamma':['scale','auto'] }
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
grid = GridSearchCV(svm,param_grid)

In [None]:
grid.fit(scaled_X_train,Y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear']})

In [None]:
grid_svc = grid.predict(scaled_X_test)
print("Accuracy: ",accuracy_score(Y_test,grid_svc))

Accuracy:  0.8858695652173914


In [None]:
print(classification_report(Y_test,grid_svc))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87        82
           1       0.89      0.91      0.90       102

    accuracy                           0.89       184
   macro avg       0.89      0.88      0.88       184
weighted avg       0.89      0.89      0.89       184



# Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_DT = DecisionTreeClassifier()

In [None]:
model_DT.fit(X_train,Y_train)

DecisionTreeClassifier()

In [None]:
y_DT_pred=model_DT.predict(X_test)
print("Accuracy : ",accuracy_score(y_DT_pred,Y_test))

Accuracy :  0.8152173913043478


In [None]:
print(classification_report(y_DT_pred,Y_test))

              precision    recall  f1-score   support

           0       0.83      0.77      0.80        88
           1       0.80      0.85      0.83        96

    accuracy                           0.82       184
   macro avg       0.82      0.81      0.81       184
weighted avg       0.82      0.82      0.81       184



# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=200)

In [None]:
clf.fit(X_train,Y_train)

RandomForestClassifier(n_estimators=200)

In [None]:
y_RF_pred = clf.predict(X_test)
print("Accuracy : ",accuracy_score(y_RF_pred,Y_test))

Accuracy :  0.9021739130434783


In [None]:
print(classification_report(y_RF_pred,Y_test))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89        78
           1       0.93      0.90      0.91       106

    accuracy                           0.90       184
   macro avg       0.90      0.90      0.90       184
weighted avg       0.90      0.90      0.90       184



# Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
clf_GB = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=0).fit(X_train, Y_train)

In [None]:
y_GB_pred=clf_GB.predict(X_test)
print("Accuracy : ",accuracy_score(y_GB_pred,Y_test))

Accuracy :  0.8586956521739131


In [None]:
print(classification_report(y_GB_pred,Y_test))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84        78
           1       0.89      0.86      0.88       106

    accuracy                           0.86       184
   macro avg       0.85      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184



# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
clf_Knn = KNeighborsClassifier(n_neighbors=5)
clf_Knn.fit(X_train, Y_train)
y_Knn_pred=clf_Knn.predict(X_test)
print("Accuracy : ",accuracy_score(y_Knn_pred,Y_test))

Accuracy :  0.7119565217391305


# Deep learning

In [None]:
import tensorflow as tf
classifier = tf.keras.models.Sequential()
classifier.add(tf.keras.layers.Dense(units = 128, activation = "relu"))
classifier.add(tf.keras.layers.Dropout(0.2))
classifier.add(tf.keras.layers.Dense(units = 32, activation = "relu"))
classifier.add(tf.keras.layers.Dropout(0.1))
classifier.add(tf.keras.layers.Dense(units = 1,activation = "sigmoid"))
classifier.compile(optimizer = "adam", loss = "binary_crossentropy" , metrics=["accuracy"])

In [None]:
classifier.fit(scaled_X_train,Y_train,epochs = 100,validation_data = (scaled_X_test,Y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x2358d0d4fd0>

In [None]:
y_pred = classifier.predict(scaled_X_test)
y_pred = (y_pred > 0.3)
print("Accuracy : ",accuracy_score(y_pred,Y_test))

Accuracy :  0.8804347826086957


In [None]:
print(classification_report(y_pred,Y_test))

              precision    recall  f1-score   support

       False       0.82      0.91      0.86        74
        True       0.93      0.86      0.90       110

    accuracy                           0.88       184
   macro avg       0.87      0.88      0.88       184
weighted avg       0.89      0.88      0.88       184

