# Installing the libraries

In [None]:
! pip install tensorflow==2.4.0



# Importing the libraries

In [None]:
''' Importing the libraries '''

import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
import sklearn
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Mount the Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading the Dataset

In [None]:
''' Lets load the saved CSV file again '''

heart = pd.read_csv('/content/drive/MyDrive/Heart Attack Dataset/heart_new_processed.csv',index_col=[0])
heart.head()

Unnamed: 0,age,sex,cp,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,150,0,2.3,0,0,1,1
1,37,1,2,187,0,3.5,0,0,2,1
2,41,0,1,172,0,1.4,2,0,2,1
3,56,1,1,178,0,0.8,2,0,2,1
4,57,0,0,163,1,0.6,2,0,2,1


#  Train Test split

In [None]:
''' Splitting up the features and the output column '''


X = heart.iloc[:,0:9].values
Y = heart.iloc[:,-1].values

Y = Y.reshape(-1,1)

# Normalising the dataset

In [None]:
scaler = MinMaxScaler()

x = scaler.fit_transform(X)
y = scaler.fit_transform(Y)

# Train Test split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=42)

# Applying appropriate ML/DL models to predict the Heart attacks

**DNN Model**

In [None]:
#Creating a Sequential model and adding up the layers

model = Sequential()
model.add(tf.keras.layers.Dense(200, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))
model.add(tf.keras.layers.Dense(200, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(200, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='softmax'))

In [None]:
#set early stopping  so the model stops training when it won't improve anymore

early_stopping = EarlyStopping(monitor="val_loss", min_delta=0, patience=10, verbose=1, restore_best_weights=True)

reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=3, verbose=1, min_delta=0.0001)

In [None]:
#Compiling the model

model.compile(loss = 'binary_crossentropy', optimizer = tf.keras.optimizers.SGD(lr=0.01), metrics=['accuracy'])

#Printing the summary of the model

print(model.summary())

# Fit/Train the model

history = model.fit(X_train,y_train,epochs = 200, batch_size = 16, validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 200)               2000      
_________________________________________________________________
dense_5 (Dense)              (None, 200)               40200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 201       
Total params: 82,601
Trainable params: 82,601
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epo

# Tuning the model

In [None]:
''' Installing the keras tuner to tune the Deep Learning Model '''

! pip install -q -U keras-tuner

In [None]:
import keras_tuner as kt

In [None]:
def model_builder(hp):
  model = keras.Sequential()

  # Tune the number of units in the first Dense layer
  # Choose an optimal value between 32-512
  hp_units = hp.Int('units', min_value=32, max_value=512, step=32)

  model.add(tf.keras.layers.Dense(units=hp_units, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))
  model.add(tf.keras.layers.Dropout(0.5))
  model.add(tf.keras.layers.Dense(units=hp_units, activation='relu'))
  model.add(tf.keras.layers.Dropout(0.5))
  model.add(keras.layers.Dense(units=hp_units, activation='relu'))
  model.add(keras.layers.Dense(1,activation='softmax'))

  # Tune the learning rate for the optimizer
  # Choose an optimal value from 0.01, 0.001, or 0.0001
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss=keras.losses.BinaryCrossentropy(),
                metrics=['accuracy'])

  return model

In [None]:
# Instantiate the tuner and perform hypertuning

tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='intro_to_kt')

# Checkpoints to monitor Early stopping

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

INFO:tensorflow:Reloading Oracle from existing project my_dir/intro_to_kt/oracle.json
INFO:tensorflow:Reloading Tuner from my_dir/intro_to_kt/tuner0.json


In [None]:
tuner.search(X_train, y_train, epochs=200, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=2)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

# Build the model with the optimal hyperparameters and train it on the data for 200 epochs

model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=200, validation_split=0.2)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))


INFO:tensorflow:Oracle triggered exit

The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is 96 and the optimal learning rate for the optimizer
is 0.001.

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62

In [None]:
hypermodel = tuner.hypermodel.build(best_hps)

''' Retrain the model'''

hypermodel.fit(X_train, y_train, epochs=best_epoch, validation_split=0.2)

''' Evaluating the hypermodel on the test set and priniting the test results '''

eval_result = hypermodel.evaluate(X_test, y_test)
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: [0.6843596696853638, 0.5409836173057556]


**Analysis of the model**

We can see that clearly DNN is not performing well both on test set as well as the train set. In other words, the model is underfitting.As the model is not performing well, we will use other models to build and train them.

**Different Machine Learning Models**

We build a pipeline to perform hyperparamter tuning (Grid Search CV) on different classification models and we choose the best accuracy of the model based on hyper-parameter tuning.We then save the model of the best performing model.

In [None]:
''' Importing the Grid Search Library '''

from sklearn.model_selection import GridSearchCV

''' Creating a pipeline of models for hyper parameter tuning '''

pipe = Pipeline([("classifier", RandomForestClassifier())])

''' Create dictionary with candidate learning algorithms and their hyperparameters'''

grid_param = [
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2','l1'],
                 "classifier__C": np.logspace(0, 4, 10)
                 },
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2'],
                 "classifier__C": np.logspace(0, 4, 10),
                 "classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_depth":[5,8,15,25,30,None],
                 "classifier__min_samples_leaf":[1,2,5,10,15,100],
                 "classifier__max_leaf_nodes": [2, 5,10]},
                {"classifier": [SVC()],
                 "classifier__kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
                 "classifier__gamma":['scale','auto'],
                 'classifier__C': [0.1, 1, 10, 100, 1000]},
                {"classifier": [DecisionTreeClassifier()],
                 "classifier__criterion": ['gini', 'entropy'],
                 "classifier__splitter":['best','random'],
                 'classifier__max_depth': [5,8,15,25,30,None],
                 'classifier__min_samples_split': [1,5,10,15,20,None],
                 'classifier__min_samples_leaf': [1,5,10,15,20,None]},
                {"classifier": [GaussianNB()],
                 "classifier__var_smoothing": [1e-9,1e-8,1e-7,1e-5]},
                {"classifier": [KNeighborsClassifier()],
                 "classifier__n_neighbors": [1,5,10,15,20,25]},
              ]

''' create a gridsearch of the pipeline, the fit the best model '''

gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

  self._final_estimator.fit(Xt, y, **fit_params)


In [None]:
''' Selcting the best model '''

print(best_model.best_estimator_)

''' Printing the test accuracy of the best moedl '''

print("The mean accuracy of the model is:",best_model.score(X_test,y_test))

Pipeline(memory=None,
         steps=[('classifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=25, max_features='auto',
                                        max_leaf_nodes=2, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=15,
                                        min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)
The mean accuracy of the model is: 0.819672131147541


Based on the above hyper-parameters, we define the SVM model with the best parameters and train the model.

In [None]:
''' Defining the SVM model with best hyper-parameters '''

svm_model = SVC(C=10, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='linear', max_iter=-1,
                     probability=True, random_state=None, shrinking=True,
                     tol=0.001, verbose=False)

''' Training the model '''

svm_model.fit(X_train, y_train)

''' Evaluating the train and test scores for SVM model'''

train_score =  svm_model.score(X_train,y_train)
test_score =  svm_model.score(X_test,y_test)  

''' Printing the Train and Test scores of SVM Model '''

print("test score: {} train score: {}".format(test_score,train_score),'\n')

''' Predicting the test values '''

y_pred =  svm_model.predict(X_test)

  y = column_or_1d(y, warn=True)


test score: 0.8032786885245902 train score: 0.8791666666666667 



**Analysis of the performance of the model**

Based on the above hyper-parameter tuning for different classification models, we see that SVM has the highest test accuracy of 80%.Hence, this is the best performing model and we will save this model.

# Performance Metrics

**Performance Metrics of SVM Model**

In [None]:
''' Printing the confusion matrix '''

print(confusion_matrix(y_test, y_pred))

[[20  8]
 [ 4 29]]


In [None]:
''' Printing the classification report of the Logistic Regression Model '''

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.83      0.71      0.77        28
         1.0       0.78      0.88      0.83        33

    accuracy                           0.80        61
   macro avg       0.81      0.80      0.80        61
weighted avg       0.81      0.80      0.80        61



# Saving the model (either as pkl file or .h5 file)

In [None]:
Pkl_Filename = "svmbestmodel.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(svm_model, file)

# Predicting the values

In [None]:
row = [57,1,0,143,1,3,1,1,3]

X1 = scaler.fit_transform([row])

pred = svm_model.predict(X1)
round(pred[0], 2)

if(pred[0]==1.0):
  print("You have higher chance of getting heart attack")
else:
  print("You have lower chance of getiing heart attack")   

You have higher chance of getting heart attack


In [None]:
pred

array([1.])