In [49]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random 
from preprocessing_functions import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score

# Import and undersample the dataset

In [50]:
heart_attack_2020 = pd.read_csv(r'C:\Users\Hp\Documents\Machine_learning_projects\heart_attack_predictions\0_datasets\2020\heart_2020_cleaned.csv')
heart_attack_2020 = undersample_data(heart_attack_2020, 'HeartDisease')
print(heart_attack_2020.value_counts('HeartDisease'))
heart_attack_2020.head()

HeartDisease
No     27373
Yes    27373
Name: count, dtype: int64


Unnamed: 0,HeartDisease,BMI,Smoking,Stroke,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
111390,No,25.02,No,No,No,Female,50-54,White,No,Yes,Good,6.0,Yes,No,Yes
250162,No,23.3,No,No,No,Female,50-54,Black,No,Yes,Good,7.0,No,No,No
230713,No,25.77,Yes,No,No,Male,45-49,White,No,Yes,Very good,4.0,Yes,No,No
153210,No,25.61,Yes,Yes,No,Female,25-29,White,Yes,No,Fair,6.0,No,No,No
149117,No,31.16,Yes,No,Yes,Female,65-69,Black,No,Yes,Fair,7.0,No,No,No


# Preprocessing

In [51]:
from preprocessing_functions import *

# numerical features
numerical_features = ['BMI','SleepTime']
# categorical features
categorical_features = ['Smoking', 'Stroke', 'DiffWalking', 'Sex','AgeCategory', 'Race', 'Diabetic', 
                        'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']

heart_attack_2020 = heart_attack_2020[numerical_features + categorical_features + ['HeartDisease']]

# Encoding, scaling and train test split
(Train_df_encoded, Train_labels_encoded), (Valid_df_encoded, 
        Valid_labels_encoded), (Test_df_encoded, Test_labels_encoded) = preprocess_features(heart_attack_2020, one_hot_encoding=False,
                categorical_columns=categorical_features, numerical_tranformer = 'log_min_max',
                numerical_columns=numerical_features, target='HeartDisease')

print(f' train dataset length: {len(Train_df_encoded)}')
print(f' validation dataset length: {len(Valid_df_encoded)}')
print(f' test dataset length: {len(Test_df_encoded)}')
Train_df_encoded.shape, Train_labels_encoded.shape

 train dataset length: 46839
 validation dataset length: 4074
 test dataset length: 3833




((46839, 46), TensorShape([46839]))

In [52]:
# Creating fast processing dataset
Dataset_train, Dataset_valid, Dataset_test = create_fast_preprocessing_odds(Train_df_encoded, Train_labels_encoded,
        Valid_df_encoded, Valid_labels_encoded, Test_df_encoded, Test_labels_encoded)
Dataset_train

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 46), dtype=tf.float64, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

# Creating checkpoint callback

In [53]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    filepath='Dense_model_best.h5',
    save_weights_only=False,  
    monitor='val_accuracy',  
    save_best_only=True, 
    verbose=1  
)

# Creating the model

## Using log_min_max for BMI

In [72]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(Train_df_encoded.shape[1],), name = 'input_layer')
x = layers.Dense(96, activation='relu')(inputs)
x = layers.Dense(96, activation='relu')(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(32, activation='relu')(x)
x = layers.Dense(32, activation='relu')(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model_dense = tf.keras.Model(inputs, outputs)

model_dense.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)

In [73]:
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

dense_history = model_dense.fit(
                                Dataset_train,
                                epochs=45,
                                validation_data=Dataset_valid,
                                validation_steps=len(Dataset_valid),
                                callbacks=[checkpoint_callback]
                                )

Epoch 1/45


Epoch 1: val_accuracy did not improve from 0.77418
Epoch 2/45
Epoch 2: val_accuracy did not improve from 0.77418
Epoch 3/45
Epoch 3: val_accuracy did not improve from 0.77418
Epoch 4/45
Epoch 4: val_accuracy did not improve from 0.77418
Epoch 5/45
Epoch 5: val_accuracy did not improve from 0.77418
Epoch 6/45
Epoch 6: val_accuracy did not improve from 0.77418
Epoch 7/45
Epoch 7: val_accuracy did not improve from 0.77418
Epoch 8/45
Epoch 8: val_accuracy did not improve from 0.77418
Epoch 9/45
Epoch 9: val_accuracy did not improve from 0.77418
Epoch 10/45
Epoch 10: val_accuracy did not improve from 0.77418
Epoch 11/45
Epoch 11: val_accuracy did not improve from 0.77418
Epoch 12/45
Epoch 12: val_accuracy did not improve from 0.77418
Epoch 13/45
Epoch 13: val_accuracy did not improve from 0.77418
Epoch 14/45
Epoch 14: val_accuracy did not improve from 0.77418
Epoch 15/45
Epoch 15: val_accuracy did not improve from 0.77418
Epoch 16/45
Epoch 16: val_accuracy did not improve from 0.77418
Epoch