In [26]:
# Import Required Libraries and warnings

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, classification_report

# keras
import keras
from keras.models import Sequential
from keras.layers import Dense

In [2]:
!pip install keras-tuner

Collecting keras-tuner
  Using cached keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Using cached kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Using cached keras_tuner-1.4.7-py3-none-any.whl (129 kB)
Using cached kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [51]:
# importing the tuner
import keras_tuner
from kerastuner.tuners import RandomSearch

In [4]:
# Fetch the Data

data=pd.read_csv('/content/drive/MyDrive/Data Science/Practice - ML/Deep Learning/healthcare-dataset-stroke-data.csv')
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [9]:
data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [10]:
# Dropping the PII Information

data.drop(['id'],axis=1,inplace=True)

In [11]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [15]:
# Checking for Null Values

data.isnull().sum()

Unnamed: 0,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,201
smoking_status,0


In [17]:
# BMI is of float data type and it should be replaced with Median Values

data['bmi'].fillna(data['bmi'].median(),inplace=True)

In [18]:
data.isnull().sum()

Unnamed: 0,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,0
smoking_status,0


In [21]:
# Checking for Duplicated Values

data.duplicated().sum()

np.int64(0)

In [14]:
# Finding the unique Values of all Categorical Columns

for cols in data.columns:
  if data[cols].dtype =='O':
    print(f"Unique Value of {cols} is {np.unique(data[cols])}")

Unique Value of gender is ['Female' 'Male' 'Other']
Unique Value of ever_married is ['No' 'Yes']
Unique Value of work_type is ['Govt_job' 'Never_worked' 'Private' 'Self-employed' 'children']
Unique Value of Residence_type is ['Rural' 'Urban']
Unique Value of smoking_status is ['Unknown' 'formerly smoked' 'never smoked' 'smokes']


In [22]:
data['ever_married'] = data['ever_married'].map({'No':0,'Yes':1})
data['Residence_type'] = data['Residence_type'].map({'Rural':0,'Urban':1})

In [23]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,1,Private,1,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,1,Self-employed,0,202.21,28.1,never smoked,1
2,Male,80.0,0,1,1,Private,0,105.92,32.5,never smoked,1
3,Female,49.0,0,0,1,Private,1,171.23,34.4,smokes,1
4,Female,79.0,1,0,1,Self-employed,0,174.12,24.0,never smoked,1


In [33]:
# Use One hot Encoding to transform Data
encoded_data=pd.get_dummies(data,columns=['gender','smoking_status','work_type'],drop_first=True,dtype='int')
encoded_data.head()

Unnamed: 0,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,67.0,0,1,1,1,228.69,36.6,1,1,0,1,0,0,0,1,0,0
1,61.0,0,0,1,0,202.21,28.1,1,0,0,0,1,0,0,0,1,0
2,80.0,0,1,1,0,105.92,32.5,1,1,0,0,1,0,0,1,0,0
3,49.0,0,0,1,1,171.23,34.4,1,0,0,0,0,1,0,1,0,0
4,79.0,1,0,1,0,174.12,24.0,1,0,0,0,1,0,0,0,1,0


In [34]:
encoded_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             5110 non-null   float64
 1   hypertension                    5110 non-null   int64  
 2   heart_disease                   5110 non-null   int64  
 3   ever_married                    5110 non-null   int64  
 4   Residence_type                  5110 non-null   int64  
 5   avg_glucose_level               5110 non-null   float64
 6   bmi                             5110 non-null   float64
 7   stroke                          5110 non-null   int64  
 8   gender_Male                     5110 non-null   int64  
 9   gender_Other                    5110 non-null   int64  
 10  smoking_status_formerly smoked  5110 non-null   int64  
 11  smoking_status_never smoked     5110 non-null   int64  
 12  smoking_status_smokes           51

In [44]:
# Train Test Split
x=encoded_data.drop(['stroke'],axis=1)
y=encoded_data['stroke']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [45]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4088, 16)
(1022, 16)
(4088,)
(1022,)


In [54]:
# Standard Scaling

scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [49]:
x_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1.584155,-0.321981,-0.236189,0.726005,-1.023265,0.135593,-0.054102,1.192995,0.0,2.177515,-0.764122,-0.429233,-0.070117,-1.153548,2.285218,-0.38971
1,0.829708,-0.321981,-0.236189,0.726005,0.977264,-0.397457,0.951155,-0.838227,0.0,-0.459239,-0.764122,-0.429233,-0.070117,0.866891,-0.437595,-0.38971
2,-0.989841,-0.321981,-0.236189,-1.377402,-1.023265,-1.028701,0.616069,-0.838227,0.0,-0.459239,1.308692,-0.429233,-0.070117,0.866891,-0.437595,-0.38971
3,-0.546049,-0.321981,-0.236189,0.726005,0.977264,-0.893246,0.190768,1.192995,0.0,2.177515,-0.764122,-0.429233,-0.070117,-1.153548,-0.437595,-0.38971
4,-0.546049,-0.321981,-0.236189,-1.377402,-1.023265,-1.02629,-1.162462,-0.838227,0.0,-0.459239,1.308692,-0.429233,-0.070117,0.866891,-0.437595,-0.38971


In [50]:
## Building a ANN Model ##

def hyper_model(hp):
  # Initialize the model
    model = Sequential()

    # -- FORWARD PROPAGATION -- #
    # Input Layer
    model.add(Dense(units=len(x_train[0]),
                    kernel_initializer='he_uniform',
                    input_dim=len(x_train[0])))

    # Hidden Layer
    for layer in range(hp.Int('num_layer', 2, 5)):
        model.add(Dense(units=hp.Int('units_'+str(layer),
                                     min_value=2,
                                     max_value=8),
                        kernel_initializer = 'he_uniform',
                        activation = hp.Choice('activation', ['relu', 'linear', 'tanh'])))
    # Ouput Layer
    model.add(Dense(units=1, activation='relu'))

    # -- BACK PROPAGATION -- #
    model.compile(optimizer=keras.optimizers.Adam(
        hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),     # 0.01, 0.001, 0.0001
        loss = 'BinaryCrossentropy',
        metrics = ['accuracy'])

    # model results
    return model

In [52]:
# Spining the tuner
tuner = RandomSearch(
    hypermodel=hyper_model,
    objective='val_accuracy',
    max_trials=3,
    executions_per_trial=2,
    directory='my_dir',
    project_name='AQI'
)

In [55]:
# fitting the data onto the tuner
tuner.search(x_train, y_train, epochs=30, batch_size=20, validation_data=(x_test, y_test))

Trial 3 Complete [00h 01m 38s]
val_accuracy: 0.9315068423748016

Best val_accuracy So Far: 0.9315068423748016
Total elapsed time: 00h 01m 42s


In [57]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_hps.values

{'num_layer': 2,
 'units_0': 8,
 'activation': 'relu',
 'units_1': 8,
 'learning_rate': 0.0001,
 'units_2': 3,
 'units_3': 7}

In [58]:
# build and train the model
final_model = tuner.hypermodel.build(best_hps)
history = final_model.fit(x_train, y_train, initial_epoch=30, epochs=50, validation_data=(x_test, y_test))

Epoch 31/50
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9147 - loss: 0.9274 - val_accuracy: 0.9051 - val_loss: 1.0271
Epoch 32/50
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9297 - loss: 0.7620 - val_accuracy: 0.9119 - val_loss: 1.0258
Epoch 33/50
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9270 - loss: 0.8676 - val_accuracy: 0.9149 - val_loss: 1.0204
Epoch 34/50
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9267 - loss: 0.8988 - val_accuracy: 0.9139 - val_loss: 1.0191
Epoch 35/50
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9343 - loss: 0.8133 - val_accuracy: 0.9159 - val_loss: 1.0144
Epoch 36/50
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9326 - loss: 0.8189 - val_accuracy: 0.9178 - val_loss: 0.9983
Epoch 37/50
[1m128/12