#Importing Libraries

We start by importing the libraries: numpy and pandas

In [None]:
import numpy as np
import pandas as pd

#Loading the Dataset

We load the dataset from a csv file, and see its first rows

In [None]:
df=pd.read_csv('your_dataset.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


#Exploratory Data Analysis

Now we start the exploratory data analysis.

###Shape of the data

First, you need to know the shape of our data (How many examples and features do we have)

In [None]:
rows = df.shape[0]
columns = df.shape[1]

print('there are {} rows and {} columns in the data'.format(rows, columns))

there are 5110 rows and 12 columns in the data


###Types of different Columns

See the type of each of your features and see if you have any nulls

In [None]:
df.info()
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_ty

###Dealing with categorical variables

Now we will walk through the categorical variables that we have to see the categories and the counts of each of them.

In [None]:
categorical_columns = ['gender','ever_married', 'work_type', 'Residence_type', 'smoking_status']

for column in categorical_columns:
    print(f"Column: {column}")
    print(df[column].value_counts())
    print("\n")

Column: gender
Female    2994
Male      2115
Other        1
Name: gender, dtype: int64


Column: ever_married
Yes    3353
No     1757
Name: ever_married, dtype: int64


Column: work_type
Private          2925
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: work_type, dtype: int64


Column: Residence_type
Urban    2596
Rural    2514
Name: Residence_type, dtype: int64


Column: smoking_status
never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: smoking_status, dtype: int64




#Preprocessing

Prepare the data in a way to be ready to be used to train a DL model.

In [None]:
from sklearn.preprocessing import LabelEncoder
#dropping unused column
df=df.drop("id", axis=1)
#fill null values with mean
mean_bmi = df['bmi'].mean()
df['bmi'].fillna(mean_bmi, inplace=True)

#encode categorical columns
le = LabelEncoder()
cat_data = df[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']]
num_data = df[['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke']]

for i in cat_data:
  cat_data[i] = le.fit_transform(cat_data[i])
#combine both categorical data and numerical data
df = pd.concat([cat_data, num_data], axis = 1)
df.head()

#Building the DL Model

Now it's time to build the actual model. Propose a DL architecture suitable for this problem and print its summary.

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

def create_model():
    # Define the model
    model = Sequential()

    # Input layer
    model.add(Dense(64, input_dim=10, activation='relu'))

    # Hidden layers
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))

    # Output layer (binary classification, sigmoid activation)
    model.add(Dense(1, activation='sigmoid'))

    return model

# Print model summary
model = create_model()
model.summary()


Model: "sequential_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_113 (Dense)           (None, 64)                704       
                                                                 
 dense_114 (Dense)           (None, 128)               8320      
                                                                 
 dense_115 (Dense)           (None, 64)                8256      
                                                                 
 dense_116 (Dense)           (None, 1)                 65        
                                                                 
Total params: 17345 (67.75 KB)
Trainable params: 17345 (67.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


###Compiling the model

Now we need to compile the model.

In [None]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


###Fitting the model

we split our dataset between training and testing, and we fit the model on training data (70%), and validate on the testing data (30%).

In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = df.drop('stroke', axis=1)  # Features (all columns except 'stroke)
Y= df['stroke']
# Split the data into training and testing sets (70% training, 30% testing)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the scaler on the training data
x_train= scaler.fit_transform(x_train)

# Transform the test data using the same scaler
x_test= scaler.transform(x_test)

# Fit the model on training data and validate on testing data
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=32)
scores = model.evaluate(x_test, y_test, verbose = 0)

print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
accuracy: 94.19%


#Improving DL Models

**TIP: When tuning your model to obtain a better performance, make sure you use a validation set**

###Data Improvement

In [None]:
from imblearn.over_sampling import SMOTE
# Apply SMOTE to balance the class distribution
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x_train, y_train)
x_train_resampled, x_test_resampled, y_train_resampled, y_test_resampled = train_test_split(x_resampled, y_resampled, test_size=0.3, random_state=42)

data_model=create_model()
# Compile the model
data_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model on resampled training data and validate on testing data
data_model.fit(x_train_resampled, y_train_resampled, validation_data=(x_test_resampled, y_test_resampled), epochs=10, batch_size=32)
scores = data_model.evaluate(x_test_resampled, y_test_resampled, verbose = 0)

print("%s: %.2f%%" % (data_model.metrics_names[1], scores[1]*100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
accuracy: 92.24%


###Model Design

In [None]:
def create_regularized_model():
    model = Sequential()
    model.add(Dense(64, input_dim=11, activation='relu'))

    # Add dropout layer with a dropout rate of 0.2 \
    model.add(Dropout(0.2))

    model.add(Dense(128, activation='relu'))

    # Add another dropout layer after the second hidden layer
    model.add(Dense(64, activation='relu'))

    # Add one more dropout layer before the output layer
    model.add(Dropout(0.2))

    model.add(Dense(1, activation='sigmoid'))

    return model

regularized_model=create_regularized_model()
# Compile the model
regularized_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model on resampled training data and validate on testing data
regularized_model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=32)
scores = regularized_model.evaluate(x_test, y_test, verbose = 0)

print("%s: %.2f%%" % (regularized_model.metrics_names[1], scores[1]*100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
accuracy: 94.19%


###Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from tensorflow import keras
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score
# Define a function to create a model with specific hyperparameters
def create_hp_model(num_hidden_units=32, learning_rate=0.01):
    model = Sequential()
    model.add(Dense(num_hidden_units, input_dim=10, activation='relu'))
    model.add(Dense(num_hidden_units,activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Create a model with default hyperparameters for initial GridSearchCV setup
# Create a custom scoring function for GridSearchCV
def custom_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    accuracy = accuracy_score(y, y_pred)
    return accuracy
# Create a pipeline to include data preprocessing (e.g., scaling)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', None)  # The model will be set in GridSearchCV
])

# Define the hyperparameter grid to search
param_grid = {
    'model': [create_hp_model(32, 0.001), create_hp_model(64, 0.01), create_hp_model(128, 0.1)]
}

# Create a GridSearchCV object with custom scoring
grid = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, scoring=make_scorer(custom_scorer))

# Perform the grid search on your data
grid_result = grid.fit(x_train, y_train)

# Get the best model
best_model = grid_result.best_estimator_.named_steps['model']

# Fit the final model on your training data
best_model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))
scores = best_model.evaluate(x_test, y_test, verbose = 0)

print("%s: %.2f%%" % (best_model.metrics_names[1], scores[1]*100))