# Stroke Prediction Using Deep Learning

In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Loading in the data and getting a simple overview of it

In [121]:
data = pd.read_csv('C:\\Users\\Tyron\\OneDrive\\Desktop\\Machine Learning Projects\\Machine-Learning-Projects\\Stroke_Prediction\\healthcare-dataset-stroke-data.csv')
data.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [122]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [123]:
data.isna().any()

id                   False
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                   True
smoking_status       False
stroke               False
dtype: bool

In [124]:
data.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

### Data Preprocessing

#### Splitting the data into train and test sets

In [125]:
data = data[data['gender'] != 'Other']
stroke_data = data[data['stroke'] == 1].sample(249, random_state=42)
no_stroke_data = data[data['stroke'] == 0].sample(249, random_state=42)

In [143]:
test = pd.read_csv('test.csv')
submission = test['id']

In [144]:
test = test[test['work_type'] != 'Never_worked']

In [145]:
test = test[test['gender'] != 'Other']

In [146]:
test = test.drop(['id'], axis=1)

In [130]:
equal_sample = pd.concat([stroke_data.reset_index(), no_stroke_data.reset_index()], axis=0)
X = equal_sample.drop(['index', 'id', 'stroke'], axis=1)
y = equal_sample['stroke']

### Creating a Pipeline to automate data preprocessing

In [131]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

In [132]:
class Stroke_Pipeline():
    def __init__(self, data):
        self.data = data
        self.num_pipeline = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
        self.cat_pipeline = make_pipeline(OneHotEncoder())
    
    def transform(self):
        num_attrib = self.data.select_dtypes(['int64', 'float64']).columns
        cat_attrib = self.data.select_dtypes('object').columns

        pipe = ColumnTransformer([
            ('num', self.num_pipeline, num_attrib),
            ('cat', self.cat_pipeline, cat_attrib)
        ])

        piped_data = pipe.fit_transform(self.data)
        return tf.convert_to_tensor(piped_data)

In [133]:
X = Stroke_Pipeline(X).transform()
y = tf.convert_to_tensor(y)

In [134]:
X

<tf.Tensor: shape=(498, 19), dtype=float64, numpy=
array([[ 0.94631846, -0.45687839, -0.36660108, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.85486579,  2.18876624,  2.72776061, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.12324441, -0.45687839,  2.72776061, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.05966093, -0.45687839, -0.36660108, ...,  0.        ,
         0.        ,  1.        ],
       [-0.83700864, -0.45687839, -0.36660108, ...,  0.        ,
         1.        ,  0.        ],
       [-1.93444071, -0.45687839, -0.36660108, ...,  0.        ,
         0.        ,  0.        ]])>

In [135]:
y.shape

TensorShape([498])

### Model Creation

In [75]:
leaky_relu = tf.keras.layers.LeakyReLU(alpha=0.2)

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(19,)),
    tf.keras.layers.Dense(498, activation=leaky_relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(498, activation=leaky_relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(249, activation=leaky_relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(249, activation=leaky_relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(125, activation=leaky_relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [76]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy'],
    loss='binary_crossentropy'
)

In [80]:
model.fit(X, y, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x296f1d52470>

In [136]:
test = Stroke_Pipeline(test).transform()

In [140]:
y_pred = model.predict(test)



###  Model Evaluation

#### Evaluation on Training Data

### Saving the model