# Brain Stroke Prediction 

## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.utils import to_categorical
import time
import sys

In [3]:
# Read the CSV file without setting any column as the index
df = pd.read_csv("C:\\Users\\ssrut\\OneDrive\\Desktop\\Sem 5\\CP3 projects\\Iot project\\Data\\brain_stroke.csv")
df.head()
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
4976,Male,41.0,0,0,No,Private,Rural,70.15,29.8,formerly smoked,0
4977,Male,40.0,0,0,Yes,Private,Urban,191.15,31.1,smokes,0
4978,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,31.8,smokes,0
4979,Male,40.0,0,0,Yes,Private,Rural,83.94,30.0,smokes,0


In [4]:
# Convert 'ever_married' to binary (0s and 1s)
df['ever_married'] = df['ever_married'].map({'Yes': 1, 'No': 0})

# Convert 'work_type' to binary, treating 'Private' as 1 and all other types as 0
df['work_type'] = df['work_type'].apply(lambda x: 1 if x == 'Private' else 0)

# Convert 'Residence_type' to binary (Urban as 1, Rural as 0)
df['Residence_type'] = df['Residence_type'].map({'Urban': 1, 'Rural': 0})

# Convert 'smoking_status' to binary, treating 'smokes' as 1 and all other types as 0
df['smoking_status'] = df['smoking_status'].apply(lambda x: 1 if x == 'smokes' else 0)

# Convert float columns 'avg_glucose_level' and 'bmi' to integers
df['avg_glucose_level'] = df['avg_glucose_level'].astype(int)
df['bmi'] = df['bmi'].astype(int)

# Display the final DataFrame
print(df.head())

   gender   age  hypertension  heart_disease  ever_married  work_type  \
0    Male  67.0             0              1             1          1   
1    Male  80.0             0              1             1          1   
2  Female  49.0             0              0             1          1   
3  Female  79.0             1              0             1          0   
4    Male  81.0             0              0             1          1   

   Residence_type  avg_glucose_level  bmi  smoking_status  stroke  
0               1                228   36               0       1  
1               0                105   32               0       1  
2               1                171   34               1       1  
3               0                174   24               0       1  
4               1                186   29               0       1  


In [5]:
df.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [6]:
print("Unique values in 'stroke' column:", df['stroke'].unique())

Unique values in 'stroke' column: [1 0]


In [7]:
# Get basic statistics for numerical columns
print("\nSummary statistics for numerical columns:")
df.describe()


Summary statistics for numerical columns:


Unnamed: 0,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0
mean,43.419859,0.096165,0.05521,0.658502,0.574182,0.508332,105.454126,28.055611,0.155792,0.049789
std,22.662755,0.294848,0.228412,0.47426,0.494516,0.499981,45.06551,6.782413,0.362694,0.217531
min,0.08,0.0,0.0,0.0,0.0,0.0,55.0,14.0,0.0,0.0
25%,25.0,0.0,0.0,0.0,0.0,0.0,77.0,23.0,0.0,0.0
50%,45.0,0.0,0.0,1.0,1.0,1.0,91.0,28.0,0.0,0.0
75%,61.0,0.0,0.0,1.0,1.0,1.0,113.0,32.0,0.0,0.0
max,82.0,1.0,1.0,1.0,1.0,1.0,271.0,48.0,1.0,1.0


In [8]:
# Get the unique values in each column to see if there are categorical data
print("\nUnique values in each column:")
for column in df.columns:
    print(f"{column}: {df[column].nunique()} unique values")


Unique values in each column:
gender: 2 unique values
age: 104 unique values
hypertension: 2 unique values
heart_disease: 2 unique values
ever_married: 2 unique values
work_type: 2 unique values
Residence_type: 2 unique values
avg_glucose_level: 207 unique values
bmi: 35 unique values
smoking_status: 2 unique values
stroke: 2 unique values


### Encoding

In [9]:
# Encode categorical columns
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df['ever_married'] = label_encoder.fit_transform(df['ever_married'])
df['work_type'] = label_encoder.fit_transform(df['work_type'])
df['Residence_type'] = label_encoder.fit_transform(df['Residence_type'])
df['smoking_status'] = label_encoder.fit_transform(df['smoking_status'])

# Encode the target column (stroke)
df['stroke'] = label_encoder.fit_transform(df['stroke'])

In [10]:
# Separate features and target variable
X = df.drop(columns=['stroke'])
y = df['stroke']

# One-hot encode the target variable
y = to_categorical(y)

In [11]:
# Normalize the feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
# Split the data into training and testing sets (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.20, random_state=42)

### Build and train the model

In [13]:
# Model Parameters
DENSE1_SIZE = 32
DENSE2_SIZE = 16
DENSE3_SIZE = 8
NUM_OF_EPOCHS = 100
BATCH_SIZE = 16

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(DENSE1_SIZE, activation='relu'),
    tf.keras.layers.Dense(DENSE2_SIZE, activation='relu'),
    tf.keras.layers.Dense(DENSE3_SIZE, activation='relu'),
    tf.keras.layers.Dense(y.shape[1], activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 10)                0         
                                                                 
 dense (Dense)               (None, 32)                352       
                                                                 
 dense_1 (Dense)             (None, 16)                528       
                                                                 
 dense_2 (Dense)             (None, 8)                 136       
                                                                 
 dense_3 (Dense)             (None, 2)                 18        
                                                                 
Total params: 1,034
Trainable params: 1,034
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
# Train the model
history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=NUM_OF_EPOCHS,
                    verbose=1, validation_data=(X_val, y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### Evaluate the Model

In [15]:
# Evaluate the model
train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=0)
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

Training Accuracy: 96.99%
Validation Accuracy: 93.18%


### Model conversion to header file

In [16]:
# Save the model
model.save('BrainStrokeModel1.h5')

### Model Conversion to TensorFlow Lite

In [18]:
# Convert the model to TensorFlow Lite
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]

# Representative dataset for quantization
def representative_dataset():
    for i in range(100):
        yield [X_train.astype(np.float32)]

converter.representative_dataset = representative_dataset
tflite_model = converter.convert()

# Save the TFLite model
with open('BrainStrokeModel1.tflite', 'wb') as f:
    f.write(tflite_model)
print("Model has been converted and saved as 'BrainStrokeModel.tflite'.")



INFO:tensorflow:Assets written to: C:\Users\ssrut\AppData\Local\Temp\tmpam1kp8j5\assets


INFO:tensorflow:Assets written to: C:\Users\ssrut\AppData\Local\Temp\tmpam1kp8j5\assets


Model has been converted and saved as 'BrainStrokeModel.tflite'.


In [19]:
# Function to convert some hex values into an array for C programming
import time, sys

# Function to convert some hex values into an array for C programming
def hex_to_c_array(hex_data, var_name):
    c_str = ""

    # Create header guard
    c_str += '#ifndef ' + var_name.upper() + '_H\n'
    c_str += "#define " + var_name.upper() + '_H\n\n'

    c_str += "/*\n Author: Sruthika Sivakumar \n"
    c_str += " CAUTION: This is an auto generated file.\n DO NOT EDIT OR MAKE ANY CHANGES TO IT.\n"

# Time stamping of this model data in the generated file
    localtime = time.asctime( time.localtime(time.time()) )
    c_str += " This model data was generated on " + localtime+ '\n\n'
    print("This model data was generated on:", localtime)

# Add information about the verisons of tools and packages used in generating this header file
    c_str += " Tools used:\n Python:" + str(sys.version) + "\n Numpy:" + str(np.version.version) + \
          "\n TensorFlow:" + str(sys.version) + "\n Keras: "+ str(tf.keras.__version__) + "\n\n"
    print("Tools used: Python:", sys.version, "\n Numpy:", np.version.version, \
          "\n TensorFlow:", sys.version, "\n Keras: ", tf.keras.__version__, "\n\n")

# Training details of the model
    c_str += ' Model details are:\n'
    c_str += ' NUM_OF_EPOCHS = ' + str(NUM_OF_EPOCHS) + '\n'
    c_str += ' BATCH_SIZE    = ' + str(BATCH_SIZE) + '\n*/\n'
    
# Generate 'C' constants for the no. of nodes in each layer
    c_str += '\nconst int ' + 'DENSE1_SIZE' + ' = ' + str(DENSE1_SIZE) + ';\n'
    c_str +=   'const int ' + 'DENSE2_SIZE' + ' = ' + str(DENSE2_SIZE) + ';\n'      
    
    # Add array length at the top of the file
    c_str += '\nconst unsigned int ' + var_name + '_len = ' + str(len(hex_data)) + ';\n'

    # Declare C variable
    c_str += 'alignas(8) const unsigned char ' + var_name + '[] = {'
    hex_array = []
    for i, val in enumerate(hex_data):
        # Construct string from hex
        hex_str = format(val, '#04x')

        # Add formating so each line stays within 80 characters
        if (i + 1) < len(hex_data):
          hex_str += ','
        if (i + 1) % 12 == 0:
          hex_str += '\n'
        hex_array.append(hex_str)

    # Add closing brace
    c_str += '\n' + format(''.join(hex_array)) + '\n};\n\n'

    # Close out header guard
    c_str += '#endif //' + var_name.upper() + '_H'

    return c_str

In [20]:
# Write TFLite model to a C source (or header) file
with open("BrainStrokeModel1" + '.h', 'w') as file:
  file.write(hex_to_c_array(tflite_model, "BrainStrokeESP32"))

This model data was generated on: Tue Nov  5 15:25:02 2024
Tools used: Python: 3.7.16 (default, Jan 17 2023, 16:06:28) [MSC v.1916 64 bit (AMD64)] 
 Numpy: 1.21.5 
 TensorFlow: 3.7.16 (default, Jan 17 2023, 16:06:28) [MSC v.1916 64 bit (AMD64)] 
 Keras:  2.11.0 


