In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import pickle
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import datetime

In [2]:
csv = pd.read_csv('./healthcare-dataset-stroke-data.csv')
csv.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
tf.config.experimental.set_memory_growth(physical_devices[1], True)

In [4]:
csv.drop(['id'], axis=1, inplace=True)

In [5]:
def mapping(data,feature):
    featureMap=dict()
    count=0.0
    for i in sorted(data[feature].unique(),reverse=True):
        featureMap[i]=count
        count=count+1.0
    data[feature]=data[feature].map(featureMap).convert_dtypes(infer_objects=False, convert_integer=False, convert_floating= True)
    return data

def normalizeCol (data, columnName, normRange):
    data[columnName]=(((data[columnName]-data[columnName].min())/(data[columnName].max()-data[columnName].min()))*(normRange[1]-normRange[0]))-normRange[0]
    #normalize age
    return data.convert_dtypes(infer_objects=False, convert_integer=False, convert_floating= True)

In [6]:
csv.gender.unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [7]:
csv = mapping(csv, "gender") #quantize gender
csv = normalizeCol(csv, "gender", [0, 1])

In [8]:
csv.gender.unique() 
# csv[csv['gender']== 0.5]

<FloatingArray>
[0.5, 1.0, 0.0]
Length: 3, dtype: Float64

In [9]:
# csv.age.unique()
# csv[csv['age']== (csv.age.max())] #validating data

In [10]:
csv = normalizeCol(csv, "age", [0, 1])

In [11]:
# csv[csv['age']== 1.00000000e+00] #validation

In [12]:
csv.ever_married.unique()

<StringArray>
['Yes', 'No']
Length: 2, dtype: string

In [13]:
csv = mapping(csv, "ever_married")
csv = normalizeCol(csv, "ever_married", [0, 1])

In [14]:
csv = mapping(csv, "work_type")
csv = normalizeCol(csv, "work_type", [0, 1])

In [15]:
csv = mapping(csv, "Residence_type")
csv = normalizeCol(csv, "Residence_type", [0, 1])

In [16]:
csv = normalizeCol(csv, "avg_glucose_level", [0, 1])

In [17]:
csv = csv.fillna(csv['bmi'].mean())
csv = mapping(csv, "bmi")
csv = normalizeCol(csv, "bmi", [0, 1])

In [18]:
csv = mapping(csv, "smoking_status")
csv = normalizeCol(csv, "smoking_status", [0, 1])

In [19]:
csv = normalizeCol(csv, "stroke", [0, 1])

In [20]:
csv.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0.5,0.816895,0,1,0.0,0.5,0.0,0.801265,0.425837,0.666667,1.0
1,1.0,0.743652,0,0,0.0,0.25,1.0,0.679023,0.61244,0.333333,1.0
2,0.5,0.975586,0,1,0.0,0.5,1.0,0.234512,0.523923,0.333333,1.0
3,1.0,0.597168,0,0,0.0,0.5,0.0,0.536008,0.478469,0.0,1.0
4,1.0,0.963379,1,0,0.0,0.25,1.0,0.549349,0.729665,0.333333,1.0
5,0.5,0.987793,0,0,0.0,0.5,0.0,0.605161,0.607656,0.666667,1.0
6,0.5,0.902344,1,1,0.0,0.5,1.0,0.069107,0.648325,0.333333,1.0
7,1.0,0.841309,0,0,1.0,0.5,0.0,0.181285,0.758373,0.333333,1.0
8,1.0,0.719238,0,0,0.0,0.5,1.0,0.097082,0.61244,1.0,1.0
9,1.0,0.951172,0,0,0.0,0.5,0.0,0.015927,0.72488,1.0,1.0


In [28]:
X, y = csv.values[:, :-1], csv.values[:, -1]
X= X.astype('float32')
y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
n_features = X_train.shape[1] #feature selection

(3423, 10) (1687, 10) (3423,) (1687,)


In [35]:
model = tf.keras.models.Sequential() #the model
model.add(tf.keras.layers.Dense(n_features, activation='relu', kernel_initializer='he_normal', input_shape=(n_features,)))
model.add(tf.keras.layers.Dense(40, activation='relu', kernel_initializer='he_normal'))
model.add(tf.keras.layers.Dense(30, activation='relu', kernel_initializer='he_normal'))
model.add(tf.keras.layers.Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [36]:
try:
    model = tf.keras.models.load_model('trained_model')
except:
    print("no model file, training from scratch")
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

history = model.fit(X_train, y_train, epochs=30, batch_size=32, verbose=0, validation_data=(X_test, y_test), callbacks=[tensorboard_callback]).history
model.save('trained_model')
pickle.dump(history, open("history.p", "wb"))
model = tf.keras.models.load_model('trained_model')
history = pickle.load(open("history.p", "rb"))

INFO:tensorflow:Assets written to: trained_model\assets


In [26]:
loss, acc = model.evaluate(X_test, y_test, verbose=0)

1687


In [32]:
print('Test Accuracy: %.3f' % acc)

Test Accuracy: 0.959
