In [1]:
from ipywidgets import FileUpload
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io
from collections import Counter

In [2]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [3]:
uploader = FileUpload()
uploader

FileUpload(value={}, description='Upload')

In [4]:
file = list(uploader.value.values())[0]
content = file['content']
content = io.StringIO(content.decode('utf-8'))
df = pd.read_csv(content)

In [5]:
df = df.drop(columns = ['id'])

In [6]:
df[df['hypertension'] == 1]['heart_disease'].value_counts()

0    434
1     64
Name: heart_disease, dtype: int64

In [7]:
df['gender'] = (df['gender'] == 'Female').astype(int)

In [8]:
df['ever_married'] = (df['ever_married'] == 'Yes').astype(int)

In [9]:
df['Residence_type'] = (df['Residence_type'] == 'Urban').astype(int)

In [10]:
work_types = list(set(df['work_type']))

In [11]:
print(work_types)

['Govt_job', 'Self-employed', 'Never_worked', 'Private', 'children']


In [12]:
work_types_dict = {
                   'Self-employed' : 1,
                   'Govt_job' : 0.5,
                   'Private' : 0.5,
                   'Never_worked' : 0,
                   'children' : 0,
                  }

In [13]:
print(work_types_dict.keys())
print(work_types_dict.values())

dict_keys(['Self-employed', 'Govt_job', 'Private', 'Never_worked', 'children'])
dict_values([1, 0.5, 0.5, 0, 0])


In [14]:
df = df.replace(work_types_dict.keys(), work_types_dict.values())

In [15]:
smoking_status = list(set(df['smoking_status']))

In [16]:
smoking_status_dict = {
                          'formerly smoked' : 0.5,
                          'smokes' : 1,
                          'never smoked' : 0,
                          'Unknown' : np.nan,
                      }

In [17]:
df = df.replace(smoking_status_dict.keys(), smoking_status_dict.values())

In [18]:
smoking_status_mean = df['smoking_status'].mean()
smoking_status_std = df['smoking_status'].std()

for i in range(len(df['smoking_status'])):
    if np.isnan(df.loc[i, 'smoking_status']):
        df.loc[i, 'smoking_status'] = np.random.normal(smoking_status_mean, smoking_status_std)
df['smoking_status'] = df['smoking_status'].fillna(value = np.random.normal(smoking_status_mean, smoking_status_std))

In [19]:
df.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [20]:
df['bmi'].value_counts().max()

41

In [21]:
df.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [22]:
df.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [23]:
continuous_columns = ['age', 'avg_glucose_level', 'bmi']
for i in range(len(continuous_columns)):
    df[continuous_columns[i]] = (df[continuous_columns[i]] - df[continuous_columns[i]].mean())/df[continuous_columns[i]].mean()

In [24]:
bmi_mean = df['bmi'].mean()
bmi_std = df['bmi'].std()
for i in range(len(df['bmi'])):
    if np.isnan(df.loc[i, 'bmi']):
        df.loc[i, 'bmi'] = np.random.normal(bmi_mean, bmi_std)

In [25]:
df = df.sample(frac = 1.0, random_state = 32).reset_index().drop(columns = ['index'])

In [26]:
train_df = df[:3000]
test_df = df[3000:]

In [27]:
train_df.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [28]:
X_train = train_df.drop(columns = ['stroke'])
y_train = train_df['stroke']

In [29]:
count = Counter(y_train)
count

Counter({0: 2861, 1: 139})

In [30]:
X_test = test_df.drop(columns = ['stroke'])
y_test = test_df['stroke']

In [31]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [32]:
model = Sequential()
model.add(Dense(len(train_df.columns)))
model.add(Dense(32, activation = 'relu'))
model.add(Dropout(0.1))
model.add(Dense(16, activation = 'relu'))
model.add(Dropout(0.1))
model.add(Dense(8, activation = 'relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = 'accuracy')

In [33]:
for i in range(20):
  X_train = train_df.drop(columns = ['stroke'])
  y_train = train_df['stroke']
  oversample = SMOTE(sampling_strategy=0.1)
  undersample = RandomUnderSampler(sampling_strategy=0.8)
  steps = [('o', oversample), ('u', undersample)]
  pipeline = Pipeline(steps = steps)
  X_train, y_train = pipeline.fit_resample(X_train, y_train)
  model.fit(X_train, y_train, epochs = 10)

In [34]:
model.evaluate(X_test, y_test)



[0.39752358198165894, 0.7763032913208008]

In [35]:
model_json = model.to_json()

with open("model.json", "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
model.save_weights("model.h5")

print("Saved model to disk")

Saved model to disk
