In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("../data/heart_2020_cleaned.csv")
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


## EDA

In [3]:
df.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,319795.0,319795.0,319795.0,319795.0
mean,28.325399,3.37171,3.898366,7.097075
std,6.3561,7.95085,7.955235,1.436007
min,12.02,0.0,0.0,1.0
25%,24.03,0.0,0.0,6.0
50%,27.34,0.0,0.0,7.0
75%,31.42,2.0,3.0,8.0
max,94.85,30.0,30.0,24.0


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   BMI               319795 non-null  float64
 1   Smoking           319795 non-null  object 
 2   AlcoholDrinking   319795 non-null  object 
 3   Stroke            319795 non-null  object 
 4   PhysicalHealth    319795 non-null  float64
 5   MentalHealth      319795 non-null  float64
 6   DiffWalking       319795 non-null  object 
 7   Sex               319795 non-null  object 
 8   AgeCategory       319795 non-null  object 
 9   PhysicalActivity  319795 non-null  object 
 10  SleepTime         319795 non-null  float64
 11  Asthma            319795 non-null  object 
 12  KidneyDisease     319795 non-null  object 
 13  SkinCancer        319795 non-null  object 
dtypes: float64(4), object(10)
memory usage: 34.2+ MB


In [4]:
stroke = np.array(df["Stroke"])
unique, counts = np.unique(stroke, return_counts=True)
print(np.column_stack((unique,counts)))

[['No' 307726]
 ['Yes' 12069]]


## Data preprocessing

In [5]:
# drop not important columns after finding out about feature of importance
df = df.drop(["Race","HeartDisease","GenHealth","Diabetic"], axis = 1)

In [6]:
# setting up dependent variable
y = df["Stroke"]

In [7]:
# encoded target column heart disease
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)


In [8]:
# encode data
data = df.drop(["Stroke"], axis = 1)
X= pd.get_dummies(data)
X.columns

Index(['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime', 'Smoking_No',
       'Smoking_Yes', 'AlcoholDrinking_No', 'AlcoholDrinking_Yes',
       'DiffWalking_No', 'DiffWalking_Yes', 'Sex_Female', 'Sex_Male',
       'AgeCategory_18-24', 'AgeCategory_25-29', 'AgeCategory_30-34',
       'AgeCategory_35-39', 'AgeCategory_40-44', 'AgeCategory_45-49',
       'AgeCategory_50-54', 'AgeCategory_55-59', 'AgeCategory_60-64',
       'AgeCategory_65-69', 'AgeCategory_70-74', 'AgeCategory_75-79',
       'AgeCategory_80 or older', 'PhysicalActivity_No',
       'PhysicalActivity_Yes', 'Asthma_No', 'Asthma_Yes', 'KidneyDisease_No',
       'KidneyDisease_Yes', 'SkinCancer_No', 'SkinCancer_Yes'],
      dtype='object')

In [9]:
# remove unnecessary columns
X = X.drop(["Sex_Female","Smoking_No","AlcoholDrinking_No","DiffWalking_No","PhysicalActivity_No","Asthma_No","KidneyDisease_No","SkinCancer_No"],axis = 1)

In [10]:
# Store column names
feature_names = X.columns
feature_names

Index(['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime', 'Smoking_Yes',
       'AlcoholDrinking_Yes', 'DiffWalking_Yes', 'Sex_Male',
       'AgeCategory_18-24', 'AgeCategory_25-29', 'AgeCategory_30-34',
       'AgeCategory_35-39', 'AgeCategory_40-44', 'AgeCategory_45-49',
       'AgeCategory_50-54', 'AgeCategory_55-59', 'AgeCategory_60-64',
       'AgeCategory_65-69', 'AgeCategory_70-74', 'AgeCategory_75-79',
       'AgeCategory_80 or older', 'PhysicalActivity_Yes', 'Asthma_Yes',
       'KidneyDisease_Yes', 'SkinCancer_Yes'],
      dtype='object')

In [11]:
X.head()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,Smoking_Yes,AlcoholDrinking_Yes,DiffWalking_Yes,Sex_Male,AgeCategory_18-24,AgeCategory_25-29,...,AgeCategory_55-59,AgeCategory_60-64,AgeCategory_65-69,AgeCategory_70-74,AgeCategory_75-79,AgeCategory_80 or older,PhysicalActivity_Yes,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
0,16.6,3.0,30.0,5.0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,1,0,1
1,20.34,0.0,0.0,7.0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
2,26.58,20.0,30.0,8.0,1,0,0,1,0,0,...,0,0,1,0,0,0,1,1,0,0
3,24.21,0.0,0.0,6.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,23.71,28.0,0.0,8.0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [12]:
# split data into training and testing portion
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=42)

In [13]:
# no troke / stroke ratio
unique, counts = np.unique(y_train, return_counts=True)
print(np.column_stack((unique,counts)))

[[     0 230857]
 [     1   8989]]


## Logistic Regression Model

In [14]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(max_iter=1000)

In [15]:
logistic.fit(X_train,y_train)

LogisticRegression(max_iter=1000)

In [16]:
from sklearn.metrics import mean_squared_error, r2_score
# predicted = classifier.predict(X)
# mse = mean_squared_error(y, predicted)
r2 = logistic.score(X_test, y_test)
print(f"Logistic Regression score: %.04f" %r2)

Logistic Regression score: 0.9614


In [17]:
predictions = logistic.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                        target_names=["no stroke", "stroke"]))

              precision    recall  f1-score   support

   no stroke       0.96      1.00      0.98     76869
      stroke       0.29      0.00      0.00      3080

    accuracy                           0.96     79949
   macro avg       0.62      0.50      0.49     79949
weighted avg       0.94      0.96      0.94     79949



## Decision tree model

In [18]:
from sklearn import tree
dtree = tree.DecisionTreeClassifier()

In [19]:
dtree.fit(X_train,y_train)

DecisionTreeClassifier()

In [20]:
r2 = dtree.score(X_test, y_test)
print(f"Tree score: %.04f" %r2)

Tree score: 0.9284


In [21]:
predictions = dtree.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                        target_names=["no stroke", "stroke"]))

              precision    recall  f1-score   support

   no stroke       0.96      0.96      0.96     76869
      stroke       0.10      0.11      0.11      3080

    accuracy                           0.93     79949
   macro avg       0.53      0.54      0.54     79949
weighted avg       0.93      0.93      0.93     79949



## Random forest model

In [22]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50) 
rf = rf.fit(X_train,y_train)
rf.score(X_test, y_test)


0.9556717407347184

In [23]:
predictions = rf.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                        target_names=["no stroke", "stroke"]))

              precision    recall  f1-score   support

   no stroke       0.96      0.99      0.98     76869
      stroke       0.10      0.02      0.03      3080

    accuracy                           0.96     79949
   macro avg       0.53      0.51      0.50     79949
weighted avg       0.93      0.96      0.94     79949



## Chosen model: Decision Tree Classifier
Reason: best F1 score on positive outcome

In [24]:
# calculate feature importance
importances = dtree.feature_importances_
# We can sort the features by their importance
sorted(zip(dtree.feature_importances_, feature_names), reverse=True)

[(0.4663401097923856, 'BMI'),
 (0.11090857351666454, 'SleepTime'),
 (0.08526923994354865, 'MentalHealth'),
 (0.07088163694471973, 'PhysicalHealth'),
 (0.037248717802201944, 'PhysicalActivity_Yes'),
 (0.03373240650447048, 'Sex_Male'),
 (0.03268277155836561, 'DiffWalking_Yes'),
 (0.029128621315327563, 'Smoking_Yes'),
 (0.02807259574789684, 'SkinCancer_Yes'),
 (0.02259082850046824, 'Asthma_Yes'),
 (0.011626030356629399, 'AlcoholDrinking_Yes'),
 (0.008885478361278088, 'AgeCategory_75-79'),
 (0.007874632581025153, 'AgeCategory_70-74'),
 (0.007451579793688646, 'AgeCategory_65-69'),
 (0.0073280291972846676, 'AgeCategory_60-64'),
 (0.007005636693971912, 'KidneyDisease_Yes'),
 (0.006541257400060225, 'AgeCategory_55-59'),
 (0.005561790493785104, 'AgeCategory_80 or older'),
 (0.005159922198882409, 'AgeCategory_50-54'),
 (0.003943996278792211, 'AgeCategory_45-49'),
 (0.0037267064330332653, 'AgeCategory_35-39'),
 (0.0033291486780567515, 'AgeCategory_40-44'),
 (0.0021131254532392287, 'AgeCategory_25

## Save model

In [27]:
# Create pickle file
file = open('../models/DTmodel_stroke_prediction', 'wb') 

# Save the trained model to file
import pickle
pickle.dump(dtree, file)
file.close()

## Prediction

In [28]:
# load the model from disk using pickle
loaded_model = pickle.load(open('../models/DTmodel_stroke_prediction', 'rb'))

In [59]:
feature_names

Index(['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime', 'Smoking_Yes',
       'AlcoholDrinking_Yes', 'DiffWalking_Yes', 'Sex_Male',
       'AgeCategory_18-24', 'AgeCategory_25-29', 'AgeCategory_30-34',
       'AgeCategory_35-39', 'AgeCategory_40-44', 'AgeCategory_45-49',
       'AgeCategory_50-54', 'AgeCategory_55-59', 'AgeCategory_60-64',
       'AgeCategory_65-69', 'AgeCategory_70-74', 'AgeCategory_75-79',
       'AgeCategory_80 or older', 'PhysicalActivity_Yes', 'Asthma_Yes',
       'KidneyDisease_Yes', 'SkinCancer_Yes'],
      dtype='object')

In [66]:
converted_input = dict.fromkeys(feature_names)

In [99]:
# Sample user's input from webpage
user_input = {
    "Height":180,
    "Weight":60,
    "PhysicalHealth":2,
    "MentalHealth":5,
    "SleepTime":7,
    "Smoking": "No",
    "AlcoholDrinking":"No",
    'DiffWalking':"Yes",
    'Sex':"Female",
    'Age Category':"40-44",
    'PhysicalActivity':"Yes",
    'Asthma':"No",
    'KidneyDisease':"Yes",
    'SkinCancer':"No"
    }


In [100]:
# convert user input to compatible form
converted_input["BMI"] = user_input["Weight"] / (user_input["Height"]/100)**2
converted_input["PhysicalHealth"] = user_input["PhysicalHealth"]
converted_input["MentalHealth"] = user_input["MentalHealth"]
converted_input["SleepTime"] = user_input["SleepTime"]

# set all values of the remaining keys to 0
for i in range(4,len(feature_names),1):
    key = feature_names[i]
    converted_input[key] = 0

In [101]:
# Convert boolean type input to numerical value
if user_input["Smoking"] == "Yes":
    converted_input["Smoking_Yes"] = 1

if user_input["AlcoholDrinking"] == "Yes":
    converted_input["AlcoholDrinking_Yes"] = 1

if user_input["DiffWalking"] == "Yes":
    converted_input["DiffWalking_Yes"] = 1

if user_input["Sex"] == "Yes":
    converted_input["Sex_Male"] = 1
    
if user_input["PhysicalActivity"] == "Yes":
    converted_input["PhysicalActivity_Yes"] = 1

if user_input["Asthma"] == "Yes":
    converted_input["Asthma_Yes"] = 1

if user_input["KidneyDisease"] == "Yes":
    converted_input["KidneyDisease_Yes"] = 1

if user_input["SkinCancer"] == "Yes":
    converted_input["SkinCancer_Yes"] = 1

In [102]:
# convert age category
key = "AgeCategory_" + user_input['Age Category']
converted_input[key] = 1

converted_input

{'BMI': 18.51851851851852,
 'PhysicalHealth': 0,
 'MentalHealth': 0,
 'SleepTime': 8,
 'Smoking_Yes': 0,
 'AlcoholDrinking_Yes': 0,
 'DiffWalking_Yes': 0,
 'Sex_Male': 0,
 'AgeCategory_18-24': 0,
 'AgeCategory_25-29': 0,
 'AgeCategory_30-34': 0,
 'AgeCategory_35-39': 1,
 'AgeCategory_40-44': 0,
 'AgeCategory_45-49': 0,
 'AgeCategory_50-54': 0,
 'AgeCategory_55-59': 0,
 'AgeCategory_60-64': 0,
 'AgeCategory_65-69': 0,
 'AgeCategory_70-74': 0,
 'AgeCategory_75-79': 0,
 'AgeCategory_80 or older': 0,
 'PhysicalActivity_Yes': 1,
 'Asthma_Yes': 0,
 'KidneyDisease_Yes': 0,
 'SkinCancer_Yes': 0}

In [103]:
# convert dictionary to df
input_df = pd.DataFrame(converted_input, index=[0])
input_df

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,Smoking_Yes,AlcoholDrinking_Yes,DiffWalking_Yes,Sex_Male,AgeCategory_18-24,AgeCategory_25-29,...,AgeCategory_55-59,AgeCategory_60-64,AgeCategory_65-69,AgeCategory_70-74,AgeCategory_75-79,AgeCategory_80 or older,PhysicalActivity_Yes,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
0,18.518519,0,0,8,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [104]:
prediction = loaded_model.predict(input_df)
if prediction[0] == 0:
    print("You have 96% change of having no stroke")
else:    
    print("You have 11% change of having a stroke")

You have 96% change of having no stroke
