# In this project we will build a Machine Learning model to predict whether an indiviudal will have a stroke.  The data used in this project can be found on kaggle at the following link: https://www.kaggle.com/asaumya/healthcare-data#train_2v.csv

# In this notebook, we build and implement our Machine Learning model.  To view our initial data analysis, please see the notebook titled "Data_Analysis."

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import os

# Define file path to our data
stoke_data_relevant_features_and_label_file_path = os.path.join("..", "Data", "stroke_data_relevant_features_and_label.csv")

# Create dataframe from local csv file 
stroke_data_relevant_features_and_label = pd.read_csv(stoke_data_relevant_features_and_label_file_path)

# Previe dataframe
stroke_data_relevant_features_and_label.head()

Unnamed: 0,hypertension,heart_disease,ever_married,work_type,smoking_status,age,average_glucose_level,bmi,stroke
0,0,0,No,children,,3.0,95.12,18.0,0
1,1,0,Yes,other,never smoked,58.0,87.96,39.2,0
2,0,0,No,other,,8.0,110.89,17.6,0
3,0,0,Yes,other,formerly smoked,70.0,69.04,35.9,0
4,0,0,No,other,,14.0,161.28,19.1,0


#### We want to one hot encode our categorical columns, so we will convert each 0 to "No," and each 1 to "Yes."

In [3]:
# Before we replace 0 and 1 with "no" and "yes",
# we should check to see if either of these numbers are present in the age column
number_of_people_age_0 = len(stroke_data_relevant_features_and_label[stroke_data_relevant_features_and_label["age"] == 0])
number_of_people_age_1 = len(stroke_data_relevant_features_and_label[stroke_data_relevant_features_and_label["age"] == 1])

print(f"Number of people of age 0: {number_of_people_age_0}")
print(f"Number of people of age 1: {number_of_people_age_1}")

Number of people of age 0: 0
Number of people of age 1: 34


In [4]:
# When we replace all values of 0 and 1 with "No" and "Yes,"
# we are going to replace ages of 1 with a value of "Yes"
# We will also replace the binary data in the stroke column with strings.
# We will therefore make copies of these rows to put back in the dataframe after our initial replacement

copy_of_data = pd.DataFrame()

# copy_of_data["age"] = stroke_data_relevant_features_and_label["age"]
# copy_of_data["stroke"] = stroke_data_relevant_features_and_label["stroke"]

copy_of_data_age = [stroke_data_relevant_features_and_label["age"]]
copy_of_data_stroke = [stroke_data_relevant_features_and_label["stroke"]]

In [5]:
# Replace each 0 with "No," and each 1 with "Yes."
stroke_data_relevant_features_and_label[["hypertension", "heart_disease"]].replace(0, "No", inplace=True)
stroke_data_relevant_features_and_label[["hypertension", "heart_disease"]].replace(1, "Yes", inplace=True)

# Preview dataframe after converting binary data to strings
stroke_data_relevant_features_and_label.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


Unnamed: 0,hypertension,heart_disease,ever_married,work_type,smoking_status,age,average_glucose_level,bmi,stroke
0,0,0,No,children,,3.0,95.12,18.0,0
1,1,0,Yes,other,never smoked,58.0,87.96,39.2,0
2,0,0,No,other,,8.0,110.89,17.6,0
3,0,0,Yes,other,formerly smoked,70.0,69.04,35.9,0
4,0,0,No,other,,14.0,161.28,19.1,0


In [6]:
# Check to see if either if the values of 1 in the age column were changed
number_of_people_age_1 = len(stroke_data_relevant_features_and_label[stroke_data_relevant_features_and_label["age"] == 1])

print(f"Number of people of age 1: {number_of_people_age_1}")

Number of people of age 1: 34


In [7]:
# # Replace the values in the post-replacement age and stroke columns with the original values
# stroke_data_relevant_features_and_label["age"] = copy_of_data_age
# stroke_data_relevant_features_and_label["stroke"] = copy_of_data_stroke

# # Preview dataframe to confirm values in stroke column were fixed
# stroke_data_relevant_features_and_label.head()

In [8]:
# Confirm binary data proplerly converted
print(stroke_data_relevant_features_and_label["hypertension"].value_counts())
print(100*"-")
print(stroke_data_relevant_features_and_label["heart_disease"].value_counts())

0    39339
1     4061
Name: hypertension, dtype: int64
----------------------------------------------------------------------------------------------------
0    41338
1     2062
Name: heart_disease, dtype: int64


In [9]:
# Transform data to one hot encoded data
machine_ready_stroke_data = pd.get_dummies(stroke_data_relevant_features_and_label, columns=["hypertension", "heart_disease", "ever_married", "work_type", "smoking_status"])
machine_ready_stroke_data.head()

Unnamed: 0,age,average_glucose_level,bmi,stroke,hypertension_0,hypertension_1,heart_disease_0,heart_disease_1,ever_married_No,ever_married_Yes,work_type_Self-employed,work_type_children,work_type_other,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,3.0,95.12,18.0,0,1,0,1,0,1,0,0,1,0,0,0,0
1,58.0,87.96,39.2,0,0,1,1,0,0,1,0,0,1,0,1,0
2,8.0,110.89,17.6,0,1,0,1,0,1,0,0,0,1,0,0,0
3,70.0,69.04,35.9,0,1,0,1,0,0,1,0,0,1,1,0,0
4,14.0,161.28,19.1,0,1,0,1,0,1,0,0,0,1,0,0,0


In [10]:
# Import Maching Learning algorithms will we try out
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [11]:
# Create our features and label
X = np.array(machine_ready_stroke_data.drop(["stroke"], axis=1))
y = np.array(machine_ready_stroke_data["stroke"].values.reshape(-1,1))

We now have our features and labels, but the data is still imbalanced.  We will try employing SMOTE to handle this issue.

### In the following section, we will run a for-loop to examine what order of SMOTE, split, scale (<em>SSS order</em>) yields the best results.  We will ignore any SSS order that scales before it splits, as this could bias the model.

In [12]:
# Import SMOTE to handle the imbalanced data issue
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Import tree to use the DecisionTreeClassifier() algorithm
from sklearn import tree

In [13]:
import warnings
warnings.filterwarnings('ignore')

# We want to determine which order of SMOTE, split, scale (SSS order) is best for this model
# We will use the mean and stdev methods of the statistics library,
# to find out which SSS order yields the highest average accuracy, and which order is the most stable (lowest standard deviation)
from statistics import mean, stdev

# Define variables holding the value for the each argument,
# in order to easily change it in multiple places

## SMOTE() parameters
sampling_strategy_argument = 0.80
k_neighbors_argument = 18

## train_test_split() parameters
test_size_argument = 0.2
random_state_argument = 3

## DecionTreeClassifier() parameters
max_depth_argument = 30
max_leaf_nodes_argument = 60

# For every iteration in the loop,
# we will append the accuracy of the current SSS order to it's own distinct list
# After the loop has finished, we will calculate the average of each list
# The list with the highest average we will call "the most accuracte (on average)"
# we will also calculate the standard deviation of each list
# The list with the lowest standard deviation we will call "the most stable"
SSS_order_1_list = []
SSS_order_2_list = []
SSS_order_3_list = []

# Define an iterator 
i = 1
print(i)

for i in range(10):
    
    # Print the current iteration of the loop,
    # in case we use a large number of iterations
    print(f"Iteration {i}")
    
    # Print the SSS order so we can analyze which one is "best"
    print("1. SMOTE, split, scale")
        
    # Use SMOTE to handle class imbalance
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=k_neighbors_argument)
    X_SMOTE, y_SMOTE = smote.fit_sample(X, y.ravel())
    y_SMOTE = y_SMOTE.reshape(-1,1)
    
    # Split the data into training and testing sets
    X_SMOTE_train, X_SMOTE_test, y_SMOTE_train, y_SMOTE_test = train_test_split(X_SMOTE, y_SMOTE, test_size=test_size_argument, random_state=random_state_argument)
    
    # Create scaler for features
    X_scaler = StandardScaler().fit(X_SMOTE_train)
    
    # Scale features
    X_SMOTE_train_scaled = X_scaler.transform(X_SMOTE_train)
    X_SMOTE_test_scaled = X_scaler.transform(X_SMOTE_test)
    
    # Create, fit, and score the Decision Tree Classifier
    classifier = tree.DecisionTreeClassifier(max_depth=max_depth_argument, max_leaf_nodes=max_leaf_nodes_argument)
    classifier = classifier.fit(X=X_SMOTE_train_scaled, y=y_SMOTE_train)
    score = classifier.score(X_SMOTE_test_scaled, y_SMOTE_test)
    
    # Append the score the the SSS_order_1 list,
    # So that we can determine the average accuracy and standard deviation of SSS order 1
    SSS_order_1_list.append(score)
    
    # Print the accuracy for the current iteration
    print(f"Accuracy: {score}")
    
####################################################################################################
    
    # Print the SSS order so we can analyze which one is "best"
    print("2. split, SMOTE, scale")
        
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_argument, random_state=random_state_argument)
    
    # Use SMOTE to handle class imbalance
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=k_neighbors_argument)
    X_train_SMOTE, y_train_SMOTE = smote.fit_sample(X_train, y_train.ravel())
    y_train_SMOTE = y_train_SMOTE.reshape(-1,1)
    
    # Create scaler for features
    X_SMOTE_scaler = StandardScaler().fit(X_train_SMOTE)
    
    # Scale features
    X_train_SMOTE_scaled = X_SMOTE_scaler.transform(X_train_SMOTE)
    X_test_scaled = X_SMOTE_scaler.transform(X_test)
    
    # Create, fit, and score the Decision Tree Classifier
    classifier = tree.DecisionTreeClassifier(max_depth=max_depth_argument, max_leaf_nodes=max_leaf_nodes_argument)
    classifier = classifier.fit(X=X_train_SMOTE_scaled, y=y_train_SMOTE)
    score = classifier.score(X_test_scaled, y_test)
    
    # Append the score the the SSS_order_2 list,
    # So that we can determine the average accuracy and standard deviation of SSS order 2
    SSS_order_2_list.append(score)
    
    # Print the accuracy for the current iteration
    print(f"Accuracy: {score}")

####################################################################################################

    # Print the SSS order so we can analyze which one is "best"
    print("3. split, scale, SMOTE")
        
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_argument, random_state=random_state_argument)
    
    # Create scaler for features
    X_scaler = StandardScaler().fit(X_train)
    
    # Scale features
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # Use SMOTE to handle class imbalance
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=k_neighbors_argument)
    X_train_scaled_SMOTE, y_train_SMOTE = smote.fit_sample(X_train_scaled, y_train.ravel())
    y_train_SMOTE = y_train_SMOTE.reshape(-1,1)

    # Create, fit, and score the Decision Tree Classifier
    classifier = tree.DecisionTreeClassifier(max_depth=max_depth_argument, max_leaf_nodes=max_leaf_nodes_argument)
    classifier = classifier.fit(X=X_train_scaled_SMOTE, y=y_train_SMOTE)
    score = classifier.score(X_test_scaled, y_test)
    
    # Append the score the the SSS_order_3 list,
    # So that we can determine the average accuracy and standard deviation of SSS order 3
    SSS_order_3_list.append(score)
    
    # Print the accuracy for the current iteration
    print(f"Accuracy: {score}")

####################################################################################################
    
    # Print a long line with blank lines above and below,
    # to easily see where one iteration of the loop ends, and the next starts
    print()
    print(100*"-")
    print()
    
    # Increase the iterator by one
    # so that the print statement at the beginning will show we're on the next iteration
    i += 1

####################################################################################################

# Find the average accuracy of each SSS order,
# and add each average to a list
average_1 = mean(SSS_order_1_list)
average_2 = mean(SSS_order_2_list)
average_3 = mean(SSS_order_3_list)
averages_list = [average_1, average_2, average_3]

# Use conditionals to determine which SSS order has the highest average accuracy
if max(averages_list) == averages_list[0]:
    most_accurate_order = 1
    average_accuracy_greatest = averages_list[0]
    
elif max(averages_list) == averages_list[1]:
    most_accurate_order = 2
    average_accuracy_greatest = averages_list[1]
    
elif max(averages_list) == averages_list[2]:
    most_accurate_order = 3
    average_accuracy_greatest = averages_list[2]

# Print a message showing which SSS order has the highest average accuracy, along with it's accuracy
print(f"The most accurate order (highest average accuracy) is order {most_accurate_order}, with an average accuracy of {average_accuracy_greatest}")
        
####################################################################################################

# Find the standard deviation of the accuracy of each SSS order,
# and add each standard deviation to a list
standard_deviation_1 = stdev(SSS_order_1_list)
standard_deviation_2 = stdev(SSS_order_2_list)
standard_deviation_3 = stdev(SSS_order_3_list)
standard_deviations_list = [standard_deviation_1, standard_deviation_2, standard_deviation_3]

# Use conditionals to determine which SSS order has the lowest standard deviation
if min(standard_deviations_list) == standard_deviations_list[0]:
    most_stable_order = 1
    lowest_standard_deviation = standard_deviations_list[0]
    
elif min(standard_deviations_list) == standard_deviations_list[1]:
    most_stable_order = 2
    lowest_standard_deviation = standard_deviations_list[1]
    
elif min(standard_deviations_list) == standard_deviations_list[2]:
    most_stable_order = 3
    lowest_standard_deviation = standard_deviations_list[2]
    
# Print a message showing which SSS order has the highest average accuracy, along with it's accuracy
print(f"The most stable order (lowest standard deviation) is order {most_stable_order}, with a standard deviation of {lowest_standard_deviation}")

####################################################################################################

# Print a blank line to separate the lines showing us the "best" orders from the lines showing us the "worst" orders
print()

# Use conditionals to determine which SSS order has the lowest average accuracy
if min(averages_list) == averages_list[0]:
    least_accurate_order = 1
    average_accuracy_least = averages_list[0]
    
elif min(averages_list) == averages_list[1]:
    least_accurate_order = 2
    average_accuracy_least = averages_list[1]
    
elif min(averages_list) == averages_list[2]:
    least_accurate_order = 3
    average_accuracy_least = averages_list[2]

# Print a message showing which SSS order has the highest average accuracy, along with it's accuracy
print(f"The least accurate order (least average accuracy) is order {least_accurate_order}, with an average accuracy of {average_accuracy_least}")
        
####################################################################################################

# Use conditionals to determine which SSS order has the highest standard deviation
if max(standard_deviations_list) == standard_deviations_list[0]:
    least_stable_order = 1
    greatest_standard_deviation = standard_deviations_list[0]
    
elif max(standard_deviations_list) == standard_deviations_list[1]:
    least_stable_order = 2
    greatest_standard_deviation = standard_deviations_list[1]
    
elif max(standard_deviations_list) == standard_deviations_list[2]:
    least_stable_order = 3
    greatest_standard_deviation = standard_deviations_list[2]
    
# Print a message showing which SSS order has the highest average accuracy, along with it's accuracy
print(f"The least stable order (highest standard deviation) is order {least_stable_order}, with a standard deviation of {greatest_standard_deviation}")
    

1
Iteration 0
1. SMOTE, split, scale
Accuracy: 0.9389258245339591
2. split, SMOTE, scale
Accuracy: 0.9796082949308755
3. split, scale, SMOTE
Accuracy: 0.876036866359447

----------------------------------------------------------------------------------------------------

Iteration 1
1. SMOTE, split, scale
Accuracy: 0.9340372832746708
2. split, SMOTE, scale
Accuracy: 0.9797235023041475
3. split, scale, SMOTE
Accuracy: 0.8323732718894009

----------------------------------------------------------------------------------------------------

Iteration 2
1. SMOTE, split, scale
Accuracy: 0.9320166862208317
2. split, SMOTE, scale
Accuracy: 0.9754608294930875
3. split, scale, SMOTE
Accuracy: 0.8588709677419355

----------------------------------------------------------------------------------------------------

Iteration 3
1. SMOTE, split, scale
Accuracy: 0.9362534219788815
2. split, SMOTE, scale
Accuracy: 0.9726958525345623
3. split, scale, SMOTE
Accuracy: 0.8836405529953917

-----------------

#### Now that we have a better idea of which SSS order is "the best," we will try training and testing the model one more time

In [14]:
 # Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)
    
# Use SMOTE to handle class imbalance
smote = SMOTE(sampling_strategy=0.2, k_neighbors=2)
X_train_SMOTE, y_train_SMOTE = smote.fit_sample(X_train, y_train.ravel())
y_train_SMOTE = y_train_SMOTE.reshape(-1,1)

# Create scaler for features
X_SMOTE_scaler = StandardScaler().fit(X_train_SMOTE)

# Scale features
X_train_SMOTE_scaled = X_SMOTE_scaler.transform(X_train_SMOTE)
X_test_scaled = X_SMOTE_scaler.transform(X_test)

# Create, fit, and score the Decision Tree Classifier
classifier = tree.DecisionTreeClassifier(max_depth=30, max_leaf_nodes=60)
classifier = classifier.fit(X=X_train_SMOTE_scaled, y=y_train_SMOTE)
score = classifier.score(X_test_scaled, y_test)

# Print the accuracy
print(score)

0.9798387096774194


In [15]:
# Export out final model
from sklearn.externals import joblib
import pickle

standard_scaler = StandardScaler()
standard_scaler.fit(X_train_SMOTE)

standard_scaler_export_file_path = os.path.join("..", "web_development", "standard_scaler.model")
joblib.dump(standard_scaler, standard_scaler_export_file_path)

classifier_export_file_path = os.path.join("..", "web_development", "stroke_predictor.model")
pickle.dump(classifier, open(classifier_export_file_path, "wb"))

In [16]:
# import graphviz

# feature_names = ["age",
#                  "average_glucose_levels",
#                  "bmi",
#                  "hypertension_0",
#                  "hypertension_1",
#                  "heart_disease_0",
#                  "heart_disease_1",
#                  "ever_married_No",
#                  "ever_married_Yes",
#                  "work_type_Self-employed",
#                  "work_type_children",
#                  "work_type_other",
#                  "smoking_status_formerly_smoked",
#                  "smoking_status_never_smoked",
#                  "smoking_status_smokes"
#                 ]
# class_names=["did_not_have_a_stroke", "had_a_stroke"]

# dot_data = tree.export_graphviz(classifier, out_file=None, 
#                       feature_names=feature_names,
#                       class_names=class_names,  
#                       filled=True, rounded=True,  
#                       special_characters=True)
# graph = graphviz.Source(dot_data)  
# graph 