# In this project we will build a Machine Learning model to predict whether an indiviudal will have a stroke.  The data used in this project can be found on kaggle at the following link: https://www.kaggle.com/asaumya/healthcare-data#train_2v.csv

# In this notebook, we build and implement our Machine Learning model.  To view our initial data analysis, please see the notebook titled "Data_Analysis."

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
import os

# Define file path to our data
stoke_data_relevant_features_and_label_file_path = os.path.join("..", "Data", "stroke_data_relevant_features_and_label.csv")

# Create dataframe from local csv file 
stroke_data_relevant_features_and_label = pd.read_csv(stoke_data_relevant_features_and_label_file_path)

# Previe dataframe
stroke_data_relevant_features_and_label.head()

Unnamed: 0,hypertension,heart_disease,ever_married,work_type,smoking_status,age,average_glucose_level,bmi,stroke
0,0,0,No,children,,3.0,95.12,18.0,0
1,1,0,Yes,other,never smoked,58.0,87.96,39.2,0
2,0,0,No,other,,8.0,110.89,17.6,0
3,0,0,Yes,other,formerly smoked,70.0,69.04,35.9,0
4,0,0,No,other,,14.0,161.28,19.1,0


#### We want to one hot encode our categorical columns, so we will convert each 0 to "No," and each 1 to "Yes."

In [31]:
# Before we replace 0 and 1 with "no" and "yes",
# we should check to see if either of these numbers are present in the age column
number_of_people_age_0 = len(stroke_data_relevant_features_and_label[stroke_data_relevant_features_and_label["age"] == 0])
number_of_people_age_1 = len(stroke_data_relevant_features_and_label[stroke_data_relevant_features_and_label["age"] == 1])

print(f"Number of people of age 0: {number_of_people_age_0}")
print(f"Number of people of age 1: {number_of_people_age_1}")

Number of people of age 0: 0
Number of people of age 1: 34


In [32]:
# When we replace all values of 0 and 1 with "No" and "Yes,"
# we are going to replace ages of 1 with a value of "Yes"
# We will also replace the binary data in the stroke column with strings.
# We will therefore make copies of these rows to put back in the dataframe after our initial replacement

copy_of_data = pd.DataFrame()

# copy_of_data["age"] = stroke_data_relevant_features_and_label["age"]
# copy_of_data["stroke"] = stroke_data_relevant_features_and_label["stroke"]

copy_of_data_age = [stroke_data_relevant_features_and_label["age"]]
copy_of_data_stroke = [stroke_data_relevant_features_and_label["stroke"]]

In [33]:
# Replace each 0 with "No," and each 1 with "Yes."
stroke_data_relevant_features_and_label[["hypertension", "heart_disease"]].replace(0, "No", inplace=True)
stroke_data_relevant_features_and_label[["hypertension", "heart_disease"]].replace(1, "Yes", inplace=True)

# Preview dataframe after converting binary data to strings
stroke_data_relevant_features_and_label.head()

Unnamed: 0,hypertension,heart_disease,ever_married,work_type,smoking_status,age,average_glucose_level,bmi,stroke
0,0,0,No,children,,3.0,95.12,18.0,0
1,1,0,Yes,other,never smoked,58.0,87.96,39.2,0
2,0,0,No,other,,8.0,110.89,17.6,0
3,0,0,Yes,other,formerly smoked,70.0,69.04,35.9,0
4,0,0,No,other,,14.0,161.28,19.1,0


In [34]:
# Check to see if either if the values of 1 in the age column were changed
number_of_people_age_1 = len(stroke_data_relevant_features_and_label[stroke_data_relevant_features_and_label["age"] == 1])

print(f"Number of people of age 1: {number_of_people_age_1}")

Number of people of age 1: 34


In [35]:
# # Replace the values in the post-replacement age and stroke columns with the original values
# stroke_data_relevant_features_and_label["age"] = copy_of_data_age
# stroke_data_relevant_features_and_label["stroke"] = copy_of_data_stroke

# # Preview dataframe to confirm values in stroke column were fixed
# stroke_data_relevant_features_and_label.head()

In [36]:
# Confirm binary data proplerly converted
print(stroke_data_relevant_features_and_label["hypertension"].value_counts())
print(100*"-")
print(stroke_data_relevant_features_and_label["heart_disease"].value_counts())

0    39339
1     4061
Name: hypertension, dtype: int64
----------------------------------------------------------------------------------------------------
0    41338
1     2062
Name: heart_disease, dtype: int64


In [37]:
# Transform data to one hot encoded data
machine_ready_stroke_data = pd.get_dummies(stroke_data_relevant_features_and_label, columns=["hypertension", "heart_disease", "ever_married", "work_type", "smoking_status"])
machine_ready_stroke_data.head()

Unnamed: 0,age,average_glucose_level,bmi,stroke,hypertension_0,hypertension_1,heart_disease_0,heart_disease_1,ever_married_No,ever_married_Yes,work_type_Self-employed,work_type_children,work_type_other,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,3.0,95.12,18.0,0,1,0,1,0,1,0,0,1,0,0,0,0
1,58.0,87.96,39.2,0,0,1,1,0,0,1,0,0,1,0,1,0
2,8.0,110.89,17.6,0,1,0,1,0,1,0,0,0,1,0,0,0
3,70.0,69.04,35.9,0,1,0,1,0,0,1,0,0,1,1,0,0
4,14.0,161.28,19.1,0,1,0,1,0,1,0,0,0,1,0,0,0


In [38]:
# Import Maching Learning algorithms will we try out
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [39]:
# Create our features and label
X = np.array(machine_ready_stroke_data.drop(["stroke"], axis=1))
y = np.array(machine_ready_stroke_data["stroke"].values.reshape(-1,1))

We now have our features and labels, but the data is still imbalanced.  We will try employing SMOTE to handle this issue.

### In the following section, we will try running several loops to see what the effect of changing several parameters is.

In [40]:
# Import SMOTE to handle the imbalanced data issue
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

# Import tree to use the DecisionTreeClassifier() algorithm
from sklearn import tree

# Import StandardScaler to scale our data
from sklearn.preprocessing import StandardScaler

#### SMOTE parameters

##### In the cell below we examine how accuracy changes when adjusting the SMOTE parameter sampling_strategy

In [68]:
import warnings
warnings.filterwarnings('ignore')

# from sklearn import preprocessing

# Create an array of arguments to iteratively try out
sampling_strategy_arguments = np.arange(0.6, 1, 0.05)

# average_accuracies = []
# most_accurate_orders = []
# most_stable_orders = []
# standard_deviations = []

sampling_strategy_argument = 0.80

for i in range(3):
    print("1. split, SMOTE, scale")
    accuracies_list = []
        
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=3)
    
    # Use SMOTE to handle class imbalance
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=8)
    X_train_SMOTE, y_train_SMOTE = smote.fit_sample(X_train, y_train.ravel())
    y_train_SMOTE = y_train_SMOTE.reshape(-1,1)
    
    # Create scaler for features and label
    X_train_SMOTE_scaler = StandardScaler().fit(X_train_SMOTE)
    X_test_scaler = StandardScaler().fit(X_test)
#     y_scaler = StandardScaler().fit(y_train_SMOTE)
    
    # Scale features and labels
    X_train_SMOTE_scaled = X_train_SMOTE_scaler.transform(X_train_SMOTE)
    X_test_scaled = X_test_scaler.transform(X_test)
#     y_train_SMOTE_scaled = y_scaler.transform(y_train_SMOTE)
    
    # Create, fit, and score the decision tree classifier
    classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
    classifier = classifier.fit(X=X_train_SMOTE_scaled, y=y_train_SMOTE)
    score = classifier.score(X_test_scaled, y_test)
    accuracies_list.append(score)
    
    # Print a list of accuracies based on the current argument
    print(f"Accuracy: {score}")

#     average_accuracy = sum(scores)/len(scores)
#     average_accuracies.append(average_accuracy)
#     standard_deviation = np.std(scores)
#     print(f"Average accuracy: {average_accuracy}")
#     print(f"Standard Deviation of accuracy: {standard_deviation}")
        
    ####################################################################################################

    print("2. split, scale, SMOTE")
    scores = []
        
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=3)
    
    # Create scaler for features and label
    X_train_scaler = StandardScaler().fit(X_train)
    X_test_scaler = StandardScaler().fit(X_test)
#     y_scaler = StandardScaler().fit(y_train)
    
    # Scale features and labels
    X_train_scaled = X_train_scaler.transform(X_train)
    X_test_scaled = X_test_scaler.transform(X_test)
#     y_train_scaled = y_scaler.transform(y_train)

    # Use SMOTE to handle class imbalance
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=8)
    X_train_scaled_SMOTE, y_train_SMOTE = smote.fit_sample(X_train_scaled, y_train.ravel())
    y_train_SMOTE = y_train_SMOTE.reshape(-1,1)

    # Create, fit, and score the decision tree classifier
    classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
    classifier = classifier.fit(X=X_train_scaled_SMOTE, y=y_train_SMOTE)
    score = classifier.score(X_test_scaled, y_test)
    accuracies_list.append(score)
    
    # Print a list of accuracies based on the current argument
    print(f"Accuracy: {score}")
        
#     average_accuracy = sum(scores)/len(scores)
#     average_accuracies.append(average_accuracy)
#     standard_deviation = np.std(scores)
#     print(f"Average accuracy: {average_accuracy}")
#     print(f"Standard Deviation of accuracy: {standard_deviation}")
        
    ####################################################################################################

    print("3. SMOTE, split, scale")
    scores = []
        
    # Use SMOTE to handle class imbalance
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=2)
    X_SMOTE, y_SMOTE = smote.fit_sample(X, y.ravel())
    y_SMOTE = y_SMOTE.reshape(-1,1)
    
    # Split the data into training and testing sets
    X_SMOTE_train, X_SMOTE_test, y_SMOTE_train, y_SMOTE_test = train_test_split(X_SMOTE, y_SMOTE, random_state=3)
    
    # Create scaler for features and label
    X_SMOTE_train_scaler = StandardScaler().fit(X_SMOTE_train)
    X_SMOTE_test_scaler = StandardScaler().fit(X_SMOTE_test)
#     y_scaler = StandardScaler().fit(y_SMOTE_train)
    
    # Scale features and labels
    X_SMOTE_train_scaled = X_SMOTE_train_scaler.transform(X_SMOTE_train)
    X_SMOTE_test_scaled = X_SMOTE_test_scaler.transform(X_SMOTE_test)
#     y_SMOTE_train_scaled = y_scaler.transform(y_train_SMOTE)
    
    # Create, fit, and score the decision tree classifier
    classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
    classifier = classifier.fit(X=X_SMOTE_train_scaled, y=y_SMOTE_train)
    score = classifier.score(X_SMOTE_test_scaled, y_SMOTE_test)
    accuracies_list.append(score)
    
    # Print a list of accuracies based on the current argument
    print(f"Accuracy: {score}")
        
#     average_accuracy = sum(scores)/len(scores)
#     average_accuracies.append(average_accuracy)
#     standard_deviation = np.std(scores)
#     print(f"Average accuracy: {average_accuracy}")
#     print(f"Standard Deviation of accuracy: {standard_deviation}")
        
    ####################################################################################################
        
        
    print("4. SMOTE, scale, split")
    scores = []
        
    # Use SMOTE to handle class imbalance
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=2)
    X_SMOTE, y_SMOTE = smote.fit_sample(X, y.ravel())
    y_SMOTE = y_SMOTE.reshape(-1,1)
    
    # Create scaler for features and label
    X_scaler = StandardScaler().fit(X_SMOTE)
#     y_scaler = StandardScaler().fit(y_SMOTE)
    
    # Scale features and labels
    X_SMOTE_scaled = X_scaler.transform(X_SMOTE)
#     y_SMOTE_scaled = y_scaler.transform(y_SMOTE)
    
    # Split the data into training and testing sets
    X_SMOTE_scaled_train, X_SMOTE_scaled_test, y_SMOTE_train, y_SMOTE_test = train_test_split(X_SMOTE_scaled, y_SMOTE, random_state=3)
    
    # Create, fit, and score the decision tree classifier
    classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
    classifier = classifier.fit(X=X_SMOTE_scaled_train, y=y_SMOTE_train)
    score = classifier.score(X_test, y_test)
    accuracies_list.append(score)
    
    # Print a list of accuracies based on the current argument
    print(f"Accuracy: {score}")
        
#     average_accuracy = sum(scores)/len(scores)
#     average_accuracies.append(average_accuracy)
#     standard_deviation = np.std(scores)
#     print(f"Average accuracy: {average_accuracy}")
#     print(f"Standard Deviation of accuracy: {standard_deviation}")
        
    ####################################################################################################


    print("5. scale, split, SMOTE")
    scores = []
        
    # Create scaler for features and label
    X_scaler = StandardScaler().fit(X)
#     y_scaler = StandardScaler().fit(y_SMOTE)
    
    # Scale features and labels
    X_scaled = X_scaler.transform(X)
#     y_SMOTE_scaled = y_scaler.transform(y_SMOTE)

    # Split the data into training and testing sets
    X_scaled_train, X_scaled_test, y_train, y_test = train_test_split(X_scaled, y, random_state=3)
    
    # Use SMOTE to handle class imbalance
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=2)
    X_scaled_train_SMOTE, y_train_SMOTE = smote.fit_sample(X_scaled_train, y_train.ravel())
    y_train_SMOTE = y_train_SMOTE.reshape(-1,1)
    
    # Create, fit, and score the decision tree classifier
    classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
    classifier = classifier.fit(X=X_scaled_train_SMOTE, y=y_train_SMOTE)
    score = classifier.score(X_scaled_test, y_test)
    accuracies_list.append(score)
    
    # Print a list of accuracies based on the current argument
    print(f"Accuracy: {score}")
        
#     average_accuracy = sum(scores)/len(scores)
#     average_accuracies.append(average_accuracy)
#     standard_deviation = np.std(scores)
#     print(f"Average accuracy: {average_accuracy}")
#     print(f"Standard Deviation of accuracy: {standard_deviation}")
        
    ####################################################################################################

    print("6. scale, SMOTE, split")
    scores = []
        
    # Create scaler for features and label
    X_scaler = StandardScaler().fit(X)
#     y_scaler = StandardScaler().fit(y_SMOTE)
    
    # Scale features and labels
    X_scaled = X_scaler.transform(X)
#     y_SMOTE_scaled = y_scaler.transform(y_SMOTE)

    # Use SMOTE to handle class imbalance
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=4)
    X_scaled_SMOTE, y_SMOTE = smote.fit_sample(X_scaled, y.ravel())
    y_SMOTE = y_SMOTE.reshape(-1,1)

    # Split the data into training and testing sets
    X_scaled_SMOTE_train, X_scaled_SMOTE_test, y_SMOTE_train, y_SMOTE_test = train_test_split(X_scaled_SMOTE, y_SMOTE, random_state=3)
    
    # Create, fit, and score the decision tree classifier
    classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
    classifier = classifier.fit(X=X_scaled_SMOTE_train, y=y_SMOTE_train)
    score = classifier.score(X_scaled_SMOTE_test, y_SMOTE_test)
    accuracies_list.append(score)
    
    # Print a list of accuracies based on the current argument
    print(f"Accuracy: {score}")  
        
#     average_accuracy = sum(scores)/len(scores)
#     average_accuracies.append(average_accuracy)
#     standard_deviation = np.std(scores)
#     print(f"Average accuracy: {average_accuracy}")
#     print(f"Standard Deviation of accuracy: {standard_deviation}")

    ####################################################################################################

    print()
    print(100*"-")
    print()

    if max(average_accuracies) == average_accuracies[0]:
        print("Most accurate order (on average) is 1: split, SMOTE, scale")
#         most_accurate_orders.append(1)
        most_accurate_order = 1
        
    elif max(average_accuracies) == average_accuracies[1]:
        print("Most accurate order (on average) is 2: split, scale, SMOTE")
#         most_accurate_orders.append(2)
        most_accurate_order = 2
        
    elif max(average_accuracies) == average_accuracies[2]:
        print("Most accurate order (on average) is 3: SMOTE, split, scale")
#         most_accurate_orders.append(3)
        most_accurate_order = 3
        
    elif max(average_accuracies) == average_accuracies[3]:
        print("Most accurate order (on average) is 4: SMOTE, scale, split")
#         most_accurate_orders.append(4)
        most_accurate_order = 4
        
    elif max(average_accuracies) == average_accuracies[4]:
        print("Most accurate order (on average) is 5: scale, split, SMOTE")
#         most_accurate_orders.append(5)
        most_accurate_order = 5
        
    elif max(average_accuracies) == average_accuracies[5]:
        print("Most accurate order (on average) is 6: scale, SMOTE, split")
#         most_accurate_orders.append(6)
        most_accurate_order = 6
        
print(f"Order number {most_accurate_order} was the most accurate the highest number of times.")
# print(f"Order number {} was the most stable (lowest standard deviation)")

1. split, SMOTE, scale
Accuracy: 0.03064516129032258
2. split, scale, SMOTE
Accuracy: 0.7888248847926267
3. SMOTE, split, scale
Accuracy: 0.7960684117217646
4. SMOTE, scale, split
Accuracy: 0.016359447004608296
5. scale, split, SMOTE
Accuracy: 0.7890322580645162
6. scale, SMOTE, split
Accuracy: 0.7844926478256335

----------------------------------------------------------------------------------------------------

1. split, SMOTE, scale
Accuracy: 0.025230414746543778
2. split, scale, SMOTE
Accuracy: 0.7684331797235023
3. SMOTE, split, scale
Accuracy: 0.7767754718948795
4. SMOTE, scale, split
Accuracy: 0.016474654377880184
5. scale, split, SMOTE
Accuracy: 0.811705069124424
6. scale, SMOTE, split
Accuracy: 0.7949734070288873

----------------------------------------------------------------------------------------------------

1. split, SMOTE, scale
Accuracy: 0.025230414746543778
2. split, scale, SMOTE
Accuracy: 0.7678571428571429
3. SMOTE, split, scale
Accuracy: 0.8197413703201585
4. SMO

In [69]:
accuracies_list

[0.025230414746543778,
 0.7678571428571429,
 0.8197413703201585,
 0.016359447004608296,
 0.8119815668202764,
 0.7951298362707269]

In [48]:
import warnings
warnings.filterwarnings('ignore')

# from sklearn import preprocessing

# Create an array of arguments to iteratively try out
sampling_strategy_arguments = np.arange(0.6, 1, 0.05)

average_accuracies = []
most_accurate_orders = []
most_stable_orders = []
standard_deviations = []

for i in range(3):
    print("1. split, SMOTE, scale")
    scores = []
    for sampling_strategy_argument in sampling_strategy_arguments:
        
        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=3)
        
        # Use SMOTE to handle class imbalance
        smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=8)
        X_train_SMOTE, y_train_SMOTE = smote.fit_sample(X_train, y_train.ravel())
        y_train_SMOTE = y_train_SMOTE.reshape(-1,1)
        
        # Create scaler for features and label
        X_train_SMOTE_scaler = StandardScaler().fit(X_train_SMOTE)
        X_test_scaler = StandardScaler().fit(X_test)
    #     y_scaler = StandardScaler().fit(y_train_SMOTE)
        
        # Scale features and labels
        X_train_SMOTE_scaled = X_train_SMOTE_scaler.transform(X_train_SMOTE)
        X_test_scaled = X_test_scaler.transform(X_test)
    #     y_train_SMOTE_scaled = y_scaler.transform(y_train_SMOTE)
        
        # Create, fit, and score the decision tree classifier
        classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
        classifier = classifier.fit(X=X_train_SMOTE_scaled, y=y_train_SMOTE)
        score = classifier.score(X_test_scaled, y_test)
        scores.append(score)
        
    #     # Create, fit, and score the decision tree classifier
    #     classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
    #     classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    #     score = classifier.score(X_smote_test, y_smote_test)
        
        # Print a list of accuracies based on the current argument
        print(f"Setting the sampling_strategy parameter to {sampling_strategy_argument} yields an accuracy of {score}")

    average_accuracy = sum(scores)/len(scores)
    average_accuracies.append(average_accuracy)
    standard_deviation = np.std(scores)
    print(f"Average accuracy: {average_accuracy}")
    print(f"Standard Deviation of accuracy: {standard_deviation}")
    print()
        
    ####################################################################################################

    print("2. split, scale, SMOTE")
    scores = []
    for sampling_strategy_argument in sampling_strategy_arguments:
        
        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=3)
        
        # Create scaler for features and label
        X_train_scaler = StandardScaler().fit(X_train)
        X_test_scaler = StandardScaler().fit(X_test)
    #     y_scaler = StandardScaler().fit(y_train)
        
        # Scale features and labels
        X_train_scaled = X_train_scaler.transform(X_train)
        X_test_scaled = X_test_scaler.transform(X_test)
    #     y_train_scaled = y_scaler.transform(y_train)

        # Use SMOTE to handle class imbalance
        smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=8)
        X_train_scaled_SMOTE, y_train_SMOTE = smote.fit_sample(X_train_scaled, y_train.ravel())
        y_train_SMOTE = y_train_SMOTE.reshape(-1,1)

        # Create, fit, and score the decision tree classifier
        classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
        classifier = classifier.fit(X=X_train_scaled_SMOTE, y=y_train_SMOTE)
        score = classifier.score(X_test_scaled, y_test)
        scores.append(score)
        
        # Print a list of accuracies based on the current argument
        print(f"Setting the sampling_strategy parameter to {sampling_strategy_argument} yields an accuracy of {score}")
        
    average_accuracy = sum(scores)/len(scores)
    average_accuracies.append(average_accuracy)
    standard_deviation = np.std(scores)
    print(f"Average accuracy: {average_accuracy}")
    print(f"Standard Deviation of accuracy: {standard_deviation}")
    print()
        
    ####################################################################################################

    print("3. SMOTE, split, scale")
    scores = []
    for sampling_strategy_argument in sampling_strategy_arguments:
        
        # Use SMOTE to handle class imbalance
        smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=2)
        X_SMOTE, y_SMOTE = smote.fit_sample(X, y.ravel())
        y_SMOTE = y_SMOTE.reshape(-1,1)
        
        # Split the data into training and testing sets
        X_SMOTE_train, X_SMOTE_test, y_SMOTE_train, y_SMOTE_test = train_test_split(X_SMOTE, y_SMOTE, random_state=3)
        
        # Create scaler for features and label
        X_SMOTE_train_scaler = StandardScaler().fit(X_SMOTE_train)
        X_SMOTE_test_scaler = StandardScaler().fit(X_SMOTE_test)
    #     y_scaler = StandardScaler().fit(y_SMOTE_train)
        
        # Scale features and labels
        X_SMOTE_train_scaled = X_SMOTE_train_scaler.transform(X_SMOTE_train)
        X_SMOTE_test_scaled = X_SMOTE_test_scaler.transform(X_SMOTE_test)
    #     y_SMOTE_train_scaled = y_scaler.transform(y_train_SMOTE)
        
        # Create, fit, and score the decision tree classifier
        classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
        classifier = classifier.fit(X=X_SMOTE_train_scaled, y=y_SMOTE_train)
        score = classifier.score(X_SMOTE_test_scaled, y_SMOTE_test)
        scores.append(score)
        
        # Print a list of accuracies based on the current argument
        print(f"Setting the sampling_strategy parameter to {sampling_strategy_argument} yields an accuracy of {score}")
        
    average_accuracy = sum(scores)/len(scores)
    average_accuracies.append(average_accuracy)
    standard_deviation = np.std(scores)
    print(f"Average accuracy: {average_accuracy}")
    print(f"Standard Deviation of accuracy: {standard_deviation}")
    print()
        
    ####################################################################################################
        
        
    print("4. SMOTE, scale, split")
    scores = []
    for sampling_strategy_argument in sampling_strategy_arguments:
        
        # Use SMOTE to handle class imbalance
        smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=2)
        X_SMOTE, y_SMOTE = smote.fit_sample(X, y.ravel())
        y_SMOTE = y_SMOTE.reshape(-1,1)
        
        # Create scaler for features and label
        X_scaler = StandardScaler().fit(X_SMOTE)
    #     y_scaler = StandardScaler().fit(y_SMOTE)
        
        # Scale features and labels
        X_SMOTE_scaled = X_scaler.transform(X_SMOTE)
    #     y_SMOTE_scaled = y_scaler.transform(y_SMOTE)
        
        # Split the data into training and testing sets
        X_SMOTE_scaled_train, X_SMOTE_scaled_test, y_SMOTE_train, y_SMOTE_test = train_test_split(X_SMOTE_scaled, y_SMOTE, random_state=3)
        
        # Create, fit, and score the decision tree classifier
        classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
        classifier = classifier.fit(X=X_SMOTE_scaled_train, y=y_SMOTE_train)
        score = classifier.score(X_test, y_test)
        scores.append(score)
        
        # Print a list of accuracies based on the current argument
        print(f"Setting the sampling_strategy parameter to {sampling_strategy_argument} yields an accuracy of {score}")
        
    average_accuracy = sum(scores)/len(scores)
    average_accuracies.append(average_accuracy)
    standard_deviation = np.std(scores)
    print(f"Average accuracy: {average_accuracy}")
    print(f"Standard Deviation of accuracy: {standard_deviation}")
    print()
        
    ####################################################################################################


    print("5. scale, split, SMOTE")
    scores = []
    for sampling_strategy_argument in sampling_strategy_arguments:
        
        # Create scaler for features and label
        X_scaler = StandardScaler().fit(X)
    #     y_scaler = StandardScaler().fit(y_SMOTE)
        
        # Scale features and labels
        X_scaled = X_scaler.transform(X)
    #     y_SMOTE_scaled = y_scaler.transform(y_SMOTE)

        # Split the data into training and testing sets
        X_scaled_train, X_scaled_test, y_train, y_test = train_test_split(X_scaled, y, random_state=3)
        
        # Use SMOTE to handle class imbalance
        smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=2)
        X_scaled_train_SMOTE, y_train_SMOTE = smote.fit_sample(X_scaled_train, y_train.ravel())
        y_train_SMOTE = y_train_SMOTE.reshape(-1,1)
        
        # Create, fit, and score the decision tree classifier
        classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
        classifier = classifier.fit(X=X_scaled_train_SMOTE, y=y_train_SMOTE)
        score = classifier.score(X_scaled_test, y_test)
        scores.append(score)
        
        # Print a list of accuracies based on the current argument
        print(f"Setting the sampling_strategy parameter to {sampling_strategy_argument} yields an accuracy of {score}")
        
    average_accuracy = sum(scores)/len(scores)
    average_accuracies.append(average_accuracy)
    standard_deviation = np.std(scores)
    print(f"Average accuracy: {average_accuracy}")
    print(f"Standard Deviation of accuracy: {standard_deviation}")
    print()
        
    ####################################################################################################

    print("6. scale, SMOTE, split")
    scores = []
    for sampling_strategy_argument in sampling_strategy_arguments:
        
        # Create scaler for features and label
        X_scaler = StandardScaler().fit(X)
    #     y_scaler = StandardScaler().fit(y_SMOTE)
        
        # Scale features and labels
        X_scaled = X_scaler.transform(X)
    #     y_SMOTE_scaled = y_scaler.transform(y_SMOTE)

        # Use SMOTE to handle class imbalance
        smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=2)
        X_scaled_SMOTE, y_SMOTE = smote.fit_sample(X_scaled, y.ravel())
        y_SMOTE = y_SMOTE.reshape(-1,1)

        # Split the data into training and testing sets
        X_scaled_SMOTE_train, X_scaled_SMOTE_test, y_SMOTE_train, y_SMOTE_test = train_test_split(X_scaled_SMOTE, y_SMOTE, random_state=3)
        
        # Create, fit, and score the decision tree classifier
        classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
        classifier = classifier.fit(X=X_scaled_SMOTE_train, y=y_SMOTE_train)
        score = classifier.score(X_scaled_SMOTE_test, y_SMOTE_test)
        scores.append(score)
        
        # Print a list of accuracies based on the current argument
        print(f"Setting the sampling_strategy parameter to {sampling_strategy_argument} yields an accuracy of {score}")  
        
    average_accuracy = sum(scores)/len(scores)
    average_accuracies.append(average_accuracy)
    standard_deviation = np.std(scores)
    print(f"Average accuracy: {average_accuracy}")
    print(f"Standard Deviation of accuracy: {standard_deviation}")
    print()

    ####################################################################################################

    print()

    if max(average_accuracies) == average_accuracies[0]:
        print("Most accurate order (on average) is 1: split, SMOTE, scale")
#         most_accurate_orders.append(1)
        most_accurate_order = 1
        
    elif max(average_accuracies) == average_accuracies[1]:
        print("Most accurate order (on average) is 2: split, scale, SMOTE")
#         most_accurate_orders.append(2)
        most_accurate_order = 2
        
    elif max(average_accuracies) == average_accuracies[2]:
        print("Most accurate order (on average) is 3: SMOTE, split, scale")
#         most_accurate_orders.append(3)
        most_accurate_order = 3
        
    elif max(average_accuracies) == average_accuracies[3]:
        print("Most accurate order (on average) is 4: SMOTE, scale, split")
#         most_accurate_orders.append(4)
        most_accurate_order = 4
        
    elif max(average_accuracies) == average_accuracies[4]:
        print("Most accurate order (on average) is 5: scale, split, SMOTE")
#         most_accurate_orders.append(5)
        most_accurate_order = 5
        
    elif max(average_accuracies) == average_accuracies[5]:
        print("Most accurate order (on average) is 6: scale, SMOTE, split")
#         most_accurate_orders.append(6)
        most_accurate_order = 6
        
print(f"Order number {most_accurate_order} was the most accurate the highest number of times.")
# print(f"Order number {} was the most stable (lowest standard deviation)")

1. split, SMOTE, scale
Setting the sampling_strategy parameter to 0.6 yields an accuracy of 0.07085253456221198
Setting the sampling_strategy parameter to 0.65 yields an accuracy of 0.03179723502304147
Setting the sampling_strategy parameter to 0.7000000000000001 yields an accuracy of 0.028110599078341014
Setting the sampling_strategy parameter to 0.7500000000000001 yields an accuracy of 0.026382488479262674
Setting the sampling_strategy parameter to 0.8000000000000002 yields an accuracy of 0.026382488479262674
Setting the sampling_strategy parameter to 0.8500000000000002 yields an accuracy of 0.026382488479262674
Setting the sampling_strategy parameter to 0.9000000000000002 yields an accuracy of 0.4618663594470046
Setting the sampling_strategy parameter to 0.9500000000000003 yields an accuracy of 0.02453917050691244
Average accuracy: 0.08703917050691244
Standard Deviation of accuracy: 0.1424015563215359

2. split, scale, SMOTE
Setting the sampling_strategy parameter to 0.6 yields an a

Setting the sampling_strategy parameter to 0.6 yields an accuracy of 0.016359447004608296
Setting the sampling_strategy parameter to 0.65 yields an accuracy of 0.016474654377880184
Setting the sampling_strategy parameter to 0.7000000000000001 yields an accuracy of 0.016359447004608296
Setting the sampling_strategy parameter to 0.7500000000000001 yields an accuracy of 0.016359447004608296
Setting the sampling_strategy parameter to 0.8000000000000002 yields an accuracy of 0.016359447004608296
Setting the sampling_strategy parameter to 0.8500000000000002 yields an accuracy of 0.016359447004608296
Setting the sampling_strategy parameter to 0.9000000000000002 yields an accuracy of 0.016359447004608296
Setting the sampling_strategy parameter to 0.9500000000000003 yields an accuracy of 0.016359447004608296
Average accuracy: 0.016373847926267282
Standard Deviation of accuracy: 3.810125735980074e-05

5. scale, split, SMOTE
Setting the sampling_strategy parameter to 0.6 yields an accuracy of 0.8

In [23]:
most_accurate_orders

[6, 6]

In [None]:
average_accuracies

if max(average_accuracies) == average_accuracies[0]:
    print(f"Most accurate order is {1}")
    
elif max(average_accuracies) == average_accuracies[1]:
    print(f"Most accurate order is {2}")
    
elif max(average_accuracies) == average_accuracies[2]:
    print(f"Most accurate order is {3}")
    
elif max(average_accuracies) == average_accuracies[3]:
    print(f"Most accurate order is {4}")
    
elif max(average_accuracies) == average_accuracies[4]:
    print(f"Most accurate order is {5}")
    
elif max(average_accuracies) == average_accuracies[5]:
    print(f"Most accurate order is {6}")

Most accuracte order after 10 tries: <br>
1: 0 <br>
2: 0 <br>
3: 0 <br>
4: 1 <br>
5: 4 <br>
6: 5 <br>

In [None]:
# Create an array of arguments to iteratively try out
sampling_strategy_arguments = np.arange(0.6, 1, 0.05)

print("split, SMOTE, scale")

for sampling_strategy_argument in sampling_strategy_arguments:
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=8)
    X_smote, y_smote = smote.fit_sample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=3)
    
#     y_smote_train.reshape(-1,1)
#     y_smote_test.reshape(-1,1)

#     X_smote_train = X_smote_train.reshape(-1,1)
#     y_smote_train = y_smote_train.reshape(-1,1)
#     X_smote_test = X_smote_test.reshape(-1,1)
#     y_smote_test = y_smote_test.reshape(-1,1)
    
#     print(X_smote_train_scaled.shape)
#     print(y_smote_train_scaled.shape)
#     print(X_smote_test_scaled.shape)
#     print(y_smote_test_scaled.shape)
    
    # Create scale for features and label
    X_smote_scaler = StandardScaler().fit(X_smote_train)
#     y_smote_scaler = StandardScaler().fit(y_smote_train)
    
    # Scale features and labels
    X_smote_train_scaled = X_smote_scaler.transform(X_smote_train)
    X_smote_test_scaled = X_smote_scaler.transform(X_smote_test)
#     y_smote_train_scaled = y_smote_scaler.transform(y_smote_train)
#     y_smote_test_scaled = y_smote_scaler.transform(y_smote_test)
    
#     # Create, fit, and score the decision tree classifier
#     classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
#     classifier = classifier.fit(X=X_smote_train_scaled, y=y_smote_train)
#     score = classifier.score(X_smote_test, y_smote_test)
    
#     # Create, fit, and score the decision tree classifier
    classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the sampling_strategy parameter to {sampling_strategy_argument} yields an accuracy of {score}")

# classifier.feature_importances_

In [None]:
# Create an array of arguments to iteratively try out
sampling_strategy_arguments = np.arange(0.6, 1, 0.05)

for sampling_strategy_argument in sampling_strategy_arguments:
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=8)
    X_smote, y_smote = smote.fit_sample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=3)
    
#     y_smote_train.reshape(-1,1)
#     y_smote_test.reshape(-1,1)

#     X_smote_train = X_smote_train.reshape(-1,1)
#     y_smote_train = y_smote_train.reshape(-1,1)
#     X_smote_test = X_smote_test.reshape(-1,1)
#     y_smote_test = y_smote_test.reshape(-1,1)
    
#     print(X_smote_train_scaled.shape)
#     print(y_smote_train_scaled.shape)
#     print(X_smote_test_scaled.shape)
#     print(y_smote_test_scaled.shape)
    
    # Create scale for features and label
    X_smote_scaler = StandardScaler().fit(X_smote_train)
#     y_smote_scaler = StandardScaler().fit(y_smote_train)
    
    # Scale features and labels
    X_smote_train_scaled = X_smote_scaler.transform(X_smote_train)
    X_smote_test_scaled = X_smote_scaler.transform(X_smote_test)
#     y_smote_train_scaled = y_smote_scaler.transform(y_smote_train)
#     y_smote_test_scaled = y_smote_scaler.transform(y_smote_test)
    
#     # Create, fit, and score the decision tree classifier
    classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
    classifier = classifier.fit(X=X_smote_train_scaled, y=y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
#     # Create, fit, and score the decision tree classifier
#     classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
#     classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
#     score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the sampling_strategy parameter to {sampling_strategy_argument} yields an accuracy of {score}")

# classifier.feature_importances_

import seaborn as sns
sns.heatmap(machine_ready_stroke_data.corr(),cmap='jet')

In [None]:
from sklearn.metrics import precision_recall_fscore_support,confusion_matrix
confusion_matrix(score,y)

##### In the cell below we examine how accuracy changes when adjusting the SMOTE parameter k_neighbors

In [None]:
# Create an array of arguments to iteratively try out
k_neighbors_arguments = np.arange(1, 102, 10)

for k_neighbors_argument in k_neighbors_arguments:
    smote = SMOTE(sampling_strategy=0.85, k_neighbors=k_neighbors_argument)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=3)
    
    # Create, fit, and score the decision tree classifier
    classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the k_neighbor parameter to {k_neighbors_argument} yields an accuracy of {score}")
    
classifier.feature_importances_

#### train_test_split parameters

##### In the cell below we examine how accuracy changes when adjusting the train_test_split parameter test_size

In [None]:
# Create an array of arguments to iteratively try out
test_size_arguments = np.arange(0.05, 0.5, 0.05)

for test_size_argument in test_size_arguments:
    smote = SMOTE(sampling_strategy=0.85, k_neighbors=4)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = test_size_argument, random_state=3)
    
    # Create, fit, and score the decision tree classifier
    classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the train_test_split parameter to {test_size_argument} yields an accuracy of {score}")
    
classifier.feature_importances_

##### In the cell below we examine how accuracy changes when adjusting the train_test_split parameter random_state

In [None]:
# Create an array of arguments to iteratively try out
random_state_arguments = np.arange(1, 10, 1)

for random_state_argument in random_state_arguments:
    smote = SMOTE(sampling_strategy=0.85, k_neighbors=4)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = test_size_argument, random_state=random_state_argument)
    
    # Create, fit, and score the decision tree classifier
    classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the random_state parameter to {random_state_argument} yields an accuracy of {score}")
    
classifier.feature_importances_

#### DecisionTreeClassifier() parameters

##### In the cell below we examine how accuracy changes when adjusting the DecisionTreeClassifier() parameter max_depth

In [None]:
# Create an array of arguments to iteratively try out
max_depth_arguments = np.arange(1, 102, 10)

for max_depth_argument in max_depth_arguments:
    smote = SMOTE(sampling_strategy=0.85, k_neighbors=4)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=3)
    
    # Create, fit, and score the decision tree classifier
    classifier = tree.DecisionTreeClassifier(max_depth=max_depth_argument, max_leaf_nodes=10)
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the max_depth parameter to {max_depth_argument} yields an accuracy of {score}")
    
classifier.feature_importances_

##### In the cell below we examine how accuracy changes when adjusting the DecisionTreeClassifier() parameter max_depth

In [None]:
# Create an array of arguments to iteratively try out
max_nodes_arguments = np.arange(2, 153, 10)

for max_nodes_argument in max_nodes_arguments:
    smote = SMOTE(sampling_strategy=0.85, k_neighbors=4)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=3)
    
    # Create, fit, and score the decision tree classifier
    classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=max_nodes_argument)
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the max_node parameter to {max_nodes_argument} yields an accuracy of {score}")

classifier.feature_importances_

#### Now that we have a better idea of what impact each parameter does, we will try one final test below

In [None]:
smote = SMOTE(sampling_strategy=0.85, k_neighbors=60)
X_smote, y_smote = smote.fit_resample(X, y.ravel())

y_smote = y_smote.reshape(-1,1)
    
# Split the data into training and testing sets
X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=3)

# Create, fit, and score the decision tree classifier
classifier = tree.DecisionTreeClassifier(max_depth=100, max_leaf_nodes=100)
classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
score = classifier.score(X_smote_test, y_smote_test)

print(score, "\n")
print(classifier.feature_importances_)

In [None]:
print(y_smote.shape)

In [None]:
from sklearn.externals import joblib

standard_scaler = StandardScaler()
standard_scaler.fit(X_smote)


joblib.dump(standard_scaler, "../web_development/standard_scaler.model")

In [None]:
import graphviz

feature_names = ["age",
                 "average_glucose_levels",
                 "bmi",
                 "hypertension_0",
                 "hypertension_1",
                 "heart_disease_0",
                 "heart_disease_1",
                 "ever_married_No",
                 "ever_married_Yes",
                 "work_type_Self-employed",
                 "work_type_children",
                 "work_type_other",
                 "smoking_status_formerly_smoked",
                 "smoking_status_never_smoked",
                 "smoking_status_smokes"
                ]
class_names=["did_not_have_a_stroke", "had_a_stroke"]

dot_data = tree.export_graphviz(classifier, out_file=None, 
                      feature_names=feature_names,
                      class_names=class_names,  
                      filled=True, rounded=True,  
                      special_characters=True)
graph = graphviz.Source(dot_data)  
graph 

In [70]:
# Export out final model
import pickle

file_name = "../web_development/stroke_predictor.model"
pickle.dump(classifier, open(file_name, "wb"))

In [None]:
from sklearn.preprocessing import StandardScaler

# Create scale for features and label
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

# Scale features and labels
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
# models = []
# models.append(("LR", LogisticRegression()))
# models.append(("CART", DecisionTreeClassifier()))
# models.append(("CART", RandomForestClassifier()))
# models.append(("SVM", SVC()))
# models.append(("NB", GaussianNB()))

# from sklearn import model_selection

# # Evaluate each model in turn
# results = []
# names = []

# for name, model in models:
#     kfold = model_selection.KFold(n_splits=10, random_state=42)
#     cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring="accuracy")
#     results.append(cv_results)
#     names.append(name)
#     print(f"{name}: {cv_results.mean()}, {cv_results.std()}")

In [None]:
# # look at this
# y_train.shape

In [None]:
classifier.feature_importances_

In [None]:
# import graphviz

# decision_tree_data = tree.export_graphviz(
#   classifier,
#   out_file=None,
#   feature_names=["age",
#                  "average_glucose_levels",
#                  "bmi",
#                  "hypertension_0",
#                  "hypertension_1",
#                  "heart_disease_0",
#                  "heart_disease_1",
#                  "ever_married_No",
#                  "ever_married_Yes",
#                  "work_type_Self-employed",
#                  "work_type_children",
#                  "work_type_other",
#                  "smoking_status_formerly_smoked",
#                  "smoking_status_never_smoked",
#                  "smoking_status_smokes"
#                 ],
#     class_names=["did_not_have_a_stroke", "had_a_stroke"],
#     filled=True,
#     rounded=False
# )

# graph = graphviz.Source(decision_tree_data)
# #graph

# #graph[size="7.75,10.25"]

In [None]:
import graphviz

decision_tree_data = tree.export_graphviz(
  classifier,
  out_file=None,
  feature_names=["age",
                 "average_glucose_levels",
                 "bmi",
                 "hypertension_0",
                 "hypertension_1",
                 "heart_disease_0",
                 "heart_disease_1",
                 "ever_married_No",
                 "ever_married_Yes",
                 "work_type_Self-employed",
                 "work_type_children",
                 "work_type_other",
                 "smoking_status_formerly_smoked",
                 "smoking_status_never_smoked",
                 "smoking_status_smokes"
                ],
    class_names=["did_not_have_a_stroke", "had_a_stroke"],
    filled=True,
    rounded=False
)



graph = graphviz.Source(decision_tree_data)
graph

#graph[size="7.75,10.25"]

In [None]:
dir(graph)

In [None]:
dir(tree.export_graphviz)

In [None]:
graph.render(format="png")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Visualizing both classes
plt.scatter(X[:, 0], X[:, 1])

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=1)
rf = rf.fit(X_train, np.array(y_train))
rf.score(X_test, np.array(y_test))