# In this project we will build a Machine Learning model to predict whether an indiviudal will have a stroke.  The data used in this project can be found on kaggle at the following link: https://www.kaggle.com/asaumya/healthcare-data#train_2v.csv

# In this notebook, we build and implement our Machine Learning model.  To view our initial data analysis, please see the notebook titled "Data_Analysis."

In [1]:
import pandas as pd
import numpy as np

In [2]:
import os

# Define file path to our data
stoke_data_relevant_features_file_path = os.path.join("..", "Data", "stroke_data_relevant_features_and_label.csv")

# Create dataframe from local csv file 
stroke_data_relevant_features = pd.read_csv(stoke_data_relevant_features_file_path)

# Previe dataframe
stroke_data_relevant_features.head()

Unnamed: 0,hypertension,heart_disease,ever_married,work_type,smoking_status,age,average_glucose_level,bmi,stroke
0,0,0,No,children,smokes,3.0,95.12,18.0,0
1,1,0,Yes,other,never smoked,58.0,87.96,39.2,0
2,0,0,No,other,smokes,8.0,110.89,17.6,0
3,0,0,Yes,other,formerly smoked,70.0,69.04,35.9,0
4,0,0,No,other,smokes,14.0,161.28,19.1,0


#### We want to one hot encode our categorical columns, so we will convert each 0 to "No," and each 1 to "Yes."

In [3]:
# Replace each 0 with "No," and each 1 with "Yes."
stroke_data_relevant_features.replace(0, "No", inplace=True)
stroke_data_relevant_features.replace(1, "Yes", inplace=True)

# Preview dataframe after converting binary data to strings
stroke_data_relevant_features.head()

Unnamed: 0,hypertension,heart_disease,ever_married,work_type,smoking_status,age,average_glucose_level,bmi,stroke
0,No,No,No,children,smokes,3,95.12,18.0,No
1,Yes,No,Yes,other,never smoked,58,87.96,39.2,No
2,No,No,No,other,smokes,8,110.89,17.6,No
3,No,No,Yes,other,formerly smoked,70,69.04,35.9,No
4,No,No,No,other,smokes,14,161.28,19.1,No


In [4]:
# Confirm binary data proplerly converted
print(stroke_data_relevant_features["hypertension"].value_counts())
print(100*"-")
print(stroke_data_relevant_features["heart_disease"].value_counts())

No     39339
Yes     4061
Name: hypertension, dtype: int64
----------------------------------------------------------------------------------------------------
No     41338
Yes     2062
Name: heart_disease, dtype: int64


In [5]:
# Transform data to one hot encoded data
machine_ready_stroke_data = pd.get_dummies(stroke_data_relevant_features, columns=["hypertension", "heart_disease", "ever_married", "work_type", "smoking_status"])
machine_ready_stroke_data.head()

Unnamed: 0,age,average_glucose_level,bmi,stroke,hypertension_No,hypertension_Yes,heart_disease_No,heart_disease_Yes,ever_married_No,ever_married_Yes,work_type_Self-employed,work_type_children,work_type_other,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,3,95.12,18.0,No,1,0,1,0,1,0,0,1,0,0,0,1
1,58,87.96,39.2,No,0,1,1,0,0,1,0,0,1,0,1,0
2,8,110.89,17.6,No,1,0,1,0,1,0,0,0,1,0,0,1
3,70,69.04,35.9,No,1,0,1,0,0,1,0,0,1,1,0,0
4,14,161.28,19.1,No,1,0,1,0,1,0,0,0,1,0,0,1


In [6]:
# Create our features and label
X = machine_ready_stroke_data.drop(["stroke"], axis=1)
y = machine_ready_stroke_data["stroke"].values.reshape(-1,1)

In [7]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [8]:
from sklearn.preprocessing import StandardScaler

# Create scale for features and label
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

# Scale features and labels
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

ValueError: could not convert string to float: 'Yes'