In [1]:
import pandas as pd
import numpy as np

In [2]:
stroke_data = pd.read_csv("healthcare-dataset-stroke-data.csv")
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
stroke_data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [4]:
#Bmi has some missing data in the dataset

missing_values = stroke_data.isnull().sum()
highest_missing = missing_values.sort_values(ascending = False)
highest_missing

bmi                  201
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
smoking_status         0
stroke                 0
dtype: int64

In [5]:
#min-max scale age and avg_glucose_level

#delete all the duplicate rows
stroke_data = stroke_data.drop_duplicates(keep = 'first')
stroke_data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [6]:
#import pandas_profiling as pp

In [7]:
#profile = pp.ProfileReport(stroke_data)
#profile.to_file("output.html")

In [8]:
#Remove the missing values from BMI column as it has a high correlation with other variables and it can't be simply imputed.
stroke_data_final = stroke_data.dropna()
stroke_data_final

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [9]:
#In the BMI column we are also making sure that any datapoints outside the statistical range are removed.

q1 = np.percentile(stroke_data_final.bmi, 25)
q3 = np.percentile(stroke_data_final.bmi, 75)

iqr = q3 - q1

lower_limit = q1 - (1.5 * iqr)
upper_limit = q3 + (1.5 * iqr)

print(lower_limit, upper_limit)

#Remove all the values that doesn't lie in this range as they are considered outliers.

stroke_data_final = stroke_data_final[(stroke_data_final['bmi'] >= lower_limit) & (stroke_data_final['bmi'] <= upper_limit)]
stroke_data_final

9.099999999999998 47.5


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [10]:
#Scale the data using min max scaler for variales age, glucose_level and bmi to not let them over power other variables.
from sklearn.preprocessing import MinMaxScaler

print(min(stroke_data_final['age']), min(stroke_data_final['avg_glucose_level']), min(stroke_data_final['bmi']))
print(max(stroke_data_final['age']), max(stroke_data_final['avg_glucose_level']), max(stroke_data_final['bmi']))

scaler = MinMaxScaler()

stroke_data_final['age'] = scaler.fit_transform(stroke_data_final[['age']])
stroke_data_final['avg_glucose_level'] = scaler.fit_transform(stroke_data_final[['avg_glucose_level']])
stroke_data_final['bmi'] = scaler.fit_transform(stroke_data_final[['bmi']])

0.08 55.12 10.3
82.0 271.74 47.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stroke_data_final['age'] = scaler.fit_transform(stroke_data_final[['age']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stroke_data_final['avg_glucose_level'] = scaler.fit_transform(stroke_data_final[['avg_glucose_level']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stroke_data_final['bmi'] 

In [11]:
stroke_data_final

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,0.816895,0,1,Yes,Private,Urban,0.801265,0.706989,formerly smoked,1
2,31112,Male,0.975586,0,1,Yes,Private,Rural,0.234512,0.596774,never smoked,1
3,60182,Female,0.597168,0,0,Yes,Private,Urban,0.536008,0.647849,smokes,1
4,1665,Female,0.963379,1,0,Yes,Self-employed,Rural,0.549349,0.368280,never smoked,1
5,56669,Male,0.987793,0,0,Yes,Private,Urban,0.605161,0.502688,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,0.157715,0,0,No,children,Rural,0.221402,0.223118,Unknown,0
5106,44873,Female,0.987793,0,0,Yes,Self-employed,Urban,0.323516,0.798387,never smoked,0
5107,19723,Female,0.426270,0,0,Yes,Self-employed,Rural,0.128658,0.545699,never smoked,0
5108,37544,Male,0.621582,0,0,Yes,Private,Rural,0.513203,0.411290,formerly smoked,0


In [12]:
#use one hot encoding to convert the categorical data into numerical format
categorical_columns = ['gender','ever_married','work_type','Residence_type','smoking_status']

final_df = pd.get_dummies(stroke_data_final, columns = categorical_columns)
final_df = final_df.loc[: ,final_df.columns != 'id']
final_df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0.816895,0,1,0.801265,0.706989,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
2,0.975586,0,1,0.234512,0.596774,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
3,0.597168,0,0,0.536008,0.647849,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
4,0.963379,1,0,0.549349,0.368280,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5,0.987793,0,0,0.605161,0.502688,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,0.157715,0,0,0.221402,0.223118,0,1,0,0,1,...,0,0,0,1,1,0,1,0,0,0
5106,0.987793,0,0,0.323516,0.798387,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,0
5107,0.426270,0,0,0.128658,0.545699,0,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5108,0.621582,0,0,0.513203,0.411290,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0


In [13]:
#Split the training and testing data
X = final_df.loc[: ,final_df.columns != 'stroke']
Y = final_df.loc[:, final_df.columns == 'stroke']

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [14]:
#build the naive bayes classifier for the data
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
y_pred = gnb.fit(X_train, Y_train).predict(X_test)

  y = column_or_1d(y, warn=True)


In [15]:
# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          Y_test.shape[0],
          (Y_test["stroke"] != y_pred).sum(),
          100*(1-(Y_test["stroke"] != y_pred).sum()/X_test.shape[0])
))

Number of mislabeled points out of a total 960 points : 711, performance 25.94%


In [16]:
#Build the decision tree classifier for the data
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(random_state = 0)
y_pred = dtree.fit(X_train, Y_train).predict(X_test)


In [17]:
# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          Y_test.shape[0],
          (Y_test["stroke"] != y_pred).sum(),
          100*(1-(Y_test["stroke"] != y_pred).sum()/X_test.shape[0])
))

Number of mislabeled points out of a total 960 points : 86, performance 91.04%


In [18]:
#Save the decision tree model
import pickle

filename = 'dt_model.sav'
saved_model = pickle.dump(dtree, open(filename, 'wb'))

In [19]:
#Save the model as a pickled string
import pickle

#Save the model
filename = 'nb_model.sav'
saved_model = pickle.dump(gnb, open(filename, 'wb'))

In [20]:
#Load the model and predict the results
loaded_model = pickle.load(open('dt_model.sav','rb'))
loaded_model.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

In [21]:
loaded_model.predict_proba(X_test)

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [22]:
loaded_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 0,
 'splitter': 'best'}

In [23]:
import math
s = list(dtree.feature_importances_)

rounded_importance = [round(i,2) for i in s]

In [24]:
print(set(stroke_data_final['work_type']))

{'Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'}


In [25]:
print(final_df.columns)

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'stroke', 'gender_Female', 'gender_Male', 'gender_Other',
       'ever_married_No', 'ever_married_Yes', 'work_type_Govt_job',
       'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Rural',
       'Residence_type_Urban', 'smoking_status_Unknown',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes'],
      dtype='object')


In [28]:
from sklearn.model_selection import train_test_split
#Build the decision tree classifier for the data
from sklearn.tree import DecisionTreeClassifier

gender_df = final_df[["gender_Female","gender_Male","gender_Other","stroke"]]
print(gender_df)

X = gender_df.loc[: ,gender_df.columns != 'stroke']
Y = gender_df.loc[:, gender_df.columns == 'stroke']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.4, random_state = 42)

dtree_gender = DecisionTreeClassifier(random_state = 0)
y_pred = dtree_gender.fit(X_train, Y_train).predict(X_test)

# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          Y_test.shape[0],
          (Y_test["stroke"] != y_pred).sum(),
          100*(1-(Y_test["stroke"] != y_pred).sum()/X_test.shape[0])
))


#Save the model
filename = 'dt_gender_model.sav'
saved_model = pickle.dump(dtree_gender, open(filename, 'wb'))

      gender_Female  gender_Male  gender_Other  stroke
0                 0            1             0       1
2                 0            1             0       1
3                 1            0             0       1
4                 1            0             0       1
5                 0            1             0       1
...             ...          ...           ...     ...
5104              1            0             0       0
5106              1            0             0       0
5107              1            0             0       0
5108              0            1             0       0
5109              1            0             0       0

[4799 rows x 4 columns]
Number of mislabeled points out of a total 1920 points : 90, performance 95.31%


In [43]:
from sklearn.model_selection import train_test_split
#Build the decision tree classifier for the data
from sklearn.tree import DecisionTreeClassifier

age_df = final_df[["age","stroke"]]
print(age_df)

X = age_df.loc[: ,age_df.columns != 'stroke']
Y = age_df.loc[:, age_df.columns == 'stroke']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.4, random_state = 42)

dtree_age = DecisionTreeClassifier(random_state = 0)
y_pred = dtree_age.fit(X_train, Y_train).predict(X_test)

# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          Y_test.shape[0],
          (Y_test["stroke"] != y_pred).sum(),
          100*(1-(Y_test["stroke"] != y_pred).sum()/X_test.shape[0])
))


#Save the model
filename = 'dt_age_model.sav'
saved_model = pickle.dump(dtree_age, open(filename, 'wb'))

v =pd.DataFrame({'age': list(X_test['age']), 'expected':list(Y_test['stroke']), 'predicted': list(y_pred)})

#op_dict = {"age": list(X_test), "expected": list(Y_test)}
#print(pd.DataFrame.from_dict(op_dict))

           age  stroke
0     0.816895       1
2     0.975586       1
3     0.597168       1
4     0.963379       1
5     0.987793       1
...        ...     ...
5104  0.157715       0
5106  0.987793       0
5107  0.426270       0
5108  0.621582       0
5109  0.536133       0

[4799 rows x 2 columns]
Number of mislabeled points out of a total 1920 points : 90, performance 95.31%


Unnamed: 0,expected,predicted
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
1915,0,0
1916,0,0
1917,0,0
1918,0,0
