In [1]:
import pandas as pd
import numpy as np

In [2]:
stroke_data = pd.read_csv("healthcare-dataset-stroke-data.csv")
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
stroke_data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [4]:
#Bmi has some missing data in the dataset

missing_values = stroke_data.isnull().sum()
highest_missing = missing_values.sort_values(ascending = False)
highest_missing

bmi                  201
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
smoking_status         0
stroke                 0
dtype: int64

In [5]:
#min-max scale age and avg_glucose_level

#delete all the duplicate rows
stroke_data = stroke_data.drop_duplicates(keep = 'first')
stroke_data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [6]:
#import pandas_profiling as pp

In [7]:
#profile = pp.ProfileReport(stroke_data)
#profile.to_file("output.html")

In [8]:
#Remove the missing values from BMI column as it has a high correlation with other variables and it can't be simply imputed.
stroke_data_final = stroke_data.dropna()

In [9]:
#In the BMI column we are also making sure that any datapoints outside the statistical range are removed.

q1 = np.percentile(stroke_data_final.bmi, 25)
q3 = np.percentile(stroke_data_final.bmi, 75)

iqr = q3 - q1

lower_limit = q1 - (1.5 * iqr)
upper_limit = q3 + (1.5 * iqr)

print(lower_limit, upper_limit)

#Remove all the values that doesn't lie in this range as they are considered outliers.

stroke_data_final = stroke_data_final[(stroke_data_final['bmi'] >= lower_limit) & (stroke_data_final['bmi'] <= upper_limit)]
stroke_data_final
stroke_data_add = stroke_data_final [["age", "bmi", "avg_glucose_level","stroke"]]
stroke_data_add

9.099999999999998 47.5


Unnamed: 0,age,bmi,avg_glucose_level,stroke
0,67.0,36.6,228.69,1
2,80.0,32.5,105.92,1
3,49.0,34.4,171.23,1
4,79.0,24.0,174.12,1
5,81.0,29.0,186.21,1
...,...,...,...,...
5104,13.0,18.6,103.08,0
5106,81.0,40.0,125.20,0
5107,35.0,30.6,82.99,0
5108,51.0,25.6,166.29,0


In [10]:
#Scale the data using min max scaler for variales age, glucose_level and bmi to not let them over power other variables.
from sklearn.preprocessing import MinMaxScaler

print(min(stroke_data_final['age']), min(stroke_data_final['avg_glucose_level']), min(stroke_data_final['bmi']))
print(max(stroke_data_final['age']), max(stroke_data_final['avg_glucose_level']), max(stroke_data_final['bmi']))

scaler = MinMaxScaler()

stroke_data_final['age'] = scaler.fit_transform(stroke_data_final[['age']])
stroke_data_final['avg_glucose_level'] = scaler.fit_transform(stroke_data_final[['avg_glucose_level']])
stroke_data_final['bmi'] = scaler.fit_transform(stroke_data_final[['bmi']])

0.08 55.12 10.3
82.0 271.74 47.5


In [11]:
stroke_data_final

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,0.816895,0,1,Yes,Private,Urban,0.801265,0.706989,formerly smoked,1
2,31112,Male,0.975586,0,1,Yes,Private,Rural,0.234512,0.596774,never smoked,1
3,60182,Female,0.597168,0,0,Yes,Private,Urban,0.536008,0.647849,smokes,1
4,1665,Female,0.963379,1,0,Yes,Self-employed,Rural,0.549349,0.368280,never smoked,1
5,56669,Male,0.987793,0,0,Yes,Private,Urban,0.605161,0.502688,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,0.157715,0,0,No,children,Rural,0.221402,0.223118,Unknown,0
5106,44873,Female,0.987793,0,0,Yes,Self-employed,Urban,0.323516,0.798387,never smoked,0
5107,19723,Female,0.426270,0,0,Yes,Self-employed,Rural,0.128658,0.545699,never smoked,0
5108,37544,Male,0.621582,0,0,Yes,Private,Rural,0.513203,0.411290,formerly smoked,0


In [12]:
#use one hot encoding to convert the categorical data into numerical format
categorical_columns = ['gender','ever_married','work_type','Residence_type','smoking_status']

final_df = pd.get_dummies(stroke_data_final, columns = categorical_columns)
naive_df = final_df[['id','age','stroke']]
bmi_df = final_df[['id','bmi','stroke']]
glucose_df = final_df[['id','avg_glucose_level','stroke']]

required_df = final_df.loc[: ,final_df.columns != 'id']
glucose_df

Unnamed: 0,id,avg_glucose_level,stroke
0,9046,0.801265,1
2,31112,0.234512,1
3,60182,0.536008,1
4,1665,0.549349,1
5,56669,0.605161,1
...,...,...,...
5104,14180,0.221402,0
5106,44873,0.323516,0
5107,19723,0.128658,0
5108,37544,0.513203,0


In [13]:
#Split the training and testing data
X = naive_df.loc[: ,naive_df.columns != 'stroke']
Y = naive_df.loc[:, naive_df.columns == 'stroke']

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

#build the naive bayes classifier for the data
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
y_pred = gnb.fit(X_train, Y_train).predict(X_test)

# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          Y_test.shape[0],
          (Y_test["stroke"] != y_pred).sum(),
          100*(1-(Y_test["stroke"] != y_pred).sum()/X_test.shape[0])
))

#Save the model as a pickled string
import pickle

#Save the model
filename = 'nb_model.sav'
saved_model = pickle.dump(gnb, open(filename, 'wb'))

b = gnb.predict_proba(X_test)
a = list(b[0])
print(round(a[0], 2)*100, round(a[1], 2)*100)

Number of mislabeled points out of a total 960 points : 55, performance 94.27%
96.0 4.0


  y = column_or_1d(y, warn=True)


In [14]:
#Split the training and testing data
X = bmi_df.loc[: ,bmi_df.columns != 'stroke']
Y = bmi_df.loc[:, bmi_df.columns == 'stroke']

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.5, random_state = 42)

#build the naive bayes classifier for the data
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
y_pred = gnb.fit(X_train, Y_train).predict(X_test)

# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          Y_test.shape[0],
          (Y_test["stroke"] != y_pred).sum(),
          100*(1-(Y_test["stroke"] != y_pred).sum()/X_test.shape[0])
))

#Save the model as a pickled string
import pickle

#Save the model
filename = 'nb_model_bmi.sav'
saved_model = pickle.dump(gnb, open(filename, 'wb'))

b = gnb.predict_proba(X_test)
a = list(b[0])
print(round(a[0], 2)*100, round(a[1], 2)*100)

Number of mislabeled points out of a total 2400 points : 103, performance 95.71%
96.0 4.0


  y = column_or_1d(y, warn=True)


In [15]:
#Split the training and testing data
X = glucose_df.loc[: ,glucose_df.columns != 'stroke']
Y = glucose_df.loc[:, glucose_df.columns == 'stroke']

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.5, random_state = 42)

#build the naive bayes classifier for the data
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
y_pred = gnb.fit(X_train, Y_train).predict(X_test)

# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          Y_test.shape[0],
          (Y_test["stroke"] != y_pred).sum(),
          100*(1-(Y_test["stroke"] != y_pred).sum()/X_test.shape[0])
))

#Save the model as a pickled string
import pickle

#Save the model
filename = 'nb_model_glucose.sav'
saved_model = pickle.dump(gnb, open(filename, 'wb'))

b = gnb.predict_proba(X_test)
a = list(b[0])
print(round(a[0], 2)*100, round(a[1], 2)*100)

Number of mislabeled points out of a total 2400 points : 103, performance 95.71%
96.0 4.0


  y = column_or_1d(y, warn=True)


In [16]:
#Build the decision tree classifier for the data

#Split the training and testing data
X = required_df.loc[: ,required_df.columns != 'stroke']
Y = required_df.loc[:, required_df.columns == 'stroke']

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)


from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(random_state = 0)
y_pred = dtree.fit(X_train, Y_train).predict(X_test)


# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          Y_test.shape[0],
          (Y_test["stroke"] != y_pred).sum(),
          100*(1-(Y_test["stroke"] != y_pred).sum()/X_test.shape[0])
))

#Save the decision tree model
import pickle

filename = 'dt_model.sav'
saved_model = pickle.dump(dtree, open(filename, 'wb'))

Number of mislabeled points out of a total 960 points : 86, performance 91.04%


In [35]:
stroke_yes = stroke_data_add[stroke_data_add["stroke"] == 1]
stroke_yes.to_csv("only_strokes.csv")

In [37]:
stroke_yes['age_bins'] = pd.qcut(stroke_yes['age'], q=[0, .1, .2, .4, .8, 1])
print(stroke_yes['age_bins'].value_counts())

stroke_yes['bmi_bins'] = pd.qcut(stroke_yes['bmi'], q=[0, .1, .2, .4, .8, 1])
print(stroke_yes['bmi_bins'].value_counts())

stroke_yes['glucose_bins'] = pd.qcut(stroke_yes['avg_glucose_level'], q=[0, .1, .2, .4, .8, 1])
print(stroke_yes['glucose_bins'].value_counts())

ages = pd.qcut(stroke_yes['avg_glucose_level'], q=[0, .1, .2, .4, .8, 1])
ages

(68.0, 79.0]      77
(57.0, 68.0]      44
(79.0, 82.0]      39
(51.0, 57.0]      24
(13.999, 51.0]    23
Name: age_bins, dtype: int64
(28.04, 34.58]                 82
(34.58, 47.5]                  42
(25.6, 28.04]                  40
(23.46, 25.6]                  22
(16.898999999999997, 23.46]    21
Name: bmi_bins, dtype: int64
(95.968, 205.686]    82
(205.686, 271.74]    42
(76.482, 95.968]     41
(56.109, 70.204]     21
(70.204, 76.482]     21
Name: glucose_bins, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stroke_yes['age_bins'] = pd.qcut(stroke_yes['age'], q=[0, .1, .2, .4, .8, 1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stroke_yes['bmi_bins'] = pd.qcut(stroke_yes['bmi'], q=[0, .1, .2, .4, .8, 1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stroke_yes['glucose_bins'] = pd.qcut(stroke_yes['

0      (205.686, 271.74]
2      (95.968, 205.686]
3      (95.968, 205.686]
4      (95.968, 205.686]
5      (95.968, 205.686]
             ...        
243    (205.686, 271.74]
244     (76.482, 95.968]
245     (56.109, 70.204]
246     (76.482, 95.968]
248     (76.482, 95.968]
Name: avg_glucose_level, Length: 207, dtype: category
Categories (5, interval[float64]): [(56.109, 70.204] < (70.204, 76.482] < (76.482, 95.968] < (95.968, 205.686] < (205.686, 271.74]]