In [1]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import pandas as pd
import numpy as np

In [2]:
#loading data
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# Preprocessing data

In [3]:
#checking for null values, if present replacing with mean 
df = df.fillna(-1)
df.isnull().any()

id                   False
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool

In [4]:
X = df[['age','gender','hypertension','heart_disease','avg_glucose_level','bmi','work_type','Residence_type',
        'smoking_status']].values
X[:5]

array([[67.0, 'Male', 0, 1, 228.69, 36.6, 'Private', 'Urban',
        'formerly smoked'],
       [61.0, 'Female', 0, 0, 202.21, -1.0, 'Self-employed', 'Rural',
        'never smoked'],
       [80.0, 'Male', 0, 1, 105.92, 32.5, 'Private', 'Rural',
        'never smoked'],
       [49.0, 'Female', 0, 0, 171.23, 34.4, 'Private', 'Urban', 'smokes'],
       [79.0, 'Female', 1, 0, 174.12, 24.0, 'Self-employed', 'Rural',
        'never smoked']], dtype=object)

In [5]:
# Labeling categorical data
gender = preprocessing.LabelEncoder()
gender.fit(['Female','Male','Other'])
X[:,1] = gender.transform(X[:,1]) 

wt = preprocessing.LabelEncoder()
wt.fit(['Govt_job', 'Never_worked', 'Private', 'Self-employed', 'children'])
X[:,6] = wt.transform(X[:,6]) 

res = preprocessing.LabelEncoder()
res.fit(['Rural', 'Urban'])
X[:,7] = res.transform(X[:,7]) 

smoke = preprocessing.LabelEncoder()
smoke.fit(['Unknown', 'formerly smoked', 'never smoked', 'smokes'])
X[:,8] = smoke.transform(X[:,8]) 

In [6]:
y = df['stroke']
y[:5]

0    1
1    1
2    1
3    1
4    1
Name: stroke, dtype: int64

In [7]:
# spliting data into trainset and testset
X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=0.3, random_state=3)

## Model without using boosting

In [8]:
model = DecisionTreeClassifier(criterion="gini", max_depth = 10)

In [9]:
model.fit(X_trainset,y_trainset)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [10]:
prediction = model.predict(X_testset)

In [11]:
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, prediction))

DecisionTrees's Accuracy:  0.9282452707110241


## Model using adaboost

In [12]:
from sklearn.ensemble import AdaBoostClassifier
aboost_obj = AdaBoostClassifier(n_estimators=10)

In [13]:
aboosted_model = aboost_obj.fit(X_trainset,y_trainset)
aboosted_model

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=10, random_state=None)

In [14]:
aboosted_prediction =aboosted_model.predict(X_testset)

In [15]:
print("Boosted DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, aboosted_prediction))

Boosted DecisionTrees's Accuracy:  0.954337899543379


## Model using gradient boost

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
gboosted_model = GradientBoostingClassifier(n_estimators = 10)

In [17]:
gboosted_model.fit(X_trainset,y_trainset)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=10,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [18]:
gboosted_prediction = gboosted_model.predict(X_testset)

In [19]:
print("Boosted DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, gboosted_prediction))

Boosted DecisionTrees's Accuracy:  0.9562948467058056
