In [1]:
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
data.drop(columns='id', inplace=True)

In [3]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cols = ['gender','ever_married','work_type','Residence_type','smoking_status']

def encode(c):
    le.fit(data[c])
    data[c] = le.fit_transform(data[c])
    
for c in cols:
    encode(c)

In [5]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,0,61.0,0,0,1,3,0,202.21,,2,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1


In [6]:
data['bmi'].fillna(data['bmi'].median(), inplace=True)

In [7]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# data['bmi'] = scaler.fit_transform(data[['bmi']])
# data['avg_glucose_level'] = scaler.fit_transform(data[['avg_glucose_level']])

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data['bmi'] = scaler.fit_transform(data[['bmi']])
data['avg_glucose_level'] = scaler.fit_transform(data[['avg_glucose_level']])

In [9]:
X = data.drop(columns='stroke',axis=1)
y = data['stroke']

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X_train

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
802,1,79.00,0,0,1,3,0,0.143384,-0.047025,1
3927,0,62.00,0,0,1,2,1,-0.393728,0.966119,0
2337,0,21.00,0,0,0,2,0,-1.029783,0.628404,2
3910,1,31.00,0,0,1,0,1,-0.893296,0.199767,1
1886,0,31.00,0,0,0,2,0,-1.027354,-1.164081,2
...,...,...,...,...,...,...,...,...,...,...
4426,1,43.00,0,0,1,0,0,-0.400796,0.225745,2
466,0,61.00,1,0,1,2,0,1.411298,4.070495,3
3092,0,1.16,0,0,0,4,1,-0.195845,-1.436850,0
3772,1,80.00,0,0,1,3,0,1.986176,0.277701,1


In [12]:
y_train

802     0
3927    0
2337    0
3910    0
1886    0
       ..
4426    0
466     0
3092    0
3772    0
860     0
Name: stroke, Length: 4088, dtype: int64

In [13]:
X_test

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
4688,1,31.00,0,0,0,3,0,-0.912069,-0.761421,0
4478,1,40.00,0,0,1,3,0,-0.902351,-0.073003,2
3849,0,8.00,0,0,0,4,1,-0.700713,-0.826366,0
4355,0,79.00,1,0,1,3,0,-0.651684,-1.216037,2
3826,0,75.00,0,0,1,0,0,-0.251279,-0.215882,2
...,...,...,...,...,...,...,...,...,...,...
3605,1,1.88,0,0,0,4,0,0.835315,-0.098981,0
4934,1,1.32,0,0,0,4,0,0.019265,-0.098981,0
4835,1,49.00,1,0,1,3,0,0.290472,0.199767,1
4105,0,56.00,0,0,0,3,1,0.496527,-0.514630,3


In [14]:
y_test

4688    0
4478    0
3849    0
4355    0
3826    0
       ..
3605    0
4934    0
4835    0
4105    0
2902    0
Name: stroke, Length: 1022, dtype: int64

## SVM

In [15]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train,y_train)

In [16]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [17]:
X_test_pred = svc.predict(X_test)
acc_score = accuracy_score(X_test_pred, y_test)
print('Accuracy score on testing data: ', acc_score)

conf_matrix = confusion_matrix(X_test_pred, y_test)
print(conf_matrix)

class_report = classification_report(X_test_pred, y_test)
print(class_report)

Accuracy score on testing data:  0.9393346379647749
[[960  62]
 [  0   0]]
              precision    recall  f1-score   support

           0       1.00      0.94      0.97      1022
           1       0.00      0.00      0.00         0

    accuracy                           0.94      1022
   macro avg       0.50      0.47      0.48      1022
weighted avg       1.00      0.94      0.97      1022



## Decision Tree

In [18]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)

In [19]:
X_test_pred = dt.predict(X_test)
acc_score = accuracy_score(X_test_pred, y_test)
print('Accuracy score on testing data: ', acc_score)

conf_matrix = confusion_matrix(X_test_pred, y_test)
print(conf_matrix)

class_report = classification_report(X_test_pred, y_test)
print(class_report)

Accuracy score on testing data:  0.9148727984344422
[[925  52]
 [ 35  10]]
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       977
           1       0.16      0.22      0.19        45

    accuracy                           0.91      1022
   macro avg       0.56      0.58      0.57      1022
weighted avg       0.93      0.91      0.92      1022



## Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [21]:
X_test_pred = rf.predict(X_test)
acc_score = accuracy_score(X_test_pred, y_test)
print('Accuracy score on testing data: ', acc_score)

conf_matrix = confusion_matrix(X_test_pred, y_test)
print(conf_matrix)

class_report = classification_report(X_test_pred, y_test)
print(class_report)

Accuracy score on testing data:  0.9393346379647749
[[959  61]
 [  1   1]]
              precision    recall  f1-score   support

           0       1.00      0.94      0.97      1020
           1       0.02      0.50      0.03         2

    accuracy                           0.94      1022
   macro avg       0.51      0.72      0.50      1022
weighted avg       1.00      0.94      0.97      1022



## Undersampling

In [22]:
data['stroke'].value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [23]:
stroke = data[data.stroke == 0]
no_stroke = data[data.stroke == 1]

In [24]:
stroke_sample = stroke.sample(n=249)

In [25]:
new_data = pd.concat([stroke_sample,no_stroke],axis=0)

In [26]:
new_data.shape

(498, 11)

In [27]:
new_data['stroke'].value_counts()

0    249
1    249
Name: stroke, dtype: int64

In [28]:
X = new_data.drop(columns='stroke',axis=1)
y = new_data['stroke']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## SVM

In [30]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train,y_train)

In [31]:
X_test_pred = svc.predict(X_test)
acc_score = accuracy_score(X_test_pred, y_test)
print('Accuracy score on testing data: ', acc_score)

conf_matrix = confusion_matrix(X_test_pred, y_test)
print(conf_matrix)

class_report = classification_report(X_test_pred, y_test)
print(class_report)

Accuracy score on testing data:  0.74
[[29  4]
 [22 45]]
              precision    recall  f1-score   support

           0       0.57      0.88      0.69        33
           1       0.92      0.67      0.78        67

    accuracy                           0.74       100
   macro avg       0.74      0.78      0.73       100
weighted avg       0.80      0.74      0.75       100



## Decision Tree

In [32]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)

In [33]:
X_test_pred = dt.predict(X_test)
acc_score = accuracy_score(X_test_pred, y_test)
print('Accuracy score on testing data: ', acc_score)

conf_matrix = confusion_matrix(X_test_pred, y_test)
print(conf_matrix)

class_report = classification_report(X_test_pred, y_test)
print(class_report)

Accuracy score on testing data:  0.73
[[35 11]
 [16 38]]
              precision    recall  f1-score   support

           0       0.69      0.76      0.72        46
           1       0.78      0.70      0.74        54

    accuracy                           0.73       100
   macro avg       0.73      0.73      0.73       100
weighted avg       0.73      0.73      0.73       100



## Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [35]:
X_test_pred = rf.predict(X_test)
acc_score = accuracy_score(X_test_pred, y_test)
print('Accuracy score on testing data: ', acc_score)

conf_matrix = confusion_matrix(X_test_pred, y_test)
print(conf_matrix)

class_report = classification_report(X_test_pred, y_test)
print(class_report)

Accuracy score on testing data:  0.76
[[33  6]
 [18 43]]
              precision    recall  f1-score   support

           0       0.65      0.85      0.73        39
           1       0.88      0.70      0.78        61

    accuracy                           0.76       100
   macro avg       0.76      0.78      0.76       100
weighted avg       0.79      0.76      0.76       100



## Oversampling - SMOTE

In [36]:
X = data.drop(columns='stroke', axis=1)
y = data['stroke']

In [37]:
from imblearn.over_sampling import SMOTE
X_res,y_res = SMOTE().fit_resample(X,y)

In [38]:
X_res.shape, y_res.shape

((9722, 10), (9722,))

In [39]:
y_res.value_counts()

1    4861
0    4861
Name: stroke, dtype: int64

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_res,y_res,test_size=0.2,random_state=42)

## SVM

In [41]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train,y_train)

In [42]:
X_test_pred = svc.predict(X_test)
acc_score = accuracy_score(X_test_pred, y_test)
print('Accuracy score on testing data: ', acc_score)

conf_matrix = confusion_matrix(X_test_pred, y_test)
print(conf_matrix)

class_report = classification_report(X_test_pred, y_test)
print(class_report)

Accuracy score on testing data:  0.7645244215938304
[[653 136]
 [322 834]]
              precision    recall  f1-score   support

           0       0.67      0.83      0.74       789
           1       0.86      0.72      0.78      1156

    accuracy                           0.76      1945
   macro avg       0.76      0.77      0.76      1945
weighted avg       0.78      0.76      0.77      1945



## Decision Tree

In [43]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)

In [44]:
X_test_pred = dt.predict(X_test)
acc_score = accuracy_score(X_test_pred, y_test)
print('Accuracy score on testing data: ', acc_score)

conf_matrix = confusion_matrix(X_test_pred, y_test)
print(conf_matrix)

class_report = classification_report(X_test_pred, y_test)
print(class_report)

Accuracy score on testing data:  0.9146529562982005
[[879  70]
 [ 96 900]]
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       949
           1       0.93      0.90      0.92       996

    accuracy                           0.91      1945
   macro avg       0.91      0.91      0.91      1945
weighted avg       0.92      0.91      0.91      1945



## Random Forest

In [45]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [46]:
X_test_pred = rf.predict(X_test)
acc_score = accuracy_score(X_test_pred, y_test)
print('Accuracy score on testing data: ', acc_score)

conf_matrix = confusion_matrix(X_test_pred, y_test)
print(conf_matrix)

class_report = classification_report(X_test_pred, y_test)
print(class_report)

Accuracy score on testing data:  0.9439588688946016
[[902  36]
 [ 73 934]]
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       938
           1       0.96      0.93      0.94      1007

    accuracy                           0.94      1945
   macro avg       0.94      0.94      0.94      1945
weighted avg       0.94      0.94      0.94      1945

