## Stroke prediction 

In [1]:
## import the libraries .

import  pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

In [2]:
## load the data.

df=pd.read_csv('healthcare-dataset-stroke-data.csv')

In [3]:
## read shape and observe top 5 rows of the data.

print(df.shape)
df.head(5)

(5110, 12)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
## percentage of null values.

df.isnull().mean()*100

id                   0.000000
gender               0.000000
age                  0.000000
hypertension         0.000000
heart_disease        0.000000
ever_married         0.000000
work_type            0.000000
Residence_type       0.000000
avg_glucose_level    0.000000
bmi                  3.933464
smoking_status       0.000000
stroke               0.000000
dtype: float64

In [5]:
## drop null values.

df1=df.dropna()

In [6]:
## observe the cen tral tendencies.

df1.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0
mean,37064.313506,42.865374,0.091872,0.049501,105.30515,28.893237,0.042575
std,20995.098457,22.555115,0.288875,0.216934,44.424341,7.854067,0.201917
min,77.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,18605.0,25.0,0.0,0.0,77.07,23.5,0.0
50%,37608.0,44.0,0.0,0.0,91.68,28.1,0.0
75%,55220.0,60.0,0.0,0.0,113.57,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [7]:
## looking for correlation between features if any.

df.corr()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
id,1.0,0.003538,0.00355,-0.001296,0.001092,0.003084,0.006388
age,0.003538,1.0,0.276398,0.263796,0.238171,0.333398,0.245257
hypertension,0.00355,0.276398,1.0,0.108306,0.174474,0.167811,0.127904
heart_disease,-0.001296,0.263796,0.108306,1.0,0.161857,0.041357,0.134914
avg_glucose_level,0.001092,0.238171,0.174474,0.161857,1.0,0.175502,0.131945
bmi,0.003084,0.333398,0.167811,0.041357,0.175502,1.0,0.042374
stroke,0.006388,0.245257,0.127904,0.134914,0.131945,0.042374,1.0


In [8]:
## import  preprocessing from sklearn.
## read top 5 dtype object features.

from sklearn import preprocessing

df_cat = df1.select_dtypes(include=['object'])
df_cat.head()

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
0,Male,Yes,Private,Urban,formerly smoked
2,Male,Yes,Private,Rural,never smoked
3,Female,Yes,Private,Urban,smokes
4,Female,Yes,Self-employed,Rural,never smoked
5,Male,Yes,Private,Urban,formerly smoked


In [9]:
## fit the features of label encoding.


le = preprocessing.LabelEncoder()
df_cat = df_cat.apply(le.fit_transform)
df_cat.head()

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
0,1,1,2,1,1
2,1,1,2,0,2
3,0,1,2,1,3
4,0,1,3,0,2
5,1,1,2,1,1


In [10]:
## drop the features and concat the encoded values.

df1 = df1.drop(df_cat.columns, axis=1)
df2 = pd.concat([df1, df_cat], axis=1)
df2.head()


Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender,ever_married,work_type,Residence_type,smoking_status
0,9046,67.0,0,1,228.69,36.6,1,1,1,2,1,1
2,31112,80.0,0,1,105.92,32.5,1,1,1,2,0,2
3,60182,49.0,0,0,171.23,34.4,1,0,1,2,1,3
4,1665,79.0,1,0,174.12,24.0,1,0,1,3,0,2
5,56669,81.0,0,0,186.21,29.0,1,1,1,2,1,1


In [11]:
## drop the column 'id'.

df2=df2.drop(['id'],axis=1)

In [14]:
## assisgning values for X(independent values) and y(dependent values).

X=df2.drop(['stroke'],axis=1)
y=df2[['stroke']]

In [15]:
print(X.shape)
print(y.shape)

(4909, 10)
(4909, 1)


In [29]:
y.value_counts()

stroke
0         4700
1          209
dtype: int64

The data is imbalanced, from the above observation.

In [32]:
## Get the Yes and the No dataset 

Yes = df2[df2['stroke']==1]

No = df2[df2['stroke']==0]

In [33]:
print(Yes.shape,No.shape)

(209, 11) (4700, 11)


In [34]:
## import libraries for over sampling.

from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss

In [36]:
# Implementing Oversampling for Handling Imbalanced.

smk = SMOTETomek(random_state=42)
X_res,y_res=smk.fit_sample(X,y)

In [37]:
X_res.shape,y_res.shape

((9304, 10), (9304, 1))

In [38]:
## import test train split from sklearn.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.2, random_state = 51)

In [39]:
## import roc and aoc from skearn 

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [40]:
from sklearn.linear_model import LogisticRegression
log_classifier=LogisticRegression()
log_classifier.fit(X_train, y_train)
ytrain_pred = log_classifier.predict_proba(X_train)
print('Logistic train roc-auc: {}'.format(roc_auc_score(y_train, ytrain_pred[:,1])))
ytest_pred = log_classifier.predict_proba(X_test)
print('Logistic test roc-auc: {}'.format(roc_auc_score(y_test, ytest_pred[:,1])))

Logistic train roc-auc: 0.8936244859941298
Logistic test roc-auc: 0.8870744197972322


  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [41]:
## assigning y predicted values.

y_pred = log_classifier.predict(X_test)

In [42]:
## import matrics from sklearn 

from sklearn import metrics 

In [43]:
# Print the confusion matrix 
print(metrics.confusion_matrix(y_test,y_pred))

# Print the precision and recall, among other metrics
print(metrics.classification_report(y_test,y_pred, digits=3))

[[740 187]
 [175 759]]
              precision    recall  f1-score   support

           0      0.809     0.798     0.803       927
           1      0.802     0.813     0.807       934

    accuracy                          0.805      1861
   macro avg      0.806     0.805     0.805      1861
weighted avg      0.806     0.805     0.805      1861



In [44]:
from sklearn.metrics import classification_report,confusion_matrix

In [45]:
print(confusion_matrix(y_test,y_pred))

[[740 187]
 [175 759]]


In [46]:
## print classification report.

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.80      0.80       927
           1       0.80      0.81      0.81       934

    accuracy                           0.81      1861
   macro avg       0.81      0.81      0.81      1861
weighted avg       0.81      0.81      0.81      1861



In [47]:
print((metrics.accuracy_score(y_pred,y_test)*100))

80.54809242342826
