# Stroke notebook

## 1. Importing needed libraries

In [2]:
%matplotlib inline
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_profiling as pp
import seaborn as sns
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, f1_score, recall_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

## 2. Importing the data

In [3]:
data = pd.read_csv(r'healthcare-dataset-stroke-data.csv')

# show the data of how many rows and columns
print("Data Shape - ", data.shape)
data.head(5)

Data Shape -  (5110, 12)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## 3. Understanding the data

In [7]:
data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [8]:
data.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [9]:
data.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

bmi has 201 missing values. Intending on removing those rows entirely. 

In [17]:
# Delete rows with Nan, None & Null Values
data_new=data.dropna()
#Now there is only 4909 rows. Now that the rows with empty BMI has been removed.
print(data_new)

         id  gender  age  hypertension  heart_disease ever_married  \
0      9046    Male   67             0              1          Yes   
2     31112    Male   80             0              1          Yes   
3     60182  Female   49             0              0          Yes   
4      1665  Female   79             1              0          Yes   
5     56669    Male   81             0              0          Yes   
...     ...     ...  ...           ...            ...          ...   
5104  14180  Female   13             0              0           No   
5106  44873  Female   81             0              0          Yes   
5107  19723  Female   35             0              0          Yes   
5108  37544    Male   51             0              0          Yes   
5109  44679  Female   44             0              0          Yes   

          work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0           Private          Urban             228.69  36.6  formerly smoked   

Smoking column has some rows with unknown status of if person smokes or not. They will be removed. 

In [27]:
data_new.drop(data_new.index[data_new['smoking_status'] == 'Unknown'], inplace=True)
# Now all the rows with unknown smoking status has been removed so only 3426 rows remain. 

data_new.drop('id', axis=1, inplace=True)
#dropping id as it's not needed. 
print(data_new)

      gender  age  hypertension  heart_disease ever_married      work_type  \
0       Male   67             0              1          Yes        Private   
2       Male   80             0              1          Yes        Private   
3     Female   49             0              0          Yes        Private   
4     Female   79             1              0          Yes  Self-employed   
5       Male   81             0              0          Yes        Private   
...      ...  ...           ...            ...          ...            ...   
5100    Male   82             1              0          Yes  Self-employed   
5102  Female   57             0              0          Yes        Private   
5106  Female   81             0              0          Yes  Self-employed   
5107  Female   35             0              0          Yes  Self-employed   
5108    Male   51             0              0          Yes        Private   

     Residence_type  avg_glucose_level   bmi   smoking_status  