In [44]:
# Importing required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import seaborn as sns

In [2]:
# Loading data into the pandas data frame
df=pd.read_csv('healthcarestroke.csv')

In [3]:
# To view dataset
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
df.tail()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
5109,44679,Female,44.0,0,0,Yes,Govt_job,Urban,85.28,26.2,Unknown,0


In [5]:
# To check information about data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [6]:
# To check shape of data
df.shape

(5110, 12)

In [7]:
# To check null values
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [10]:
# BMI column only having 201 null values so have to fill those null values
df['bmi'].fillna(value=df['bmi'].mean(),inplace=True)

In [11]:
# checking the null values again
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [12]:
# Now there is no null values in our datase

In [13]:
# Have to change the categorical variable to numerical value because machine can understand numerical values

In [19]:
df['gender_n']=LabelEncoder().fit_transform(df['gender'])
df['evermarried_n']=LabelEncoder().fit_transform(df['ever_married'])
df['worktype_n']=LabelEncoder().fit_transform(df['work_type'])
df['residencetype_n']=LabelEncoder().fit_transform(df['Residence_type'])
df['smokingstatus_n']=LabelEncoder().fit_transform(df['smoking_status'])

In [21]:
df1=df[['age','gender_n','hypertension','heart_disease','evermarried_n','worktype_n','residencetype_n','avg_glucose_level','bmi','smokingstatus_n','stroke']]

In [22]:
# new numerical variable only dataset
df1

Unnamed: 0,age,gender_n,hypertension,heart_disease,evermarried_n,worktype_n,residencetype_n,avg_glucose_level,bmi,smokingstatus_n,stroke
0,67.0,1,0,1,1,2,1,228.69,36.600000,1,1
1,61.0,0,0,0,1,3,0,202.21,28.893237,2,1
2,80.0,1,0,1,1,2,0,105.92,32.500000,2,1
3,49.0,0,0,0,1,2,1,171.23,34.400000,3,1
4,79.0,0,1,0,1,3,0,174.12,24.000000,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,80.0,0,1,0,1,2,1,83.75,28.893237,2,0
5106,81.0,0,0,0,1,3,1,125.20,40.000000,2,0
5107,35.0,0,0,0,1,3,0,82.99,30.600000,2,0
5108,51.0,1,0,0,1,2,0,166.29,25.600000,1,0


In [26]:
# Assigning dependendent and independent variables
x=df1.drop('stroke',axis=1)
y=df['stroke']

In [27]:
# To split the data for training and testing
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [28]:
# Creating randomforest model(one of the bagging algorithm by decision tree)
rfr=RandomForestClassifier()

In [29]:
rfr.fit(x_train,y_train)

RandomForestClassifier()

In [31]:
# Now our machine learning model trained by given data now its available to prediction
y_predicted=rfr.predict(x_test)

In [34]:
#our machine learning model having 95% accuracy
accuracy_score(y_test,y_predicted)

0.9500978473581213

In [36]:
y_test

4458    0
585     0
2751    0
4068    0
3346    0
       ..
4120    0
29      1
2696    0
2608    0
4024    0
Name: stroke, Length: 1022, dtype: int64

In [37]:
y_predicted

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [40]:
x


Unnamed: 0,age,gender_n,hypertension,heart_disease,evermarried_n,worktype_n,residencetype_n,avg_glucose_level,bmi,smokingstatus_n
0,67.0,1,0,1,1,2,1,228.69,36.600000,1
1,61.0,0,0,0,1,3,0,202.21,28.893237,2
2,80.0,1,0,1,1,2,0,105.92,32.500000,2
3,49.0,0,0,0,1,2,1,171.23,34.400000,3
4,79.0,0,1,0,1,3,0,174.12,24.000000,2
...,...,...,...,...,...,...,...,...,...,...
5105,80.0,0,1,0,1,2,1,83.75,28.893237,2
5106,81.0,0,0,0,1,3,1,125.20,40.000000,2
5107,35.0,0,0,0,1,3,0,82.99,30.600000,2
5108,51.0,1,0,0,1,2,0,166.29,25.600000,1


In [43]:
# Example prediction
rfr.predict([[55,1,0,0,1,2,1,240,27,1]])



array([0], dtype=int64)

 ## steps done
    1.imported libraries
    2.loading dataset
    3.Data cleaning
    4.Data modeling
    5.Data spliting for training and testing
    6.Trainig machine learning model
    7.Made prediction with trained model
    8.Model accuracy checked