In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE

In [2]:
stroke_data = pd.read_csv(r'C:\Users\Robin Aluma\Desktop\Stroke_Detection\csv_data\stroke_data\healthcare-dataset-stroke-data.csv')
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
stroke_data.shape

(5110, 12)

In [4]:
stroke_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


#Only the bmi has missing values which requirees imputing.

In [5]:
stroke_data['age'] = stroke_data['age'].astype('int')

In [6]:
stroke_data['hypertension'].value_counts()


hypertension
0    4612
1     498
Name: count, dtype: int64

In [7]:
stroke_data['stroke'] = stroke_data['stroke'].replace({0: 'No', 1: 'yes'})

In [8]:
stroke_data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,yes
1,51676,Female,61,0,0,Yes,Self-employed,Rural,202.21,,never smoked,yes
2,31112,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,yes
3,60182,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,yes
4,1665,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,yes
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80,1,0,Yes,Private,Urban,83.75,,never smoked,No
5106,44873,Female,81,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,No
5107,19723,Female,35,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,No
5108,37544,Male,51,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,No


In [9]:
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,yes
1,51676,Female,61,0,0,Yes,Self-employed,Rural,202.21,,never smoked,yes
2,31112,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,yes
3,60182,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,yes
4,1665,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,yes


In [10]:
stroke_data.drop(columns='id',inplace=True)

In [34]:
stroke_data['stroke'].value_counts()

stroke
No     4861
yes     249
Name: count, dtype: int64

In [11]:
stroke_data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,yes
1,Female,61,0,0,Yes,Self-employed,Rural,202.21,,never smoked,yes
2,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,yes
3,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,yes
4,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,yes


In [12]:
x_variables = stroke_data.drop(columns='stroke')
x_variables

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
1,Female,61,0,0,Yes,Self-employed,Rural,202.21,,never smoked
2,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked
3,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes
4,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked
...,...,...,...,...,...,...,...,...,...,...
5105,Female,80,1,0,Yes,Private,Urban,83.75,,never smoked
5106,Female,81,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked
5107,Female,35,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked
5108,Male,51,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked


In [13]:
y_variable = stroke_data['stroke']
y_variable

0       yes
1       yes
2       yes
3       yes
4       yes
       ... 
5105     No
5106     No
5107     No
5108     No
5109     No
Name: stroke, Length: 5110, dtype: object

In [14]:
y_variable.value_counts()

stroke
No     4861
yes     249
Name: count, dtype: int64

# The y variable is imbalanced.Its therefore necessary to balance the data.

In [15]:
x_train,x_test,y_train,y_test = train_test_split(x_variables,y_variable,test_size=0.30,random_state=42)


In [16]:

categorical_columns = x_train.select_dtypes(include='object').columns
numerical_columns = x_train.select_dtypes(exclude='object').columns

In [17]:
numerical_columns

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi'], dtype='object')

In [18]:
categorical_pipeline = Pipeline(
    steps=[
        ('encoder',OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
    ]
)

In [19]:
from sklearn import set_config

In [20]:
numerical_pipeline = Pipeline(
    steps=[
        ('impute',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

In [21]:
transformation_pipeline = ColumnTransformer(
    transformers=[
        ('category',categorical_pipeline,categorical_columns),
        ('numeric',numerical_pipeline,numerical_columns)
    ]
)

In [22]:
import numpy as np

In [23]:
transformation_pipeline

In [24]:
train_data = transformation_pipeline.fit_transform(x_train)
test_data = transformation_pipeline.transform(x_test)


In [25]:
transformation_pipeline.get_feature_names_out()

array(['category__gender_Female', 'category__gender_Male',
       'category__ever_married_No', 'category__ever_married_Yes',
       'category__work_type_Govt_job', 'category__work_type_Never_worked',
       'category__work_type_Private', 'category__work_type_Self-employed',
       'category__work_type_children', 'category__Residence_type_Rural',
       'category__Residence_type_Urban',
       'category__smoking_status_Unknown',
       'category__smoking_status_formerly smoked',
       'category__smoking_status_never smoked',
       'category__smoking_status_smokes', 'numeric__age',
       'numeric__hypertension', 'numeric__heart_disease',
       'numeric__avg_glucose_level', 'numeric__bmi'], dtype=object)

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [27]:

scaled_train_data = np.c_[train_data,np.array(y_train)]
scaled_test_data = np.c_[test_data,np.array(y_test)]

In [28]:
scaled_df = pd.DataFrame(scaled_test_data)
scaled_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,-0.546524,-0.317199,-0.239469,-0.909718,-0.761931,No
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,-0.147145,-0.317199,-0.239469,-0.899927,-0.07338,No
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,-1.56716,-0.317199,-0.239469,-0.696751,-0.826888,No
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.583498,3.152592,-0.239469,-0.647348,-1.216634,No
4,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.405996,-0.317199,-0.239469,-0.24389,-0.216287,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1528,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,-1.034654,-0.317199,-0.239469,0.027604,-0.125346,No
1529,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.006617,-0.317199,-0.239469,-0.8414,0.290382,No
1530,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,-0.058394,-0.317199,-0.239469,-0.265698,-0.216287,No
1531,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.607238,-0.317199,-0.239469,-0.808464,-0.632015,No


In [29]:
transformed_train_data = pd.DataFrame(transformation_pipeline.fit_transform(x_train),columns=transformation_pipeline.get_feature_names_out())
transformed_train_data

Unnamed: 0,category__gender_Female,category__gender_Male,category__ever_married_No,category__ever_married_Yes,category__work_type_Govt_job,category__work_type_Never_worked,category__work_type_Private,category__work_type_Self-employed,category__work_type_children,category__Residence_type_Rural,category__Residence_type_Urban,category__smoking_status_Unknown,category__smoking_status_formerly smoked,category__smoking_status_never smoked,category__smoking_status_smokes,numeric__age,numeric__hypertension,numeric__heart_disease,numeric__avg_glucose_level,numeric__bmi
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,-1.744661,-0.317199,-0.239469,-0.340693,-1.645354
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.635275,-0.317199,-0.239469,2.266541,-0.787914
2,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.030357,3.152592,-0.239469,-0.321555,-0.307227
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.095368,3.152592,-0.239469,1.001645,1.485602
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.829115,-0.317199,-0.239469,-0.515384,0.926967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3572,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.014019,-0.317199,-0.239469,-0.394547,0.225425
3573,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.784740,3.152592,-0.239469,1.431363,4.070914
3574,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,-1.877788,-0.317199,-0.239469,-0.188033,-1.437489
3575,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.627873,-0.317199,-0.239469,2.010625,0.277391


In [30]:
transformed_test_data = pd.DataFrame(transformation_pipeline.transform(x_test),columns=transformation_pipeline.get_feature_names_out())
transformed_test_data

Unnamed: 0,category__gender_Female,category__gender_Male,category__ever_married_No,category__ever_married_Yes,category__work_type_Govt_job,category__work_type_Never_worked,category__work_type_Private,category__work_type_Self-employed,category__work_type_children,category__Residence_type_Rural,category__Residence_type_Urban,category__smoking_status_Unknown,category__smoking_status_formerly smoked,category__smoking_status_never smoked,category__smoking_status_smokes,numeric__age,numeric__hypertension,numeric__heart_disease,numeric__avg_glucose_level,numeric__bmi
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,-0.546524,-0.317199,-0.239469,-0.909718,-0.761931
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.147145,-0.317199,-0.239469,-0.899927,-0.073380
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,-1.567160,-0.317199,-0.239469,-0.696751,-0.826888
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.583498,3.152592,-0.239469,-0.647348,-1.216634
4,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.405996,-0.317199,-0.239469,-0.243890,-0.216287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1528,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-1.034654,-0.317199,-0.239469,0.027604,-0.125346
1529,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.006617,-0.317199,-0.239469,-0.841400,0.290382
1530,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.058394,-0.317199,-0.239469,-0.265698,-0.216287
1531,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.607238,-0.317199,-0.239469,-0.808464,-0.632015


In [31]:
model = LogisticRegression()
model.fit(transformed_train_data,y_train)

In [32]:
predicted = model.predict(transformed_test_data)
predicted

array(['No', 'No', 'No', ..., 'No', 'No', 'No'],
      shape=(1533,), dtype=object)

In [33]:
metrics = confusion_matrix(predicted,y_test)
metrics

array([[1444,   88],
       [   0,    1]])