# Stroke Prediction 

In [416]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Loading in the data and getting a simple overview of it

In [417]:
data = pd.read_csv('C:\\Users\\Tyron\\OneDrive\\Desktop\\Machine Learning Projects\\Machine-Learning-Projects\\Stroke_Prediction\\healthcare-dataset-stroke-data.csv')
data.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [418]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [419]:
data.isna().any()

id                   False
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                   True
smoking_status       False
stroke               False
dtype: bool

In [420]:
data.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

### Data Preprocessing

#### Splitting the data into train and test sets

In [421]:
from sklearn.model_selection import train_test_split

In [422]:
X = data.drop(['stroke', 'id'], axis=1)
y = data['stroke']

In [423]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [424]:
print('X_Train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

X_Train:  (4088, 10)
X_test:  (1022, 10)
y_train:  (4088,)
y_test:  (1022,)


#### Handling Missing Values

In [425]:
from sklearn.impute import SimpleImputer

In [426]:
X_train['bmi'].isna().sum()

156

In [427]:
imputer = SimpleImputer(strategy='median')
X_train['bmi'] = imputer.fit_transform(X_train[['bmi']])

In [428]:
X_train['bmi'].head()

802     28.5
3927    36.3
2337    33.7
3910    30.4
1886    19.9
Name: bmi, dtype: float64

In [429]:
X_train.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
dtype: int64

#### Handling Categorical Variables

In [430]:
from sklearn.preprocessing import OneHotEncoder

In [431]:
encoder = OneHotEncoder()

In [432]:
cat_data = X_train.select_dtypes('object')
cat_data.head()

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
802,Male,Yes,Self-employed,Rural,formerly smoked
3927,Female,Yes,Private,Urban,Unknown
2337,Female,No,Private,Rural,never smoked
3910,Male,Yes,Govt_job,Urban,formerly smoked
1886,Female,No,Private,Rural,never smoked


In [433]:
cat_data = encoder.fit_transform(cat_data)
cat_data = cat_data.toarray()
cat_data = pd.DataFrame(cat_data, columns=encoder.get_feature_names_out())
cat_data.head()

Unnamed: 0,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


#### Handling Numerical Variables

In [434]:
from sklearn.preprocessing import RobustScaler

In [435]:
scalar = RobustScaler()
num_data = X_train.select_dtypes(['int64', 'float64'])
num_data.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi
802,79.0,0,0,112.64,28.5
3927,62.0,0,0,88.32,36.3
2337,21.0,0,0,59.52,33.7
3910,31.0,0,0,65.7,30.4
1886,31.0,0,0,59.63,19.9


In [436]:
num_data = scalar.fit_transform(num_data)
num_data = pd.DataFrame(num_data, columns=scalar.get_feature_names_out())
num_data.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi
0,0.944444,0.0,0.0,0.552905,0.055556
1,0.472222,0.0,0.0,-0.095283,0.922222
2,-0.666667,0.0,0.0,-0.862873,0.633333
3,-0.388889,0.0,0.0,-0.698161,0.266667
4,-0.388889,0.0,0.0,-0.859941,-0.9


#### Creating Final Train Dataframe

In [437]:
X_train = pd.concat([num_data.reset_index(), cat_data.reset_index()], axis=1)
X_train = X_train.drop(['index'], axis=1)
X_train.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0.944444,0.0,0.0,0.552905,0.055556,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.472222,0.0,0.0,-0.095283,0.922222,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,-0.666667,0.0,0.0,-0.862873,0.633333,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,-0.388889,0.0,0.0,-0.698161,0.266667,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,-0.388889,0.0,0.0,-0.859941,-0.9,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


### Model Selection 

In [442]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

### Model Creation

### Model Evaluation

### Saving the model