### Import libraries and create data frame

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Prepare Data

In [9]:
stroke_df = pd.read_csv("Resources/healthcare-dataset-stroke-data.csv")
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [15]:
# Handle Missing Values
# Fill missing values in the 'bmi' column with the mean
stroke_df['bmi'] = stroke_df['bmi'].fillna(stroke_df['bmi'].mean())

In [16]:
# Standardize Data Formats
# Convert categorical columns to 'category' type
stroke_df['gender'] = stroke_df['gender'].str.lower().astype('category')
stroke_df['ever_married'] = stroke_df['ever_married'].str.lower().astype('category')
stroke_df['work_type'] = stroke_df['work_type'].str.lower().astype('category')
stroke_df['Residence_type'] = stroke_df['Residence_type'].str.lower().astype('category')
stroke_df['smoking_status'] = stroke_df['smoking_status'].str.lower().astype('category')

In [18]:
# Ensure Consistency
# Convert 'id' to string 
stroke_df['id'] = stroke_df['id'].astype(str)

In [19]:
# Display the cleaned DataFrame and its data types
print(stroke_df)
print(stroke_df.dtypes)

         id  gender   age  hypertension  heart_disease ever_married  \
0      9046    male  67.0             0              1          yes   
1     51676  female  61.0             0              0          yes   
2     31112    male  80.0             0              1          yes   
3     60182  female  49.0             0              0          yes   
4      1665  female  79.0             1              0          yes   
...     ...     ...   ...           ...            ...          ...   
5105  18234  female  80.0             1              0          yes   
5106  44873  female  81.0             0              0          yes   
5107  19723  female  35.0             0              0          yes   
5108  37544    male  51.0             0              0          yes   
5109  44679  female  44.0             0              0          yes   

          work_type Residence_type  avg_glucose_level        bmi  \
0           private          urban             228.69  36.600000   
1     self-

In [4]:
stroke_df['smoking_status'].value_counts()

smoking_status
never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: count, dtype: int64

### Split into X and y

In [93]:
X = stroke_df.drop(columns=['id', 'stroke'])
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked


In [94]:
y = stroke_df['stroke']

### Train test split

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

### Handle Nulls

In [96]:
X_train.isna().sum()/len(stroke_df)

gender               0.000000
age                  0.000000
hypertension         0.000000
heart_disease        0.000000
ever_married         0.000000
work_type            0.000000
Residence_type       0.000000
avg_glucose_level    0.000000
bmi                  0.029941
smoking_status       0.000000
dtype: float64

In [98]:
(X_train['smoking_status'] == 'Unknown').sum()/len(stroke_df)

0.22857142857142856

In [99]:
#drop the 2.99% null bmi rows
X_train.dropna(inplace = True)

X_train.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
1914,Male,43.0,0,0,Yes,Private,Rural,77.86,28.9,never smoked
1751,Female,65.0,0,0,Yes,Private,Urban,88.82,28.2,formerly smoked
396,Male,60.0,1,0,Yes,Private,Rural,213.37,36.0,never smoked
1783,Male,2.0,0,0,No,children,Rural,65.67,16.6,Unknown
2361,Female,30.0,0,0,Yes,Private,Urban,101.98,23.2,Unknown


In [103]:
X_train.loc[X_train['smoking_status'] == 'Unknown'].describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi
count,1119.0,1119.0,1119.0,1119.0,1119.0
mean,29.955889,0.030384,0.025022,97.790751,25.604021
std,24.964771,0.171719,0.156263,34.949039,7.975154
min,0.08,0.0,0.0,55.28,10.3
25%,8.0,0.0,0.0,76.385,19.5
50%,22.0,0.0,0.0,88.83,24.5
75%,51.0,0.0,0.0,108.8,30.05
max,82.0,1.0,1.0,254.6,97.6


In [104]:
X_train.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi
count,3679.0,3679.0,3679.0,3679.0,3679.0
mean,42.930318,0.092416,0.050285,104.948323,28.96393
std,22.531573,0.289652,0.218563,44.628451,7.809128
min,0.08,0.0,0.0,55.12,10.3
25%,25.0,0.0,0.0,76.505,23.7
50%,45.0,0.0,0.0,91.04,28.1
75%,60.0,0.0,0.0,113.265,33.1
max,82.0,1.0,1.0,271.74,97.6


In [65]:
'''
for x in stroke_df:
    display(stroke_df[x].value_counts())
'''

'\nfor x in stroke_df:\n    display(stroke_df[x].value_counts())\n'

## Encode the dataset

In [None]:
#encode# List 
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

#One-Hot Encoding with pd.get_dummies
X_train_encoded = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)

#Check the data
X_train_encoded.head()


## Resample Data