In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [2]:
df.shape

(5110, 12)

In [3]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

**Filling with the Mean**

In [4]:
df_mean = df.copy()
df_mean["bmi"] = df_mean["bmi"].fillna(df_mean["bmi"].mean())  

In [5]:
df_mean.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

**Filling with the Median**

In [6]:
df_median = df.copy()
df_median["bmi"] = df_median["bmi"].fillna(df_median["bmi"].sum())

In [7]:
df_median.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

**Filling with a Specific Value**

In [8]:
df_specific = df.copy() 
df_specific['bmi'] = df_specific['bmi'].fillna(0) 

In [9]:
df_specific.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

**Real Dataset**

In [10]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

**Deleting the columns with missing data**

In [11]:
data = df.copy()

In [12]:
data.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [13]:
data.drop("bmi",axis=1,inplace=True)

In [14]:
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,never smoked,1


**2. Age Categorization**

In [15]:
data["age"]

0       67.0
1       61.0
2       80.0
3       49.0
4       79.0
        ... 
5105    80.0
5106    81.0
5107    35.0
5108    51.0
5109    44.0
Name: age, Length: 5110, dtype: float64

In [16]:
data["Label"] = pd.cut(x=data["age"], bins=[0, 3, 17, 63, 99],
                        labels=["Baby", "Child", "Adult", "Elderly"])
data["Label"]

0       Elderly
1         Adult
2       Elderly
3         Adult
4       Elderly
         ...   
5105    Elderly
5106    Elderly
5107      Adult
5108      Adult
5109      Adult
Name: Label, Length: 5110, dtype: category
Categories (4, object): ['Baby' < 'Child' < 'Adult' < 'Elderly']

In [17]:
data["Label"].value_counts()

Label
Adult      3174
Elderly    1080
Child       635
Baby        221
Name: count, dtype: int64

**LabelEncoding**

In [18]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [19]:
from sklearn.preprocessing import LabelEncoder

LabelEncoder = LabelEncoder()
data_LabelEncoder = df.copy()

encode = ["id", "gender", "ever_married", "work_type", "Residence_type", "smoking_status"]

for column in encode:
    data_LabelEncoder[column] = LabelEncoder.fit_transform(data_LabelEncoder[column])


In [20]:
data_LabelEncoder.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,671,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,3610,0,61.0,0,0,1,3,0,202.21,,2,1
2,2151,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,4226,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,114,0,79.0,1,0,1,3,0,174.12,24.0,2,1


**Ordinal Encoding**

In [21]:
from sklearn.preprocessing import OrdinalEncoder

data_Ordinal = df.copy()

Residence_type = data_Ordinal[["Residence_type"]]

encoder = OrdinalEncoder()
data_Ordinal["Residence_type_encoded"] = encoder.fit_transform(Residence_type)

print(data_Ordinal["Residence_type_encoded"])

0       1.0
1       0.0
2       0.0
3       1.0
4       0.0
       ... 
5105    1.0
5106    1.0
5107    0.0
5108    0.0
5109    1.0
Name: Residence_type_encoded, Length: 5110, dtype: float64


**One-hot encoding**

In [22]:
from sklearn.preprocessing import OneHotEncoder

residence_data = df[["Residence_type"]]
encoder = OneHotEncoder(sparse_output=False)

onehot = encoder.fit_transform(residence_data)
print(onehot)


[[0. 1.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [0. 1.]]


**Min_Max**

In [23]:
dataframe = df.copy()

dataframe["age"] = (dataframe["age"] - dataframe["age"].min()) / (dataframe["age"].max() - dataframe["age"].min())

print(dataframe["age"] )

0       0.816895
1       0.743652
2       0.975586
3       0.597168
4       0.963379
          ...   
5105    0.975586
5106    0.987793
5107    0.426270
5108    0.621582
5109    0.536133
Name: age, Length: 5110, dtype: float64


In [24]:
dataframe.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,0.816895,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,0.743652,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,0.975586,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,0.597168,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,0.963379,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
