In [33]:
!pip install catboost matplotlib pandas scikit-learn kaggle optuna ipywidgets kaleido shap jupyterlab-rise



In [34]:
#!dir ../../../home/vscode

In [35]:
import os
from pathlib import Path
container_check = os.getenv("iscontainer")
if container_check=="y":
    config_dir = Path("/home/vscode/.config/kaggle")
    config_dir.mkdir(parents=True, exist_ok=True)
    
    with open(config_dir / "kaggle.json", "w") as dst:
        with open("./kaggle.json", "r") as src:
            dst.write(src.read())

# Download dataset

In [36]:
import os
import zipfile
from pathlib import Path
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()



dataset_name = "fedesoriano/stroke-prediction-dataset"
download_folder = Path("data/stroke-prediction")
download_folder.mkdir(parents=True, exist_ok=True)

api.dataset_download_files(dataset_name, path=str(download_folder), unzip=True)



Dataset URL: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset


In [37]:
!dir "./data/stroke-prediction"

healthcare-dataset-stroke-data.csv


# Load data

In [38]:

import pandas as pd

df = pd.read_csv(download_folder / "healthcare-dataset-stroke-data.csv")
df_ids = df.pop("id")

df.head(5)




     


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# Ordinal encoder for categorical features

In [39]:
from sklearn.preprocessing import OrdinalEncoder

def create_ordinal_encoder(categories_order):
    return OrdinalEncoder(categories=categories_order)

smoking_status = [['formerly smoked', 'never smoked', 'smokes','Unknown']]  

ordinal_encoder_smoking_status = create_ordinal_encoder(smoking_status)


# One-hot encoding categorical non-order features

In [40]:
from sklearn.preprocessing import OneHotEncoder

hot_encoder = OneHotEncoder(drop='first')

# Pipeline for transforming columns

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('ever_married', hot_encoder, ['ever_married']),
        ('work_type', hot_encoder, ['work_type']),
        ('gender', hot_encoder, ['gender']),
        ('Residence_type', hot_encoder, ['Residence_type']),
        ('smoking_status', ordinal_encoder_smoking_status, ['smoking_status'])
    ],
    remainder='passthrough'  
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

encoded_data = preprocessor.fit_transform(df)

encoded_data

array([[  1.  ,   0.  ,   1.  , ..., 228.69,  36.6 ,   1.  ],
       [  1.  ,   0.  ,   0.  , ..., 202.21,    nan,   1.  ],
       [  1.  ,   0.  ,   1.  , ..., 105.92,  32.5 ,   1.  ],
       ...,
       [  1.  ,   0.  ,   0.  , ...,  82.99,  30.6 ,   0.  ],
       [  1.  ,   0.  ,   1.  , ..., 166.29,  25.6 ,   0.  ],
       [  1.  ,   0.  ,   0.  , ...,  85.28,  26.2 ,   0.  ]],
      shape=(5110, 15))

# Creation of a data frame from transformed data

In [42]:
transformed_df = pd.DataFrame(
  encoded_data,
  columns=preprocessor.get_feature_names_out())

transformed_df

Unnamed: 0,ever_married__ever_married_Yes,work_type__work_type_Never_worked,work_type__work_type_Private,work_type__work_type_Self-employed,work_type__work_type_children,gender__gender_Male,gender__gender_Other,Residence_type__Residence_type_Urban,smoking_status__smoking_status,remainder__age,remainder__hypertension,remainder__heart_disease,remainder__avg_glucose_level,remainder__bmi,remainder__stroke
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,67.0,0.0,1.0,228.69,36.6,1.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,61.0,0.0,0.0,202.21,,1.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,80.0,0.0,1.0,105.92,32.5,1.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,49.0,0.0,0.0,171.23,34.4,1.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,79.0,1.0,0.0,174.12,24.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,80.0,1.0,0.0,83.75,,0.0
5106,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,81.0,0.0,0.0,125.20,40.0,0.0
5107,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,35.0,0.0,0.0,82.99,30.6,0.0
5108,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,51.0,0.0,0.0,166.29,25.6,0.0


# Division into training and test data

In [43]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(transformed_df, test_size=0.2, random_state=42)

df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1022 entries, 4688 to 2902
Data columns (total 15 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ever_married__ever_married_Yes        1022 non-null   float64
 1   work_type__work_type_Never_worked     1022 non-null   float64
 2   work_type__work_type_Private          1022 non-null   float64
 3   work_type__work_type_Self-employed    1022 non-null   float64
 4   work_type__work_type_children         1022 non-null   float64
 5   gender__gender_Male                   1022 non-null   float64
 6   gender__gender_Other                  1022 non-null   float64
 7   Residence_type__Residence_type_Urban  1022 non-null   float64
 8   smoking_status__smoking_status        1022 non-null   float64
 9   remainder__age                        1022 non-null   float64
 10  remainder__hypertension               1022 non-null   float64
 11  remainder__heart_di

In [44]:
df_train.info()





     


<class 'pandas.core.frame.DataFrame'>
Index: 4088 entries, 802 to 860
Data columns (total 15 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ever_married__ever_married_Yes        4088 non-null   float64
 1   work_type__work_type_Never_worked     4088 non-null   float64
 2   work_type__work_type_Private          4088 non-null   float64
 3   work_type__work_type_Self-employed    4088 non-null   float64
 4   work_type__work_type_children         4088 non-null   float64
 5   gender__gender_Male                   4088 non-null   float64
 6   gender__gender_Other                  4088 non-null   float64
 7   Residence_type__Residence_type_Urban  4088 non-null   float64
 8   smoking_status__smoking_status        4088 non-null   float64
 9   remainder__age                        4088 non-null   float64
 10  remainder__hypertension               4088 non-null   float64
 11  remainder__heart_dise

In [None]:
y_train = df_train.pop("remainder__stroke")
X_train = df_train


802     0.0
3927    0.0
2337    0.0
3910    0.0
1886    0.0
       ... 
4426    0.0
466     0.0
3092    0.0
3772    0.0
860     0.0
Name: remainder__stroke, Length: 4088, dtype: float64

In [46]:
y_test = df_test.pop("remainder__stroke")
X_test = df_test

In [47]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4088 entries, 802 to 860
Data columns (total 14 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ever_married__ever_married_Yes        4088 non-null   float64
 1   work_type__work_type_Never_worked     4088 non-null   float64
 2   work_type__work_type_Private          4088 non-null   float64
 3   work_type__work_type_Self-employed    4088 non-null   float64
 4   work_type__work_type_children         4088 non-null   float64
 5   gender__gender_Male                   4088 non-null   float64
 6   gender__gender_Other                  4088 non-null   float64
 7   Residence_type__Residence_type_Urban  4088 non-null   float64
 8   smoking_status__smoking_status        4088 non-null   float64
 9   remainder__age                        4088 non-null   float64
 10  remainder__hypertension               4088 non-null   float64
 11  remainder__heart_dise

4356    0.0
1055    0.0
1002    0.0
1305    0.0
1721    0.0
745     0.0
2649    0.0
3518    0.0
3555    0.0
2569    0.0
2610    0.0
1175    0.0
3667    0.0
1438    0.0
2091    0.0
2312    0.0
1788    0.0
2018    0.0
2877    0.0
1730    0.0
2172    0.0
1129    0.0
893     0.0
1934    0.0
62      1.0
2622    0.0
2867    0.0
1281    0.0
478     0.0
1413    0.0
1183    0.0
4353    0.0
1503    0.0
2194    0.0
3849    0.0
3742    0.0
1038    0.0
3733    0.0
4019    0.0
731     0.0
3887    0.0
2629    0.0
227     1.0
2313    0.0
1397    0.0
287     0.0
2902    0.0
1545    0.0
4666    0.0
1456    0.0
Name: remainder__stroke, dtype: float64