In [1]:
import warnings
warnings.filterwarnings("ignore")

#### Stroke Prediction

In [None]:
# import libraries 

import kagglehub
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier 
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score , classification_report , confusion_matrix
import joblib



In [3]:
# Download dataset
path = kagglehub.dataset_download("fedesoriano/stroke-prediction-dataset")

# Load CSV file with pandas
df = pd.read_csv(path + "/healthcare-dataset-stroke-data.csv")


In [4]:
print(df[["gender" , "stroke"]].head(3))

   gender  stroke
0    Male       1
1  Female       1
2    Male       1


In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
None


In [6]:
print(df.describe())

                 id          age  hypertension  heart_disease  \
count   5110.000000  5110.000000   5110.000000    5110.000000   
mean   36517.829354    43.226614      0.097456       0.054012   
std    21161.721625    22.612647      0.296607       0.226063   
min       67.000000     0.080000      0.000000       0.000000   
25%    17741.250000    25.000000      0.000000       0.000000   
50%    36932.000000    45.000000      0.000000       0.000000   
75%    54682.000000    61.000000      0.000000       0.000000   
max    72940.000000    82.000000      1.000000       1.000000   

       avg_glucose_level          bmi       stroke  
count        5110.000000  4909.000000  5110.000000  
mean          106.147677    28.893237     0.048728  
std            45.283560     7.854067     0.215320  
min            55.120000    10.300000     0.000000  
25%            77.245000    23.500000     0.000000  
50%            91.885000    28.100000     0.000000  
75%           114.090000    33.100000     0

In [7]:
# class balance 

df["stroke"].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

In [8]:
# check null values and deal with it 
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [9]:
# Calculate mean BMI for each gender
bmi_means = df.groupby('gender')['bmi'].mean()
print(bmi_means)

gender
Female    29.065758
Male      28.647936
Other     22.400000
Name: bmi, dtype: float64


In [10]:
# fill the null values of bmi with gender specific 

df["bmi"] = df["bmi"].fillna(df.groupby("gender")["bmi"].transform("mean"))

In [11]:
df["bmi"].mean()

np.float64(28.891519349393853)

In [12]:
# check null values for verification
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [13]:
print(df.dtypes)

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object


In [14]:
categorical_cols = ["gender" ,"ever_married" , "work_type" , "Residence_type" , "smoking_status"]

In [15]:
for col in categorical_cols:
    print(col , " Unique Value : " , df[col].unique() )

gender  Unique Value :  ['Male' 'Female' 'Other']
ever_married  Unique Value :  ['Yes' 'No']
work_type  Unique Value :  ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
Residence_type  Unique Value :  ['Urban' 'Rural']
smoking_status  Unique Value :  ['formerly smoked' 'never smoked' 'smokes' 'Unknown']


In [16]:
# convert categorical  data to numeric

df["gender"] = df["gender"].map({"Male" : 0 , "Female" : 1 , "Other" : 2})
df["ever_married"] = df["ever_married"].map({"Yes" : 0 , "No" : 1})
df["Residence_type"] = df["Residence_type"].map({"Urban" : 0 , "Rural" : 1 })
df["work_type"] = df["work_type"].map({"Private":0, "Self-employed":1, "Govt_job":2, "children":3, "Never_worked":4})
df["smoking_status"] = df["smoking_status"].map({"formerly smoked":0 ,  "never smoked":1, "smokes":2 ,  "Unknown":3})


In [17]:
print(df.dtypes)

id                     int64
gender                 int64
age                  float64
hypertension           int64
heart_disease          int64
ever_married           int64
work_type              int64
Residence_type         int64
avg_glucose_level    float64
bmi                  float64
smoking_status         int64
stroke                 int64
dtype: object


In [18]:
# normalize

scaler = StandardScaler()

col_to_scale = ["bmi" , "avg_glucose_level" , "age"]
df[col_to_scale] = scaler.fit_transform(df[col_to_scale])

In [19]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,0,1.051434,0,1,0,0,0,2.706375,1.001442,0,1
1,51676,1,0.78607,0,0,0,1,1,2.121559,0.022636,1,1
2,31112,0,1.62639,0,1,0,0,1,-0.005028,0.468793,1,1
3,60182,1,0.255342,0,0,0,0,0,1.437358,0.715631,2,1
4,1665,1,1.582163,1,0,0,1,1,1.501184,-0.635478,1,1


In [20]:
X = df.drop(["stroke" , "id"] ,axis=1)
y = df["stroke"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 , random_state=42, stratify=y
)

In [22]:
smote = SMOTE(random_state=42)
X_train_res  , y_train_res = smote.fit_resample(X_train , y_train)
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_res.value_counts())


Before SMOTE: stroke
0    3889
1     199
Name: count, dtype: int64
After SMOTE: stroke
0    3889
1    3889
Name: count, dtype: int64


In [23]:
models = {
    "Logistic Regression" : LogisticRegression(), 
    "Random Forest": RandomForestClassifier() , 
    "Decision Tree" : DecisionTreeClassifier() , 
    "Gradient Boosting": GradientBoostingClassifier()
} 

In [24]:
result = {}
best_model = None
best_acc = 0.0

for name , model in models.items():
    model.fit(X_train , y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test , y_pred)
    result[name] = acc
    print(name , " Accuracy : " , acc*100)
    print(classification_report(y_test , y_pred))

    if acc >  best_acc :
        best_model = (name , model)

print("Best Model : " , best_model[0] , "with Accuracy" , best_acc)


Logistic Regression  Accuracy :  95.10763209393346
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       972
           1       0.00      0.00      0.00        50

    accuracy                           0.95      1022
   macro avg       0.48      0.50      0.49      1022
weighted avg       0.90      0.95      0.93      1022

Random Forest  Accuracy :  94.71624266144813
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       972
           1       0.00      0.00      0.00        50

    accuracy                           0.95      1022
   macro avg       0.48      0.50      0.49      1022
weighted avg       0.90      0.95      0.93      1022

Decision Tree  Accuracy :  90.50880626223092
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       972
           1       0.15      0.20      0.17        50

    accuracy                           0.

In [25]:
joblib.dump(best_model[1], f"{best_model[0].replace(' ', '_')}_best_model.pkl")
print(f"Best model saved as {best_model[0].replace(' ', '_')}_best_model.pkl")


Best model saved as Gradient_Boosting_best_model.pkl
