In [12]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [13]:
data=pd.read_csv('healthcare_stroke_.csv')
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [14]:
data.info()
data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


Unnamed: 0,0
id,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,201


In [15]:
data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [16]:
data.shape

(5110, 12)

In [17]:
X = data.drop('stroke', axis=1)
y = data['stroke']

In [18]:
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print("Numerical:", numerical_cols)
print("Categorical:", categorical_cols)


Numerical: ['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
Categorical: ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']


In [19]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [20]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])


In [21]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])


In [32]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

smote = SMOTE(random_state=42)

clf_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', LogisticRegression(class_weight='balanced'))
])


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf_pipeline.fit(X_train, y_train)


In [34]:
y_pred = clf_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.98      0.75      0.85       960
           1       0.17      0.81      0.28        62

    accuracy                           0.75      1022
   macro avg       0.58      0.78      0.57      1022
weighted avg       0.93      0.75      0.82      1022



In [38]:
from sklearn.ensemble import RandomForestClassifier

smote = SMOTE(random_state=42)

rf_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_pipeline.fit(X_train, y_train)


In [40]:
y_pred = rf_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96       960
           1       0.10      0.03      0.05        62

    accuracy                           0.92      1022
   macro avg       0.52      0.51      0.50      1022
weighted avg       0.89      0.92      0.90      1022



In [41]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))


[[942  18]
 [ 60   2]]


In [42]:
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [43]:
train_size = int(0.8 * len(data))

train_data = data[:train_size]
test_data = data[train_size:]

In [44]:
X_train = train_data.drop('stroke', axis=1)
y_train = train_data['stroke']

X_test = test_data.drop('stroke', axis=1)
y_test = test_data['stroke']

In [48]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Transformerlər
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Pipeline: preprocessing + SMOTE + classifier
smote = SMOTE(random_state=42)
clf = RandomForestClassifier(random_state=42)

pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', clf)
])

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(pipeline, X, y, cv=skf, scoring='f1')

print("Cross-validated F1 scores:", scores)
print("Average F1 score:", np.mean(scores))




Cross-validated F1 scores: [0.08695652 0.11111111 0.05405405 0.14084507 0.15      ]
Average F1 score: 0.10859335146536617


In [49]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# rf_pipeline - artıq təlim olunmuş pipeline (preprocessing + model)

# Test datası üçün ehtimalları al
y_probs = rf_pipeline.predict_proba(X_test)[:, 1]  # class 1 ehtimalları

# Fərqli threshold ilə yeni prediksiya yaradaq (məsələn, 0.3)
threshold = 0.3
y_pred_thresh = (y_probs >= threshold).astype(int)

print(f"Threshold = {threshold}")
print("Classification Report:\n", classification_report(y_test, y_pred_thresh))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thresh))


Threshold = 0.3
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       979
           1       0.96      1.00      0.98        43

    accuracy                           1.00      1022
   macro avg       0.98      1.00      0.99      1022
weighted avg       1.00      1.00      1.00      1022

Confusion Matrix:
 [[977   2]
 [  0  43]]


In [50]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    estimator=clf_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Ən yaxşı parametr kombinasiyası:", grid_search.best_params_)
print("Ən yaxşı CV f1-score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print("Test set nəticələri:\n", classification_report(y_test, y_pred_best))


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Ən yaxşı parametr kombinasiyası: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Ən yaxşı CV f1-score: 0.177732474075835
Test set nəticələri:
               precision    recall  f1-score   support

           0       0.97      0.92      0.95       979
           1       0.19      0.42      0.26        43

    accuracy                           0.90      1022
   macro avg       0.58      0.67      0.60      1022
weighted avg       0.94      0.90      0.92      1022



In [51]:
y_probs = best_model.predict_proba(X_test)[:, 1]
threshold = 0.3
y_pred_thresh = (y_probs >= threshold).astype(int)

from sklearn.metrics import classification_report, confusion_matrix
print(f"Threshold = {threshold}")
print(classification_report(y_test, y_pred_thresh))
print(confusion_matrix(y_test, y_pred_thresh))


Threshold = 0.3
              precision    recall  f1-score   support

           0       0.99      0.76      0.86       979
           1       0.13      0.81      0.22        43

    accuracy                           0.76      1022
   macro avg       0.56      0.79      0.54      1022
weighted avg       0.95      0.76      0.83      1022

[[745 234]
 [  8  35]]
