In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [41]:
df = pd.read_csv('stroke.csv', sep=';', decimal='.')
display(df)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5105,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5106,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5107,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [42]:
df.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [43]:
df['heart_disease'].fillna(df['heart_disease'].interpolate(), inplace=True)
df['avg_glucose_level'].fillna(df['avg_glucose_level'].interpolate(), inplace=True)
df['bmi'].fillna(df['bmi'].interpolate(), inplace=True)

In [44]:
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
numerical_columns = [col for col in df.columns if col not in categorical_columns + ['stroke']]
x_numeric = df[numerical_columns]
x_categorical = df[categorical_columns]

encoder = LabelEncoder()
x_categorical_encoded = x_categorical.apply(encoder.fit_transform)

X = pd.concat([x_numeric, x_categorical_encoded], axis=1)
df.drop(['stroke'], axis=1)
y = df['stroke']

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
udzial_pozytywny_train = y_train.sum() / len(y_train)
udzial_negatywny_train = 1 - udzial_pozytywny_train
print(f'Udział negatywny: {udzial_negatywny_train:.4f}')
print(f'Udział pozytywny: {udzial_pozytywny_train:.4f}')

Udział negatywny: 0.9542
Udział pozytywny: 0.0458


In [31]:
udzial_pozytywny_test = y_test.sum() / len(y_test)
udzial_negatywny_test = 1 - udzial_pozytywny_test
print(f'Udział negatywny: {udzial_negatywny_test:.4f}')
print(f'Udział pozywtny: {udzial_pozytywny_test:.4f}')

Udział negatywny: 0.9393
Udział pozywtny: 0.0607


Model KNN

In [42]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f'Dokładność KNN: {accuracy_knn:.2%}')

classification_report_knn = classification_report(y_test, y_pred_knn)
print(f'Raport klasyfikacji dla modelu KNN: {classification_report_knn}')
conf_matrix = confusion_matrix(y_test, y_pred_knn)
true_negatives = conf_matrix[0, 0]
false_positives = conf_matrix[0, 1]

specificity = true_negatives / (true_negatives + false_positives)
print(f'Specificity: {specificity}')

true_positives = conf_matrix[1, 1]
false_negatives = conf_matrix[1, 0]

sensitivity = true_positives / (true_positives + false_negatives)
print(f'Sensitivity: {sensitivity}')

Dokładność KNN: 93.05%
Raport klasyfikacji dla modelu KNN:               precision    recall  f1-score   support

           0       0.94      0.99      0.96       960
           1       0.15      0.03      0.05        62

    accuracy                           0.93      1022
   macro avg       0.55      0.51      0.51      1022
weighted avg       0.89      0.93      0.91      1022

Specificity: 0.9885416666666667
Sensitivity: 0.03225806451612903


Model regresji logistycznej

In [43]:
logistic_regression_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_regression_model.fit(X_train, y_train)
y_pred_lr = logistic_regression_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f'Dokładność LR: {accuracy_lr:.2%}')

classification_report_lr = classification_report(y_test, y_pred_lr)
print(f'Raport klasyfikacji dla modelu LR: {classification_report_lr}')

conf_matrix = confusion_matrix(y_test, y_pred_lr)
true_negatives = conf_matrix[0, 0]
false_positives = conf_matrix[0, 1]

specificity = true_negatives / (true_negatives + false_positives)
print(f'Specificity: {specificity}')

true_positives = conf_matrix[1, 1]
false_negatives = conf_matrix[1, 0]

sensitivity = true_positives / (true_positives + false_negatives)
print(f'Sensitivity: {sensitivity}')

Dokładność LR: 93.93%
Raport klasyfikacji dla modelu LR:               precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       0.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.47      0.50      0.48      1022
weighted avg       0.88      0.94      0.91      1022

Specificity: 1.0
Sensitivity: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Oversampling dla algorytmu KNN

In [44]:
oversample = RandomOverSampler(sampling_strategy='minority')
X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)
knn_model_over = KNeighborsClassifier(n_neighbors=3)
knn_model_over.fit(X_train_over, y_train_over)
y_pred_over_knn = knn_model_over.predict(X_test)
accuracy_over_knn = accuracy_score(y_test, y_pred_over_knn)
report_over_knn = classification_report(y_test, y_pred_over_knn)
print(f'Z oversamplingiem:')
print(f'Dokładność: {accuracy_over_knn:.2%}')
print(f'Raport klasyfikacji: {report_over_knn}')

conf_matrix = confusion_matrix(y_test, y_pred_over_knn)
true_negatives = conf_matrix[0, 0]
false_positives = conf_matrix[0, 1]

specificity = true_negatives / (true_negatives + false_positives)
print(f'Specificity: {specificity}')

true_positives = conf_matrix[1, 1]
false_negatives = conf_matrix[1, 0]

sensitivity = true_positives / (true_positives + false_negatives)
print(f'Sensitivity: {sensitivity}')


Z oversamplingiem:
Dokładność: 88.55%
Raport klasyfikacji:               precision    recall  f1-score   support

           0       0.95      0.93      0.94       960
           1       0.15      0.19      0.17        62

    accuracy                           0.89      1022
   macro avg       0.55      0.56      0.55      1022
weighted avg       0.90      0.89      0.89      1022

Specificity: 0.9302083333333333
Sensitivity: 0.1935483870967742


Undersampling dla algorytmu KNN

In [45]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)
knn_model_under = KNeighborsClassifier(n_neighbors=3)
knn_model_under.fit(X_train_under, y_train_under)
y_pred_under_knn = knn_model_under.predict(X_test)
accuracy_under_knn = accuracy_score(y_test, y_pred_under_knn)
report_under_knn = classification_report(y_test, y_pred_under_knn)
print(f'Z undersamplingiem:')
print(f'Dokładność: {accuracy_over_knn:.2%}')
print(f'Raport klasyfikacji: {report_under_knn}')

conf_matrix = confusion_matrix(y_test, y_pred_under_knn)
true_negatives = conf_matrix[0, 0]
false_positives = conf_matrix[0, 1]

specificity = true_negatives / (true_negatives + false_positives)
print(f'Specificity: {specificity}')

true_positives = conf_matrix[1, 1]
false_negatives = conf_matrix[1, 0]

sensitivity = true_positives / (true_positives + false_negatives)
print(f'Sensitivity: {sensitivity}')

Z undersamplingiem:
Dokładność: 88.55%
Raport klasyfikacji:               precision    recall  f1-score   support

           0       0.97      0.65      0.78       960
           1       0.12      0.71      0.20        62

    accuracy                           0.66      1022
   macro avg       0.54      0.68      0.49      1022
weighted avg       0.92      0.66      0.75      1022

Specificity: 0.6520833333333333
Sensitivity: 0.7096774193548387


Oversampling dla algorytmu regresji logistycznej

In [46]:
oversampler = RandomOverSampler(sampling_strategy='minority')
X_train_over, y_train_over = oversampler.fit_resample(X_train, y_train)
logistic_regression_model_over = LogisticRegression(random_state=42, max_iter=1000)
logistic_regression_model_over.fit(X_train_over, y_train_over)
y_pred_over_lr = logistic_regression_model_over.predict(X_test)
accuracy_over_lr = accuracy_score(y_test, y_pred_over_lr)
report_over_lr = classification_report(y_test, y_pred_over_lr)
print(f'Z oversamplingiem:')
print(f'Dodładność: {accuracy_over_lr:.2%}')
print(f'Raport klasyfikacji: {report_over_lr}')

conf_matrix = confusion_matrix(y_test, y_pred_over_lr)
true_negatives = conf_matrix[0, 0]
false_positives = conf_matrix[0, 1]

specificity = true_negatives / (true_negatives + false_positives)
print(f'Specificity: {specificity}')

true_positives = conf_matrix[1, 1]
false_negatives = conf_matrix[1, 0]

sensitivity = true_positives / (true_positives + false_negatives)
print(f'Sensitivity: {sensitivity}')

Z oversamplingiem:
Dodładność: 73.58%
Raport klasyfikacji:               precision    recall  f1-score   support

           0       0.98      0.73      0.84       960
           1       0.16      0.79      0.27        62

    accuracy                           0.74      1022
   macro avg       0.57      0.76      0.55      1022
weighted avg       0.93      0.74      0.80      1022

Specificity: 0.7322916666666667
Sensitivity: 0.7903225806451613


Undersampling dla algorytmu regresji logistycznej

In [47]:
undersampler = RandomUnderSampler(sampling_strategy='majority')
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)
logistic_regression_model_under = LogisticRegression(random_state=42, max_iter=1000)
logistic_regression_model_under.fit(X_train_under, y_train_under)
y_pred_under_lr = logistic_regression_model_under.predict(X_test)
accuracy_under_lr = accuracy_score(y_test, y_pred_under_lr)
report_under_lr = classification_report(y_test, y_pred_under_lr)
print(f'Z undersamplingiem:')
print(f'Dokładność: {accuracy_under_lr:.2%}')
print(f'Raport klasyfikacji: {report_under_lr}')

conf_matrix = confusion_matrix(y_test, y_pred_under_lr)
true_negatives = conf_matrix[0, 0]
false_positives = conf_matrix[0, 1]

specificity = true_negatives / (true_negatives + false_positives)
print(f'Specificity: {specificity}')

true_positives = conf_matrix[1, 1]
false_negatives = conf_matrix[1, 0]

sensitivity = true_positives / (true_positives + false_negatives)
print(f'Sensitivity: {sensitivity}')

Z undersamplingiem:
Dokładność: 73.19%
Raport klasyfikacji:               precision    recall  f1-score   support

           0       0.98      0.73      0.84       960
           1       0.16      0.79      0.26        62

    accuracy                           0.73      1022
   macro avg       0.57      0.76      0.55      1022
weighted avg       0.93      0.73      0.80      1022

Specificity: 0.728125
Sensitivity: 0.7903225806451613


In [48]:
df.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Tworzenie modelu z dostosowanym stosunkiem klas
# Ustawienie class_weight na 10:90 dla klasy 1:0
logistic_regression_model = LogisticRegression(random_state=42, max_iter=1000, class_weight={0: 0.2, 1: 0.8})

# Trenowanie modelu na oryginalnym zestawie danych
logistic_regression_model.fit(X_train, y_train)

# Prognozowanie i ocena modelu
y_pred_lr = logistic_regression_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f'Dokładność LR: {accuracy_lr:.2%}')

# Raport klasyfikacji
classification_report_lr = classification_report(y_test, y_pred_lr)
print(f'Raport klasyfikacji dla modelu LR:\n{classification_report_lr}')

# Obliczanie specificity i sensitivity
conf_matrix = confusion_matrix(y_test, y_pred_lr)
true_negatives, false_positives, false_negatives, true_positives = conf_matrix.ravel()

specificity = true_negatives / (true_negatives + false_positives)
print(f'Specificity: {specificity:.4f}')

sensitivity = true_positives / (true_positives + false_negatives)
print(f'Sensitivity: {sensitivity:.4f}')


Dokładność LR: 92.17%
Raport klasyfikacji dla modelu LR:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       960
           1       0.31      0.24      0.27        62

    accuracy                           0.92      1022
   macro avg       0.63      0.60      0.62      1022
weighted avg       0.91      0.92      0.92      1022

Specificity: 0.9656
Sensitivity: 0.2419


In [12]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Ustawienie stosunku klasy pozytywnej do negatywnej na 10:90
rus = RandomUnderSampler(sampling_strategy={0: 6, 1: 4}, random_state=42)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

# Sprawdzenie nowego rozkładu klas
print(f'Nowy rozkład klas: {Counter(y_train_res)}')

# Trenowanie modelu na zbalansowanym zestawie danych
logistic_regression_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_regression_model.fit(X_train_res, y_train_res)

# Prognozowanie i ocena modelu
y_pred_lr = logistic_regression_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f'Dokładność LR: {accuracy_lr:.2%}')

# Raport klasyfikacji
classification_report_lr = classification_report(y_test, y_pred_lr)
print(f'Raport klasyfikacji dla modelu LR:\n{classification_report_lr}')

# Obliczanie specificity i sensitivity
conf_matrix = confusion_matrix(y_test, y_pred_lr)
true_negatives, false_positives, false_negatives, true_positives = conf_matrix.ravel()

specificity = true_negatives / (true_negatives + false_positives)
print(f'Specificity: {specificity:.4f}')

sensitivity = true_positives / (true_positives + false_negatives)
print(f'Sensitivity: {sensitivity:.4f}')


Nowy rozkład klas: Counter({0: 6, 1: 4})
Dokładność LR: 85.03%
Raport klasyfikacji dla modelu LR:
              precision    recall  f1-score   support

           0       0.97      0.87      0.92       960
           1       0.22      0.56      0.31        62

    accuracy                           0.85      1022
   macro avg       0.59      0.72      0.61      1022
weighted avg       0.92      0.85      0.88      1022

Specificity: 0.8688
Sensitivity: 0.5645


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Ustawienie stosunku klasy pozytywnej do negatywnej na 10:90
rus = RandomUnderSampler(sampling_strategy={0: 6, 1: 4}, random_state=42)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

# Sprawdzenie nowego rozkładu klas
print(f'Nowy rozkład klas: {Counter(y_train_res)}')

# Trenowanie modelu na zbalansowanym zestawie danych
logistic_regression_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_regression_model.fit(X_train_res, y_train_res)

# Prognozowanie i ocena modelu
y_pred_lr = logistic_regression_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f'Dokładność LR: {accuracy_lr:.2%}')

# Raport klasyfikacji
classification_report_lr = classification_report(y_test, y_pred_lr)
print(f'Raport klasyfikacji dla modelu LR:\n{classification_report_lr}')

# Obliczanie specificity i sensitivity
conf_matrix = confusion_matrix(y_test, y_pred_lr)
true_negatives, false_positives, false_negatives, true_positives = conf_matrix.ravel()

specificity = true_negatives / (true_negatives + false_positives)
print(f'Specificity: {specificity:.4f}')

sensitivity = true_positives / (true_positives + false_negatives)
print(f'Sensitivity: {sensitivity:.4f}')


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Tworzenie modelu z dostosowanym stosunkiem klas
# Ustawienie class_weight na 90:10 dla klasy 0:1
logistic_regression_model = LogisticRegression(random_state=42, max_iter=1000, class_weight={0: 7, 1: 3})

# Trenowanie modelu na oryginalnym zestawie danych
logistic_regression_model.fit(X_train, y_train)

# Prognozowanie i ocena modelu
y_pred_lr = logistic_regression_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f'Dokładność LR: {accuracy_lr:.2%}')

# Raport klasyfikacji
classification_report_lr = classification_report(y_test, y_pred_lr)
print(f'Raport klasyfikacji dla modelu LR:\n{classification_report_lr}')

# Obliczanie specificity i sensitivity
conf_matrix = confusion_matrix(y_test, y_pred_lr)
true_negatives, false_positives, false_negatives, true_positives = conf_matrix.ravel()

specificity = true_negatives / (true_negatives + false_positives)
print(f'Specificity: {specificity:.4f}')

sensitivity = true_positives / (true_positives + false_negatives)
print(f'Sensitivity: {sensitivity:.4f}')


Dokładność LR: 93.93%
Raport klasyfikacji dla modelu LR:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       0.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.47      0.50      0.48      1022
weighted avg       0.88      0.94      0.91      1022

Specificity: 1.0000
Sensitivity: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
