In [227]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import dalex as dx

import warnings
warnings.filterwarnings('ignore')

# Załadowanie danych # Zmień na ścieżkę do swojego pliku
df = pd.read_csv('zad7_Stroke.csv', sep=';', decimal='.')

# Konwersja typów danych dla zmiennych kategorycznych
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
numerical_columns = [col for col in df.columns if col not in categorical_columns + ['stroke']]
x_numeric = df[numerical_columns]
x_categorical = df[categorical_columns]

encoder = LabelEncoder()
x_categorical_encoded = x_categorical.apply(encoder.fit_transform)

# Wyświetlenie pierwszych wierszy
print(df.head())

   gender   age  hypertension  heart_disease ever_married      work_type  \
0    Male  67.0             0              1          Yes        Private   
1  Female  61.0             0              0          Yes  Self-employed   
2    Male  80.0             0              1          Yes        Private   
3  Female  49.0             0              0          Yes        Private   
4  Female  79.0             1              0          Yes  Self-employed   

  Residence_type  avg_glucose_level   bmi   smoking_status  stroke  
0          Urban             228.69  36.6  formerly smoked       1  
1          Rural             202.21   NaN     never smoked       1  
2          Rural             105.92  32.5     never smoked       1  
3          Urban             171.23  34.4           smokes       1  
4          Rural             174.12  24.0     never smoked       1  


In [228]:
df['heart_disease'].fillna(df['heart_disease'].interpolate(), inplace=True)
df['avg_glucose_level'].fillna(df['avg_glucose_level'].interpolate(), inplace=True)
df['bmi'].fillna(df['bmi'].interpolate(), inplace=True)

In [229]:
columns_to_impute = ['work_type']
imputer = SimpleImputer(strategy='most_frequent')
data_to_impute = df[columns_to_impute].values
imputed_data = imputer.fit_transform(data_to_impute)
df[columns_to_impute] = imputed_data

In [230]:
X = df.drop('stroke', axis=1)
y = df['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=len(df) - 600, random_state=9)

# Trenowanie modelu
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression(max_iter=1000))])  # Zwiększenie liczby iteracji, jeśli to konieczne
model.fit(X_train, y_train)

Profile ceteris-paribus (PCP)

In [231]:
exp = dx.Explainer(model, X_train, y_train, label="Logistic Regression")

# Wybór obserwacji do analizy
obs = X_train.iloc[10]

# Przewidywanie i analiza profilu
pcp = exp.predict_profile(obs)

# Wyświetlanie wyników
pcp.plot(variables=["age", "avg_glucose_level", "bmi"])  # Przykładowe zmienne numeryczne

# Analiza dla zmiennych kategorycznych
pcp.plot(variables=["Residence_type", "work_type"], variable_type="categorical")

Preparation of a new explainer is initiated

  -> data              : 600 rows 10 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 600 values
  -> model_class       : sklearn.linear_model._logistic.LogisticRegression (default)
  -> label             : Logistic Regression
  -> predict function  : <function yhat_proba_default at 0x000001FA9D1E83A0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.00126, mean = 0.0483, max = 0.481
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.443, mean = -2.43e-09, max = 0.998
  -> model_info        : package sklearn

A new explainer has been created!


Calculating ceteris paribus:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating ceteris paribus: 100%|██████████| 10/10 [00:00<00:00, 131.58it/s]


In [232]:
gender_encoder = LabelEncoder().fit(df['gender'])
print(dict(zip(gender_encoder.classes_, gender_encoder.transform(gender_encoder.classes_))))

{'Female': 0, 'Male': 1}


In [233]:
residence_encoder = LabelEncoder().fit(df['Residence_type'])
print(dict(zip(residence_encoder.classes_, residence_encoder.transform(residence_encoder.classes_))))

{'Rural': 0, 'Urban': 1}


In [234]:
smoke_encoder = LabelEncoder().fit(df['smoking_status'])
print(dict(zip(smoke_encoder.classes_, smoke_encoder.transform(smoke_encoder.classes_))))

{'Unknown': 0, 'formerly smoked': 1, 'never smoked': 2, 'smokes': 3}


Wykresy częściowej zależności (PDP)

Zmienne ilościowe:

In [235]:
pdp_age = exp.model_profile(variables="age")
pdp_age.plot()

Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 20.00it/s]


In [236]:
X_train['gender_text'] = X_train['gender'].map({0: 'Female', 1: 'Male'})

# Użycie tej kolumny do grupowania
pdp_age_gender = exp.model_profile(variables=['age'], groups='gender')
pdp_age_gender.plot(title="PDP for age with gender groups")

Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 20.41it/s]


In [237]:
X_train['Residence_type_text'] = X_train['Residence_type'].map({0: 'Rural', 1: 'Urban'})

# Użycie tej kolumny do grupowania
pdp_age_gender = exp.model_profile(variables=['age'], groups='Residence_type')
pdp_age_gender.plot(title="PDP for age with Residence_type groups")

Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 19.62it/s]


In [238]:
X_train['smoke_text'] = X_train['smoking_status'].map({'Unknown': 0, 'formerly smoked': 1, 'never smoked': 2, 'smokes': 3})

# Użycie tej kolumny do grupowania
pdp_age_gender = exp.model_profile(variables=['age'], groups='smoking_status')
pdp_age_gender.plot(title="PDP for age with gender groups")

Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 20.40it/s]


Zmienne kategoryczne:

In [239]:
pdp_gender = exp.model_profile(variables=['gender'], variable_type='categorical')
pdp_gender.plot(title="PDP for gender")

Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 71.49it/s]


In [240]:
pdp_residence_type = exp.model_profile(variables=['Residence_type'], variable_type='categorical')
pdp_residence_type.plot(title="PDP for Residence_type")


Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 99.99it/s]


In [241]:
pdp_smoking_status_type = exp.model_profile(variables=['smoking_status'], variable_type='categorical')
pdp_smoking_status_type.plot(title="PDP for smoking_status")

Calculating ceteris paribus: 100%|██████████| 1/1 [00:00<00:00, 90.92it/s]


Wartości SHAP

Wykresy BD (Break-down Plots for Interactions)

In [242]:
obs = X_train.iloc[10]

In [243]:
prediction = model.predict_proba(obs.to_frame().T)
print(prediction)

[[0.87460559 0.12539441]]


In [244]:
bd1 = exp.predict_parts(obs, type='break_down_interactions', order=['age', 'gender', 'smoking_status'])
bd2 = exp.predict_parts(obs, type='break_down_interactions', order=['gender', 'smoking_status', 'age'])
bd3 = exp.predict_parts(obs, type='break_down_interactions', order=['smoking_status', 'age', 'gender'])

Wartości SHAP

In [245]:
plt.figure(figsize=(15, 5))
bd1.plot()
bd2.plot()
bd3.plot()
plt.tight_layout()
plt.show()

<Figure size 1500x500 with 0 Axes>

In [246]:
shap = exp.predict_parts(obs, type='shap')

In [247]:
plt.figure(figsize=(10, 4))
shap.plot()
shap.plot(max_vars=10)
plt.tight_layout()
plt.show()

<Figure size 1000x400 with 0 Axes>