In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score  # Импортируем roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

In [19]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

data = pd.read_csv('train.csv')

X = data.drop('Survived', axis=1)
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=RANDOM_STATE, stratify=y
)


baseline_pred = np.zeros_like(y_test)
baseline_roc_auc = roc_auc_score(y_test, baseline_pred) # Считаем ROC AUC для baseline
print(f'Baseline ROC AUC: {baseline_roc_auc:.4f}')

numeric_features = ['Age', 'Fare']
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) # Добавил StandardScaler

categorical_features = ['Sex', 'Pclass', 'Embarked']
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot',OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),('cat', categorical_transformer, categorical_features)])

model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression(random_state=RANDOM_STATE))])
model.fit(X_train, y_train)

# Получаем вероятности принадлежности к классу 1 (выжил)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Рассчитываем ROC AUC
model_roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'Model ROC AUC: {model_roc_auc:.4f}')

# Feature Importance (остается практически без изменений, но теперь интерпретируем в контексте влияния на вероятность выживания)
if hasattr(model.named_steps['classifier'], 'coef_'):
  feature_names = (model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features))
  all_features = numeric_features + list(feature_names)
  coef = model.named_steps['classifier'].coef_[0]
  print('\nFeature importance: ')
  for feature, importance in zip(all_features, coef):
    print(f'{feature}:{importance:.4f}')  # Больше значение -> больше влияет на вероятность выживания


Baseline ROC AUC: 0.5000
Model ROC AUC: 0.8338

Feature importance: 
Age:-0.4793
Fare:0.0360
Sex_female:1.3218
Sex_male:-1.2082
Pclass_1:1.0757
Pclass_2:0.1513
Pclass_3:-1.1134
Embarked_C:0.1924
Embarked_Q:0.1964
Embarked_S:-0.2753


Выбрал метриику roc auc - измеряет способность модели различать между классами (выжил/не выжил), не завися от выбранного порога классификации. Модель LpgisticRegression выбрана как простая интерпретируемая модель бинарной классификации.