In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

RANDOM_STATE = 42

In [24]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


F1-score подходит лучше, чем accuracy, так как классы Survived несбалансированы:
погибших больше, чем выживших. Accuracy может быть вводящей в заблуждение.
F1-score отражает баланс точности и полноты и лучше показывает качество модели.


In [26]:
X = df.drop("Survived", axis=1)
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)

In [27]:
# Находим самый частый класс
majority_class = y_train.mode()[0]
majority_class

np.int64(0)

In [28]:
# Константное предсказание
baseline_pred = np.full_like(y_test, fill_value=majority_class)

baseline_f1 = f1_score(y_test, baseline_pred)
print("Baseline F1-score:", baseline_f1)

Baseline F1-score: 0.0


In [29]:
numeric_features = ["Age", "Fare", "SibSp", "Parch", "Pclass"]
categorical_features = ["Sex", "Embarked"]

# Заполним пропуски
X_train["Age"] = X_train["Age"].fillna(X_train["Age"].median())
X_test["Age"] = X_test["Age"].fillna(X_train["Age"].median())

X_train["Embarked"] = X_train["Embarked"].fillna("Unknown")
X_test["Embarked"] = X_test["Embarked"].fillna("Unknown")

# ColumnTransformer для OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

model = Pipeline(
    steps=[
        ("prep", preprocessor),
        ("logreg", LogisticRegression(max_iter=200, random_state=RANDOM_STATE))
    ]
)


In [30]:
model.fit(X_train, y_train);

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
preds = model.predict(X_test)

f1 = f1_score(y_test, preds)
acc = accuracy_score(y_test, preds)

print("Model F1-score:", f1)
print("Model Accuracy:", acc)

Model F1-score: 0.734375
Model Accuracy: 0.8100558659217877
