In [125]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

# Загружаем данные
train_df=pd.read_csv(r"C:\AI-Lab\kaggle-titanic\data\train.csv")
test_df=pd.read_csv(r"C:\AI-Lab\kaggle-titanic\data\test.csv")
gender_submission = pd.read_csv(r"C:\AI-Lab\kaggle-titanic\data\gender_submission.csv")

# Извлекаю титул из имени
train_df["Title"] = train_df["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip())
test_df["Title"] = test_df["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip())

# Удаляем колонку Name
train_df.drop(columns=["Name"], inplace=True)
test_df.drop(columns=["Name"], inplace=True)

# Категориальные и   числовые признаки
categorical_features = ["Sex", "Embarked", "Title"]
numerical_features = ["Pclass", "Age", "SibSp", "Parch", "Fare"]

# Препроцессинг
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

numerical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("num", numerical_transformer, numerical_features),
    ("cat", categorical_transformer, categorical_features)
])





# Разделяем  данные
X = train_df.drop(columns=["Survived", "PassengerId"])
y = train_df["Survived"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Модель
model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=500))
])


model.fit(X_train, y_train)


y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")


test_X = test_df.drop(columns=["PassengerId"])
test_predictions = model.predict(test_X)




Validation Accuracy: 0.8156


In [95]:
#pd.set_option('display.max_rows', None) 
#pd.set_option('display.max_columns', None) 
#train_df
#train_df.describe()
#train_df.isna().sum()



In [103]:
train_df['Title'] = train_df['Name'].apply(extract_name)
unique_title = train_df['Title'].unique()
title_counts =train_df['Title'].value_counts()
print(title_counts)





Title
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Col               2
Mlle              2
Major             2
Ms                1
Mme               1
Don               1
Lady              1
Sir               1
Capt              1
the Countess      1
Jonkheer          1
Name: count, dtype: int64
