In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [9]:
df_train = pd.read_csv('titanic_train.csv')
X_test = pd.read_csv('titanic_reserved.csv')

In [4]:
df_test

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home.dest
0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.5500,,S,
1,1,"Snyder, Mr. John Pillsbury",male,24.0000,1,0,21228,82.2667,B45,S,"Minneapolis, MN"
2,2,"Ashby, Mr. John",male,57.0000,0,0,244346,13.0000,,S,"West Hoboken, NJ"
3,3,"Rosblom, Mr. Viktor Richard",male,18.0000,1,1,370129,20.2125,,S,
4,2,"Doling, Miss. Elsie",female,18.0000,0,1,231919,23.0000,,S,Southampton
...,...,...,...,...,...,...,...,...,...,...,...
323,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6000,,C,"Philadelphia, PA"
324,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",male,45.0000,0,0,111428,26.5500,,S,"New York, NY"
325,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C,"New York, NY"
326,2,"Caldwell, Master. Alden Gates",male,0.8333,0,2,248738,29.0000,,S,"Bangkok, Thailand / Roseville, IL"


In [7]:
X_train = df_train.drop(columns=['survived'])
y_train = df_train['survived']

In [8]:
len_X = len(X_train)

In [13]:
data = pd.concat([X_train, X_test], ignore_index=True)

In [16]:
import re
# Определяем функцию для поиска обращений в имени
def extract_title(name):
    title_search = re.search(r'([A-Z][a-z]+\.)', name)
    if title_search:
        return title_search.group()[:-1]  # Исключаем последнюю точку
    else:
        return None

# Создаем новый столбец 'title' и применяем функцию extract_title к столбцу 'name'
data['honorific'] = data['name'].apply(extract_title)

In [18]:
print(data['honorific'].unique())
print(data['honorific'].nunique())

['Miss' 'Mrs' 'Mr' 'Master' 'Mlle' 'Rev' 'Ms' 'Col' 'Dona' 'Dr' 'Countess'
 'Major' 'Don' 'Capt' 'Sir' 'Lady' 'Mme' 'Jonkheer']
18


In [23]:
Mr, Mrs, Miss = ['Rev', 'Col', 'Dr', 'Major', 'Don', 'Capt', 'Sir'], ['Dona', 'Countess'], ['Mlle', 'Ms']

def change_name(name):
    if name in Mr:
        return 'Mr'
    elif name in Mrs:
        return 'Mrs'
    elif name in Miss:
        return 'Miss'
    else:
        return name
    
    
data['honorific'] = data['honorific'].apply(change_name)    

In [27]:
mean_miss = np.mean(data[data['honorific']=='Miss']['age'])
mean_master = np.mean(data[data['honorific']=='Master']['age'])
mean_mr = np.mean(data[data['honorific']=='Mr']['age'])
mean_mrs = np.mean(data[data['honorific']=='Mrs']['age'])

In [28]:
data['age'] = data.apply(lambda row: mean_miss if row['honorific'] == 'Miss' and pd.isnull(row['age']) else
                                  mean_master if row['honorific'] == 'Master' and pd.isnull(row['age']) else
                                  mean_mr if row['honorific'] == 'Mr' and pd.isnull(row['age']) else
                                  mean_mrs if row['honorific'] == 'Mrs' and pd.isnull(row['age']) else
                                  row['age'], axis=1)


In [36]:
data['fam_size'] = data['sibsp'] + data['parch']
data.drop(columns=['sibsp', 'parch'], inplace=True)

Unnamed: 0,pclass,name,sex,age,ticket,fare,cabin,embarked,home.dest,honorific,fam_size
0,3,"Smyth, Miss. Julia",female,21.824335,335432,7.7333,,Q,,Miss,0
1,3,"Glynn, Miss. Mary Agatha",female,21.824335,335677,7.7500,,Q,"Co Clare, Ireland Washington, DC",Miss,0
2,3,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38.000000,2688,7.2292,,C,,Mrs,0
3,3,"Vovk, Mr. Janko",male,22.000000,349252,7.8958,,S,,Mr,0
4,3,"de Pelsmaeker, Mr. Alfons",male,16.000000,345778,9.5000,,S,,Mr,0
...,...,...,...,...,...,...,...,...,...,...,...
1304,1,"Brewe, Dr. Arthur Jackson",male,32.802479,112379,39.6000,,C,"Philadelphia, PA",Mr,0
1305,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",male,45.000000,111428,26.5500,,S,"New York, NY",Mr,0
1306,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,36.982558,PC 17604,82.1708,,C,"New York, NY",Mrs,1
1307,2,"Caldwell, Master. Alden Gates",male,0.833300,248738,29.0000,,S,"Bangkok, Thailand / Roseville, IL",Master,2


In [37]:
data = data.drop(columns=['home.dest', 'cabin'])

In [41]:
data.loc[data['embarked'].isna(), 'embarked'] = 'S'

S    914
C    270
Q    123
Name: embarked, dtype: int64

In [49]:
df = pd.get_dummies(data.drop(columns=['name', 'honorific', 'ticket']), drop_first=True)

In [53]:
X_train = df.iloc[:len_X]
X_test = df.iloc[len_X:]

### Подбор оптимальных гиперпараметров

In [71]:
X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train, test_size=0.2, random_state=0, stratify=y_train)
for weight in ['balanced', None]:
    model = LogisticRegression(random_state=0, max_iter=1000, class_weight=weight)
    model.fit(X_tr, y_tr)
    print(f1_score(y_te, model.predict(X_te)))

0.7044025157232704
0.7034482758620689


### Предсказание для тестового набора данных

In [70]:
model = LogisticRegression(random_state=0, max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)
model.predict(X_test)

array([0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,