In [26]:
import numpy as np
import pandas as pd
import config

In [27]:
df = pd.read_csv(config.CONFIG['paths']['train_with_folds'])
print('Shape', df.shape)
df.head(3)

Shape (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,fold
0,1,0,2,"Braund, Mr. Owen Harris",1,-0.565419,1,0,A/5 21171,-0.879247,2,1
1,2,1,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,0.663488,1,0,PC 17599,1.360456,0,4
2,3,1,2,"Heikkinen, Miss. Laina",0,-0.258192,0,0,STON/O2. 3101282,-0.798092,2,3


# Декомпозиция Name

In [28]:
def extract_title(name):
    if pd.isna(name):
        return 'Unknown'
    s = str(name).split(',')
    if len(s) < 2:
        return 'Unknown'
    part = s[1].strip().split('.')
    if not part:
        return 'Unknown'
    return part[0].strip()

df['Title'] = df['Name'].map(extract_title)
print(df['Title'].value_counts())

Title
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: count, dtype: int64


In [29]:
title_map = {
    'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3,
    'Dr': 4, 'Rev': 4, 'Col': 4, 'Major': 4, 'Mlle': 4, 'Countess': 4,
    'Ms': 4, 'Lady': 4, 'Jonkheer': 4, 'Don': 4, 'Dona': 4, 'Mme': 4, 'Capt': 4, 'Sir': 4
}

df['Title'] = df['Title'].map(lambda x: title_map.get(x, 4))
print(df['Title'].value_counts().sort_index())

Title
0    517
1    182
2    125
3     40
4     27
Name: count, dtype: int64


# Комбинация фичей

считаем размер семьи, одинокий и плата за чел

In [30]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
df['Fare_per_person'] = df['Fare'] / df['FamilySize'].clip(lower=1)
df[['SibSp', 'Parch', 'FamilySize', 'IsAlone', 'Fare_per_person']].head(8)

Unnamed: 0,SibSp,Parch,FamilySize,IsAlone,Fare_per_person
0,1,0,2,0,-0.439623
1,1,0,2,0,0.680228
2,0,0,1,1,-0.798092
3,1,0,2,0,0.530721
4,0,0,1,1,-0.783739
5,0,0,1,1,-0.738202
6,0,0,1,1,1.037563
7,3,1,5,0,0.027285


# Прочие трансформации

In [31]:
df['Age_Pclass'] = df['Age'] * (df['Pclass'] + 1)
df['Age_bin'] = pd.qcut(df['Age'], q=5, labels=False, duplicates='drop')
print(df['Age_bin'].value_counts().sort_index())

Age_bin
0    179
1    360
2    175
3    177
Name: count, dtype: int64


In [32]:
df_fe = df.drop(columns=['Name', 'Ticket'], errors='ignore')
feature_cols = [c for c in df_fe.columns if c not in ['Survived', 'fold'] and pd.api.types.is_numeric_dtype(df_fe[c])]
print('Фичи', feature_cols)

Фичи ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'FamilySize', 'IsAlone', 'Fare_per_person', 'Age_Pclass', 'Age_bin']


In [33]:
path_fe = config.CONFIG['paths']['train_with_folds_fe']
df_fe.to_csv(path_fe, index=False)
print('Сохранено:', path_fe, 'Shape:', df_fe.shape)

Сохранено: C:\newTry2\classicMLpractice\ProjectKaggle\checkpoints\train_with_folds_fe.csv Shape: (891, 16)


# сравнения до и после Фич инжиниринг

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [35]:
from pandas.core.common import random_state


def run_cv_fe(df, target_col='Survived', n_splits=5):
    feature_cols = [c for c in df.columns if c not in [target_col, 'fold'] and pd.api.types.is_numeric_dtype(df[c])]
    scores = []
    for fold in range(n_splits):
        train_mask = df['fold'] != fold
        val_mask = df['fold'] == fold
        X_train = df.loc[train_mask, feature_cols]
        y_train = df.loc[train_mask, target_col]
        X_val = df.loc[val_mask, feature_cols]
        y_val = df.loc[val_mask, target_col]
        model = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        scores.append(accuracy_score(y_val, preds))

    return np.mean(scores), np.std(scores)

df_baseline = pd.read_csv(config.CONFIG['paths']['train_with_folds']).select_dtypes(include=[np.number])
mean_base, std_base = run_cv_fe(df_baseline)
mean_fe, std_fe = run_cv_fe(df_fe)

print('Без FE', round(mean_base, 4), '+-', round(std_base, 4))
print('С FE', round(mean_fe, 4), '+-', round(std_fe, 4))

Без FE 0.8294 +- 0.0156
С FE 0.8317 +- 0.0122
