# Evolver Loop 2: Feature Engineering Deep Dive

This notebook validates the high-impact features identified in Loop 1 and prepares for the next experiment.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print("Data loaded successfully")
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Data loaded successfully
Train shape: (891, 12)
Test shape: (418, 11)


## 1. Validate Ticket Frequency Feature

In [3]:
# Combine train and test for ticket frequency calculation
combined = pd.concat([train, test], axis=0, sort=False)
combined['TicketFreq'] = combined.groupby('Ticket')['Ticket'].transform('count')

# Split back
train['TicketFreq'] = combined.iloc[:len(train)]['TicketFreq']
test['TicketFreq'] = combined.iloc[len(train):]['TicketFreq']

# Analyze survival by ticket frequency
ticket_survival = train.groupby('TicketFreq')['Survived'].agg(['count', 'sum', 'mean']).round(3)
ticket_survival.columns = ['Count', 'Survived', 'SurvivalRate']
print("Survival by Ticket Frequency:")
print(ticket_survival)

# Focus on shared tickets (freq > 1)
shared_tickets = train[train['TicketFreq'] > 1]
solo_tickets = train[train['TicketFreq'] == 1]

print(f"\nShared tickets survival rate: {shared_tickets['Survived'].mean():.3f} ({len(shared_tickets)} passengers)")
print(f"Solo tickets survival rate: {solo_tickets['Survived'].mean():.3f} ({len(solo_tickets)} passengers)")
print(f"Absolute difference: {shared_tickets['Survived'].mean() - solo_tickets['Survived'].mean():.3f}")

Survival by Ticket Frequency:
            Count  Survived  SurvivalRate
TicketFreq                               
1             481       130         0.270
2             181        93         0.514
3             101        66         0.653
4              44        32         0.727
5              21         7         0.333
6              19         4         0.211
7              24         5         0.208
8              13         5         0.385
11              7         0         0.000

Shared tickets survival rate: 0.517 (410 passengers)
Solo tickets survival rate: 0.270 (481 passengers)
Absolute difference: 0.247


## 2. Validate Cabin Side Feature

In [5]:
# Extract cabin side (even/odd) from cabin numbers
def extract_cabin_side(cabin):
    if pd.isna(cabin):
        return np.nan
    # Get first cabin number if multiple
    cabin = str(cabin).split()[0]
    # Extract numbers
    numbers = ''.join(filter(str.isdigit, cabin))
    if numbers:
        return int(numbers) % 2  # 0=even, 1=odd
    return np.nan

train['CabinSide'] = train['Cabin'].apply(extract_cabin_side)
test['CabinSide'] = test['Cabin'].apply(extract_cabin_side)

# Analyze survival by cabin side
cabin_side_survival = train.groupby('CabinSide')['Survived'].agg(['count', 'sum', 'mean']).round(3)
cabin_side_survival.columns = ['Count', 'Survived', 'SurvivalRate']
print("Survival by Cabin Side (0=Even, 1=Odd):")
print(cabin_side_survival)

# For passengers with cabin info
cabin_known = train[train['CabinSide'].notna()]
even_side = cabin_known[cabin_known['CabinSide'] == 0]
odd_side = cabin_known[cabin_known['CabinSide'] == 1]

if len(even_side) > 0 and len(odd_side) > 0:
    print(f"\nEven side survival rate: {even_side['Survived'].mean():.3f} ({len(even_side)} passengers)")
    print(f"Odd side survival rate: {odd_side['Survived'].mean():.3f} ({len(odd_side)} passengers)")
    print(f"Absolute difference: {even_side['Survived'].mean() - odd_side['Survived'].mean():.3f}")

Survival by Cabin Side (0=Even, 1=Odd):
           Count  Survived  SurvivalRate
CabinSide                               
0.0          108        66         0.611
1.0           88        67         0.761

Even side survival rate: 0.611 (108 passengers)
Odd side survival rate: 0.761 (88 passengers)
Absolute difference: -0.150


## 3. Validate Name Length Feature

In [6]:
# Calculate name length
train['NameLength'] = train['Name'].apply(len)
test['NameLength'] = test['Name'].apply(len)

# Correlation with survival
correlation = train['NameLength'].corr(train['Survived'])
print(f"Correlation between NameLength and Survived: {correlation:.3f}")

# Bin name length and analyze
train['NameLengthBin'] = pd.qcut(train['NameLength'], 5, labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5'])
name_length_survival = train.groupby('NameLengthBin')['Survived'].mean()
print("\nSurvival by Name Length Quintile:")
print(name_length_survival.round(3))

Correlation between NameLength and Survived: 0.332

Survival by Name Length Quintile:
NameLengthBin
Q1    0.221
Q2    0.301
Q3    0.320
Q4    0.442
Q5    0.675
Name: Survived, dtype: float64


## 4. Validate Fare Binning

In [7]:
# Create 5-bin fare categories
train['FareBin5'] = pd.qcut(train['Fare'], 5, labels=['VeryLow', 'Low', 'Medium', 'High', 'VeryHigh'])
test['FareBin5'] = pd.qcut(test['Fare'], 5, labels=['VeryLow', 'Low', 'Medium', 'High', 'VeryHigh'])

fare_survival = train.groupby('FareBin5')['Survived'].agg(['count', 'sum', 'mean']).round(3)
fare_survival.columns = ['Count', 'Survived', 'SurvivalRate']
print("Survival by 5 Fare Bins:")
print(fare_survival)

Survival by 5 Fare Bins:
          Count  Survived  SurvivalRate
FareBin5                               
VeryLow     179        39         0.218
Low         184        37         0.201
Medium      172        73         0.424
High        180        80         0.444
VeryHigh    176       113         0.642


## 5. Quick Model Test with New Features

In [None]:
from sklearn.preprocessing import LabelEncoder

# Prepare features for quick test
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

# Add engineered features
engineered_features = ['TicketFreq', 'CabinSide', 'NameLength', 'FareBin5']

# Combine all features
all_features = features + engineered_features

# Create a copy for preprocessing
train_test = pd.concat([train, test], axis=0, sort=False)

# Fill missing values
train_test['Age'].fillna(train_test['Age'].median(), inplace=True)
train_test['Fare'].fillna(train_test['Fare'].median(), inplace=True)
train_test['Embarked'].fillna(train_test['Embarked'].mode()[0], inplace=True)
train_test['CabinSide'].fillna(-1, inplace=True)  # -1 for unknown

# Encode categorical variables
le_sex = LabelEncoder()
train_test['Sex'] = le_sex.fit_transform(train_test['Sex'])

le_embarked = LabelEncoder()
train_test['Embarked'] = le_embarked.fit_transform(train_test['Embarked'])

le_farebin = LabelEncoder()
train_test['FareBin5'] = le_farebin.fit_transform(train_test['FareBin5'].astype(str))

# Split back
train_processed = train_test.iloc[:len(train)]
test_processed = train_test.iloc[len(train):]

# Prepare data for modeling
X = train_processed[all_features]
y = train['Survived']

# Quick CV test
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    scores.append(accuracy_score(y_val, y_pred))

print(f"CV scores with new features: {scores}")
print(f"Mean CV accuracy: {np.mean(scores):.4f} Â± {np.std(scores):.4f}")
print(f"Improvement over baseline: {np.mean(scores) - 0.817:.4f}")

## Summary of Findings

This analysis validates the high-impact features identified in Loop 1.