# Feature Engineering for Threshold-Tuned Ensemble

## Goal: Improve LB by adding proven features

Current best: LB 0.7847 (Threshold-Tuned Ensemble, CV 0.8373)

## Key Learnings from Previous Experiments:
1. Stacking FAILED - simpler models work better
2. ~31% survival rate (130-133 survivors) is optimal for LB
3. CV alone is NOT reliable - same CV gave LB 0.7847 vs 0.7631

## Features to Add:
1. FamilySize = SibSp + Parch + 1
2. IsAlone = 1 if FamilySize == 1
3. Has_Cabin = 1 if Cabin is not null
4. TicketFreq = Number of passengers sharing same ticket
5. FareBin = Quartile-based fare categories

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Training survival rate: {train['Survived'].mean():.3f}")

In [None]:
# Enhanced preprocessing with NEW features
def preprocess_enhanced(train_df, test_df):
    train_data = train_df.copy()
    test_data = test_df.copy()
    
    # ============ TITLE EXTRACTION ============
    for df in [train_data, test_data]:
        df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
        title_mapping = {
            'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
            'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
            'Lady': 'Rare', 'Countess': 'Rare', 'Capt': 'Rare', 'Col': 'Rare',
            'Don': 'Rare', 'Dr': 'Rare', 'Major': 'Rare', 'Rev': 'Rare',
            'Sir': 'Rare', 'Jonkheer': 'Rare', 'Dona': 'Rare'
        }
        df['Title'] = df['Title'].map(title_mapping).fillna('Rare')
    
    title_order = ['Mr', 'Miss', 'Mrs', 'Master', 'Rare']
    title_map = {t: i for i, t in enumerate(title_order)}
    train_data['Title_Code'] = train_data['Title'].map(title_map)
    test_data['Title_Code'] = test_data['Title'].map(title_map)
    
    # ============ SEX ENCODING ============
    train_data['Sex_Code'] = (train_data['Sex'] == 'male').astype(int)
    test_data['Sex_Code'] = (test_data['Sex'] == 'male').astype(int)
    
    # ============ EMBARKED ============
    train_data['Embarked'] = train_data['Embarked'].fillna('S')
    test_data['Embarked'] = test_data['Embarked'].fillna('S')
    embarked_map = {'S': 0, 'C': 1, 'Q': 2}
    train_data['Embarked_Code'] = train_data['Embarked'].map(embarked_map)
    test_data['Embarked_Code'] = test_data['Embarked'].map(embarked_map)
    
    # ============ NEW FEATURE 1: FamilySize ============
    train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
    test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1
    
    # ============ NEW FEATURE 2: IsAlone ============
    train_data['IsAlone'] = (train_data['FamilySize'] == 1).astype(int)
    test_data['IsAlone'] = (test_data['FamilySize'] == 1).astype(int)
    
    # ============ NEW FEATURE 3: Has_Cabin ============
    train_data['Has_Cabin'] = train_data['Cabin'].notna().astype(int)
    test_data['Has_Cabin'] = test_data['Cabin'].notna().astype(int)
    
    # ============ NEW FEATURE 4: TicketFreq ============
    # Count how many passengers share the same ticket
    combined_tickets = pd.concat([train_data['Ticket'], test_data['Ticket']])
    ticket_counts = combined_tickets.value_counts()
    train_data['TicketFreq'] = train_data['Ticket'].map(ticket_counts)
    test_data['TicketFreq'] = test_data['Ticket'].map(ticket_counts)
    
    # ============ FARE IMPUTATION (from train only) ============
    train_fare_median = train_data['Fare'].median()
    train_data['Fare'] = train_data['Fare'].fillna(train_fare_median)
    test_data['Fare'] = test_data['Fare'].fillna(train_fare_median)
    
    # ============ NEW FEATURE 5: FareBin ============
    # Use quartiles from training data - handle edge cases
    fare_bins = [-0.001, 7.91, 14.45, 31, 1000]
    train_data['FareBin'] = pd.cut(train_data['Fare'], bins=fare_bins, labels=[0, 1, 2, 3])
    test_data['FareBin'] = pd.cut(test_data['Fare'], bins=fare_bins, labels=[0, 1, 2, 3])
    # Fill any NaN with median bin
    train_data['FareBin'] = train_data['FareBin'].fillna(1).astype(int)
    test_data['FareBin'] = test_data['FareBin'].fillna(1).astype(int)
    
    # ============ AGE IMPUTATION (from train only) ============
    age_medians = train_data.groupby(['Title', 'Pclass'])['Age'].median()
    train_age_median = train_data['Age'].median()
    
    def fill_age(row, medians, fallback):
        if pd.isna(row['Age']):
            try:
                return medians[(row['Title'], row['Pclass'])]
            except KeyError:
                return fallback
        return row['Age']
    
    train_data['Age'] = train_data.apply(lambda x: fill_age(x, age_medians, train_age_median), axis=1)
    test_data['Age'] = test_data.apply(lambda x: fill_age(x, age_medians, train_age_median), axis=1)
    
    return train_data, test_data

train_processed, test_processed = preprocess_enhanced(train, test)
print("Preprocessing complete!")

In [None]:
# Verify new features
print("NEW FEATURES SUMMARY:")
print("="*50)

print(f"\nFamilySize distribution (train):")
print(train_processed['FamilySize'].value_counts().sort_index())

print(f"\nIsAlone distribution (train):")
print(train_processed['IsAlone'].value_counts())

print(f"\nHas_Cabin distribution (train):")
print(train_processed['Has_Cabin'].value_counts())

print(f"\nTicketFreq distribution (train):")
print(train_processed['TicketFreq'].value_counts().sort_index())

print(f"\nFareBin distribution (train):")
print(train_processed['FareBin'].value_counts().sort_index())

In [None]:
# Check survival rates for new features
print("\nSURVIVAL RATES BY NEW FEATURES:")
print("="*50)

print("\nFamilySize survival rates:")
for size in sorted(train_processed['FamilySize'].unique()):
    mask = train_processed['FamilySize'] == size
    rate = train_processed.loc[mask, 'Survived'].mean()
    count = mask.sum()
    print(f"  Size {size}: {rate:.3f} (n={count})")

print("\nIsAlone survival rates:")
for alone in [0, 1]:
    mask = train_processed['IsAlone'] == alone
    rate = train_processed.loc[mask, 'Survived'].mean()
    count = mask.sum()
    label = 'Alone' if alone == 1 else 'With family'
    print(f"  {label}: {rate:.3f} (n={count})")

print("\nHas_Cabin survival rates:")
for cabin in [0, 1]:
    mask = train_processed['Has_Cabin'] == cabin
    rate = train_processed.loc[mask, 'Survived'].mean()
    count = mask.sum()
    label = 'Has cabin' if cabin == 1 else 'No cabin'
    print(f"  {label}: {rate:.3f} (n={count})")

print("\nFareBin survival rates:")
for bin_val in sorted(train_processed['FareBin'].unique()):
    mask = train_processed['FareBin'] == bin_val
    rate = train_processed.loc[mask, 'Survived'].mean()
    count = mask.sum()
    print(f"  Bin {bin_val}: {rate:.3f} (n={count})")

In [None]:
# Prepare features - ORIGINAL 8 features (baseline)
original_features = ['Pclass', 'Sex_Code', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Code', 'Title_Code']

# ENHANCED features (original + new)
enhanced_features = original_features + ['FamilySize', 'IsAlone', 'Has_Cabin', 'TicketFreq', 'FareBin']

print(f"Original features ({len(original_features)}): {original_features}")
print(f"\nEnhanced features ({len(enhanced_features)}): {enhanced_features}")

# Prepare data
X_orig = train_processed[original_features].values
X_enhanced = train_processed[enhanced_features].values
y = train_processed['Survived'].values

X_test_orig = test_processed[original_features].values
X_test_enhanced = test_processed[enhanced_features].values
test_ids = test_processed['PassengerId'].values

print(f"\nOriginal X shape: {X_orig.shape}")
print(f"Enhanced X shape: {X_enhanced.shape}")

In [None]:
# Define the Voting Ensemble (same as exp_003 that achieved LB 0.7847)
def create_voting_ensemble():
    rf = RandomForestClassifier(
        n_estimators=100, max_depth=5, min_samples_leaf=5,
        random_state=42, n_jobs=-1
    )
    lr = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
    gb = GradientBoostingClassifier(
        n_estimators=100, max_depth=3, learning_rate=0.1,
        random_state=42
    )
    svc = SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
    
    ensemble = VotingClassifier(
        estimators=[('rf', rf), ('lr', lr), ('gb', gb), ('svc', svc)],
        voting='soft'
    )
    return ensemble

print("Voting Ensemble defined (same as exp_003)")

In [None]:
# Compare ORIGINAL vs ENHANCED features with CV
print("COMPARING ORIGINAL vs ENHANCED FEATURES")
print("="*60)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()

# Test ORIGINAL features
print("\n1. ORIGINAL FEATURES (8 features):")
oof_probs_orig = np.zeros(len(X_orig))
fold_scores_orig = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_orig, y)):
    X_train, X_val = X_orig[train_idx], X_orig[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Scale
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Train
    model = create_voting_ensemble()
    model.fit(X_train_scaled, y_train)
    
    # Predict
    val_prob = model.predict_proba(X_val_scaled)[:, 1]
    oof_probs_orig[val_idx] = val_prob
    
    val_pred = (val_prob >= 0.5).astype(int)
    fold_acc = accuracy_score(y_val, val_pred)
    fold_scores_orig.append(fold_acc)
    print(f"  Fold {fold+1}: {fold_acc:.4f}")

cv_orig = np.mean(fold_scores_orig)
print(f"  Mean CV (threshold 0.5): {cv_orig:.4f} (+/- {np.std(fold_scores_orig):.4f})")

In [None]:
# Test ENHANCED features
print("\n2. ENHANCED FEATURES (13 features):")
oof_probs_enhanced = np.zeros(len(X_enhanced))
fold_scores_enhanced = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_enhanced, y)):
    X_train, X_val = X_enhanced[train_idx], X_enhanced[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Scale
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Train
    model = create_voting_ensemble()
    model.fit(X_train_scaled, y_train)
    
    # Predict
    val_prob = model.predict_proba(X_val_scaled)[:, 1]
    oof_probs_enhanced[val_idx] = val_prob
    
    val_pred = (val_prob >= 0.5).astype(int)
    fold_acc = accuracy_score(y_val, val_pred)
    fold_scores_enhanced.append(fold_acc)
    print(f"  Fold {fold+1}: {fold_acc:.4f}")

cv_enhanced = np.mean(fold_scores_enhanced)
print(f"  Mean CV (threshold 0.5): {cv_enhanced:.4f} (+/- {np.std(fold_scores_enhanced):.4f})")

In [None]:
# Compare results
print("\n" + "="*60)
print("COMPARISON SUMMARY")
print("="*60)
print(f"\nOriginal (8 features):  CV = {cv_orig:.4f} (+/- {np.std(fold_scores_orig):.4f})")
print(f"Enhanced (13 features): CV = {cv_enhanced:.4f} (+/- {np.std(fold_scores_enhanced):.4f})")
print(f"\nDifference: {cv_enhanced - cv_orig:+.4f}")

if cv_enhanced > cv_orig:
    print("\n✓ Enhanced features IMPROVED CV!")
else:
    print("\n✗ Enhanced features did NOT improve CV")

In [None]:
# Threshold analysis for ENHANCED features
print("\n" + "="*60)
print("THRESHOLD ANALYSIS FOR ENHANCED FEATURES")
print("="*60)

# Train final model on all data for test predictions
X_all_scaled = scaler.fit_transform(X_enhanced)
X_test_scaled = scaler.transform(X_test_enhanced)

final_model = create_voting_ensemble()
final_model.fit(X_all_scaled, y)
test_probs = final_model.predict_proba(X_test_scaled)[:, 1]

thresholds = [0.45, 0.50, 0.52, 0.55, 0.58, 0.60, 0.62, 0.65]

print(f"\n{'Threshold':<12} {'Survivors':<12} {'Survival Rate':<15} {'OOF Accuracy':<15}")
print("-"*55)

for thresh in thresholds:
    test_preds = (test_probs >= thresh).astype(int)
    oof_preds = (oof_probs_enhanced >= thresh).astype(int)
    survivors = test_preds.sum()
    survival_rate = test_preds.mean()
    oof_acc = accuracy_score(y, oof_preds)
    
    marker = ""
    if 125 <= survivors <= 135:
        marker = " <- TARGET"
    
    print(f"{thresh:<12.2f} {survivors:<12} {survival_rate:<15.3f} {oof_acc:<15.4f}{marker}")

In [None]:
# Find optimal threshold for ~31% survival rate (130 survivors)
target_survivors = 130

low, high = 0.4, 0.7
while high - low > 0.001:
    mid = (low + high) / 2
    survivors = (test_probs >= mid).sum()
    if survivors > target_survivors:
        low = mid
    else:
        high = mid

optimal_threshold = (low + high) / 2
optimal_survivors = (test_probs >= optimal_threshold).sum()

print(f"\nOptimal threshold for ~{target_survivors} survivors: {optimal_threshold:.3f}")
print(f"Actual survivors: {optimal_survivors}")
print(f"Survival rate: {optimal_survivors/len(test_probs):.3f}")

# Calculate OOF accuracy at optimal threshold
oof_preds_opt = (oof_probs_enhanced >= optimal_threshold).astype(int)
oof_acc_opt = accuracy_score(y, oof_preds_opt)
print(f"OOF accuracy at optimal threshold: {oof_acc_opt:.4f}")

In [None]:
# Create submission with optimal threshold
test_preds_final = (test_probs >= optimal_threshold).astype(int)

submission = pd.DataFrame({
    'PassengerId': test_ids,
    'Survived': test_preds_final
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved with {len(submission)} rows")
print(f"\nSurvived distribution:")
print(submission['Survived'].value_counts())
print(f"\nSurvival rate: {submission['Survived'].mean():.3f}")

In [None]:
# Final summary
print("\n" + "="*70)
print("EXPERIMENT SUMMARY: Feature Engineering")
print("="*70)

print(f"\nFeatures added:")
print(f"  - FamilySize (SibSp + Parch + 1)")
print(f"  - IsAlone (1 if FamilySize == 1)")
print(f"  - Has_Cabin (1 if Cabin is not null)")
print(f"  - TicketFreq (passengers sharing same ticket)")
print(f"  - FareBin (quartile-based fare categories)")

print(f"\nResults:")
print(f"  Original (8 features):  CV = {cv_orig:.4f}")
print(f"  Enhanced (13 features): CV = {cv_enhanced:.4f}")
print(f"  Improvement: {cv_enhanced - cv_orig:+.4f}")

print(f"\nThreshold-tuned submission:")
print(f"  Threshold: {optimal_threshold:.3f}")
print(f"  OOF accuracy: {oof_acc_opt:.4f}")
print(f"  Survivors: {submission['Survived'].sum()} ({submission['Survived'].mean()*100:.1f}%)")

print(f"\nComparison to previous best (exp_003):")
print(f"  exp_003: CV 0.8373, LB 0.7847, 130 survivors (31.1%)")
print(f"  This:    CV {oof_acc_opt:.4f}, ?? LB, {submission['Survived'].sum()} survivors ({submission['Survived'].mean()*100:.1f}%)")

if oof_acc_opt > 0.8373:
    print(f"\n✓ CV IMPROVED! Consider submitting.")
else:
    print(f"\n✗ CV did not improve. May not be worth submitting.")

In [None]:
# Save candidate submission
import shutil
shutil.copy('/home/submission/submission.csv', '/home/code/submission_candidates/candidate_005.csv')
print("Saved candidate_005.csv")