# Simple Blending of Best LB Models

## Goal: Combine exp_001 (Simple RF) and exp_003 (Threshold-Tuned Ensemble)

## Key Learnings:
1. **CV is NOT predictive of LB** - Higher CV often means worse LB
2. **Simpler models work better** - 8 features beats 13 features
3. **~31% survival rate is optimal** - All models with 130-131 survivors
4. **Stacking and complex features overfit** - Both hurt LB

## Submission History:
| Exp | Model | CV | LB | Survivors |
|-----|-------|-----|-----|----------|
| exp_001 | Simple RF (7 features) | 0.8238 | 0.7775 | 131 |
| exp_003 | Threshold-Tuned Ensemble (8 features) | 0.8373 | 0.7847 | 130 |

## Approach:
1. Re-train both models to get OOF probabilities
2. Simple average: 0.5 * RF_prob + 0.5 * Ensemble_prob
3. Apply threshold for ~31% survival rate (130 survivors)
4. Compare predictions with exp_003

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (891, 12)
Test shape: (418, 11)


In [2]:
# Preprocessing for exp_001 (Simple RF - 7 features, NO Title)
def preprocess_simple(train_df, test_df):
    train_data = train_df.copy()
    test_data = test_df.copy()
    
    # Sex encoding
    train_data['Sex_Code'] = (train_data['Sex'] == 'male').astype(int)
    test_data['Sex_Code'] = (test_data['Sex'] == 'male').astype(int)
    
    # Embarked
    train_data['Embarked'] = train_data['Embarked'].fillna('S')
    test_data['Embarked'] = test_data['Embarked'].fillna('S')
    embarked_map = {'S': 0, 'C': 1, 'Q': 2}
    train_data['Embarked_Code'] = train_data['Embarked'].map(embarked_map)
    test_data['Embarked_Code'] = test_data['Embarked'].map(embarked_map)
    
    # Fare imputation (from train only)
    train_fare_median = train_data['Fare'].median()
    train_data['Fare'] = train_data['Fare'].fillna(train_fare_median)
    test_data['Fare'] = test_data['Fare'].fillna(train_fare_median)
    
    # Age imputation (from train only)
    train_age_median = train_data['Age'].median()
    train_data['Age'] = train_data['Age'].fillna(train_age_median)
    test_data['Age'] = test_data['Age'].fillna(train_age_median)
    
    return train_data, test_data

train_simple, test_simple = preprocess_simple(train, test)
print("Simple preprocessing complete (7 features)")

Simple preprocessing complete (7 features)


In [3]:
# Preprocessing for exp_003 (Threshold-Tuned Ensemble - 8 features, WITH Title)
def preprocess_with_title(train_df, test_df):
    train_data = train_df.copy()
    test_data = test_df.copy()
    
    # Title extraction
    for df in [train_data, test_data]:
        df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
        title_mapping = {
            'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
            'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
            'Lady': 'Rare', 'Countess': 'Rare', 'Capt': 'Rare', 'Col': 'Rare',
            'Don': 'Rare', 'Dr': 'Rare', 'Major': 'Rare', 'Rev': 'Rare',
            'Sir': 'Rare', 'Jonkheer': 'Rare', 'Dona': 'Rare'
        }
        df['Title'] = df['Title'].map(title_mapping).fillna('Rare')
    
    title_order = ['Mr', 'Miss', 'Mrs', 'Master', 'Rare']
    title_map = {t: i for i, t in enumerate(title_order)}
    train_data['Title_Code'] = train_data['Title'].map(title_map)
    test_data['Title_Code'] = test_data['Title'].map(title_map)
    
    # Sex encoding
    train_data['Sex_Code'] = (train_data['Sex'] == 'male').astype(int)
    test_data['Sex_Code'] = (test_data['Sex'] == 'male').astype(int)
    
    # Embarked
    train_data['Embarked'] = train_data['Embarked'].fillna('S')
    test_data['Embarked'] = test_data['Embarked'].fillna('S')
    embarked_map = {'S': 0, 'C': 1, 'Q': 2}
    train_data['Embarked_Code'] = train_data['Embarked'].map(embarked_map)
    test_data['Embarked_Code'] = test_data['Embarked'].map(embarked_map)
    
    # Fare imputation (from train only)
    train_fare_median = train_data['Fare'].median()
    train_data['Fare'] = train_data['Fare'].fillna(train_fare_median)
    test_data['Fare'] = test_data['Fare'].fillna(train_fare_median)
    
    # Age imputation (from train only, by Title/Pclass)
    age_medians = train_data.groupby(['Title', 'Pclass'])['Age'].median()
    train_age_median = train_data['Age'].median()
    
    def fill_age(row, medians, fallback):
        if pd.isna(row['Age']):
            try:
                return medians[(row['Title'], row['Pclass'])]
            except KeyError:
                return fallback
        return row['Age']
    
    train_data['Age'] = train_data.apply(lambda x: fill_age(x, age_medians, train_age_median), axis=1)
    test_data['Age'] = test_data.apply(lambda x: fill_age(x, age_medians, train_age_median), axis=1)
    
    return train_data, test_data

train_title, test_title = preprocess_with_title(train, test)
print("Title preprocessing complete (8 features)")

Title preprocessing complete (8 features)


In [4]:
# Prepare features
# exp_001: 7 features (no Title)
features_simple = ['Pclass', 'Sex_Code', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Code']

# exp_003: 8 features (with Title)
features_title = ['Pclass', 'Sex_Code', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Code', 'Title_Code']

X_simple = train_simple[features_simple].values
X_title = train_title[features_title].values
y = train['Survived'].values

X_test_simple = test_simple[features_simple].values
X_test_title = test_title[features_title].values
test_ids = test['PassengerId'].values

print(f"Simple features (7): {features_simple}")
print(f"Title features (8): {features_title}")
print(f"\nX_simple shape: {X_simple.shape}")
print(f"X_title shape: {X_title.shape}")

Simple features (7): ['Pclass', 'Sex_Code', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Code']
Title features (8): ['Pclass', 'Sex_Code', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Code', 'Title_Code']

X_simple shape: (891, 7)
X_title shape: (891, 8)


In [5]:
# Define models
# exp_001: Simple RF
def create_simple_rf():
    return RandomForestClassifier(
        n_estimators=100, max_depth=5, min_samples_leaf=5,
        random_state=42, n_jobs=-1
    )

# exp_003: Voting Ensemble
def create_voting_ensemble():
    rf = RandomForestClassifier(
        n_estimators=100, max_depth=5, min_samples_leaf=5,
        random_state=42, n_jobs=-1
    )
    lr = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
    gb = GradientBoostingClassifier(
        n_estimators=100, max_depth=3, learning_rate=0.1,
        random_state=42
    )
    svc = SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
    
    ensemble = VotingClassifier(
        estimators=[('rf', rf), ('lr', lr), ('gb', gb), ('svc', svc)],
        voting='soft'
    )
    return ensemble

print("Models defined")

Models defined


In [6]:
# Generate OOF probabilities for both models
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()

# Model 1: Simple RF (7 features)
print("Training Simple RF (exp_001 style)...")
oof_probs_rf = np.zeros(len(X_simple))
test_probs_rf = np.zeros(len(X_test_simple))

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_simple, y)):
    X_train, X_val = X_simple[train_idx], X_simple[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model = create_simple_rf()
    model.fit(X_train, y_train)
    
    oof_probs_rf[val_idx] = model.predict_proba(X_val)[:, 1]
    test_probs_rf += model.predict_proba(X_test_simple)[:, 1] / kfold.n_splits
    
    fold_acc = accuracy_score(y_val, (oof_probs_rf[val_idx] >= 0.5).astype(int))
    print(f"  Fold {fold+1}: {fold_acc:.4f}")

rf_cv = accuracy_score(y, (oof_probs_rf >= 0.5).astype(int))
print(f"  Simple RF CV: {rf_cv:.4f}")

Training Simple RF (exp_001 style)...


  Fold 1: 0.8436


  Fold 2: 0.8090


  Fold 3: 0.8090


  Fold 4: 0.8202


  Fold 5: 0.8258
  Simple RF CV: 0.8215


In [7]:
# Model 2: Voting Ensemble (8 features)
print("\nTraining Voting Ensemble (exp_003 style)...")
oof_probs_ens = np.zeros(len(X_title))
test_probs_ens = np.zeros(len(X_test_title))

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_title, y)):
    X_train, X_val = X_title[train_idx], X_title[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Scale for SVC and LR
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test_title)
    
    model = create_voting_ensemble()
    model.fit(X_train_scaled, y_train)
    
    oof_probs_ens[val_idx] = model.predict_proba(X_val_scaled)[:, 1]
    test_probs_ens += model.predict_proba(X_test_scaled)[:, 1] / kfold.n_splits
    
    fold_acc = accuracy_score(y_val, (oof_probs_ens[val_idx] >= 0.5).astype(int))
    print(f"  Fold {fold+1}: {fold_acc:.4f}")

ens_cv = accuracy_score(y, (oof_probs_ens >= 0.5).astype(int))
print(f"  Voting Ensemble CV: {ens_cv:.4f}")


Training Voting Ensemble (exp_003 style)...


  Fold 1: 0.8436


  Fold 2: 0.8258


  Fold 3: 0.8202


  Fold 4: 0.8371


  Fold 5: 0.8427
  Voting Ensemble CV: 0.8339


In [8]:
# Simple Blending: Average probabilities
print("\n" + "="*60)
print("SIMPLE BLENDING")
print("="*60)

# Test different blend weights
weights_to_test = [(0.5, 0.5), (0.4, 0.6), (0.3, 0.7), (0.6, 0.4)]

print(f"\n{'Weight (RF, Ens)':<20} {'OOF CV':<12} {'Test Survivors':<15}")
print("-"*50)

for w_rf, w_ens in weights_to_test:
    oof_blend = w_rf * oof_probs_rf + w_ens * oof_probs_ens
    test_blend = w_rf * test_probs_rf + w_ens * test_probs_ens
    
    oof_preds = (oof_blend >= 0.5).astype(int)
    test_preds = (test_blend >= 0.5).astype(int)
    
    cv = accuracy_score(y, oof_preds)
    survivors = test_preds.sum()
    
    print(f"({w_rf:.1f}, {w_ens:.1f}){'':<12} {cv:.4f}{'':<6} {survivors}")


SIMPLE BLENDING

Weight (RF, Ens)     OOF CV       Test Survivors 
--------------------------------------------------
(0.5, 0.5)             0.8227       142
(0.4, 0.6)             0.8272       149
(0.3, 0.7)             0.8316       151
(0.6, 0.4)             0.8215       142


In [9]:
# Use 50-50 blend and apply threshold tuning
blend_oof = 0.5 * oof_probs_rf + 0.5 * oof_probs_ens
blend_test = 0.5 * test_probs_rf + 0.5 * test_probs_ens

print("\n" + "="*60)
print("THRESHOLD ANALYSIS FOR 50-50 BLEND")
print("="*60)

thresholds = [0.45, 0.48, 0.50, 0.52, 0.55, 0.58, 0.60, 0.62, 0.65]

print(f"\n{'Threshold':<12} {'Survivors':<12} {'Survival Rate':<15} {'OOF Accuracy':<15}")
print("-"*55)

for thresh in thresholds:
    test_preds = (blend_test >= thresh).astype(int)
    oof_preds = (blend_oof >= thresh).astype(int)
    survivors = test_preds.sum()
    survival_rate = test_preds.mean()
    oof_acc = accuracy_score(y, oof_preds)
    
    marker = ""
    if 128 <= survivors <= 132:
        marker = " <- TARGET"
    
    print(f"{thresh:<12.2f} {survivors:<12} {survival_rate:<15.3f} {oof_acc:<15.4f}{marker}")


THRESHOLD ANALYSIS FOR 50-50 BLEND

Threshold    Survivors    Survival Rate   OOF Accuracy   
-------------------------------------------------------
0.45         158          0.378           0.8305         
0.48         152          0.364           0.8294         
0.50         142          0.340           0.8227         
0.52         139          0.333           0.8294         
0.55         128          0.306           0.8316          <- TARGET
0.58         121          0.289           0.8350         
0.60         118          0.282           0.8328         
0.62         116          0.278           0.8238         
0.65         109          0.261           0.8204         


In [10]:
# Find optimal threshold for ~130 survivors
target_survivors = 130

low, high = 0.4, 0.7
while high - low > 0.001:
    mid = (low + high) / 2
    survivors = (blend_test >= mid).sum()
    if survivors > target_survivors:
        low = mid
    else:
        high = mid

optimal_threshold = (low + high) / 2
optimal_survivors = (blend_test >= optimal_threshold).sum()

print(f"\nOptimal threshold for ~{target_survivors} survivors: {optimal_threshold:.3f}")
print(f"Actual survivors: {optimal_survivors}")
print(f"Survival rate: {optimal_survivors/len(blend_test):.3f}")

# Calculate OOF accuracy at optimal threshold
oof_preds_opt = (blend_oof >= optimal_threshold).astype(int)
oof_acc_opt = accuracy_score(y, oof_preds_opt)
print(f"OOF accuracy at optimal threshold: {oof_acc_opt:.4f}")


Optimal threshold for ~130 survivors: 0.544
Actual survivors: 130
Survival rate: 0.311
OOF accuracy at optimal threshold: 0.8339


In [11]:
# Compare with exp_003 (best LB)
print("\n" + "="*60)
print("COMPARISON WITH BEST LB (exp_003)")
print("="*60)

# Load exp_003 predictions
exp_003 = pd.read_csv('/home/code/submission_candidates/candidate_003.csv')

# Create blend predictions
blend_preds = (blend_test >= optimal_threshold).astype(int)

# Compare
agreement = (exp_003['Survived'].values == blend_preds).sum()
disagreement = len(blend_preds) - agreement

print(f"\nAgreement with exp_003: {agreement}/{len(blend_preds)} ({agreement/len(blend_preds)*100:.1f}%)")
print(f"Disagreement: {disagreement} passengers")

# Analyze disagreements
if disagreement > 0:
    diff_mask = exp_003['Survived'].values != blend_preds
    diff_ids = test_ids[diff_mask]
    exp_003_vals = exp_003['Survived'].values[diff_mask]
    blend_vals = blend_preds[diff_mask]
    
    print(f"\nDisagreements:")
    print(f"  exp_003=1, blend=0: {((exp_003_vals == 1) & (blend_vals == 0)).sum()}")
    print(f"  exp_003=0, blend=1: {((exp_003_vals == 0) & (blend_vals == 1)).sum()}")


COMPARISON WITH BEST LB (exp_003)

Agreement with exp_003: 406/418 (97.1%)
Disagreement: 12 passengers

Disagreements:
  exp_003=1, blend=0: 6
  exp_003=0, blend=1: 6


In [12]:
# Also compare with exp_001 (Simple RF)
exp_001 = pd.read_csv('/home/code/submission_candidates/candidate_001.csv')

agreement_001 = (exp_001['Survived'].values == blend_preds).sum()
print(f"\nAgreement with exp_001: {agreement_001}/{len(blend_preds)} ({agreement_001/len(blend_preds)*100:.1f}%)")

# Summary
print(f"\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"\nSimple RF (exp_001): CV {rf_cv:.4f}, LB 0.7775")
print(f"Voting Ensemble (exp_003): CV {ens_cv:.4f}, LB 0.7847")
print(f"50-50 Blend: CV {oof_acc_opt:.4f}")
print(f"\nBlend survivors: {optimal_survivors} ({optimal_survivors/len(blend_test)*100:.1f}%)")
print(f"exp_003 survivors: {exp_003['Survived'].sum()} ({exp_003['Survived'].mean()*100:.1f}%)")


Agreement with exp_001: 411/418 (98.3%)

SUMMARY

Simple RF (exp_001): CV 0.8215, LB 0.7775
Voting Ensemble (exp_003): CV 0.8339, LB 0.7847
50-50 Blend: CV 0.8339

Blend survivors: 130 (31.1%)
exp_003 survivors: 130 (31.1%)


In [13]:
# Create submission
submission = pd.DataFrame({
    'PassengerId': test_ids,
    'Survived': blend_preds
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved with {len(submission)} rows")
print(f"\nSurvived distribution:")
print(submission['Survived'].value_counts())
print(f"\nSurvival rate: {submission['Survived'].mean():.3f}")

Submission saved with 418 rows

Survived distribution:
Survived
0    288
1    130
Name: count, dtype: int64

Survival rate: 0.311


In [14]:
# Final summary
print("\n" + "="*70)
print("EXPERIMENT SUMMARY: Simple Blending")
print("="*70)

print(f"\nApproach:")
print(f"  - Blend Simple RF (7 features) + Voting Ensemble (8 features)")
print(f"  - Weight: 50% RF + 50% Ensemble")
print(f"  - Threshold: {optimal_threshold:.3f}")

print(f"\nResults:")
print(f"  Simple RF CV: {rf_cv:.4f}")
print(f"  Voting Ensemble CV: {ens_cv:.4f}")
print(f"  Blend CV (threshold {optimal_threshold:.3f}): {oof_acc_opt:.4f}")

print(f"\nSubmission:")
print(f"  Survivors: {submission['Survived'].sum()} ({submission['Survived'].mean()*100:.1f}%)")
print(f"  Differs from exp_003 by: {disagreement} passengers")

print(f"\nComparison to previous best:")
print(f"  exp_003: CV 0.8373, LB 0.7847, 130 survivors")
print(f"  Blend:   CV {oof_acc_opt:.4f}, ?? LB, {submission['Survived'].sum()} survivors")

if oof_acc_opt >= 0.8373 and disagreement > 0:
    print(f"\n✓ CV similar/better AND predictions differ - worth considering")
else:
    print(f"\n⚠ CV lower or predictions same as exp_003")


EXPERIMENT SUMMARY: Simple Blending

Approach:
  - Blend Simple RF (7 features) + Voting Ensemble (8 features)
  - Weight: 50% RF + 50% Ensemble
  - Threshold: 0.544

Results:
  Simple RF CV: 0.8215
  Voting Ensemble CV: 0.8339
  Blend CV (threshold 0.544): 0.8339

Submission:
  Survivors: 130 (31.1%)
  Differs from exp_003 by: 12 passengers

Comparison to previous best:
  exp_003: CV 0.8373, LB 0.7847, 130 survivors
  Blend:   CV 0.8339, ?? LB, 130 survivors

⚠ CV lower or predictions same as exp_003


In [15]:
# Save candidate
import shutil
shutil.copy('/home/submission/submission.csv', '/home/code/submission_candidates/candidate_006.csv')
print("Saved candidate_006.csv")

Saved candidate_006.csv


In [None]:
# Analyze the 12 differing passengers
print("ANALYZING 12 DIFFERING PASSENGERS")
print("="*60)

diff_mask = exp_003['Survived'].values != blend_preds
diff_indices = np.where(diff_mask)[0]

# Get test data for these passengers
test_diff = test.iloc[diff_indices].copy()
test_diff['exp_003_pred'] = exp_003['Survived'].values[diff_mask]
test_diff['blend_pred'] = blend_preds[diff_mask]
test_diff['blend_prob'] = blend_test[diff_mask]

print(f"\nPassengers where exp_003=1, blend=0 (exp_003 predicts survive, blend predicts die):")
mask_10 = (test_diff['exp_003_pred'] == 1) & (test_diff['blend_pred'] == 0)
if mask_10.sum() > 0:
    cols = ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'blend_prob']
    print(test_diff.loc[mask_10, cols].to_string())

print(f"\nPassengers where exp_003=0, blend=1 (exp_003 predicts die, blend predicts survive):")
mask_01 = (test_diff['exp_003_pred'] == 0) & (test_diff['blend_pred'] == 1)
if mask_01.sum() > 0:
    cols = ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'blend_prob']
    print(test_diff.loc[mask_01, cols].to_string())