# Adversarial Validation + Regularized Ensemble

**Goal:** Address the CV-LB gap (+0.0645) through:
1. Adversarial validation to identify distribution shift
2. Stronger regularization in base models
3. Feature selection to remove noisy features

**Current best:** Voting Ensemble CV 0.8372, LB 0.7727
**Target:** Reduce CV-LB gap, improve generalization

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, 
                              ExtraTreesClassifier, AdaBoostClassifier)
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

In [None]:
# Feature engineering (same as voting ensemble)
def extract_title(name):
    title_search = re.search(r' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

def process_features(df):
    df = df.copy()
    
    # Title extraction
    df['Title'] = df['Name'].apply(extract_title)
    df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 
                                        'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    # Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['FamilySize_Bin'] = pd.cut(df['FamilySize'], bins=[0, 1, 4, 11], labels=[0, 1, 2]).astype(int)
    
    # Cabin features
    df['Has_Cabin'] = df['Cabin'].notna().astype(int)
    df['Deck'] = df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'U')
    
    # Embarked - fill missing with mode
    df['Embarked'] = df['Embarked'].fillna('S')
    
    # Fare - fill missing with median by Pclass
    if df['Fare'].isna().any():
        df['Fare'] = df.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.median()))
    
    return df

train_processed = process_features(train)
test_processed = process_features(test)

print("Features processed.")

In [None]:
# Age imputation
def impute_age(train_df, test_df):
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    age_medians = train_df.groupby(['Pclass', 'Sex', 'Title'])['Age'].median()
    
    def get_median_age(row, medians, fallback_median):
        if pd.isna(row['Age']):
            try:
                return medians.loc[(row['Pclass'], row['Sex'], row['Title'])]
            except KeyError:
                return fallback_median
        return row['Age']
    
    fallback = train_df['Age'].median()
    train_df['Age'] = train_df.apply(lambda x: get_median_age(x, age_medians, fallback), axis=1)
    test_df['Age'] = test_df.apply(lambda x: get_median_age(x, age_medians, fallback), axis=1)
    
    return train_df, test_df

train_processed, test_processed = impute_age(train_processed, test_processed)

# Age_Bin feature
def add_age_bin(df):
    df = df.copy()
    df['Age_Bin'] = pd.cut(df['Age'], bins=[0, 16, 32, 48, 100], labels=[0, 1, 2, 3]).astype(int)
    return df

train_processed = add_age_bin(train_processed)
test_processed = add_age_bin(test_processed)

print(f"Age imputed. Missing: Train={train_processed['Age'].isna().sum()}, Test={test_processed['Age'].isna().sum()}")

In [None]:
# Encode categorical features
def encode_features(train_df, test_df):
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    train_df['Sex'] = train_df['Sex'].map({'female': 0, 'male': 1})
    test_df['Sex'] = test_df['Sex'].map({'female': 0, 'male': 1})
    
    embarked_map = {'S': 0, 'C': 1, 'Q': 2}
    train_df['Embarked'] = train_df['Embarked'].map(embarked_map)
    test_df['Embarked'] = test_df['Embarked'].map(embarked_map)
    
    title_map = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
    train_df['Title'] = train_df['Title'].map(title_map)
    test_df['Title'] = test_df['Title'].map(title_map)
    
    deck_map = {'U': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}
    train_df['Deck'] = train_df['Deck'].map(deck_map)
    test_df['Deck'] = test_df['Deck'].map(deck_map)
    
    return train_df, test_df

train_encoded, test_encoded = encode_features(train_processed, test_processed)

# Original feature set (same as voting ensemble exp_001)
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 
            'Title', 'FamilySize', 'IsAlone', 'Has_Cabin', 
            'Deck', 'FamilySize_Bin', 'Age_Bin']

X_train = train_encoded[features].values
y_train = train_encoded['Survived'].values
X_test = test_encoded[features].values

print(f"Feature matrix: Train {X_train.shape}, Test {X_test.shape}")

In [None]:
# ADVERSARIAL VALIDATION: Can we distinguish train from test?
print("="*60)
print("ADVERSARIAL VALIDATION")
print("="*60)

# Combine train and test
X_combined = np.vstack([X_train, X_test])
y_adversarial = np.array([0]*len(X_train) + [1]*len(X_test))  # 0=train, 1=test

# Train classifier to distinguish train vs test
adv_clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
adv_scores = cross_val_score(adv_clf, X_combined, y_adversarial, cv=5, scoring='roc_auc')

print(f"\nAdversarial AUC: {np.mean(adv_scores):.4f} (+/- {np.std(adv_scores):.4f})")
print(f"  - AUC = 0.5 means no distribution shift (ideal)")
print(f"  - AUC > 0.6 means significant shift (problematic)")

if np.mean(adv_scores) > 0.55:
    print(f"\n⚠️ Distribution shift detected! AUC = {np.mean(adv_scores):.4f}")
else:
    print(f"\n✓ Minimal distribution shift. AUC = {np.mean(adv_scores):.4f}")

In [None]:
# Feature importance for adversarial classification
adv_clf.fit(X_combined, y_adversarial)

adv_importance = pd.DataFrame({
    'feature': features,
    'importance': adv_clf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeatures that distinguish train from test (higher = more shift):")
print(adv_importance.to_string(index=False))

# Identify problematic features (high importance in adversarial model)
problematic_features = adv_importance[adv_importance['importance'] > 0.10]['feature'].tolist()
print(f"\nProblematic features (importance > 0.10): {problematic_features}")

In [None]:
# APPROACH 1: Regularized Voting Ensemble (stronger regularization)
print("\n" + "="*60)
print("APPROACH 1: REGULARIZED VOTING ENSEMBLE")
print("="*60)

# Stronger regularization than exp_001
models_regularized = [
    ('lr', LogisticRegression(C=0.01, max_iter=1000, random_state=42)),  # C=0.01 (was 0.1)
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=4, min_samples_leaf=8, random_state=42)),  # depth=4 (was 6)
    ('gb', GradientBoostingClassifier(n_estimators=50, max_depth=2, learning_rate=0.1, random_state=42)),  # depth=2 (was 3)
    ('et', ExtraTreesClassifier(n_estimators=100, max_depth=4, min_samples_leaf=8, random_state=42)),  # depth=4 (was 6)
    ('ada', AdaBoostClassifier(n_estimators=50, learning_rate=0.3, random_state=42)),  # reduced
    ('svc', SVC(kernel='rbf', C=0.5, probability=True, random_state=42)),  # C=0.5 (was 1.0)
    ('xgb', XGBClassifier(n_estimators=50, max_depth=2, learning_rate=0.1, 
                          reg_alpha=1.0, reg_lambda=2.0,  # Added L1/L2 regularization
                          use_label_encoder=False, eval_metric='logloss', random_state=42))
]

print("Models with stronger regularization:")
for name, model in models_regularized:
    print(f"  - {name}")

In [None]:
# 10-fold CV for regularized ensemble
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

cv_scores_reg = []
oof_predictions_reg = np.zeros(len(X_train))
test_predictions_reg = np.zeros(len(X_test))

individual_scores_reg = {name: [] for name, _ in models_regularized}

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, y_train)):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Scale for SVC and LR
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    fold_probs = np.zeros((len(X_val), 2))
    test_fold_probs = np.zeros((len(X_test), 2))
    
    for name, model in models_regularized:
        from sklearn.base import clone
        m = clone(model)
        
        if name in ['lr', 'svc']:
            m.fit(X_tr_scaled, y_tr)
            val_prob = m.predict_proba(X_val_scaled)
            test_prob = m.predict_proba(X_test_scaled)
            val_pred = m.predict(X_val_scaled)
        else:
            m.fit(X_tr, y_tr)
            val_prob = m.predict_proba(X_val)
            test_prob = m.predict_proba(X_test)
            val_pred = m.predict(X_val)
        
        fold_probs += val_prob
        test_fold_probs += test_prob
        individual_scores_reg[name].append(accuracy_score(y_val, val_pred))
    
    fold_probs /= len(models_regularized)
    test_fold_probs /= len(models_regularized)
    
    val_pred_ensemble = (fold_probs[:, 1] >= 0.5).astype(int)
    oof_predictions_reg[val_idx] = val_pred_ensemble
    test_predictions_reg += test_fold_probs[:, 1] / kfold.n_splits
    
    fold_acc = accuracy_score(y_val, val_pred_ensemble)
    cv_scores_reg.append(fold_acc)
    print(f"Fold {fold+1}: Accuracy = {fold_acc:.4f}")

print(f"\n{'='*50}")
print(f"Regularized Ensemble CV: {np.mean(cv_scores_reg):.4f} (+/- {np.std(cv_scores_reg):.4f})")
print(f"Previous Voting Ensemble: 0.8372 (+/- 0.0239)")
print(f"Change: {np.mean(cv_scores_reg) - 0.8372:+.4f}")

In [None]:
# Individual model performance
print("\nIndividual Model Performance (Regularized):")
print("="*50)
for name, scores in individual_scores_reg.items():
    print(f"{name:5s}: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")

In [None]:
# APPROACH 2: Feature Selection - Remove noisy/redundant features
print("\n" + "="*60)
print("APPROACH 2: FEATURE SELECTION")
print("="*60)

# Remove features identified as problematic or redundant:
# - IsAlone (redundant with FamilySize)
# - Parch (redundant with FamilySize)
# - Deck (77% unknown, may add noise)
# - Features with high adversarial importance (cause distribution shift)

features_reduced = ['Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Embarked', 
                    'Title', 'FamilySize', 'Has_Cabin', 'FamilySize_Bin', 'Age_Bin']
# Removed: IsAlone, Parch, Deck

print(f"Original features: {len(features)}")
print(f"Reduced features: {len(features_reduced)}")
print(f"Removed: IsAlone, Parch, Deck")

X_train_reduced = train_encoded[features_reduced].values
X_test_reduced = test_encoded[features_reduced].values

In [None]:
# CV with reduced features + regularized models
cv_scores_reduced = []
oof_predictions_reduced = np.zeros(len(X_train_reduced))
test_predictions_reduced = np.zeros(len(X_test_reduced))

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_reduced, y_train)):
    X_tr, X_val = X_train_reduced[train_idx], X_train_reduced[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test_reduced)
    
    fold_probs = np.zeros((len(X_val), 2))
    test_fold_probs = np.zeros((len(X_test_reduced), 2))
    
    for name, model in models_regularized:
        from sklearn.base import clone
        m = clone(model)
        
        if name in ['lr', 'svc']:
            m.fit(X_tr_scaled, y_tr)
            val_prob = m.predict_proba(X_val_scaled)
            test_prob = m.predict_proba(X_test_scaled)
        else:
            m.fit(X_tr, y_tr)
            val_prob = m.predict_proba(X_val)
            test_prob = m.predict_proba(X_test_reduced)
        
        fold_probs += val_prob
        test_fold_probs += test_prob
    
    fold_probs /= len(models_regularized)
    test_fold_probs /= len(models_regularized)
    
    val_pred_ensemble = (fold_probs[:, 1] >= 0.5).astype(int)
    oof_predictions_reduced[val_idx] = val_pred_ensemble
    test_predictions_reduced += test_fold_probs[:, 1] / kfold.n_splits
    
    fold_acc = accuracy_score(y_val, val_pred_ensemble)
    cv_scores_reduced.append(fold_acc)

print(f"Reduced Features + Regularized CV: {np.mean(cv_scores_reduced):.4f} (+/- {np.std(cv_scores_reduced):.4f})")
print(f"Regularized (all features) CV:     {np.mean(cv_scores_reg):.4f} (+/- {np.std(cv_scores_reg):.4f})")
print(f"Original Voting Ensemble CV:       0.8372 (+/- 0.0239)")

In [None]:
# Summary and choose best approach
print("\n" + "="*60)
print("SUMMARY")
print("="*60)

results = {
    'Original Voting (exp_001)': (0.8372, 0.0239),
    'Regularized (all features)': (np.mean(cv_scores_reg), np.std(cv_scores_reg)),
    'Regularized (reduced features)': (np.mean(cv_scores_reduced), np.std(cv_scores_reduced))
}

print("\nCV Scores:")
for name, (mean, std) in results.items():
    print(f"  {name}: {mean:.4f} (+/- {std:.4f})")

# Choose best based on CV score
best_name = max(results.keys(), key=lambda x: results[x][0])
best_cv, best_std = results[best_name]

print(f"\nBest approach: {best_name}")
print(f"Best CV: {best_cv:.4f} (+/- {best_std:.4f})")

# Expected LB using calibration
expected_lb = 2.55 * best_cv - 1.37
print(f"\nExpected LB (using calibration): {expected_lb:.4f}")
print(f"Previous best LB: 0.7727")

In [None]:
# Generate submission using best approach
print("\n" + "="*60)
print("GENERATING SUBMISSION")
print("="*60)

# Use regularized ensemble with all features (best CV)
if 'Regularized (all features)' in best_name or np.mean(cv_scores_reg) >= np.mean(cv_scores_reduced):
    test_pred_binary = (test_predictions_reg >= 0.5).astype(int)
    print("Using: Regularized ensemble with all features")
else:
    test_pred_binary = (test_predictions_reduced >= 0.5).astype(int)
    print("Using: Regularized ensemble with reduced features")

submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_pred_binary
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"\nSubmission saved to /home/submission/submission.csv")
print(f"\nPrediction distribution:")
print(submission['Survived'].value_counts())
print(f"\nComparison:")
print(f"  Original Voting: 255 died, 163 survived")
print(f"  This model:      {(test_pred_binary == 0).sum()} died, {(test_pred_binary == 1).sum()} survived")