# Evolver Loop 3 Analysis: Interaction Features & Misclassification Patterns

This notebook investigates class-gender interaction patterns and prepares for the next experiment incorporating validated features and XGBoost.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print("\nTarget distribution:")
print(train['Survived'].value_counts(normalize=True))

In [None]:
# Feature engineering functions from previous analysis
def engineer_features(df):
    """Engineer features for Titanic dataset"""
    df = df.copy()
    
    # Title extraction
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
    'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    # Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # Age groups
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], 
                           labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior'])
    
    # Fare per person
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    
    # Deck from cabin
    df['Deck'] = df['Cabin'].str[0].fillna('Unknown')
    
    # NEW FEATURES from validation
    # Ticket frequency
    ticket_counts = df['Ticket'].value_counts()
    df['TicketFreq'] = df['Ticket'].map(ticket_counts)
    
    # Cabin side (odd/even)
    df['CabinNumber'] = df['Cabin'].str.extract('(\d+)')
    df['CabinNumber'] = pd.to_numeric(df['CabinNumber'], errors='coerce')
    df['CabinSide'] = df['CabinNumber'] % 2
    df['CabinSide'] = df['CabinSide'].map({0: 'Even', 1: 'Odd', np.nan: 'Unknown'})
    
    # Name length
    df['NameLength'] = df['Name'].str.len()
    
    # Fare binning (5 categories)
    df['FareBin5'] = pd.qcut(df['Fare'], q=5, 
                            labels=['VeryLow', 'Low', 'Medium', 'High', 'VeryHigh'])
    
    return df

# Apply feature engineering
train_fe = engineer_features(train)
test_fe = engineer_features(test)

print("Features engineered successfully")
print(f"New features added: TicketFreq, CabinSide, NameLength, FareBin5")

In [None]:
# Analyze class-gender interactions and misclassification patterns
# Focus on the groups identified as problematic

# Create interaction features
interactions = []

# Pclass × Sex interaction
train_fe['Pclass_Sex'] = train_fe['Pclass'].astype(str) + '_' + train_fe['Sex']
test_fe['Pclass_Sex'] = test_fe['Pclass'].astype(str) + '_' + test_fe['Sex']
interactions.append('Pclass_Sex')

# AgeGroup × Sex interaction
train_fe['AgeGroup_Sex'] = train_fe['AgeGroup'].astype(str) + '_' + train_fe['Sex']
test_fe['AgeGroup_Sex'] = test_fe['AgeGroup'].astype(str) + '_' + test_fe['Sex']
interactions.append('AgeGroup_Sex')

# FareBin5 × Sex interaction
train_fe['FareBin5_Sex'] = train_fe['FareBin5'].astype(str) + '_' + train_fe['Sex']
test_fe['FareBin5_Sex'] = test_fe['FareBin5'].astype(str) + '_' + test_fe['Sex']
interactions.append('FareBin5_Sex')

print("Interaction features created:")
for inter in interactions:
    print(f"- {inter}")

# Analyze survival rates by interaction
print("\n" + "="*60)
print("SURVIVAL RATES BY CLASS-GENDER INTERACTION")
print("="*60)

survival_by_interaction = train_fe.groupby('Pclass_Sex')['Survived'].agg(['count', 'sum', 'mean'])
survival_by_interaction['survival_rate'] = survival_by_interaction['mean']
survival_by_interaction = survival_by_interaction.sort_values('survival_rate', ascending=False)

for idx, row in survival_by_interaction.iterrows():
    print(f"{idx:>12}: {row['survival_rate']:.1%} survival ({int(row['sum'])}/{int(row['count'])} passengers)")

# Identify problematic groups
print("\n" + "="*60)
print("PROBLEMATIC GROUPS (High misclassification risk)")
print("="*60)

# Third-class females (should have high survival but often misclassified)
third_class_females = train_fe[(train_fe['Pclass'] == 3) & (train_fe['Sex'] == 'female')]
print(f"3rd class females: {third_class_females['Survived'].mean():.1%} survival ({third_class_females['Survived'].sum()}/{len(third_class_females)})")

# First/second-class males (should have lower survival but some survive)
high_class_males = train_fe[(train_fe['Pclass'].isin([1, 2])) & (train_fe['Sex'] == 'male')]
print(f"1st/2nd class males: {high_class_males['Survived'].mean():.1%} survival ({high_class_males['Survived'].sum()}/{len(high_class_males)})")

# Young males in 2nd/3rd class
young_males = train_fe[(train_fe['Sex'] == 'male') & 
                      (train_fe['Age'] <= 25) & 
                      (train_fe['Pclass'].isin([2, 3]))]
if len(young_males) > 0:
    print(f"Young males (≤25) in 2nd/3rd: {young_males['Survived'].mean():.1%} survival ({young_males['Survived'].sum()}/{len(young_males)})")

In [None]:
# Quick test with interaction features
from sklearn.ensemble import RandomForestClassifier

# Prepare features
feature_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',
                'Title', 'FamilySize', 'IsAlone', 'AgeGroup', 'FarePerPerson', 'Deck',
                'TicketFreq', 'CabinSide', 'NameLength', 'FareBin5'] + interactions

# Fill missing values
train_clean = train_fe[feature_cols + ['Survived']].copy()
test_clean = test_fe[feature_cols].copy()

# Fill numeric missing with median, categorical with mode
numeric_cols = train_clean.select_dtypes(include=[np.number]).columns
categorical_cols = train_clean.select_dtypes(include=['object']).columns

for col in numeric_cols:
    if col != 'Survived':
        median_val = train_clean[col].median()
        train_clean[col].fillna(median_val, inplace=True)
        test_clean[col].fillna(median_val, inplace=True)

for col in categorical_cols:
    if col != 'Survived':
        mode_val = train_clean[col].mode()[0]
        train_clean[col].fillna(mode_val, inplace=True)
        test_clean[col].fillna(mode_val, inplace=True)

# Encode categorical features
categorical_features = [col for col in categorical_cols if col != 'Survived']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_features)
    ], remainder='passthrough'
)

X = train_clean.drop('Survived', axis=1)
y = train_clean['Survived']

# Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Test with and without interactions
scores_without = []
scores_with = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Without interactions (baseline features only)
    baseline_cols = [col for col in feature_cols if col not in interactions]
    X_train_base = X_train[baseline_cols]
    X_val_base = X_val[baseline_cols]
    
    preprocessor.fit(X_train_base)
    X_train_base_enc = preprocessor.transform(X_train_base)
    X_val_base_enc = preprocessor.transform(X_val_base)
    
    rf_base = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    rf_base.fit(X_train_base_enc, y_train)
    pred_base = rf_base.predict(X_val_base_enc)
    score_base = accuracy_score(y_val, pred_base)
    scores_without.append(score_base)
    
    # With interactions
    preprocessor.fit(X_train)
    X_train_enc = preprocessor.transform(X_train)
    X_val_enc = preprocessor.transform(X_val)
    
    rf_inter = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    rf_inter.fit(X_train_enc, y_train)
    pred_inter = rf_inter.predict(X_val_enc)
    score_inter = accuracy_score(y_val, pred_inter)
    scores_with.append(score_inter)
    
    print(f"Fold {fold+1}: Baseline={score_base:.4f}, With interactions={score_inter:.4f} (diff: {score_inter-score_base:+.4f})")

print(f"\n{'='*60}")
print(f"Average without interactions: {np.mean(scores_without):.4f} ± {np.std(scores_without):.4f}")
print(f"Average with interactions: {np.mean(scores_with):.4f} ± {np.std(scores_with):.4f}")
print(f"Improvement: +{np.mean(scores_with) - np.mean(scores_without):.4f}")

In [None]:
# Feature importance analysis with interactions
print("FEATURE IMPORTANCE ANALYSIS")
print("="*60)

# Fit on full data for importance
preprocessor.fit(X)
X_enc = preprocessor.transform(X)

rf_full = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_full.fit(X_enc, y)

# Get feature names after encoding
feature_names = []
for name, transformer, columns in preprocessor.transformers_:
    if name == 'cat':
        # For ordinal encoder, just use column names
        feature_names.extend(columns)
    else:
        # For passthrough, get remaining columns
        passthrough_cols = [col for col in X.columns if col not in categorical_features]
        feature_names.extend(passthrough_cols)

importances = rf_full.feature_importances_
indices = np.argsort(importances)[::-1]

print("Top 15 most important features:")
for i in range(min(15, len(feature_names))):
    idx = indices[i]
    print(f"{i+1:2d}. {feature_names[idx]:<20} {importances[idx]:.4f}")

# Check importance of interaction features
print(f"\nInteraction feature importances:")
for inter in interactions:
    if inter in feature_names:
        idx = feature_names.index(inter)
        print(f"- {inter}: {importances[idx]:.4f}")
    else:
        print(f"- {inter}: Not found in features")