# Evolver Loop 1 Analysis

Analysis to identify improvement opportunities for Titanic competition.
Focus on: feature patterns, error analysis, and proven techniques from Kaggle meta.

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print("Dataset shapes:")
print(f"Train: {train_df.shape}")
print(f"Test: {test_df.shape}")
print(f"\nTarget distribution: {train_df['Survived'].value_counts().to_dict()}")
print(f"Survival rate: {train_df['Survived'].mean():.3f}")

Dataset shapes:
Train: (891, 12)
Test: (418, 11)

Target distribution: {0: 549, 1: 342}
Survival rate: 0.384


In [7]:
# Analyze missing values
print("Missing values in training data:")
missing_train = train_df.isnull().sum()
print(missing_train[missing_train > 0])

print("\nMissing values in test data:")
missing_test = test_df.isnull().sum()
print(missing_test[missing_test > 0])

Missing values in training data:
Age         177
Cabin       687
Embarked      2
dtype: int64

Missing values in test data:
Age       86
Fare       1
Cabin    327
dtype: int64


In [8]:
# Analyze baseline features from exp_000
# Recreate baseline features
def create_baseline_features(df):
    df = df.copy()
    
    # Extract title from name
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)
    
    # Simplify titles
    title_mapping = {
        'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 'Master': 'Master',
        'Dr': 'Other', 'Rev': 'Other', 'Col': 'Other', 'Major': 'Other',
        'Mlle': 'Miss', 'Countess': 'Other', 'Ms': 'Miss', 'Lady': 'Other',
        'Jonkheer': 'Other', 'Don': 'Other', 'Dona': 'Other', 'Mme': 'Mrs',
        'Capt': 'Other', 'Sir': 'Other'
    }
    df['Title'] = df['Title'].map(title_mapping)
    
    # Family size
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    # Is alone
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # Age groups
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], 
                           labels=['Child', 'Teen', 'Adult', 'MiddleAge', 'Senior'])
    
    # Fare per person
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    
    # Extract deck from cabin
    df['Deck'] = df['Cabin'].str[0]
    df['Deck'] = df['Deck'].fillna('Unknown')
    
    return df

train_baseline = create_baseline_features(train_df)

# Analyze feature correlations with target
numeric_features = ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'FarePerPerson']
categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'IsAlone', 'AgeGroup', 'Deck']

print("Survival rates by key features:")
print("\nBy Pclass:")
print(train_baseline.groupby('Pclass')['Survived'].agg(['count', 'sum', 'mean']))

print("\nBy Sex:")
print(train_baseline.groupby('Sex')['Survived'].agg(['count', 'sum', 'mean']))

print("\nBy Title:")
print(train_baseline.groupby('Title')['Survived'].agg(['count', 'sum', 'mean']))

print("\nBy Deck:")
print(train_baseline.groupby('Deck')['Survived'].agg(['count', 'sum', 'mean']))

Survival rates by key features:

By Pclass:
        count  sum      mean
Pclass                      
1         216  136  0.629630
2         184   87  0.472826
3         491  119  0.242363

By Sex:
        count  sum      mean
Sex                         
female    314  233  0.742038
male      577  109  0.188908

By Title:
        count  sum      mean
Title                       
Master     40   23  0.575000
Miss      185  130  0.702703
Mr        517   81  0.156673
Mrs       126  100  0.793651
Other      23    8  0.347826

By Deck:
         count  sum      mean
Deck                         
A           15    7  0.466667
B           47   35  0.744681
C           59   35  0.593220
D           33   25  0.757576
E           32   24  0.750000
F           13    8  0.615385
G            4    2  0.500000
T            1    0  0.000000
Unknown    687  206  0.299854


In [None]:
# Analyze ticket patterns - potential improvement area
print("Ticket value examples:")
print(train_df['Ticket'].head(20).tolist())

# Extract ticket prefix patterns
train_df['TicketPrefix'] = train_df['Ticket'].str.extract('^([A-Z/]+)', expand=False)
train_df['TicketPrefix'] = train_df['TicketPrefix'].fillna('None')

print("\nTicket prefixes and survival rates:")
ticket_analysis = train_df.groupby('TicketPrefix')['Survived'].agg(['count', 'sum', 'mean']).sort_values('count', ascending=False)
print(ticket_analysis.head(10))

# Analyze ticket frequency (shared tickets = families/groups)
train_df['TicketFreq'] = train_df.groupby('Ticket')['Ticket'].transform('count')
print(f"\nTicket frequency distribution:")
print(train_df['TicketFreq'].value_counts().head())

print("\nSurvival by ticket frequency:")
print(train_df.groupby('TicketFreq')['Survived'].agg(['count', 'mean']))

# Also add Deck feature for later use
train_df['Deck'] = train_df['Cabin'].str[0]
train_df['Deck'] = train_df['Deck'].fillna('Unknown')

In [None]:
# Analyze cabin patterns more deeply
print("Cabin examples:")
print(train_df['Cabin'].dropna().head(20).tolist())

# Extract more detailed cabin information
train_df['CabinNum'] = train_df['Cabin'].str.extract('([0-9]+)', expand=False)
train_df['CabinNum'] = pd.to_numeric(train_df['CabinNum'], errors='coerce')

# Cabin location (odd/even might indicate port/starboard)
train_df['CabinSide'] = np.where(train_df['CabinNum'] % 2 == 0, 'Even', 'Odd')
train_df['CabinSide'] = train_df['CabinSide'].fillna('Unknown')

print("\nSurvival by cabin side (odd/even):")
cabin_side_analysis = train_df.groupby('CabinSide')['Survived'].agg(['count', 'sum', 'mean'])
print(cabin_side_analysis)

# Analyze cabin deck + side combination
train_df['DeckSide'] = train_df['Deck'] + '_' + train_df['CabinSide']
print("\nSurvival by deck and side:")
deck_side_analysis = train_df.groupby('DeckSide')['Survived'].agg(['count', 'mean']).sort_values('mean', ascending=False)
print(deck_side_analysis.head(10))

In [None]:
# Analyze name features
print("Name examples:")
print(train_df['Name'].head(10).tolist())

# Name length
train_df['NameLength'] = train_df['Name'].str.len()
print(f"\nName length statistics:")
print(train_df['NameLength'].describe())

# Correlation with survival
name_length_corr = train_df['NameLength'].corr(train_df['Survived'])
print(f"\nCorrelation between name length and survival: {name_length_corr:.3f}")

# Analyze fare patterns more deeply
print("\nFare statistics:")
print(train_df['Fare'].describe())

# Fare bins - more granular than baseline
fare_bins = [0, 7.91, 14.45, 31.0, 100, 600]
fare_labels = ['VeryLow', 'Low', 'Medium', 'High', 'VeryHigh']
train_df['FareBin'] = pd.cut(train_df['Fare'], bins=fare_bins, labels=fare_labels)

print("\nSurvival by fare bins:")
fare_bin_analysis = train_df.groupby('FareBin')['Survived'].agg(['count', 'sum', 'mean'])
print(fare_bin_analysis)

In [None]:
# Feature importance analysis using baseline model
from sklearn.metrics import accuracy_score

# Prepare baseline features
numeric_features_baseline = ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'FarePerPerson']
categorical_features_baseline = ['Pclass', 'Sex', 'Embarked', 'Title', 'IsAlone', 'AgeGroup', 'Deck']

X_baseline = train_baseline[numeric_features_baseline + categorical_features_baseline]
y = train_baseline['Survived']

# Create pipeline
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features_baseline),
        ('cat', categorical_transformer, categorical_features_baseline)
    ])

# Train baseline model
rf_baseline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])

rf_baseline.fit(X_baseline, y)

# Get feature importance
categorical_features_encoded = list(rf_baseline.named_steps['preprocessor']
                                   .named_transformers_['cat']
                                   .named_steps['encoder']
                                   .get_feature_names_out(categorical_features_baseline))

all_features = numeric_features_baseline + categorical_features_encoded
importances = rf_baseline.named_steps['classifier'].feature_importances_

feature_importance_df = pd.DataFrame({
    'feature': all_features,
    'importance': importances
}).sort_values('importance', ascending=False)

print("Top 20 feature importances:")
print(feature_importance_df.head(20))

In [None]:
# Analyze misclassifications to identify patterns
from sklearn.model_selection import cross_val_predict

# Get cross-validated predictions
y_pred = cross_val_predict(rf_baseline, X_baseline, y, cv=5)

# Identify misclassifications
misclassified = (y_pred != y)
misclassified_indices = misclassified[misclassified].index

print(f"Total misclassified: {misclassified.sum()} out of {len(y)} ({misclassified.mean():.1%})")

# Analyze misclassifications by groups
print("\nMisclassification rates by Pclass:")
print(pd.crosstab(train_baseline.loc[misclassified_indices, 'Pclass'], 
                  columns='count', normalize='index'))

print("\nMisclassification rates by Sex:")
print(pd.crosstab(train_baseline.loc[misclassified_indices, 'Sex'], 
                  columns='count', normalize='index'))

print("\nMisclassification rates by Title:")
print(pd.crosstab(train_baseline.loc[misclassified_indices, 'Title'], 
                  columns='count', normalize='index'))

# Look at some specific misclassified cases
print("\nSample misclassified cases (True label -> Predicted):")
misclassified_df = train_baseline.loc[misclassified_indices].copy()
misclassified_df['TrueLabel'] = y[misclassified_indices]
misclassified_df['PredictedLabel'] = y_pred[misclassified_indices]

print(misclassified_df[['Name', 'Pclass', 'Sex', 'Age', 'Title', 'Fare', 'TrueLabel', 'PredictedLabel']].head(10))