# Evolver Loop 1 Analysis

Analysis of current state and identification of improvement opportunities.

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print("\nTrain info:")
print(train_df.info())

In [None]:
# Load previous experiment results
with open('/home/code/experiments/001_baseline_results.json', 'r') as f:
    baseline_results = json.load(f)

print("Baseline Results:")
print(f"Mean CV Score: {baseline_results['mean_cv_score']:.4f} Â± {baseline_results['std_cv_score']:.4f}")
print(f"OOF Accuracy: {baseline_results['oof_accuracy']:.4f}")
print(f"CV Scores: {[f'{score:.4f}' for score in baseline_results['cv_scores']]}")

# Feature importance
importance_df = pd.DataFrame(baseline_results['feature_importance'])
print("\nTop Features:")
print(importance_df.head(10))

In [None]:
# Analyze missing values
print("Missing Values in Train:")
print(train_df.isnull().sum())
print("\nMissing Values in Test:")
print(test_df.isnull().sum())

# Analyze target distribution
print(f"\nTarget distribution in train:")
print(train_df['Survived'].value_counts(normalize=True))

In [None]:
# Analyze feature distributions by target
# Sex feature
print("Survival by Sex:")
sex_survival = train_df.groupby('Sex')['Survived'].agg(['count', 'sum', 'mean'])
sex_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(sex_survival)

# Pclass feature
print("\nSurvival by Pclass:")
pclass_survival = train_df.groupby('Pclass')['Survived'].agg(['count', 'sum', 'mean'])
pclass_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(pclass_survival)

# Embarked feature
print("\nSurvival by Embarked:")
embarked_survival = train_df.groupby('Embarked')['Survived'].agg(['count', 'sum', 'mean'])
embarked_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(embarked_survival)

In [None]:
# Analyze Age patterns
print("Age statistics:")
print(train_df['Age'].describe())

# Create age groups and analyze survival
age_bins = [0, 12, 18, 35, 60, 100]
age_labels = ['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior']
train_df['AgeGroup'] = pd.cut(train_df['Age'], bins=age_bins, labels=age_labels)

print("\nSurvival by AgeGroup:")
age_survival = train_df.groupby('AgeGroup')['Survived'].agg(['count', 'sum', 'mean'])
age_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(age_survival)

In [None]:
# Analyze family features
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
print("Survival by FamilySize:")
family_survival = train_df.groupby('FamilySize')['Survived'].agg(['count', 'sum', 'mean'])
family_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(family_survival.head(10))

# Create IsAlone feature
train_df['IsAlone'] = (train_df['FamilySize'] == 1).astype(int)
print("\nSurvival by IsAlone:")
alone_survival = train_df.groupby('IsAlone')['Survived'].agg(['count', 'sum', 'mean'])
alone_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(alone_survival)

In [None]:
# Analyze Fare patterns
print("Fare statistics:")
print(train_df['Fare'].describe())

# Create fare bins
train_df['FareBin'] = pd.qcut(train_df['Fare'], q=5, labels=['VeryLow', 'Low', 'Medium', 'High', 'VeryHigh'])
print("\nSurvival by FareBin:")
fare_survival = train_df.groupby('FareBin')['Survived'].agg(['count', 'sum', 'mean'])
fare_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(fare_survival)

In [None]:
# Analyze Title patterns (extracted from Name)
train_df['Title'] = train_df['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)

print("Title distribution:")
print(train_df['Title'].value_counts())

print("\nSurvival by Title:")
title_survival = train_df.groupby('Title')['Survived'].agg(['count', 'sum', 'mean'])
title_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(title_survival)

In [None]:
# Analyze Cabin patterns
train_df['HasCabin'] = train_df['Cabin'].notna().astype(int)
train_df['CabinLetter'] = train_df['Cabin'].str[0]
train_df['CabinLetter'] = train_df['CabinLetter'].fillna('Unknown')

print("Survival by HasCabin:")
cabin_survival = train_df.groupby('HasCabin')['Survived'].agg(['count', 'sum', 'mean'])
cabin_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(cabin_survival)

print("\nSurvival by CabinLetter:")
cabin_letter_survival = train_df.groupby('CabinLetter')['Survived'].agg(['count', 'sum', 'mean'])
cabin_letter_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(cabin_letter_survival)

In [None]:
# Identify potential new features
print("Potential new feature ideas:")

# 1. Name length
train_df['NameLength'] = train_df['Name'].str.len()
print(f"\n1. NameLength correlation with survival: {train_df['NameLength'].corr(train_df['Survived']):.4f}")

# 2. Ticket patterns - check if ticket has numbers only or mixed
train_df['TicketPrefix'] = train_df['Ticket'].str.extract('([A-Za-z]+)', expand=False)
train_df['TicketPrefix'] = train_df['TicketPrefix'].fillna('None')
print("\n2. TicketPrefix distribution:")
print(train_df['TicketPrefix'].value_counts().head())

# 3. Fare per person (already in baseline)
train_df['FarePerPerson'] = train_df['Fare'] / train_df['FamilySize']
print(f"\n3. FarePerPerson correlation with survival: {train_df['FarePerPerson'].corr(train_df['Survived']):.4f}")

# 4. Family survival rate (would need cross-validation to avoid leakage)
print("\n4. Family survival rate feature - needs careful implementation to avoid leakage")

In [None]:
# Check for data leakage concerns
print("Data Leakage Analysis:")
print("="*50)

print("\n1. Current preprocessing approach:")
print("- Median imputation: Done globally before CV")
print("- Label encoding: Fit on combined train+test before CV")
print("- This leaks test distribution information into training")

print("\n2. Impact on CV scores:")
print(f"- Current CV score: {baseline_results['mean_cv_score']:.4f}")
print("- This may be optimistic due to leakage")
print("- LB score may be lower than CV")

print("\n3. What needs to be fixed:")
print("- Move ALL preprocessing inside CV loop")
print("- Fit imputers/encoders on train fold only")
print("- Transform val/test with fitted transformers")