# Evolver Loop 1 Analysis

Analysis of current state and identification of improvement opportunities.

In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print("\nTrain info:")
print(train_df.info())

Train shape: (891, 12)
Test shape: (418, 11)

Train info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [2]:
# Load previous experiment results
with open('/home/code/experiments/001_baseline_results.json', 'r') as f:
    baseline_results = json.load(f)

print("Baseline Results:")
print(f"Mean CV Score: {baseline_results['mean_cv_score']:.4f} ± {baseline_results['std_cv_score']:.4f}")
print(f"OOF Accuracy: {baseline_results['oof_accuracy']:.4f}")
print(f"CV Scores: {[f'{score:.4f}' for score in baseline_results['cv_scores']]}")

# Feature importance
importance_df = pd.DataFrame(baseline_results['feature_importance'])
print("\nTop Features:")
print(importance_df.head(10))

Baseline Results:
Mean CV Score: 0.8541 ± 0.0226
OOF Accuracy: 0.8541
CV Scores: ['0.8827', '0.8652', '0.8146', '0.8483', '0.8596']

Top Features:
         feature   importance
0            Sex  1524.733730
1  FarePerPerson   602.632620
2            Age   579.309369
3           Fare   539.415274
4         Pclass   453.235429
5          Title   373.418227
6    CabinLetter   221.311582
7       Embarked   136.412549
8          SibSp   100.128826
9     FamilySize    86.340292


In [3]:
# Analyze missing values
print("Missing Values in Train:")
print(train_df.isnull().sum())
print("\nMissing Values in Test:")
print(test_df.isnull().sum())

# Analyze target distribution
print(f"\nTarget distribution in train:")
print(train_df['Survived'].value_counts(normalize=True))

Missing Values in Train:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Missing Values in Test:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

Target distribution in train:
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [4]:
# Analyze feature distributions by target
# Sex feature
print("Survival by Sex:")
sex_survival = train_df.groupby('Sex')['Survived'].agg(['count', 'sum', 'mean'])
sex_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(sex_survival)

# Pclass feature
print("\nSurvival by Pclass:")
pclass_survival = train_df.groupby('Pclass')['Survived'].agg(['count', 'sum', 'mean'])
pclass_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(pclass_survival)

# Embarked feature
print("\nSurvival by Embarked:")
embarked_survival = train_df.groupby('Embarked')['Survived'].agg(['count', 'sum', 'mean'])
embarked_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(embarked_survival)

Survival by Sex:
        Total  Survived  Survival_Rate
Sex                                   
female    314       233       0.742038
male      577       109       0.188908

Survival by Pclass:
        Total  Survived  Survival_Rate
Pclass                                
1         216       136       0.629630
2         184        87       0.472826
3         491       119       0.242363

Survival by Embarked:
          Total  Survived  Survival_Rate
Embarked                                
C           168        93       0.553571
Q            77        30       0.389610
S           644       217       0.336957


In [5]:
# Analyze Age patterns
print("Age statistics:")
print(train_df['Age'].describe())

# Create age groups and analyze survival
age_bins = [0, 12, 18, 35, 60, 100]
age_labels = ['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior']
train_df['AgeGroup'] = pd.cut(train_df['Age'], bins=age_bins, labels=age_labels)

print("\nSurvival by AgeGroup:")
age_survival = train_df.groupby('AgeGroup')['Survived'].agg(['count', 'sum', 'mean'])
age_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(age_survival)

Age statistics:
count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

Survival by AgeGroup:
            Total  Survived  Survival_Rate
AgeGroup                                  
Child          69        40       0.579710
Teen           70        30       0.428571
YoungAdult    358       137       0.382682
Adult         195        78       0.400000
Senior         22         5       0.227273


  age_survival = train_df.groupby('AgeGroup')['Survived'].agg(['count', 'sum', 'mean'])


In [6]:
# Analyze family features
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
print("Survival by FamilySize:")
family_survival = train_df.groupby('FamilySize')['Survived'].agg(['count', 'sum', 'mean'])
family_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(family_survival.head(10))

# Create IsAlone feature
train_df['IsAlone'] = (train_df['FamilySize'] == 1).astype(int)
print("\nSurvival by IsAlone:")
alone_survival = train_df.groupby('IsAlone')['Survived'].agg(['count', 'sum', 'mean'])
alone_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(alone_survival)

Survival by FamilySize:
            Total  Survived  Survival_Rate
FamilySize                                
1             537       163       0.303538
2             161        89       0.552795
3             102        59       0.578431
4              29        21       0.724138
5              15         3       0.200000
6              22         3       0.136364
7              12         4       0.333333
8               6         0       0.000000
11              7         0       0.000000

Survival by IsAlone:
         Total  Survived  Survival_Rate
IsAlone                                
0          354       179       0.505650
1          537       163       0.303538


In [7]:
# Analyze Fare patterns
print("Fare statistics:")
print(train_df['Fare'].describe())

# Create fare bins
train_df['FareBin'] = pd.qcut(train_df['Fare'], q=5, labels=['VeryLow', 'Low', 'Medium', 'High', 'VeryHigh'])
print("\nSurvival by FareBin:")
fare_survival = train_df.groupby('FareBin')['Survived'].agg(['count', 'sum', 'mean'])
fare_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(fare_survival)

Fare statistics:
count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

Survival by FareBin:
          Total  Survived  Survival_Rate
FareBin                                 
VeryLow     179        39       0.217877
Low         184        37       0.201087
Medium      172        73       0.424419
High        180        80       0.444444
VeryHigh    176       113       0.642045


  fare_survival = train_df.groupby('FareBin')['Survived'].agg(['count', 'sum', 'mean'])


In [8]:
# Analyze Title patterns (extracted from Name)
train_df['Title'] = train_df['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)

print("Title distribution:")
print(train_df['Title'].value_counts())

print("\nSurvival by Title:")
title_survival = train_df.groupby('Title')['Survived'].agg(['count', 'sum', 'mean'])
title_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(title_survival)

Title distribution:
Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Mlle          2
Major         2
Ms            1
Mme           1
Don           1
Lady          1
Sir           1
Capt          1
Countess      1
Jonkheer      1
Name: count, dtype: int64

Survival by Title:
          Total  Survived  Survival_Rate
Title                                   
Capt          1         0       0.000000
Col           2         1       0.500000
Countess      1         1       1.000000
Don           1         0       0.000000
Dr            7         3       0.428571
Jonkheer      1         0       0.000000
Lady          1         1       1.000000
Major         2         1       0.500000
Master       40        23       0.575000
Miss        182       127       0.697802
Mlle          2         2       1.000000
Mme           1         1       1.000000
Mr          517        81       0.156673
Mrs         125        99       0.792000


In [9]:
# Analyze Cabin patterns
train_df['HasCabin'] = train_df['Cabin'].notna().astype(int)
train_df['CabinLetter'] = train_df['Cabin'].str[0]
train_df['CabinLetter'] = train_df['CabinLetter'].fillna('Unknown')

print("Survival by HasCabin:")
cabin_survival = train_df.groupby('HasCabin')['Survived'].agg(['count', 'sum', 'mean'])
cabin_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(cabin_survival)

print("\nSurvival by CabinLetter:")
cabin_letter_survival = train_df.groupby('CabinLetter')['Survived'].agg(['count', 'sum', 'mean'])
cabin_letter_survival.columns = ['Total', 'Survived', 'Survival_Rate']
print(cabin_letter_survival)

Survival by HasCabin:
          Total  Survived  Survival_Rate
HasCabin                                
0           687       206       0.299854
1           204       136       0.666667

Survival by CabinLetter:
             Total  Survived  Survival_Rate
CabinLetter                                
A               15         7       0.466667
B               47        35       0.744681
C               59        35       0.593220
D               33        25       0.757576
E               32        24       0.750000
F               13         8       0.615385
G                4         2       0.500000
T                1         0       0.000000
Unknown        687       206       0.299854


In [11]:
# Identify potential new features
print("Potential new feature ideas:")

# 1. Name length
train_df['NameLength'] = train_df['Name'].str.len()
print(f"\n1. NameLength correlation with survival: {train_df['NameLength'].corr(train_df['Survived']):.4f}")

# 2. Ticket patterns - check if ticket has numbers only or mixed
train_df['TicketPrefix'] = train_df['Ticket'].str.extract('([A-Za-z]+)', expand=False)
train_df['TicketPrefix'] = train_df['TicketPrefix'].fillna('None')
print("\n2. TicketPrefix distribution:")
print(train_df['TicketPrefix'].value_counts().head())

# 3. Fare per person (already in baseline)
train_df['FarePerPerson'] = train_df['Fare'] / train_df['FamilySize']
print(f"\n3. FarePerPerson correlation with survival: {train_df['FarePerPerson'].corr(train_df['Survived']):.4f}")

# 4. Family survival rate (would need cross-validation to avoid leakage)
print("\n4. Family survival rate feature - needs careful implementation to avoid leakage")

Potential new feature ideas:

1. NameLength correlation with survival: 0.3323

2. TicketPrefix distribution:
TicketPrefix
None    661
PC       60
C        33
A        29
STON     18
Name: count, dtype: int64

3. FarePerPerson correlation with survival: 0.2216

4. Family survival rate feature - needs careful implementation to avoid leakage


In [12]:
# Check for data leakage concerns
print("Data Leakage Analysis:")
print("="*50)

print("\n1. Current preprocessing approach:")
print("- Median imputation: Done globally before CV")
print("- Label encoding: Fit on combined train+test before CV")
print("- This leaks test distribution information into training")

print("\n2. Impact on CV scores:")
print(f"- Current CV score: {baseline_results['mean_cv_score']:.4f}")
print("- This may be optimistic due to leakage")
print("- LB score may be lower than CV")

print("\n3. What needs to be fixed:")
print("- Move ALL preprocessing inside CV loop")
print("- Fit imputers/encoders on train fold only")
print("- Transform val/test with fitted transformers")

Data Leakage Analysis:

1. Current preprocessing approach:
- Median imputation: Done globally before CV
- Label encoding: Fit on combined train+test before CV
- This leaks test distribution information into training

2. Impact on CV scores:
- Current CV score: 0.8541
- This may be optimistic due to leakage
- LB score may be lower than CV

3. What needs to be fixed:
- Move ALL preprocessing inside CV loop
- Fit imputers/encoders on train fold only
- Transform val/test with fitted transformers
