# Loop 11 Analysis: exp_011 Results and Next Steps

## Key Results from exp_011
- CV: 0.82032 (+/- 0.01408) - **BEST CV achieved so far!**
- Exceeds exp_003's 0.81951 by +0.00081
- 10-fold CV with stronger regularization
- Fold range: 0.79862 - 0.84253 (4.4% spread)

## Questions to Answer
1. Should we submit exp_011?
2. What's causing the high fold variance?
3. Is GroupKFold worth trying?

In [None]:
import pandas as pd
import numpy as np

# Load data to analyze group structure
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

# Extract Group from PassengerId
train['Group'] = train['PassengerId'].apply(lambda x: int(x.split('_')[0]))
test['Group'] = test['PassengerId'].apply(lambda x: int(x.split('_')[0]))

print(f"Train: {train.shape}, Test: {test.shape}")
print(f"\nUnique groups in train: {train['Group'].nunique()}")
print(f"Unique groups in test: {test['Group'].nunique()}")
print(f"Total unique groups: {pd.concat([train['Group'], test['Group']]).nunique()}")

In [None]:
# Analyze group sizes
group_sizes = train.groupby('Group').size()
print("Group size distribution:")
print(group_sizes.value_counts().sort_index())
print(f"\nMean group size: {group_sizes.mean():.2f}")
print(f"Max group size: {group_sizes.max()}")
print(f"Solo travelers (group size 1): {(group_sizes == 1).sum()} ({(group_sizes == 1).mean()*100:.1f}%)")
print(f"Groups with 2+ members: {(group_sizes > 1).sum()} ({(group_sizes > 1).mean()*100:.1f}%)")

In [None]:
# Analyze transported rate by group
train['Transported'] = train['Transported'].astype(int)
group_transported = train.groupby('Group')['Transported'].agg(['mean', 'count'])

print("Transported rate by group size:")
for size in range(1, 9):
    mask = group_transported['count'] == size
    if mask.sum() > 0:
        rate = group_transported.loc[mask, 'mean'].mean()
        n_groups = mask.sum()
        print(f"  Size {size}: {rate:.3f} transported rate ({n_groups} groups)")

# Check if groups have consistent outcomes
group_variance = train.groupby('Group')['Transported'].var().dropna()
print(f"\nGroups with mixed outcomes (variance > 0): {(group_variance > 0).sum()} ({(group_variance > 0).mean()*100:.1f}%)")
print(f"Groups with consistent outcomes (variance = 0): {(group_variance == 0).sum()} ({(group_variance == 0).mean()*100:.1f}%)")

In [None]:
# CV-LB relationship analysis
submissions = [
    {'exp': 'exp_000', 'cv': 0.8067, 'lb': 0.7971},
    {'exp': 'exp_003', 'cv': 0.8195, 'lb': 0.8045},
    {'exp': 'exp_004', 'cv': 0.8193, 'lb': 0.8041},
    {'exp': 'exp_006', 'cv': 0.8171, 'lb': 0.8010},
]

df_sub = pd.DataFrame(submissions)
print("CV-LB Relationship:")
print(df_sub.to_string(index=False))

# Linear regression to predict LB from CV
from scipy import stats
slope, intercept, r_value, p_value, std_err = stats.linregress(df_sub['cv'], df_sub['lb'])
print(f"\nLinear model: LB = {slope:.3f} * CV + {intercept:.3f}")
print(f"RÂ² = {r_value**2:.3f}")

# Predict LB for exp_011
exp_011_cv = 0.82032
predicted_lb = slope * exp_011_cv + intercept
print(f"\nexp_011 CV: {exp_011_cv:.5f}")
print(f"Predicted LB: {predicted_lb:.5f}")
print(f"Gap from best LB (0.8045): {predicted_lb - 0.8045:.5f}")

In [None]:
# Analyze if exp_011 should be submitted
print("=== SUBMISSION DECISION ===")
print(f"\nexp_011 CV: 0.82032 (+/- 0.01408)")
print(f"Best previous CV: 0.81951 (exp_003)")
print(f"Improvement: +0.00081 (+0.10%)")

print(f"\nPredicted LB: {predicted_lb:.5f}")
print(f"Best LB so far: 0.8045 (exp_003)")
print(f"Predicted improvement: {predicted_lb - 0.8045:.5f}")

print("\n--- RECOMMENDATION ---")
if predicted_lb > 0.8045:
    print(">>> SUBMIT exp_011 - predicted to beat best LB")
else:
    print(">>> SUBMIT exp_011 anyway - regularization may help generalization")
    print("    - CV is best achieved so far")
    print("    - Regularization could reduce CV-LB gap")
    print("    - Worth testing with 6 submissions remaining")

In [None]:
# GroupKFold analysis - would it help?
print("=== GROUPKFOLD ANALYSIS ===")
print("\nWhy GroupKFold might help:")
print("1. Passengers in same group likely have correlated outcomes")
print("2. Standard StratifiedKFold may leak information between folds")
print("3. High fold variance (4.4% range) suggests sensitivity to data splits")

print("\nPotential issues with GroupKFold:")
print(f"1. Many solo travelers ({(group_sizes == 1).sum()} groups of size 1)")
print("2. Group sizes vary (1-8), making folds uneven")
print("3. May reduce effective sample size per fold")

print("\nRecommendation: Try GroupKFold as next experiment")
print("- If variance decreases, it's capturing group structure")
print("- If CV drops significantly, groups may not be as correlated as expected")