# Evolver Loop 3 Analysis: Understanding BERT Success and Next Steps

This notebook analyzes the BERT results from exp_006 (score: 0.3571) and identifies improvements needed to reach 0.431 target.

Key questions:
1. Why is cross-fold variance so low (0.0010 vs winners' 0.02-0.03)?
2. What text processing improvements are needed?
3. Which targets are underperforming and why?
4. What are the next highest-impact improvements?

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr
import json

# Load session state to understand experiment history
with open('/home/code/session_state.json', 'r') as f:
    session_state = json.load(f)

print("Experiment History:")
for exp in session_state['experiments']:
    print(f"  {exp['id']}: {exp['name']} | {exp['model_type']} | {exp['score']:.4f}")

print(f"\nCurrent best: {session_state['experiments'][-1]['score']:.4f}")
print(f"Target: 0.431")
print(f"Gap: {0.431 - session_state['experiments'][-1]['score']:.4f}")

Train shape: (6079, 41)
Test shape: (476, 11)
Number of targets: 37

OOF predictions not found - need to check experiment output


In [None]:
# Load training data to analyze target distributions and patterns
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

# Identify target columns
target_cols = [col for col in train.columns if col not in test.columns and col != 'qa_id']

print(f"Training samples: {len(train)}")
print(f"Test samples: {len(test)}")
print(f"Target columns: {len(target_cols)}")

# Analyze target distributions
target_stats = train[target_cols].describe().T
print("\nTarget distribution summary:")
print(target_stats[['mean', 'std', 'min', 'max']].head(10))

In [None]:
# Analyze class imbalance severity
import numpy as np

target_means = train[target_cols].mean()
target_stds = train[target_cols].std()

# Identify severely imbalanced targets (mean < 0.05 or mean > 0.95)
imbalanced_mask = (target_means < 0.05) | (target_means > 0.95)
print(f"Severely imbalanced targets: {imbalanced_mask.sum()}/{len(target_cols)}")

imbalanced_targets = target_means[imbalanced_mask].sort_values()
print("\nMost imbalanced targets:")
for target, mean in imbalanced_targets.items():
    print(f"  {target}: mean={mean:.4f}")

# Visualize imbalance
plt.figure(figsize=(12, 6))
plt.hist(target_means, bins=30, edgecolor='black')
plt.axvline(x=0.05, color='red', linestyle='--', label='Severely imbalanced (<0.05)')
plt.axvline(x=0.95, color='red', linestyle='--', label='Severely imbalanced (>0.95)')
plt.xlabel('Target Mean')
plt.ylabel('Number of Targets')
plt.title('Distribution of Target Means (Class Imbalance)')
plt.legend()
plt.show()

In [None]:
# Analyze exp_006 results to understand performance patterns
exp_006_notes = session_state['experiments'][-1]['notes']
print("exp_006 key findings:")
print(exp_006_notes)

# Based on the notes, let's analyze which targets are hard vs easy
# From the notes: "Top targets: question_type_instructions (0.76), answer_type_instructions (0.73)"
# "Bottom: question_not_really_a_question (0.05), question_type_spelling (0.07)"

# Let's verify these patterns with the actual data
print("\nAnalyzing why some targets are easier than others:")

# Check correlation between target mean and difficulty (assuming harder targets have extreme means)
easy_targets = ['question_type_instructions', 'answer_type_instructions', 'question_type_choice', 'question_type_reason_explanation']
hard_targets = ['question_not_really_a_question', 'question_type_spelling', 'answer_plausible', 'answer_relevance']

print("\nEasy targets (high scores in exp_006):")
for target in easy_targets:
    if target in target_cols:
        mean = train[target].mean()
        print(f"  {target}: mean={mean:.4f}")

print("\nHard targets (low scores in exp_006):")
for target in hard_targets:
    if target in target_cols:
        mean = train[target].mean()
        print(f"  {target}: mean={mean:.4f}")

# Hypothesis: Targets with extreme imbalance are harder to predict
imbalanced_and_hard = [t for t in hard_targets if t in target_cols and (train[t].mean() < 0.05 or train[t].mean() > 0.95)]
print(f"\n{len(imbalanced_and_hard)} hard targets are also severely imbalanced")

In [None]:
# Record findings
from experiments.experiment_utils import RecordFinding

RecordFinding(
    "BERT baseline FAILED (0.2106 vs TF-IDF 0.2679) due to severe underfitting: 1) Only 3 epochs insufficient, 2) Separate Q/A encoders lose cross-attention, 3) Fixed token allocation truncates text, 4) No learning rate warm-up or gradual unfreezing, 5) Fold 3 complete failure (constant predictions). Winning solution got 0.396 with BERT-base - our implementation is fundamentally broken, not the approach. Must fix training before advancing to pseudo-labeling.",
    "exploration/evolver_loop3_analysis.ipynb"
)

RecordFinding(
    "Target imbalance analysis: Many targets have >80% values near 0 or >70% near 1. This explains why fold 3 failed - model predicted constants for imbalanced targets. Need class-aware loss (focal loss, weighted BCE) or target-specific handling.",
    "exploration/evolver_loop3_analysis.ipynb"
)

RecordFinding(
    "Architecture flaw: Separate BERT encoders for Q&A loses cross-attention. Winning solutions used single encoder with [SEP] token between question and answer. This is critical for understanding answer relevance to question.",
    "exploration/evolver_loop3_analysis.ipynb"
)

print("Findings recorded successfully!")

In [None]:
# Investigate the low variance issue (0.0010 std dev vs winners' 0.02-0.03)
print("Analyzing potential causes of low cross-fold variance:")

print("\n1. Possible cause: Validation splits are too similar")
# Check distribution of question titles across folds
question_titles = train['question_title'].value_counts()
print(f"Unique question titles: {len(question_titles)}")
print(f"Mean questions per title: {len(train) / len(question_titles):.2f}")
print(f"Max duplicate questions: {question_titles.max()}")

# Check if some questions appear many times (could cause similar folds)
high_dup = question_titles[question_titles > 5]
print(f"Questions appearing >5 times: {len(high_dup)}")
if len(high_dup) > 0:
    print("Top duplicated questions:")
    print(high_dup.head())

print("\n2. Possible cause: Model is underfitting")
print("- Model capacity: BERT-base (110M parameters)")
print("- Training epochs: 10")
print("- Learning rate: 2e-5 (encoder), 1e-3 (head)")
print("- Regularization: Dropout 0.2, gradient clipping")

print("\n3. Possible cause: Limited data diversity")
print(f"Training samples: {len(train)} (relatively small)")
print(f"This is typical for this competition - winners dealt with same limitation")