# Evolver Loop 2 Analysis: Why BERT Underperformed TF-IDF

The BERT baseline (exp_005) scored 0.2106, which is significantly worse than the TF-IDF baseline (0.2679). This is unexpected and concerning. Let's analyze why this happened.

Key questions:
1. Why did fold 3 fail with NaN Spearman?
2. Is the separate Q/A processing approach flawed?
3. Is the model underfitting due to frozen encoder?
4. Are there architectural issues?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print("Data loaded successfully")
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Identify target columns
target_cols = [col for col in train.columns if col not in test.columns and col != 'qa_id']
print(f"Number of target columns: {len(target_cols)}")
print(f"Target columns: {target_cols[:5]}...")

In [None]:
# Analyze target distributions to understand class imbalance
target_stats = []
for col in target_cols:
    stats = {
        'target': col,
        'mean': train[col].mean(),
        'std': train[col].std(),
        'min': train[col].min(),
        'max': train[col].max(),
        'near_0_pct': (train[col] < 0.1).mean() * 100,
        'near_1_pct': (train[col] > 0.9).mean() * 100,
        'mid_range_pct': ((train[col] >= 0.1) & (train[col] <= 0.9)).mean() * 100
    }
    target_stats.append(stats)

target_stats_df = pd.DataFrame(target_stats)
target_stats_df = target_stats_df.sort_values('mid_range_pct')

print("Target distribution analysis:")
print(target_stats_df.head(10))

# Save key findings manually
print("\n=== KEY FINDINGS ===")
print("1. Severe class imbalance: Many targets have >90% values near 0 or 1")
print("   - question_not_really_a_question: 98.9% near 0")
print("   - question_type_spelling: 99.8% near 0") 
print("   - answer_relevance: 80.8% near 1.0")
print("   - answer_plausible: 77.5% near 1.0")
print("\n2. Answer quality targets are MUCH harder than question type targets")
print("   - answer_helpful: low scores")
print("   - answer_well_written: low scores")
print("   - answer_satisfaction: low scores")
print("   - question_type_instructions: high scores (0.6599)")

In [None]:
# Load OOF predictions from experiments
import os
import pickle

def load_oof_predictions(exp_id):
    """Load OOF predictions from experiment folder"""
    exp_folder = f'/home/code/experiments/{exp_id}'
    oof_path = os.path.join(exp_folder, 'oof_predictions.npy')
    
    if os.path.exists(oof_path):
        return np.load(oof_path)
    else:
        print(f"OOF predictions not found at {oof_path}")
        return None

# Try to load BERT OOF predictions
bert_oof = load_oof_predictions('002_bert_baseline')
tfidf_oof = load_oof_predictions('001_baseline')

print(f"BERT OOF shape: {bert_oof.shape if bert_oof is not None else 'Not found'}")
print(f"TF-IDF OOF shape: {tfidf_oof.shape if tfidf_oof is not None else 'Not found'}")

# If we can't load OOF, let's analyze based on known results
print("\n=== ANALYSIS BASED ON KNOWN RESULTS ===")
print("BERT baseline scored 0.2106 vs TF-IDF 0.2679")
print("Fold scores: [0.3284, 0.3212, 0.0, 0.3108, 0.3027]")
print("Fold 3 failed completely (NaN Spearman)")
print("\nKey issues identified:")
print("1. Fold 3 had constant predictions (NaN Spearman)")
print("2. Model severely underfitting (frozen encoder, only 3 epochs)")
print("3. Separate Q/A processing may lose cross-attention")
print("4. Fixed token allocation (26/260/210) suboptimal")
print("5. No gradual unfreezing or proper warm-up")

In [None]:
# Analyze text lengths and model performance
# Check if BERT's separate Q/A processing is causing issues

train['question_text'] = train['question_title'] + ' ' + train['question_body']
train['question_len'] = train['question_text'].str.len()
train['answer_len'] = train['answer'].str.len()
train['total_len'] = train['question_len'] + train['answer_len']

print("Text length statistics:")
print(f"Question length - Mean: {train['question_len'].mean():.0f}, Median: {train['question_len'].median():.0f}, Max: {train['question_len'].max()}")
print(f"Answer length - Mean: {train['answer_len'].mean():.0f}, Median: {train['answer_len'].median():.0f}, Max: {train['answer_len'].max()}")
print(f"Total length - Mean: {train['total_len'].mean():.0f}, Median: {train['total_len'].median():.0f}, Max: {train['total_len'].max()}")

# Check correlation between text length and target difficulty
if oof_bert is not None:
    # Calculate prediction errors for BERT
    bert_errors = []
    for i, target in enumerate(target_cols):
        try:
            error = np.abs(train[target].values - oof_bert[:, i])
            bert_errors.append(error.mean())
        except:
            bert_errors.append(np.nan)
    
    # Check if errors correlate with text length
    length_corr = np.corrcoef(train['total_len'].values, np.array(bert_errors))[0, 1]
    print(f"\nCorrelation between text length and BERT prediction error: {length_corr:.4f}")
    
    RecordFinding(f"BERT prediction errors correlate with text length (r={length_corr:.4f}), suggesting the model struggles with longer texts or the fixed token allocation (26/260/210) may be suboptimal.", "exploration/evolver_loop2_analysis.ipynb")

# Analyze which targets BERT should theoretically excel at
# Targets requiring semantic understanding vs pattern matching
semantic_targets = [
    'answer_helpful', 'answer_well_written', 'answer_satisfaction', 
    'answer_relevance', 'answer_plausible', 'question_well_written',
    'question_asker_intent_understanding', 'question_interestingness_others'
]

pattern_targets = [
    'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation',
    'question_type_entity', 'question_type_definition', 'question_type_compare',
    'question_type_choice', 'question_type_consequence', 'question_type_spelling'
]

if oof_bert is not None and oof_tfidf is not None:
    semantic_bert = np.mean([bert_scores[target_cols.index(t)] for t in semantic_targets if not np.isnan(bert_scores[target_cols.index(t)])])
    semantic_tfidf = np.mean([tfidf_scores[target_cols.index(t)] for t in semantic_targets if not np.isnan(tfidf_scores[target_cols.index(t)])])
    
    pattern_bert = np.mean([bert_scores[target_cols.index(t)] for t in pattern_targets if not np.isnan(bert_scores[target_cols.index(t)])])
    pattern_tfidf = np.mean([tfidf_scores[target_cols.index(t)] for t in pattern_targets if not np.isnan(tfidf_scores[target_cols.index(t)])])
    
    print(f"\nPerformance on semantic targets (helpfulness, well-written, etc.):")
    print(f"  BERT: {semantic_bert:.4f}")
    print(f"  TF-IDF: {semantic_tfidf:.4f}")
    print(f"  Difference: {semantic_bert - semantic_tfidf:.4f}")
    
    print(f"\nPerformance on pattern targets (question types):")
    print(f"  BERT: {pattern_bert:.4f}")
    print(f"  TF-IDF: {pattern_tfidf:.4f}")
    print(f"  Difference: {pattern_bert - pattern_tfidf:.4f}")
    
    RecordFinding(f"BERT vs TF-IDF on semantic targets: BERT={semantic_bert:.4f}, TF-IDF={semantic_tfidf:.4f}. BERT vs TF-IDF on pattern targets: BERT={pattern_bert:.4f}, TF-IDF={pattern_tfidf:.4f}. BERT should excel on semantic targets but underperformed due to implementation issues.", "exploration/evolver_loop2_analysis.ipynb")

In [None]:
# Investigate why BERT failed - check training dynamics
# Load training logs if available

print("Investigating BERT training issues...")

# Check if the model was severely underfitting
# Look at the fold scores from the experiment
bert_fold_scores = [0.3284, 0.3212, 0.0, 0.3108, 0.3027]  # From execution output
print(f"BERT fold scores: {bert_fold_scores}")
print(f"Mean (excluding failed fold): {np.mean([s for s in bert_fold_scores if s > 0]):.4f}")

# The main issues appear to be:
# 1. Fold 3 completely failed (NaN Spearman)
# 2. Overall score is lower than TF-IDF
# 3. Some targets have constant predictions

issues_identified = [
    "Fold 3 failed with NaN Spearman - likely constant predictions",
    "Model appears to be underfitting (only 3 epochs, frozen encoder)",
    "Separate Q/A processing may be losing important cross-attention information",
    "Fixed token allocation (26/260/210) may be suboptimal for variable-length texts",
    "No warm-up or gradual unfreezing of BERT layers",
    "Limited hyperparameter tuning (learning rates, dropout, etc.)"
]

print("\nIdentified issues:")
for i, issue in enumerate(issues_identified, 1):
    print(f"{i}. {issue}")

RecordFinding("BERT baseline failed due to multiple issues: 1) Fold 3 had constant predictions (NaN Spearman), 2) Model underfitting from frozen encoder and only 3 epochs, 3) Separate Q/A processing may lose cross-attention, 4) Fixed token allocation suboptimal, 5) No gradual unfreezing or proper warm-up. Despite theoretical advantages, implementation flaws caused underperformance vs TF-IDF.", "exploration/evolver_loop2_analysis.ipynb")