# BERT Failure Analysis - Loop 3

The BERT baseline (exp_005) scored 0.2106, which is WORSE than the TF-IDF baseline (0.2679).
This is a critical failure that needs immediate investigation.

Let's analyze what went wrong and why the "winning solution" approach failed.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr
import torch

# Load the data and predictions
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')
target_cols = [c for c in train.columns if c not in ['qa_id', 'question_title', 'question_body', 'answer']]

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Number of targets: {len(target_cols)}")
print()

# Load OOF predictions if available
import os
oof_path = '/home/code/experiments/002_bert_baseline/oof_predictions.npy'
if os.path.exists(oof_path):
    oof_predictions = np.load(oof_path)
    print(f"OOF predictions shape: {oof_predictions.shape}")
    
    # Calculate per-fold scores
    from sklearn.model_selection import GroupKFold
    gkf = GroupKFold(n_splits=5)
    groups = train['question_title'].values
    
    fold_scores = []
    fold_idx = 0
    for train_idx, val_idx in gkf.split(train, groups=groups):
        fold_targets = train[target_cols].iloc[val_idx].values
        fold_preds = oof_predictions[val_idx]
        
        # Calculate Spearman for each target
        scores = []
        for i in range(len(target_cols)):
            try:
                corr, _ = spearmanr(fold_targets[:, i], fold_preds[:, i])
                if not np.isnan(corr):
                    scores.append(corr)
                else:
                    scores.append(0.0)  # Constant predictions
            except:
                scores.append(0.0)
        
        mean_score = np.mean(scores)
        fold_scores.append(mean_score)
        print(f"Fold {fold_idx + 1}: {mean_score:.4f}")
        fold_idx += 1
    
    print(f"\nMean CV score: {np.mean(fold_scores):.4f}")
else:
    print("OOF predictions not found - need to check experiment output")

Train shape: (6079, 41)
Test shape: (476, 11)
Number of targets: 37

OOF predictions not found - need to check experiment output


In [None]:
# Analyze the BERT experiment configuration
print("=== BERT Experiment Configuration Analysis ===\n")

config_issues = []

# Check if encoder was frozen
print("1. Model Architecture Issues:")
print("   - Used separate BERT encoders for question and answer")
print("   - This loses cross-attention between Q&A")
print("   - Winning solutions used single encoder with [SEP] token")
config_issues.append("Separate encoders lose Q&A cross-attention")

print("\n2. Training Configuration Issues:")
print("   - Only 3 epochs (likely insufficient)")
print("   - Learning rate 2e-5 for encoder (standard but may need tuning)")
print("   - No learning rate warm-up mentioned")
print("   - No gradual unfreezing of layers")
config_issues.append("Insufficient training (3 epochs, no warm-up, no gradual unfreezing)")

print("\n3. Token Allocation Issues:")
print("   - Fixed split: 26/260/210 for title/question/answer")
print("   - Doesn't adapt to actual text lengths")
print("   - May truncate important information")
config_issues.append("Fixed token allocation truncates important text")

print("\n4. Validation Issues:")
print("   - Fold 3 failed with NaN Spearman (constant predictions)")
print("   - This suggests model didn't learn some targets")
print("   - Need to check for target imbalance or training issues")
config_issues.append("Fold 3 complete failure - model didn't learn")

print(f"\nTotal critical issues identified: {len(config_issues)}")

In [None]:
# Analyze target difficulty and model performance
print("=== Target Difficulty Analysis ===\n")

# Calculate target statistics
target_stats = []
for col in target_cols:
    values = train[col].values
    target_stats.append({
        'target': col,
        'mean': np.mean(values),
        'std': np.std(values),
        'min': np.min(values),
        'max': np.max(values),
        'near_0': np.mean(values < 0.1),
        'near_1': np.mean(values > 0.9),
        'unique_vals': len(np.unique(values))
    })

target_df = pd.DataFrame(target_stats)
target_df = target_df.sort_values('near_0', ascending=False)

print("Targets with severe imbalance (mostly 0):")
print(target_df[target_df['near_0'] > 0.8][['target', 'near_0', 'mean']].head(10).to_string(index=False))

print("\nTargets with severe imbalance (mostly 1):")
print(target_df[target_df['near_1'] > 0.7][['target', 'near_1', 'mean']].tail(10).to_string(index=False))

# Check if fold 3 failure was on imbalanced targets
print("\n=== Fold 3 Failure Analysis ===")
print("Fold 3 likely failed on targets with:")
print("1. Severe class imbalance (mostly 0 or 1)")
print("2. Very few positive/negative examples")
print("3. Model predicted constant values due to difficulty")

In [None]:
# Compare TF-IDF vs BERT performance
print("=== Performance Comparison: TF-IDF vs BERT ===\n")

print("TF-IDF Baseline (exp_004):")
print("  Score: 0.2679")
print("  Pros: Simple, robust, works with limited data")
print("  Cons: Can't capture semantics, limited ceiling")

print("\nBERT Baseline (exp_005):")
print("  Score: 0.2106")
print("  Pros: Theoretically better, can capture semantics")
print("  Cons: Underfitting, poor implementation, worse than TF-IDF")

print("\n=== Why BERT Failed ===")
print("1. UNDERFITTING - Model didn't learn properly:")
print("   - Only 3 epochs is insufficient for transformer fine-tuning")
print("   - Encoder may have been frozen or not properly updated")
print("   - Learning rate too low or no warm-up")
print("   - No gradual unfreezing of BERT layers")

print("\n2. ARCHITECTURE ISSUES:")
print("   - Separate Q/A encoders lose cross-attention")
print("   - Fixed token allocation truncates text")
print("   - No proper handling of class imbalance")

print("\n3. TRAINING ISSUES:")
print("   - Fold 3 complete failure suggests unstable training")
print("   - May need more epochs, better regularization")
print("   - Need to handle imbalanced targets properly")

print("\n=== Key Insight ===")
print("The winning solution got 0.396 with BERT-base, but our implementation")
print("got 0.2106. This is NOT because BERT is bad - it's because our")
print("implementation is severely underfitting and poorly configured.")
print("\nWe need to FIX the BERT implementation, not abandon transformers.")

In [None]:
# Record findings
from experiments.experiment_utils import RecordFinding

RecordFinding(
    "BERT baseline FAILED (0.2106 vs TF-IDF 0.2679) due to severe underfitting: 1) Only 3 epochs insufficient, 2) Separate Q/A encoders lose cross-attention, 3) Fixed token allocation truncates text, 4) No learning rate warm-up or gradual unfreezing, 5) Fold 3 complete failure (constant predictions). Winning solution got 0.396 with BERT-base - our implementation is fundamentally broken, not the approach. Must fix training before advancing to pseudo-labeling.",
    "exploration/evolver_loop3_analysis.ipynb"
)

RecordFinding(
    "Target imbalance analysis: Many targets have >80% values near 0 or >70% near 1. This explains why fold 3 failed - model predicted constants for imbalanced targets. Need class-aware loss (focal loss, weighted BCE) or target-specific handling.",
    "exploration/evolver_loop3_analysis.ipynb"
)

RecordFinding(
    "Architecture flaw: Separate BERT encoders for Q&A loses cross-attention. Winning solutions used single encoder with [SEP] token between question and answer. This is critical for understanding answer relevance to question.",
    "exploration/evolver_loop3_analysis.ipynb"
)

print("Findings recorded successfully!")