In [1]:
"""
Cross-Encoder Reranking Comparison - Complete evaluation with parallelization
Tests: Baseline (no reranking), 2x over-fetch, 5x over-fetch
"""
import sys
from pathlib import Path
from tqdm import tqdm
import pandas as pd
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
import time

# Find project root
current = Path.cwd()
while current != current.parent:
    if (current / 'src').exists():
        project_root = current
        break
    current = current.parent
else:
    project_root = Path.cwd().parent

# Load environment variables
load_dotenv(project_root / '.env')

# Add src to path
src_path = str(project_root / 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Force reload the module to get latest changes
import importlib
if 'retrieval.retriever' in sys.modules:
    importlib.reload(sys.modules['retrieval.retriever'])

from retrieval.retriever import KnowledgeBaseRetriever

# Verify API key
import os
if not os.getenv('OPENAI_API_KEY'):
    print("ERROR: OPENAI_API_KEY not found!")
    sys.exit(1)
else:
    print("✓ OpenAI API key loaded")

# Load test data
print("Loading test data...")
test_df = pd.read_csv(project_root / 'data/processed/test_processed.csv')
vector_db_path = str(project_root / 'data/vector_db')

print("="*80)
print("CROSS-ENCODER RERANKING COMPARISON (with parallel processing)")
print(f"Test set size: {len(test_df)} queries")
print("="*80)

def evaluate_single_query(row_tuple, retriever):
    """Evaluate a single query - designed for parallel execution"""
    idx, row = row_tuple
    query = row['text']
    true_category = row['category']
    
    max_retries = 3
    for attempt in range(max_retries):
        try:
            result = retriever.retrieve(query, n_results=1)
            predicted_category = result['metadatas'][0]['category']
            return idx, predicted_category == true_category, None
        except Exception as e:
            error_msg = str(e)
            # Ignore ChromaDB internal event errors (harmless)
            if 'CollectionQueryEvent' in error_msg:
                time.sleep(0.1)  # Brief pause
                continue
            
            # For other errors, retry with exponential backoff
            if attempt < max_retries - 1:
                time.sleep(0.5 * (2 ** attempt))
                continue
            else:
                # Final attempt failed
                return idx, False, error_msg
    
    # If we exhausted retries
    return idx, False, "Max retries exceeded"

def parallel_evaluate(retriever, test_df, max_workers=10):
    """Evaluate retriever with parallel processing"""
    correct = 0
    total = len(test_df)
    errors = []
    
    # Create partial function with retriever bound
    eval_func = partial(evaluate_single_query, retriever=retriever)
    
    # Parallel execution
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        futures = {
            executor.submit(eval_func, (idx, row)): idx
            for idx, row in test_df.iterrows()
        }
        
        # Process results with progress bar
        for future in tqdm(as_completed(futures), total=total, desc="Evaluating"):
            try:
                idx, is_correct, error = future.result()
                if is_correct:
                    correct += 1
                if error:
                    errors.append((idx, error))
            except Exception as e:
                print(f"\nUnexpected error: {e}")
    
    # Report errors if any
    if errors:
        print(f"\n⚠️ {len(errors)} queries had errors (counted as incorrect)")
        if len(errors) <= 5:
            for idx, error in errors:
                print(f"  Query {idx}: {error[:100]}")
    
    return correct, total

# Test 1: BASELINE (no reranking)
print("\n[1/3] Testing BASELINE (no reranking)...")
print("Initializing baseline retriever...")
retriever_baseline = KnowledgeBaseRetriever(
    vector_db_path,
    use_reranking=False
)
start_time = time.time()
correct_baseline, total_baseline = parallel_evaluate(retriever_baseline, test_df, max_workers=10)
baseline_time = time.time() - start_time
accuracy_baseline = correct_baseline / total_baseline * 100
print(f"✓ Baseline completed in {baseline_time/60:.1f} minutes")

# Test 2: Reranking with 2x over-fetch
print("\n[2/3] Testing RERANKING 2x (TinyBERT)...")
print("Initializing reranker with 2x over-fetch...")
retriever_2x = KnowledgeBaseRetriever(
    vector_db_path,
    use_reranking=True,
    rerank_multiplier=2
)
start_time = time.time()
correct_2x, total_2x = parallel_evaluate(retriever_2x, test_df, max_workers=10)
rerank_2x_time = time.time() - start_time
accuracy_2x = correct_2x / total_2x * 100
print(f"✓ Reranking 2x completed in {rerank_2x_time/60:.1f} minutes")

# Test 3: Reranking with 5x over-fetch
print("\n[3/3] Testing RERANKING 5x (TinyBERT)...")
print("Initializing reranker with 5x over-fetch...")
retriever_5x = KnowledgeBaseRetriever(
    vector_db_path,
    use_reranking=True,
    rerank_multiplier=5
)
start_time = time.time()
correct_5x, total_5x = parallel_evaluate(retriever_5x, test_df, max_workers=10)
rerank_5x_time = time.time() - start_time
accuracy_5x = correct_5x / total_5x * 100
print(f"✓ Reranking 5x completed in {rerank_5x_time/60:.1f} minutes")

# Results Summary
print("\n" + "="*80)
print("RESULTS SUMMARY")
print("="*80)
print(f"\nBaseline (no reranking): {accuracy_baseline:.1f}% ({correct_baseline}/{total_baseline}) [{baseline_time/60:.1f} min]")
print(f"Reranking 2x (TinyBERT): {accuracy_2x:.1f}% ({correct_2x}/{total_2x}) [+{accuracy_2x-accuracy_baseline:.1f}%] [{rerank_2x_time/60:.1f} min]")
print(f"Reranking 5x (TinyBERT): {accuracy_5x:.1f}% ({correct_5x}/{total_5x}) [+{accuracy_5x-accuracy_baseline:.1f}%] [{rerank_5x_time/60:.1f} min]")

# Determine best approach
results = [
    ("Baseline (no reranking)", accuracy_baseline, baseline_time),
    ("Reranking 2x", accuracy_2x, rerank_2x_time),
    ("Reranking 5x", accuracy_5x, rerank_5x_time)
]
best_name, best_accuracy, best_time = max(results, key=lambda x: x[1])

print("\n" + "-"*80)
if best_name == "Baseline (no reranking)":
    print("✓ RECOMMENDATION: Use baseline (no reranking)")
    print("  The embedding model is already excellent for this dataset!")
    print("  Reranking adds complexity without improvement.")
else:
    improvement = best_accuracy - accuracy_baseline
    print(f"✓ RECOMMENDATION: Use {best_name}")
    print(f"  Improvement over baseline: +{improvement:.1f}%")
    if improvement < 1.0:
        print("  ⚠️ Note: Marginal improvement - consider baseline for simplicity")
    else:
        print("  ✓ Significant improvement worth the added complexity!")

# Performance/accuracy tradeoff analysis
print("\n" + "-"*80)
print("PERFORMANCE vs ACCURACY TRADEOFF")
print("-"*80)
for name, acc, exec_time in results:
    time_overhead = ((exec_time / baseline_time) - 1) * 100 if name != "Baseline (no reranking)" else 0
    acc_gain = acc - accuracy_baseline
    
    if name == "Baseline (no reranking)":
        print(f"• {name:25s}: {acc:.1f}% (reference)")
    else:
        efficiency_ratio = acc_gain / time_overhead if time_overhead > 0 else 0
        print(f"• {name:25s}: {acc:.1f}% (+{acc_gain:.1f}%) | +{time_overhead:.0f}% time | Ratio: {efficiency_ratio:.3f}")

# Detailed analysis
print("\n" + "-"*80)
print("ANALYSIS")
print("-"*80)
if accuracy_2x > accuracy_baseline and accuracy_5x > accuracy_baseline:
    print("• Both reranking strategies improve over baseline")
    if accuracy_5x > accuracy_2x + 0.5:
        print("• 5x over-fetch provides notably better results")
        print("  → More candidates allows better reranking")
    elif accuracy_2x > accuracy_5x:
        print("• 2x over-fetch performs better than 5x")
        print("  → Diminishing returns with too many candidates")
    else:
        print("• Similar performance between 2x and 5x")
        print("  → 2x is more efficient with comparable results")
elif accuracy_2x < accuracy_baseline or accuracy_5x < accuracy_baseline:
    print("• Reranking underperforms baseline")
    print("  Possible reasons:")
    print("  - Embedding model already captures semantic similarity well")
    print("  - Cross-encoder not well-suited for this domain")
    print("  - Dataset characteristics favor pure embedding search")
else:
    print("• Mixed results - some reranking helps, some doesn't")

print("\n" + "="*80)

# Save results
results_df = pd.DataFrame({
    'Method': ['Baseline', 'Reranking 2x', 'Reranking 5x'],
    'Correct': [correct_baseline, correct_2x, correct_5x],
    'Total': [total_baseline, total_2x, total_5x],
    'Accuracy': [accuracy_baseline, accuracy_2x, accuracy_5x],
    'vs_Baseline': [0.0, accuracy_2x - accuracy_baseline, accuracy_5x - accuracy_baseline],
    'Time_Minutes': [baseline_time/60, rerank_2x_time/60, rerank_5x_time/60],
    'Time_Overhead_Pct': [0.0, ((rerank_2x_time/baseline_time)-1)*100, ((rerank_5x_time/baseline_time)-1)*100]
})

output_path = project_root / 'data/processed/reranking_comparison.csv'
results_df.to_csv(output_path, index=False)
print(f"\n✓ Results saved to: {output_path}")

✓ OpenAI API key loaded
Loading test data...
CROSS-ENCODER RERANKING COMPARISON (with parallel processing)
Test set size: 3080 queries

[1/3] Testing BASELINE (no reranking)...
Initializing baseline retriever...


Evaluating: 100%|██████████| 3080/3080 [01:49<00:00, 28.18it/s]


✓ Baseline completed in 1.8 minutes

[2/3] Testing RERANKING 2x (TinyBERT)...
Initializing reranker with 2x over-fetch...


Evaluating: 100%|██████████| 3080/3080 [02:04<00:00, 24.81it/s]


✓ Reranking 2x completed in 2.1 minutes

[3/3] Testing RERANKING 5x (TinyBERT)...
Initializing reranker with 5x over-fetch...


Evaluating: 100%|██████████| 3080/3080 [02:07<00:00, 24.19it/s]

✓ Reranking 5x completed in 2.1 minutes

RESULTS SUMMARY

Baseline (no reranking): 90.5% (2788/3080) [1.8 min]
Reranking 2x (TinyBERT): 91.3% (2812/3080) [+0.8%] [2.1 min]
Reranking 5x (TinyBERT): 90.9% (2800/3080) [+0.4%] [2.1 min]

--------------------------------------------------------------------------------
✓ RECOMMENDATION: Use Reranking 2x
  Improvement over baseline: +0.8%
  ⚠️ Note: Marginal improvement - consider baseline for simplicity

--------------------------------------------------------------------------------
PERFORMANCE vs ACCURACY TRADEOFF
--------------------------------------------------------------------------------
• Baseline (no reranking)  : 90.5% (reference)
• Reranking 2x             : 91.3% (+0.8%) | +14% time | Ratio: 0.054
• Reranking 5x             : 90.9% (+0.4%) | +17% time | Ratio: 0.023

--------------------------------------------------------------------------------
ANALYSIS
--------------------------------------------------------------------------


