# S-C Evidence Pipeline Evaluation (No Post-processing)

This notebook implements the NV-Embed-v2 + jina-reranker-v3 pipeline with the best HPO config (Trial 33) and runs 5-fold cross-validation with comprehensive metrics.

## Best Config (Trial 33, nDCG@10=0.8205)
- **Retriever:** NV-Embed-v2
- **Reranker:** jina-reranker-v3 (LoRA fine-tuned)
- **Training:** no_evidence=True (includes queries with no positive evidence)
- **Loss:** BCE + Pairwise-Softplus + Lambda

In [None]:
import os
import sys
import json
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple, Set
from tqdm.auto import tqdm

import torch
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.optim import AdamW

# Add project root to path
project_root = Path("..")
sys.path.insert(0, str(project_root))

from final_sc_review.data.io import load_criteria, load_groundtruth, load_sentence_corpus
from final_sc_review.data.splits import split_post_ids
from final_sc_review.metrics.ranking import ndcg_at_k, recall_at_k, mrr_at_k, map_at_k
from final_sc_review.retriever.zoo import RetrieverZoo
from final_sc_review.reranker.losses import HybridRerankerLoss

## 1. Configuration

In [None]:
# Best Trial 33 hyperparameters
BEST_PARAMS = {
    'batch_size': 1,
    'num_epochs': 1,
    'learning_rate': 4.447467238603695e-05,
    'weight_decay': 8.769982161626777e-05,
    'grad_accum': 2,
    'pointwise_type': 'bce',
    'pairwise_type': 'pairwise_softplus',
    'listwise_type': 'lambda',
    'w_list': 1.0755666826190335,
    'w_pair': 1.8398728897689836,
    'w_point': 0.813832693617893,
    'temperature': 0.9342605824607415,
    'sigma': 1.5735217400312576,
    'margin': 0.7247599691970003,
    'max_pairs': 100,
    'lora_r': 16,
    'lora_alpha': 16,
    'lora_dropout': 0.05,
}

# Pipeline config
RETRIEVER_NAME = "nv-embed-v2"
# Use BGE-reranker-v2-m3 (compatible with transformers 4.44.2)
# Note: jina-reranker-v3 requires newer transformers which breaks NV-Embed-v2
RERANKER_MODEL_ID = "BAAI/bge-reranker-v2-m3"
TOP_K_RETRIEVER = 20  # Candidates from retriever
N_FOLDS = 5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Data paths
DATA_DIR = project_root / "data"
GROUNDTRUTH_PATH = DATA_DIR / "groundtruth/evidence_sentence_groundtruth.csv"
CORPUS_PATH = DATA_DIR / "groundtruth/sentence_corpus.jsonl"
CRITERIA_PATH = DATA_DIR / "DSM5/MDD_Criteira.json"
# Note: RetrieverZoo internally adds '/retriever_zoo' subdirectory
CACHE_DIR = DATA_DIR / "cache"

print(f"Device: {DEVICE}")
print(f"Retriever: {RETRIEVER_NAME}")
print(f"Reranker: {RERANKER_MODEL_ID}")
print(f"Top-K Retriever: {TOP_K_RETRIEVER}")
print(f"N Folds: {N_FOLDS}")

## 2. Load Data

In [None]:
# Load groundtruth, sentences, and criteria
groundtruth = load_groundtruth(GROUNDTRUTH_PATH)
sentences = load_sentence_corpus(CORPUS_PATH)
criteria = load_criteria(CRITERIA_PATH)

print(f"Groundtruth rows: {len(groundtruth)}")
print(f"Sentences: {len(sentences)}")
print(f"Criteria: {len(criteria)}")

# Build lookup maps
sentences_by_post = defaultdict(list)
for sent in sentences:
    sentences_by_post[sent.post_id].append(sent)

sentence_map = {s.sent_uid: s for s in sentences}
criteria_map = {c.criterion_id: c.text for c in criteria}

# Get all post IDs
all_post_ids = sorted(set(row.post_id for row in groundtruth))
print(f"Unique posts: {len(all_post_ids)}")

## 3. Create 5-Fold Cross-Validation Splits

In [None]:
def create_kfold_splits(post_ids: List[str], n_folds: int = 5, seed: int = 42) -> List[Tuple[List[str], List[str]]]:
    """Create k-fold cross-validation splits at post level (post-disjoint)."""
    np.random.seed(seed)
    post_ids = np.array(post_ids)
    np.random.shuffle(post_ids)
    
    fold_size = len(post_ids) // n_folds
    folds = []
    
    for i in range(n_folds):
        start = i * fold_size
        if i == n_folds - 1:
            end = len(post_ids)
        else:
            end = start + fold_size
        
        val_posts = post_ids[start:end].tolist()
        train_posts = np.concatenate([post_ids[:start], post_ids[end:]]).tolist()
        folds.append((train_posts, val_posts))
    
    return folds

# Create folds
folds = create_kfold_splits(all_post_ids, n_folds=N_FOLDS)

for i, (train_posts, val_posts) in enumerate(folds):
    print(f"Fold {i+1}: Train={len(train_posts)} posts, Val={len(val_posts)} posts")

## 4. Build Query Data from Groundtruth

In [None]:
def build_query_data(groundtruth, post_ids: Set[str], include_no_evidence: bool = True):
    """Build query data from groundtruth for given post IDs.
    
    Returns list of dicts with:
        - post_id, criterion_id, criterion_text
        - gold_uids: set of positive sentence UIDs
        - is_no_evidence: whether this query has no positives
    """
    queries = {}
    
    for row in groundtruth:
        if row.post_id not in post_ids:
            continue
        
        key = (row.post_id, row.criterion_id)
        if key not in queries:
            queries[key] = {
                'post_id': row.post_id,
                'criterion_id': row.criterion_id,
                'criterion_text': criteria_map.get(row.criterion_id, row.criterion_id),
                'gold_uids': set(),
            }
        
        if row.groundtruth == 1:
            queries[key]['gold_uids'].add(row.sent_uid)
    
    result = []
    for query_data in queries.values():
        query_data['is_no_evidence'] = len(query_data['gold_uids']) == 0
        if include_no_evidence or not query_data['is_no_evidence']:
            result.append(query_data)
    
    return result

# Test with all posts
all_queries = build_query_data(groundtruth, set(all_post_ids), include_no_evidence=True)
queries_with_evidence = [q for q in all_queries if not q['is_no_evidence']]
queries_no_evidence = [q for q in all_queries if q['is_no_evidence']]

print(f"Total queries: {len(all_queries)}")
print(f"Queries with evidence: {len(queries_with_evidence)}")
print(f"Queries with no evidence: {len(queries_no_evidence)}")

## 5. Initialize Retriever

In [None]:
# Initialize retriever (shared across folds)
retriever_zoo = RetrieverZoo(sentences=sentences, cache_dir=CACHE_DIR)
retriever = retriever_zoo.get_retriever(RETRIEVER_NAME)

# Encode corpus (uses cache if available)
print(f"Encoding corpus for {RETRIEVER_NAME}...")
retriever.encode_corpus(rebuild=False)
print("Retriever ready!")

## 6. Define Training and Assessment Functions

In [None]:
def prepare_training_data(queries, retriever, top_k: int = 20):
    """Prepare training data by retrieving candidates for each query."""
    train_data = []
    
    for query in tqdm(queries, desc="Preparing training data"):
        post_id = query['post_id']
        criterion_text = query['criterion_text']
        gold_uids = query['gold_uids']
        
        # Get sentences for this post
        post_sentences = sentences_by_post.get(post_id, [])
        if len(post_sentences) < 2:
            continue
        
        # Retrieve candidates
        try:
            results = retriever.retrieve_within_post(
                query=criterion_text,
                post_id=post_id,
                top_k=min(top_k, len(post_sentences)),
            )
        except Exception as e:
            continue
        
        if len(results) < 2:
            continue
        
        candidates = []
        for r in results:
            candidates.append({
                'sent_uid': r.sent_uid,
                'text': r.text,
                'score': r.score,
                'label': 1 if r.sent_uid in gold_uids else 0,
            })
        
        train_data.append({
            'query': criterion_text,
            'post_id': post_id,
            'criterion_id': query['criterion_id'],
            'gold_uids': list(gold_uids),
            'is_no_evidence': query['is_no_evidence'],
            'candidates': candidates,
        })
    
    return train_data


def train_reranker(train_data, params, verbose=True):
    """Train a LoRA-adapted reranker on the training data."""
    # Load fresh model
    tokenizer = AutoTokenizer.from_pretrained(RERANKER_MODEL_ID, trust_remote_code=True)
    # Set padding token if not defined (required for batch processing)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load model - use float16 for BGE-reranker-v2-m3 (doesn't support bfloat16)
    model = AutoModelForSequenceClassification.from_pretrained(
        RERANKER_MODEL_ID,
        torch_dtype=torch.float16,
        trust_remote_code=True
    )
    # Set pad_token_id on model config
    model.config.pad_token_id = tokenizer.pad_token_id
    
    # Apply LoRA - target modules differ by model architecture
    # BGE-reranker-v2-m3 (RoBERTa): query, key, value
    # jina-reranker-v3 (Qwen3): q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj
    if "bge" in RERANKER_MODEL_ID.lower():
        target_modules = ["query", "key", "value"]
    else:
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    
    lora_config = LoraConfig(
        r=params['lora_r'],
        lora_alpha=params['lora_alpha'],
        lora_dropout=params['lora_dropout'],
        target_modules=target_modules,
        bias="none",
        task_type="SEQ_CLS",
    )
    model = get_peft_model(model, lora_config)
    model = model.to(DEVICE)
    
    # Create loss function
    loss_fn = HybridRerankerLoss(
        pointwise_type=params['pointwise_type'],
        pairwise_type=params['pairwise_type'],
        listwise_type=params['listwise_type'],
        w_point=params['w_point'],
        w_pair=params['w_pair'],
        w_list=params['w_list'],
        temperature=params['temperature'],
        sigma=params['sigma'],
        margin=params['margin'],
    )
    
    # Optimizer
    optimizer = AdamW(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
    
    # Training
    model.train()
    total_loss = 0
    step = 0
    
    # Shuffle training data
    np.random.shuffle(train_data)
    
    for epoch in range(params['num_epochs']):
        for query_data in train_data:
            query_text = query_data['query']
            candidates = query_data['candidates'][:TOP_K_RETRIEVER]
            
            if len(candidates) < 2:
                continue
            
            # Prepare inputs
            texts = [[query_text, c['text']] for c in candidates]
            labels = torch.tensor([c['label'] for c in candidates], dtype=torch.float32, device=DEVICE)
            
            inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            
            # Forward
            outputs = model(**inputs)
            scores = outputs.logits.squeeze(-1)
            
            # Loss
            loss = loss_fn(scores, labels)
            loss = loss / params['grad_accum']
            loss.backward()
            
            step += 1
            if step % params['grad_accum'] == 0:
                optimizer.step()
                optimizer.zero_grad()
            
            total_loss += loss.item() * params['grad_accum']
    
    avg_loss = total_loss / step if step > 0 else 0
    if verbose:
        print(f"  Training done. Steps: {step}, Avg loss: {avg_loss:.4f}")
    
    return model, tokenizer


def assess_reranker(model, tokenizer, val_data, verbose=True):
    """Assess reranker on validation data.
    
    Returns:
        - metrics: dict with aggregated metrics
        - detailed_results: list of per-query results for investigation
    """
    model.eval()
    
    all_results = {k: [] for k in [1, 5, 10, 20]}
    detailed_results = []
    
    with torch.no_grad():
        for query_data in tqdm(val_data, desc="Assessing", disable=not verbose):
            query_text = query_data['query']
            candidates = query_data['candidates'][:TOP_K_RETRIEVER]
            gold_uids = set(query_data['gold_uids'])
            
            if len(candidates) < 2:
                continue
            
            # Skip queries with no positives for metric calculation
            if not gold_uids:
                continue
            
            # Score candidates
            texts = [[query_text, c['text']] for c in candidates]
            inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            
            outputs = model(**inputs)
            scores = outputs.logits.squeeze(-1).cpu().numpy()
            
            # Get retriever-only ranking
            retriever_ranking = [c['sent_uid'] for c in candidates]
            retriever_scores = [c['score'] for c in candidates]
            
            # Rerank by score
            ranked_indices = np.argsort(scores)[::-1]
            reranked_candidates = [candidates[i] for i in ranked_indices]
            reranked_ids = [candidates[i]['sent_uid'] for i in ranked_indices]
            reranked_scores = [float(scores[i]) for i in ranked_indices]
            
            # Compute metrics for each k
            query_metrics = {}
            for k in [1, 5, 10, 20]:
                # Retriever metrics
                ret_ndcg = ndcg_at_k(gold_uids, retriever_ranking, k)
                ret_recall = recall_at_k(gold_uids, retriever_ranking, k)
                ret_mrr = mrr_at_k(gold_uids, retriever_ranking, k)
                ret_map = map_at_k(gold_uids, retriever_ranking, k)
                
                # Reranker metrics
                rerank_ndcg = ndcg_at_k(gold_uids, reranked_ids, k)
                rerank_recall = recall_at_k(gold_uids, reranked_ids, k)
                rerank_mrr = mrr_at_k(gold_uids, reranked_ids, k)
                rerank_map = map_at_k(gold_uids, reranked_ids, k)
                
                all_results[k].append({
                    'ret_ndcg': ret_ndcg, 'ret_recall': ret_recall, 'ret_mrr': ret_mrr, 'ret_map': ret_map,
                    'rerank_ndcg': rerank_ndcg, 'rerank_recall': rerank_recall, 
                    'rerank_mrr': rerank_mrr, 'rerank_map': rerank_map,
                })
                
                query_metrics[f'ndcg@{k}'] = {'retriever': ret_ndcg, 'reranker': rerank_ndcg}
                query_metrics[f'recall@{k}'] = {'retriever': ret_recall, 'reranker': rerank_recall}
            
            # Store detailed result for investigation
            detailed_results.append({
                'post_id': query_data['post_id'],
                'criterion_id': query_data['criterion_id'],
                'query_text': query_text,
                'gold_uids': list(gold_uids),
                'retriever_ranking': [
                    {'sent_uid': c['sent_uid'], 'text': c['text'][:100], 'score': c['score'], 
                     'is_positive': c['sent_uid'] in gold_uids}
                    for c in candidates
                ],
                'reranker_ranking': [
                    {'sent_uid': reranked_candidates[i]['sent_uid'], 
                     'text': reranked_candidates[i]['text'][:100],
                     'reranker_score': reranked_scores[i],
                     'is_positive': reranked_candidates[i]['sent_uid'] in gold_uids}
                    for i in range(len(reranked_candidates))
                ],
                'metrics': query_metrics,
            })
    
    # Aggregate metrics
    metrics = {}
    for k in [1, 5, 10, 20]:
        if all_results[k]:
            metrics[f'ret_ndcg@{k}'] = np.mean([r['ret_ndcg'] for r in all_results[k]])
            metrics[f'ret_recall@{k}'] = np.mean([r['ret_recall'] for r in all_results[k]])
            metrics[f'ret_mrr@{k}'] = np.mean([r['ret_mrr'] for r in all_results[k]])
            metrics[f'ret_map@{k}'] = np.mean([r['ret_map'] for r in all_results[k]])
            
            metrics[f'rerank_ndcg@{k}'] = np.mean([r['rerank_ndcg'] for r in all_results[k]])
            metrics[f'rerank_recall@{k}'] = np.mean([r['rerank_recall'] for r in all_results[k]])
            metrics[f'rerank_mrr@{k}'] = np.mean([r['rerank_mrr'] for r in all_results[k]])
            metrics[f'rerank_map@{k}'] = np.mean([r['rerank_map'] for r in all_results[k]])
    
    metrics['n_queries'] = len(all_results[10])
    
    return metrics, detailed_results

## 7. Run 5-Fold Cross-Validation

In [None]:
# Store results
fold_metrics = []
all_detailed_results = []

for fold_idx, (train_posts, val_posts) in enumerate(folds):
    print(f"\n{'='*60}")
    print(f"FOLD {fold_idx + 1}/{N_FOLDS}")
    print(f"{'='*60}")
    print(f"Train posts: {len(train_posts)}, Val posts: {len(val_posts)}")
    
    # Build query data for this fold
    train_queries = build_query_data(groundtruth, set(train_posts), include_no_evidence=True)
    val_queries = build_query_data(groundtruth, set(val_posts), include_no_evidence=True)
    
    print(f"Train queries: {len(train_queries)}, Val queries: {len(val_queries)}")
    
    # Prepare training and validation data with retriever candidates
    print("\nPreparing training data...")
    train_data = prepare_training_data(train_queries, retriever, top_k=TOP_K_RETRIEVER)
    
    print("Preparing validation data...")
    val_data = prepare_training_data(val_queries, retriever, top_k=TOP_K_RETRIEVER)
    
    print(f"Training samples: {len(train_data)}, Validation samples: {len(val_data)}")
    
    # Train reranker
    print("\nTraining reranker...")
    model, tokenizer = train_reranker(train_data, BEST_PARAMS, verbose=True)
    
    # Assess
    print("\nAssessing...")
    metrics, detailed = assess_reranker(model, tokenizer, val_data, verbose=True)
    
    # Add fold info to detailed results
    for d in detailed:
        d['fold'] = fold_idx + 1
    
    fold_metrics.append(metrics)
    all_detailed_results.extend(detailed)
    
    # Print fold results
    print(f"\nFold {fold_idx + 1} Results:")
    print(f"  Queries: {metrics['n_queries']}")
    print(f"  Retriever nDCG@10: {metrics['ret_ndcg@10']:.4f}")
    print(f"  Reranker nDCG@10:  {metrics['rerank_ndcg@10']:.4f} (+{metrics['rerank_ndcg@10'] - metrics['ret_ndcg@10']:.4f})")
    
    # Clean up
    del model, tokenizer
    torch.cuda.empty_cache()

## 8. Aggregate Results Across Folds

In [None]:
# Compute mean and std across folds
aggregated = {}
for key in fold_metrics[0].keys():
    if key == 'n_queries':
        aggregated[key] = sum(m[key] for m in fold_metrics)
    else:
        values = [m[key] for m in fold_metrics]
        aggregated[f'{key}_mean'] = np.mean(values)
        aggregated[f'{key}_std'] = np.std(values)

# Print comprehensive results table
print("\n" + "="*80)
print("5-FOLD CROSS-VALIDATION RESULTS")
print(f"Model: NV-Embed-v2 + {RERANKER_MODEL_ID.split('/')[-1]} (Trial 33 config, no_evidence=True)")
print("="*80)

print(f"\nTotal queries assessed: {aggregated['n_queries']}")

print(f"\n{'Metric':<15} {'@1':>12} {'@5':>12} {'@10':>12} {'@20':>12}")
print("-"*65)

# Retriever metrics
for metric_name in ['ndcg', 'recall', 'mrr', 'map']:
    row = f"Ret {metric_name.upper():<10}"
    for k in [1, 5, 10, 20]:
        mean = aggregated[f'ret_{metric_name}@{k}_mean']
        std = aggregated[f'ret_{metric_name}@{k}_std']
        row += f" {mean:.4f}+/-{std:.3f}"
    print(row)

print("-"*65)

# Reranker metrics
for metric_name in ['ndcg', 'recall', 'mrr', 'map']:
    row = f"Rerank {metric_name.upper():<7}"
    for k in [1, 5, 10, 20]:
        mean = aggregated[f'rerank_{metric_name}@{k}_mean']
        std = aggregated[f'rerank_{metric_name}@{k}_std']
        row += f" {mean:.4f}+/-{std:.3f}"
    print(row)

print("-"*65)

# Improvement
print("\nImprovement (Reranker - Retriever):")
for metric_name in ['ndcg', 'recall', 'mrr', 'map']:
    row = f"{metric_name.upper():<12}"
    for k in [1, 5, 10, 20]:
        ret_mean = aggregated[f'ret_{metric_name}@{k}_mean']
        rerank_mean = aggregated[f'rerank_{metric_name}@{k}_mean']
        diff = rerank_mean - ret_mean
        pct = (diff / ret_mean * 100) if ret_mean > 0 else 0
        row += f" {diff:+.4f} ({pct:+.1f}%)"
    print(row)

In [None]:
# Create DataFrame for fold-by-fold comparison
fold_df = pd.DataFrame(fold_metrics)
fold_df.index = [f"Fold {i+1}" for i in range(len(fold_metrics))]

# Show key metrics per fold
key_metrics = ['ret_ndcg@10', 'rerank_ndcg@10', 'ret_recall@10', 'rerank_recall@10', 'n_queries']
print("\nPer-Fold Metrics:")
display(fold_df[key_metrics].round(4))

## 9. Detailed Results for Investigation

In [None]:
# Convert to DataFrame for easier analysis
detailed_df = pd.DataFrame([
    {
        'fold': d['fold'],
        'post_id': d['post_id'],
        'criterion_id': d['criterion_id'],
        'query_text': d['query_text'][:50] + '...',
        'n_gold': len(d['gold_uids']),
        'ret_ndcg@10': d['metrics']['ndcg@10']['retriever'],
        'rerank_ndcg@10': d['metrics']['ndcg@10']['reranker'],
        'improvement': d['metrics']['ndcg@10']['reranker'] - d['metrics']['ndcg@10']['retriever'],
    }
    for d in all_detailed_results
])

print(f"Total detailed results: {len(detailed_df)}")
display(detailed_df.head(10))

In [None]:
# Show queries where reranker helped most
print("\nQueries where reranker improved most:")
top_improvements = detailed_df.nlargest(10, 'improvement')
display(top_improvements)

In [None]:
# Show queries where reranker hurt most
print("\nQueries where reranker hurt most:")
worst_regressions = detailed_df.nsmallest(10, 'improvement')
display(worst_regressions)

In [None]:
# Examine a specific query in detail
def show_query_detail(query_idx: int):
    """Show detailed retriever vs reranker comparison for a specific query."""
    d = all_detailed_results[query_idx]
    
    print(f"Post ID: {d['post_id']}")
    print(f"Criterion: {d['criterion_id']}")
    print(f"Query: {d['query_text']}")
    print(f"Gold UIDs: {d['gold_uids']}")
    print()
    
    print("RETRIEVER RANKING:")
    print("-" * 80)
    for i, c in enumerate(d['retriever_ranking'][:10], 1):
        marker = "[+]" if c['is_positive'] else "[ ]"
        print(f"{i:2d}. {marker} {c['sent_uid']} (score: {c['score']:.4f})")
        print(f"     {c['text']}")
    
    print()
    print("RERANKER RANKING:")
    print("-" * 80)
    for i, c in enumerate(d['reranker_ranking'][:10], 1):
        marker = "[+]" if c['is_positive'] else "[ ]"
        print(f"{i:2d}. {marker} {c['sent_uid']} (score: {c['reranker_score']:.4f})")
        print(f"     {c['text']}")
    
    print()
    print("METRICS:")
    for k in [1, 5, 10]:
        ret = d['metrics'][f'ndcg@{k}']['retriever']
        rerank = d['metrics'][f'ndcg@{k}']['reranker']
        print(f"  nDCG@{k}: Retriever={ret:.4f}, Reranker={rerank:.4f}, Delta={rerank-ret:+.4f}")

In [None]:
# Show a query where reranker helped
best_idx = detailed_df['improvement'].idxmax()
print("QUERY WITH BEST RERANKER IMPROVEMENT:")
print("=" * 80)
show_query_detail(best_idx)

In [None]:
# Show a query where reranker hurt
worst_idx = detailed_df['improvement'].idxmin()
print("QUERY WITH WORST RERANKER REGRESSION:")
print("=" * 80)
show_query_detail(worst_idx)

## 10. Save Results

In [None]:
# Save aggregated metrics
output_dir = project_root / "outputs" / "5fold_results"
output_dir.mkdir(parents=True, exist_ok=True)

# Save aggregated metrics
with open(output_dir / "aggregated_metrics.json", "w") as f:
    json.dump(aggregated, f, indent=2)

# Save per-fold metrics
fold_df.to_csv(output_dir / "fold_metrics.csv")

# Save detailed results
detailed_df.to_csv(output_dir / "detailed_results.csv", index=False)

# Save full detailed results (including rankings) as JSON
with open(output_dir / "full_detailed_results.json", "w") as f:
    json.dump(all_detailed_results, f, indent=2)

print(f"Results saved to: {output_dir}")
print(f"  - aggregated_metrics.json")
print(f"  - fold_metrics.csv")
print(f"  - detailed_results.csv")
print(f"  - full_detailed_results.json")

## 11. Analysis Summary

In [None]:
# Summary statistics
print("\n" + "="*80)
print("ANALYSIS SUMMARY")
print("="*80)

print(f"\n1. Overall Performance:")
print(f"   - Retriever nDCG@10: {aggregated['ret_ndcg@10_mean']:.4f} +/- {aggregated['ret_ndcg@10_std']:.4f}")
print(f"   - Reranker nDCG@10:  {aggregated['rerank_ndcg@10_mean']:.4f} +/- {aggregated['rerank_ndcg@10_std']:.4f}")
improvement = aggregated['rerank_ndcg@10_mean'] - aggregated['ret_ndcg@10_mean']
pct_improvement = improvement / aggregated['ret_ndcg@10_mean'] * 100
print(f"   - Improvement: +{improvement:.4f} ({pct_improvement:+.1f}%)")

print(f"\n2. Query Analysis:")
improved = (detailed_df['improvement'] > 0).sum()
unchanged = (detailed_df['improvement'] == 0).sum()
regressed = (detailed_df['improvement'] < 0).sum()
print(f"   - Improved: {improved} ({improved/len(detailed_df)*100:.1f}%)")
print(f"   - Unchanged: {unchanged} ({unchanged/len(detailed_df)*100:.1f}%)")
print(f"   - Regressed: {regressed} ({regressed/len(detailed_df)*100:.1f}%)")

print(f"\n3. Best Config (Trial 33):")
print(f"   - Loss: BCE + Pairwise-Softplus + Lambda")
print(f"   - Weights: w_point={BEST_PARAMS['w_point']:.3f}, w_pair={BEST_PARAMS['w_pair']:.3f}, w_list={BEST_PARAMS['w_list']:.3f}")
print(f"   - Learning rate: {BEST_PARAMS['learning_rate']:.2e}")
print(f"   - LoRA: r={BEST_PARAMS['lora_r']}, alpha={BEST_PARAMS['lora_alpha']}")

print(f"\n4. Next Steps:")
print(f"   - Investigate queries where reranker regressed")
print(f"   - Analyze error patterns by criterion type")
print(f"   - Consider per-criterion fine-tuning")