# S-C Evidence Pipeline - Stage 2: Reranking (No Post-processing)

This notebook runs the jina-reranker-v3 reranking stage using retrieval candidates from Stage 1.

**Run this notebook in the Jina reranker conda environment.**

## Prerequisites
- Run `sc_retrieval_pipeline.ipynb` first to generate retrieval candidates
- Retrieval candidates should be in `outputs/retrieval_candidates/`

## Best Config (Trial 33, nDCG@10=0.8205)
- **Retriever:** NV-Embed-v2
- **Reranker:** jina-reranker-v3 (LoRA fine-tuned, Qwen3-based)
- **Training:** no_evidence=True (includes queries with no positive evidence)
- **Loss:** BCE + Pairwise-Softplus + Lambda

In [1]:
import os
import sys
import json
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple, Set
from tqdm.auto import tqdm

import torch
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.optim import AdamW

# Add project root to path
project_root = Path("..")
sys.path.insert(0, str(project_root))

from final_sc_review.metrics.ranking import ndcg_at_k, recall_at_k, mrr_at_k, map_at_k
from final_sc_review.reranker.losses import HybridRerankerLoss

  from .autonotebook import tqdm as notebook_tqdm


## 1. Configuration

In [2]:
# Best Trial 33 hyperparameters
BEST_PARAMS = {
    'batch_size': 1,
    'num_epochs': 1,
    'learning_rate': 4.447467238603695e-05,
    'weight_decay': 8.769982161626777e-05,
    'grad_accum': 2,
    'pointwise_type': 'bce',
    'pairwise_type': 'pairwise_softplus',
    'listwise_type': 'lambda',
    'w_list': 1.0755666826190335,
    'w_pair': 1.8398728897689836,
    'w_point': 0.813832693617893,
    'temperature': 0.9342605824607415,
    'sigma': 1.5735217400312576,
    'margin': 0.7247599691970003,
    'max_pairs': 100,
    'lora_r': 16,
    'lora_alpha': 16,
    'lora_dropout': 0.05,
}

# Pipeline config
RERANKER_MODEL_ID = "jinaai/jina-reranker-v3"
TOP_K_RETRIEVER = 20  # Candidates from retriever
N_FOLDS = 5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Input/Output paths
INPUT_DIR = project_root / "outputs" / "retrieval_candidates"
OUTPUT_DIR = project_root / "outputs" / "5fold_results"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Device: {DEVICE}")
print(f"Reranker: {RERANKER_MODEL_ID}")
print(f"Top-K Retriever: {TOP_K_RETRIEVER}")
print(f"N Folds: {N_FOLDS}")

Device: cuda
Reranker: jinaai/jina-reranker-v3
Top-K Retriever: 20
N Folds: 5


## 2. Load Retrieval Candidates

In [3]:
# Load configuration metadata
with open(INPUT_DIR / "config_metadata.json", "r") as f:
    config_metadata = json.load(f)

print("Retrieval Configuration:")
for key, value in config_metadata.items():
    print(f"  - {key}: {value}")

Retrieval Configuration:
  - retriever_name: nv-embed-v2
  - top_k_retriever: 20
  - n_folds: 5
  - total_posts: 1477
  - total_queries: 14770


In [4]:
# Load retrieval candidates (using pickle for efficiency)
# Note: This is internal pipeline data generated by sc_retrieval_pipeline.ipynb
with open(INPUT_DIR / "retrieval_candidates.pkl", "rb") as f:
    all_fold_data = pickle.load(f)

print(f"\nLoaded {len(all_fold_data)} folds:")
for fold_name, fold_data in all_fold_data.items():
    print(f"  - {fold_name}: {len(fold_data['train_data'])} train, {len(fold_data['val_data'])} val samples")


Loaded 5 folds:
  - fold_1: 10810 train, 2700 val samples
  - fold_2: 10790 train, 2720 val samples
  - fold_3: 10810 train, 2700 val samples
  - fold_4: 10770 train, 2740 val samples
  - fold_5: 10860 train, 2650 val samples


## 3. Define Training and Assessment Functions

In [5]:
def train_reranker(train_data, params, verbose=True):
    """Train a LoRA-adapted reranker on the training data."""
    # Load fresh model
    tokenizer = AutoTokenizer.from_pretrained(RERANKER_MODEL_ID, trust_remote_code=True)
    # Set padding token if not defined (required for batch processing)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load model with num_labels=1 for regression-style reranking
    model = AutoModelForSequenceClassification.from_pretrained(
        RERANKER_MODEL_ID,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        num_labels=1
    )
    # Set pad_token_id on model config
    model.config.pad_token_id = tokenizer.pad_token_id
    
    # Apply LoRA - target modules for jina-reranker-v3 (Qwen3-based)
    # Qwen3 uses: q_proj, k_proj, v_proj, o_proj (attention) + gate_proj, up_proj, down_proj (MLP)
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    
    lora_config = LoraConfig(
        r=params['lora_r'],
        lora_alpha=params['lora_alpha'],
        lora_dropout=params['lora_dropout'],
        target_modules=target_modules,
        bias="none",
        task_type="SEQ_CLS",
    )
    model = get_peft_model(model, lora_config)
    model = model.to(DEVICE)
    
    # Create loss function
    loss_fn = HybridRerankerLoss(
        pointwise_type=params['pointwise_type'],
        pairwise_type=params['pairwise_type'],
        listwise_type=params['listwise_type'],
        w_point=params['w_point'],
        w_pair=params['w_pair'],
        w_list=params['w_list'],
        temperature=params['temperature'],
        sigma=params['sigma'],
        margin=params['margin'],
    )
    
    # Optimizer
    optimizer = AdamW(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
    
    # Training
    model.train()
    total_loss = 0
    step = 0
    
    # Shuffle training data
    train_data_shuffled = train_data.copy()
    np.random.shuffle(train_data_shuffled)
    
    for epoch in range(params['num_epochs']):
        for query_data in train_data_shuffled:
            query_text = query_data['query']
            candidates = query_data['candidates'][:TOP_K_RETRIEVER]
            
            if len(candidates) < 2:
                continue
            
            # Prepare inputs
            texts = [[query_text, c['text']] for c in candidates]
            labels = torch.tensor([c['label'] for c in candidates], dtype=torch.float32, device=DEVICE)
            
            inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            
            # Forward
            outputs = model(**inputs)
            scores = outputs.logits.squeeze(-1)
            
            # Loss
            loss = loss_fn(scores, labels)
            loss = loss / params['grad_accum']
            loss.backward()
            
            step += 1
            if step % params['grad_accum'] == 0:
                optimizer.step()
                optimizer.zero_grad()
            
            total_loss += loss.item() * params['grad_accum']
    
    avg_loss = total_loss / step if step > 0 else 0
    if verbose:
        print(f"  Training done. Steps: {step}, Avg loss: {avg_loss:.4f}")
    
    return model, tokenizer

In [6]:
def assess_reranker(model, tokenizer, val_data, verbose=True):
    """Assess reranker on validation data.
    
    Returns:
        - metrics: dict with aggregated metrics
        - detailed_results: list of per-query results for investigation
    """
    model.eval()
    
    all_results = {k: [] for k in [1, 5, 10, 20]}
    detailed_results = []
    
    with torch.no_grad():
        for query_data in tqdm(val_data, desc="Assessing", disable=not verbose):
            query_text = query_data['query']
            candidates = query_data['candidates'][:TOP_K_RETRIEVER]
            gold_uids = set(query_data['gold_uids'])
            
            if len(candidates) < 2:
                continue
            
            # Skip queries with no positives for metric calculation
            if not gold_uids:
                continue
            
            # Score candidates
            texts = [[query_text, c['text']] for c in candidates]
            inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            
            outputs = model(**inputs)
            # Convert to float32 before numpy (bfloat16 not supported by numpy)
            scores = outputs.logits.squeeze(-1).float().cpu().numpy()
            
            # Get retriever-only ranking
            retriever_ranking = [c['sent_uid'] for c in candidates]
            retriever_scores = [c['score'] for c in candidates]
            
            # Rerank by score
            ranked_indices = np.argsort(scores)[::-1]
            reranked_candidates = [candidates[i] for i in ranked_indices]
            reranked_ids = [candidates[i]['sent_uid'] for i in ranked_indices]
            reranked_scores = [float(scores[i]) for i in ranked_indices]
            
            # Compute metrics for each k
            query_metrics = {}
            for k in [1, 5, 10, 20]:
                # Retriever metrics
                ret_ndcg = ndcg_at_k(gold_uids, retriever_ranking, k)
                ret_recall = recall_at_k(gold_uids, retriever_ranking, k)
                ret_mrr = mrr_at_k(gold_uids, retriever_ranking, k)
                ret_map = map_at_k(gold_uids, retriever_ranking, k)
                
                # Reranker metrics
                rerank_ndcg = ndcg_at_k(gold_uids, reranked_ids, k)
                rerank_recall = recall_at_k(gold_uids, reranked_ids, k)
                rerank_mrr = mrr_at_k(gold_uids, reranked_ids, k)
                rerank_map = map_at_k(gold_uids, reranked_ids, k)
                
                all_results[k].append({
                    'ret_ndcg': ret_ndcg, 'ret_recall': ret_recall, 'ret_mrr': ret_mrr, 'ret_map': ret_map,
                    'rerank_ndcg': rerank_ndcg, 'rerank_recall': rerank_recall, 
                    'rerank_mrr': rerank_mrr, 'rerank_map': rerank_map,
                })
                
                query_metrics[f'ndcg@{k}'] = {'retriever': ret_ndcg, 'reranker': rerank_ndcg}
                query_metrics[f'recall@{k}'] = {'retriever': ret_recall, 'reranker': rerank_recall}
            
            # Store detailed result for investigation
            detailed_results.append({
                'post_id': query_data['post_id'],
                'criterion_id': query_data['criterion_id'],
                'query_text': query_text,
                'gold_uids': list(gold_uids),
                'retriever_ranking': [
                    {'sent_uid': c['sent_uid'], 'text': c['text'][:100], 'score': c['score'], 
                     'is_positive': c['sent_uid'] in gold_uids}
                    for c in candidates
                ],
                'reranker_ranking': [
                    {'sent_uid': reranked_candidates[i]['sent_uid'], 
                     'text': reranked_candidates[i]['text'][:100],
                     'reranker_score': reranked_scores[i],
                     'is_positive': reranked_candidates[i]['sent_uid'] in gold_uids}
                    for i in range(len(reranked_candidates))
                ],
                'metrics': query_metrics,
            })
    
    # Aggregate metrics
    metrics = {}
    for k in [1, 5, 10, 20]:
        if all_results[k]:
            metrics[f'ret_ndcg@{k}'] = np.mean([r['ret_ndcg'] for r in all_results[k]])
            metrics[f'ret_recall@{k}'] = np.mean([r['ret_recall'] for r in all_results[k]])
            metrics[f'ret_mrr@{k}'] = np.mean([r['ret_mrr'] for r in all_results[k]])
            metrics[f'ret_map@{k}'] = np.mean([r['ret_map'] for r in all_results[k]])
            
            metrics[f'rerank_ndcg@{k}'] = np.mean([r['rerank_ndcg'] for r in all_results[k]])
            metrics[f'rerank_recall@{k}'] = np.mean([r['rerank_recall'] for r in all_results[k]])
            metrics[f'rerank_mrr@{k}'] = np.mean([r['rerank_mrr'] for r in all_results[k]])
            metrics[f'rerank_map@{k}'] = np.mean([r['rerank_map'] for r in all_results[k]])
    
    metrics['n_queries'] = len(all_results[10])
    
    return metrics, detailed_results

## 4. Run 5-Fold Cross-Validation

In [7]:
# Store results
fold_metrics = []
all_detailed_results = []

for fold_idx in range(N_FOLDS):
    fold_name = f'fold_{fold_idx + 1}'
    fold_data = all_fold_data[fold_name]
    
    print(f"\n{'='*60}")
    print(f"FOLD {fold_idx + 1}/{N_FOLDS}")
    print(f"{'='*60}")
    
    train_data = fold_data['train_data']
    val_data = fold_data['val_data']
    
    print(f"Training samples: {len(train_data)}, Validation samples: {len(val_data)}")
    
    # Train reranker
    print("\nTraining reranker...")
    model, tokenizer = train_reranker(train_data, BEST_PARAMS, verbose=True)
    
    # Assess
    print("\nAssessing...")
    metrics, detailed = assess_reranker(model, tokenizer, val_data, verbose=True)
    
    # Add fold info to detailed results
    for d in detailed:
        d['fold'] = fold_idx + 1
    
    fold_metrics.append(metrics)
    all_detailed_results.extend(detailed)
    
    # Print fold results
    print(f"\nFold {fold_idx + 1} Results:")
    print(f"  Queries: {metrics['n_queries']}")
    print(f"  Retriever nDCG@10: {metrics['ret_ndcg@10']:.4f}")
    print(f"  Reranker nDCG@10:  {metrics['rerank_ndcg@10']:.4f} (+{metrics['rerank_ndcg@10'] - metrics['ret_ndcg@10']:.4f})")
    
    # Clean up
    del model, tokenizer
    torch.cuda.empty_cache()


FOLD 1/5
Training samples: 10810, Validation samples: 2700

Training reranker...


Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at jinaai/jina-reranker-v3 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Training done. Steps: 10810, Avg loss: 0.1247

Assessing...


Assessing:   0%|          | 0/2700 [00:00<?, ?it/s]

Assessing:   2%|▏         | 47/2700 [00:00<00:06, 413.95it/s]

Assessing:   4%|▍         | 117/2700 [00:00<00:04, 518.23it/s]

Assessing:   6%|▋         | 172/2700 [00:00<00:05, 500.34it/s]

Assessing:   8%|▊         | 227/2700 [00:00<00:04, 500.13it/s]

Assessing:  10%|█         | 277/2700 [00:00<00:06, 375.88it/s]

Assessing:  12%|█▏        | 329/2700 [00:00<00:06, 391.04it/s]

Assessing:  14%|█▍        | 384/2700 [00:00<00:05, 417.00it/s]

Assessing:  16%|█▌        | 437/2700 [00:01<00:05, 420.87it/s]

Assessing:  18%|█▊        | 481/2700 [00:01<00:06, 369.53it/s]

Assessing:  19%|█▉        | 521/2700 [00:01<00:05, 369.90it/s]

Assessing:  21%|██▏       | 574/2700 [00:01<00:05, 398.45it/s]

Assessing:  23%|██▎       | 627/2700 [00:01<00:05, 402.76it/s]

Assessing:  25%|██▍       | 669/2700 [00:01<00:05, 376.89it/s]

Assessing:  27%|██▋       | 732/2700 [00:01<00:04, 419.41it/s]

Assessing:  29%|██▉       | 790/2700 [00:01<00:04, 448.99it/s]

Assessing:  31%|███       | 836/2700 [00:01<00:04, 443.57it/s]

Assessing:  33%|███▎      | 884/2700 [00:02<00:04, 431.61it/s]

Assessing:  34%|███▍      | 928/2700 [00:02<00:04, 417.77it/s]

Assessing:  36%|███▌      | 970/2700 [00:02<00:04, 377.82it/s]

Assessing:  37%|███▋      | 1011/2700 [00:02<00:04, 384.74it/s]

Assessing:  39%|███▉      | 1051/2700 [00:02<00:04, 382.99it/s]

Assessing:  40%|████      | 1090/2700 [00:02<00:04, 351.63it/s]

Assessing:  42%|████▏     | 1141/2700 [00:02<00:04, 372.08it/s]

Assessing:  44%|████▍     | 1189/2700 [00:02<00:03, 398.46it/s]

Assessing:  46%|████▌     | 1234/2700 [00:03<00:03, 399.27it/s]

Assessing:  48%|████▊     | 1301/2700 [00:03<00:03, 454.46it/s]

Assessing:  51%|█████     | 1379/2700 [00:03<00:02, 525.44it/s]

Assessing:  53%|█████▎    | 1441/2700 [00:03<00:02, 527.10it/s]

Assessing:  57%|█████▋    | 1526/2700 [00:03<00:01, 598.85it/s]

Assessing:  59%|█████▉    | 1587/2700 [00:03<00:02, 549.02it/s]

Assessing:  61%|██████    | 1643/2700 [00:03<00:02, 387.27it/s]

Assessing:  63%|██████▎   | 1699/2700 [00:04<00:02, 413.75it/s]

Assessing:  65%|██████▍   | 1746/2700 [00:04<00:02, 339.62it/s]

Assessing:  66%|██████▋   | 1792/2700 [00:04<00:02, 355.58it/s]

Assessing:  68%|██████▊   | 1832/2700 [00:04<00:02, 335.85it/s]

Assessing:  70%|███████   | 1891/2700 [00:04<00:02, 375.25it/s]

Assessing:  73%|███████▎  | 1961/2700 [00:04<00:01, 444.95it/s]

Assessing:  74%|███████▍  | 2009/2700 [00:04<00:01, 403.26it/s]

Assessing:  76%|███████▌  | 2053/2700 [00:04<00:01, 401.70it/s]

Assessing:  78%|███████▊  | 2098/2700 [00:05<00:01, 399.95it/s]

Assessing:  79%|███████▉  | 2144/2700 [00:05<00:01, 412.46it/s]

Assessing:  83%|████████▎ | 2237/2700 [00:05<00:00, 524.48it/s]

Assessing:  86%|████████▋ | 2329/2700 [00:05<00:00, 608.89it/s]

Assessing:  89%|████████▊ | 2391/2700 [00:05<00:00, 591.44it/s]

Assessing:  91%|█████████ | 2451/2700 [00:05<00:00, 546.22it/s]

Assessing:  93%|█████████▎| 2507/2700 [00:05<00:00, 466.78it/s]

Assessing:  95%|█████████▍| 2556/2700 [00:05<00:00, 442.59it/s]

Assessing:  96%|█████████▋| 2602/2700 [00:06<00:00, 416.21it/s]

Assessing:  98%|█████████▊| 2645/2700 [00:06<00:00, 410.95it/s]

Assessing: 100%|█████████▉| 2693/2700 [00:06<00:00, 409.21it/s]

Assessing: 100%|██████████| 2700/2700 [00:06<00:00, 428.25it/s]





Fold 1 Results:
  Queries: 255
  Retriever nDCG@10: 0.6954
  Reranker nDCG@10:  0.7692 (+0.0738)

FOLD 2/5
Training samples: 10790, Validation samples: 2720

Training reranker...


Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at jinaai/jina-reranker-v3 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Training done. Steps: 10790, Avg loss: 0.1288

Assessing...


Assessing:   0%|          | 0/2720 [00:00<?, ?it/s]

Assessing:   1%|▏         | 37/2720 [00:00<00:08, 302.69it/s]

Assessing:   3%|▎         | 91/2720 [00:00<00:06, 404.45it/s]

Assessing:   6%|▌         | 151/2720 [00:00<00:05, 461.15it/s]

Assessing:   7%|▋         | 198/2720 [00:00<00:06, 411.19it/s]

Assessing:  10%|▉         | 263/2720 [00:00<00:05, 463.17it/s]

Assessing:  11%|█▏        | 311/2720 [00:00<00:05, 448.53it/s]

Assessing:  14%|█▍        | 381/2720 [00:00<00:04, 501.56it/s]

Assessing:  16%|█▌        | 432/2720 [00:01<00:05, 404.88it/s]

Assessing:  18%|█▊        | 476/2720 [00:01<00:05, 391.85it/s]

Assessing:  19%|█▉        | 517/2720 [00:01<00:05, 392.46it/s]

Assessing:  21%|██        | 571/2720 [00:01<00:05, 405.52it/s]

Assessing:  23%|██▎       | 619/2720 [00:01<00:05, 412.73it/s]

Assessing:  24%|██▍       | 661/2720 [00:01<00:05, 368.33it/s]

Assessing:  26%|██▌       | 699/2720 [00:01<00:05, 359.52it/s]

Assessing:  27%|██▋       | 736/2720 [00:01<00:05, 349.31it/s]

Assessing:  28%|██▊       | 772/2720 [00:01<00:05, 337.41it/s]

Assessing:  30%|███       | 827/2720 [00:02<00:05, 370.27it/s]

Assessing:  32%|███▏      | 871/2720 [00:02<00:05, 369.45it/s]

Assessing:  34%|███▎      | 914/2720 [00:02<00:04, 383.10it/s]

Assessing:  35%|███▌      | 953/2720 [00:02<00:04, 373.25it/s]

Assessing:  36%|███▋      | 991/2720 [00:02<00:05, 313.63it/s]

Assessing:  38%|███▊      | 1024/2720 [00:02<00:05, 302.89it/s]

Assessing:  39%|███▉      | 1067/2720 [00:02<00:05, 313.55it/s]

Assessing:  41%|████      | 1109/2720 [00:02<00:05, 320.72it/s]

Assessing:  43%|████▎     | 1169/2720 [00:03<00:04, 377.67it/s]

Assessing:  44%|████▍     | 1208/2720 [00:03<00:04, 358.89it/s]

Assessing:  46%|████▌     | 1245/2720 [00:03<00:04, 333.53it/s]

Assessing:  48%|████▊     | 1297/2720 [00:03<00:03, 379.52it/s]

Assessing:  50%|████▉     | 1357/2720 [00:03<00:03, 420.97it/s]

Assessing:  52%|█████▏    | 1417/2720 [00:03<00:02, 450.86it/s]

Assessing:  54%|█████▍    | 1481/2720 [00:03<00:02, 482.82it/s]

Assessing:  56%|█████▋    | 1535/2720 [00:03<00:02, 485.15it/s]

Assessing:  58%|█████▊    | 1584/2720 [00:04<00:02, 423.61it/s]

Assessing:  61%|██████    | 1660/2720 [00:04<00:02, 494.08it/s]

Assessing:  63%|██████▎   | 1712/2720 [00:04<00:02, 398.34it/s]

Assessing:  65%|██████▍   | 1756/2720 [00:04<00:02, 396.00it/s]

Assessing:  68%|██████▊   | 1847/2720 [00:04<00:01, 501.68it/s]

Assessing:  70%|███████   | 1916/2720 [00:04<00:01, 530.06it/s]

Assessing:  73%|███████▎  | 1997/2720 [00:04<00:01, 583.25it/s]

Assessing:  76%|███████▋  | 2076/2720 [00:04<00:01, 610.33it/s]

Assessing:  79%|███████▊  | 2139/2720 [00:05<00:01, 559.89it/s]

Assessing:  81%|████████  | 2207/2720 [00:05<00:00, 570.77it/s]

Assessing:  86%|████████▌ | 2336/2720 [00:05<00:00, 714.73it/s]

Assessing:  89%|████████▉ | 2427/2720 [00:05<00:00, 736.39it/s]

Assessing:  92%|█████████▏| 2502/2720 [00:05<00:00, 697.26it/s]

Assessing:  95%|█████████▍| 2573/2720 [00:05<00:00, 509.98it/s]

Assessing:  97%|█████████▋| 2631/2720 [00:05<00:00, 493.16it/s]

Assessing:  99%|█████████▉| 2689/2720 [00:06<00:00, 494.82it/s]

Assessing: 100%|██████████| 2720/2720 [00:06<00:00, 449.15it/s]





Fold 2 Results:
  Queries: 245
  Retriever nDCG@10: 0.7115
  Reranker nDCG@10:  0.7664 (+0.0549)

FOLD 3/5
Training samples: 10810, Validation samples: 2700

Training reranker...


Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at jinaai/jina-reranker-v3 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Training done. Steps: 10810, Avg loss: 0.1224

Assessing...


Assessing:   0%|          | 0/2700 [00:00<?, ?it/s]

Assessing:   2%|▏         | 51/2700 [00:00<00:05, 443.21it/s]

Assessing:   4%|▎         | 96/2700 [00:00<00:06, 392.20it/s]

Assessing:   5%|▌         | 148/2700 [00:00<00:06, 420.99it/s]

Assessing:   7%|▋         | 191/2700 [00:00<00:06, 394.49it/s]

Assessing:   9%|▉         | 239/2700 [00:00<00:06, 396.28it/s]

Assessing:  10%|█         | 279/2700 [00:00<00:06, 350.01it/s]

Assessing:  12%|█▏        | 327/2700 [00:00<00:06, 380.66it/s]

Assessing:  14%|█▍        | 377/2700 [00:00<00:05, 388.62it/s]

Assessing:  16%|█▌        | 437/2700 [00:01<00:05, 419.20it/s]

Assessing:  18%|█▊        | 480/2700 [00:01<00:05, 378.34it/s]

Assessing:  19%|█▉        | 526/2700 [00:01<00:05, 379.82it/s]

Assessing:  21%|██        | 565/2700 [00:01<00:06, 350.73it/s]

Assessing:  22%|██▏       | 601/2700 [00:01<00:05, 351.70it/s]

Assessing:  24%|██▎       | 641/2700 [00:01<00:05, 361.44it/s]

Assessing:  25%|██▌       | 678/2700 [00:01<00:06, 292.53it/s]

Assessing:  26%|██▋       | 710/2700 [00:01<00:06, 295.63it/s]

Assessing:  27%|██▋       | 742/2700 [00:02<00:06, 290.19it/s]

Assessing:  29%|██▉       | 777/2700 [00:02<00:06, 293.16it/s]

Assessing:  30%|███       | 816/2700 [00:02<00:05, 314.79it/s]

Assessing:  32%|███▏      | 861/2700 [00:02<00:05, 341.85it/s]

Assessing:  34%|███▍      | 912/2700 [00:02<00:04, 384.18it/s]

Assessing:  37%|███▋      | 999/2700 [00:02<00:03, 493.58it/s]

Assessing:  39%|███▉      | 1056/2700 [00:02<00:03, 487.61it/s]

Assessing:  41%|████      | 1105/2700 [00:02<00:03, 442.70it/s]

Assessing:  43%|████▎     | 1150/2700 [00:03<00:03, 421.77it/s]

Assessing:  46%|████▌     | 1231/2700 [00:03<00:02, 497.58it/s]

Assessing:  47%|████▋     | 1282/2700 [00:03<00:02, 476.50it/s]

Assessing:  49%|████▉     | 1330/2700 [00:03<00:03, 430.89it/s]

Assessing:  52%|█████▏    | 1391/2700 [00:03<00:02, 475.96it/s]

Assessing:  53%|█████▎    | 1440/2700 [00:03<00:03, 404.09it/s]

Assessing:  55%|█████▍    | 1483/2700 [00:03<00:02, 409.49it/s]

Assessing:  57%|█████▋    | 1547/2700 [00:03<00:02, 443.56it/s]

Assessing:  59%|█████▉    | 1593/2700 [00:04<00:02, 442.65it/s]

Assessing:  61%|██████    | 1639/2700 [00:04<00:02, 377.16it/s]

Assessing:  62%|██████▏   | 1684/2700 [00:04<00:02, 382.38it/s]

Assessing:  64%|██████▍   | 1729/2700 [00:04<00:02, 397.46it/s]

Assessing:  66%|██████▌   | 1771/2700 [00:04<00:02, 399.17it/s]

Assessing:  69%|██████▉   | 1871/2700 [00:04<00:01, 528.32it/s]

Assessing:  72%|███████▏  | 1934/2700 [00:04<00:01, 534.11it/s]

Assessing:  74%|███████▎  | 1988/2700 [00:04<00:01, 514.41it/s]

Assessing:  76%|███████▋  | 2061/2700 [00:04<00:01, 564.80it/s]

Assessing:  79%|███████▉  | 2137/2700 [00:05<00:00, 590.75it/s]

Assessing:  83%|████████▎ | 2247/2700 [00:05<00:00, 687.98it/s]

Assessing:  86%|████████▌ | 2316/2700 [00:05<00:00, 580.53it/s]

Assessing:  88%|████████▊ | 2377/2700 [00:05<00:00, 539.34it/s]

Assessing:  90%|█████████ | 2437/2700 [00:05<00:00, 536.33it/s]

Assessing:  92%|█████████▏| 2492/2700 [00:05<00:00, 434.35it/s]

Assessing:  94%|█████████▍| 2542/2700 [00:05<00:00, 433.67it/s]

Assessing:  96%|█████████▌| 2592/2700 [00:06<00:00, 433.60it/s]

Assessing:  99%|█████████▊| 2666/2700 [00:06<00:00, 490.31it/s]

Assessing: 100%|██████████| 2700/2700 [00:06<00:00, 432.95it/s]





Fold 3 Results:
  Queries: 250
  Retriever nDCG@10: 0.6870
  Reranker nDCG@10:  0.7607 (+0.0737)

FOLD 4/5
Training samples: 10770, Validation samples: 2740

Training reranker...


Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at jinaai/jina-reranker-v3 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Training done. Steps: 10770, Avg loss: 0.1227

Assessing...


Assessing:   0%|          | 0/2740 [00:00<?, ?it/s]

Assessing:   1%|▏         | 36/2740 [00:00<00:08, 315.86it/s]

Assessing:   4%|▍         | 107/2740 [00:00<00:05, 505.70it/s]

Assessing:   6%|▌         | 158/2740 [00:00<00:06, 385.26it/s]

Assessing:   7%|▋         | 199/2740 [00:00<00:07, 359.69it/s]

Assessing:   9%|▊         | 239/2740 [00:00<00:06, 369.16it/s]

Assessing:  10%|█         | 277/2740 [00:00<00:08, 298.69it/s]

Assessing:  11%|█▏        | 313/2740 [00:00<00:07, 312.89it/s]

Assessing:  13%|█▎        | 356/2740 [00:01<00:07, 329.94it/s]

Assessing:  14%|█▍        | 391/2740 [00:01<00:07, 298.14it/s]

Assessing:  16%|█▌        | 431/2740 [00:01<00:07, 309.81it/s]

Assessing:  17%|█▋        | 471/2740 [00:01<00:07, 320.08it/s]

Assessing:  18%|█▊        | 504/2740 [00:01<00:07, 309.29it/s]

Assessing:  20%|█▉        | 536/2740 [00:01<00:07, 301.64it/s]

Assessing:  21%|██        | 567/2740 [00:01<00:08, 257.23it/s]

Assessing:  22%|██▏       | 594/2740 [00:01<00:08, 258.13it/s]

Assessing:  23%|██▎       | 630/2740 [00:02<00:07, 281.50it/s]

Assessing:  25%|██▍       | 679/2740 [00:02<00:06, 327.89it/s]

Assessing:  26%|██▌       | 713/2740 [00:02<00:07, 285.51it/s]

Assessing:  27%|██▋       | 743/2740 [00:02<00:07, 277.07it/s]

Assessing:  28%|██▊       | 779/2740 [00:02<00:06, 283.18it/s]

Assessing:  30%|██▉       | 811/2740 [00:02<00:06, 282.17it/s]

Assessing:  31%|███       | 840/2740 [00:02<00:07, 269.84it/s]

Assessing:  32%|███▏      | 875/2740 [00:02<00:06, 276.79it/s]

Assessing:  34%|███▍      | 926/2740 [00:02<00:05, 324.81it/s]

Assessing:  35%|███▌      | 971/2740 [00:03<00:05, 345.84it/s]

Assessing:  37%|███▋      | 1011/2740 [00:03<00:04, 355.81it/s]

Assessing:  39%|███▊      | 1061/2740 [00:03<00:04, 375.29it/s]

Assessing:  40%|████      | 1109/2740 [00:03<00:04, 401.54it/s]

Assessing:  42%|████▏     | 1150/2740 [00:03<00:04, 370.47it/s]

Assessing:  43%|████▎     | 1190/2740 [00:03<00:04, 366.16it/s]

Assessing:  46%|████▌     | 1249/2740 [00:03<00:03, 409.81it/s]

Assessing:  47%|████▋     | 1291/2740 [00:03<00:03, 395.04it/s]

Assessing:  49%|████▉     | 1337/2740 [00:04<00:03, 406.51it/s]

Assessing:  50%|█████     | 1378/2740 [00:04<00:03, 404.81it/s]

Assessing:  52%|█████▏    | 1419/2740 [00:04<00:03, 368.17it/s]

Assessing:  53%|█████▎    | 1457/2740 [00:04<00:04, 299.85it/s]

Assessing:  55%|█████▍    | 1497/2740 [00:04<00:04, 308.92it/s]

Assessing:  56%|█████▌    | 1537/2740 [00:04<00:03, 308.33it/s]

Assessing:  58%|█████▊    | 1577/2740 [00:04<00:03, 324.64it/s]

Assessing:  60%|█████▉    | 1637/2740 [00:04<00:02, 392.79it/s]

Assessing:  61%|██████▏   | 1679/2740 [00:05<00:02, 398.33it/s]

Assessing:  63%|██████▎   | 1727/2740 [00:05<00:02, 411.49it/s]

Assessing:  65%|██████▍   | 1770/2740 [00:05<00:02, 378.14it/s]

Assessing:  66%|██████▌   | 1809/2740 [00:05<00:02, 374.79it/s]

Assessing:  67%|██████▋   | 1848/2740 [00:05<00:02, 372.18it/s]

Assessing:  70%|██████▉   | 1917/2740 [00:05<00:01, 454.79it/s]

Assessing:  72%|███████▏  | 1964/2740 [00:05<00:01, 412.50it/s]

Assessing:  74%|███████▎  | 2019/2740 [00:05<00:01, 444.90it/s]

Assessing:  75%|███████▌  | 2065/2740 [00:05<00:01, 417.18it/s]

Assessing:  77%|███████▋  | 2117/2740 [00:06<00:01, 424.18it/s]

Assessing:  79%|███████▉  | 2161/2740 [00:06<00:01, 388.24it/s]

Assessing:  81%|████████  | 2217/2740 [00:06<00:01, 417.79it/s]

Assessing:  83%|████████▎ | 2270/2740 [00:06<00:01, 434.09it/s]

Assessing:  84%|████████▍ | 2315/2740 [00:06<00:01, 394.19it/s]

Assessing:  90%|█████████ | 2473/2740 [00:06<00:00, 658.39it/s]

Assessing:  93%|█████████▎| 2540/2740 [00:06<00:00, 642.43it/s]

Assessing:  95%|█████████▌| 2605/2740 [00:06<00:00, 551.34it/s]

Assessing:  97%|█████████▋| 2663/2740 [00:07<00:00, 549.63it/s]

Assessing:  99%|█████████▉| 2720/2740 [00:07<00:00, 536.19it/s]

Assessing: 100%|██████████| 2740/2740 [00:07<00:00, 379.07it/s]





Fold 4 Results:
  Queries: 285
  Retriever nDCG@10: 0.6644
  Reranker nDCG@10:  0.6924 (+0.0280)

FOLD 5/5
Training samples: 10860, Validation samples: 2650

Training reranker...


Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at jinaai/jina-reranker-v3 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Training done. Steps: 10860, Avg loss: 0.1232

Assessing...


Assessing:   0%|          | 0/2650 [00:00<?, ?it/s]

Assessing:   1%|          | 22/2650 [00:00<00:13, 193.33it/s]

Assessing:   2%|▏         | 51/2650 [00:00<00:10, 241.54it/s]

Assessing:   4%|▍         | 119/2650 [00:00<00:06, 402.53it/s]

Assessing:   7%|▋         | 189/2650 [00:00<00:05, 479.69it/s]

Assessing:   9%|▉         | 242/2650 [00:00<00:05, 476.48it/s]

Assessing:  11%|█         | 294/2650 [00:00<00:05, 468.63it/s]

Assessing:  13%|█▎        | 342/2650 [00:00<00:05, 452.65it/s]

Assessing:  15%|█▍        | 393/2650 [00:00<00:05, 446.19it/s]

Assessing:  17%|█▋        | 438/2650 [00:01<00:06, 364.27it/s]

Assessing:  18%|█▊        | 481/2650 [00:01<00:06, 358.09it/s]

Assessing:  20%|█▉        | 523/2650 [00:01<00:05, 372.63it/s]

Assessing:  21%|██▏       | 564/2650 [00:01<00:05, 375.10it/s]

Assessing:  23%|██▎       | 619/2650 [00:01<00:05, 403.15it/s]

Assessing:  25%|██▌       | 670/2650 [00:01<00:04, 415.67it/s]

Assessing:  27%|██▋       | 713/2650 [00:01<00:05, 375.69it/s]

Assessing:  28%|██▊       | 752/2650 [00:01<00:05, 342.16it/s]

Assessing:  30%|███       | 802/2650 [00:02<00:05, 369.31it/s]

Assessing:  32%|███▏      | 849/2650 [00:02<00:04, 376.03it/s]

Assessing:  34%|███▍      | 897/2650 [00:02<00:04, 397.57it/s]

Assessing:  35%|███▌      | 938/2650 [00:02<00:04, 395.92it/s]

Assessing:  37%|███▋      | 979/2650 [00:02<00:04, 338.58it/s]

Assessing:  38%|███▊      | 1015/2650 [00:02<00:04, 330.73it/s]

Assessing:  40%|███▉      | 1050/2650 [00:02<00:05, 287.90it/s]

Assessing:  42%|████▏     | 1101/2650 [00:02<00:04, 337.88it/s]

Assessing:  44%|████▍     | 1170/2650 [00:03<00:03, 407.18it/s]

Assessing:  47%|████▋     | 1242/2650 [00:03<00:02, 475.07it/s]

Assessing:  49%|████▉     | 1292/2650 [00:03<00:02, 466.40it/s]

Assessing:  51%|█████     | 1340/2650 [00:03<00:03, 390.87it/s]

Assessing:  53%|█████▎    | 1397/2650 [00:03<00:02, 428.48it/s]

Assessing:  54%|█████▍    | 1443/2650 [00:03<00:03, 378.37it/s]

Assessing:  56%|█████▋    | 1497/2650 [00:03<00:02, 401.44it/s]

Assessing:  58%|█████▊    | 1540/2650 [00:03<00:03, 365.98it/s]

Assessing:  60%|██████    | 1596/2650 [00:04<00:02, 398.65it/s]

Assessing:  62%|██████▏   | 1638/2650 [00:04<00:02, 354.14it/s]

Assessing:  64%|██████▍   | 1697/2650 [00:04<00:02, 406.32it/s]

Assessing:  67%|██████▋   | 1763/2650 [00:04<00:01, 445.32it/s]

Assessing:  72%|███████▏  | 1904/2650 [00:04<00:01, 686.71it/s]

Assessing:  75%|███████▍  | 1987/2650 [00:04<00:00, 696.61it/s]

Assessing:  78%|███████▊  | 2061/2650 [00:04<00:01, 537.66it/s]

Assessing:  80%|████████  | 2123/2650 [00:05<00:00, 555.47it/s]

Assessing:  82%|████████▏ | 2185/2650 [00:05<00:00, 549.46it/s]

Assessing:  86%|████████▌ | 2269/2650 [00:05<00:00, 595.79it/s]

Assessing:  88%|████████▊ | 2332/2650 [00:05<00:00, 568.39it/s]

Assessing:  91%|█████████ | 2408/2650 [00:05<00:00, 589.23it/s]

Assessing:  93%|█████████▎| 2469/2650 [00:05<00:00, 569.94it/s]

Assessing:  95%|█████████▌| 2528/2650 [00:05<00:00, 452.07it/s]

Assessing:  98%|█████████▊| 2591/2650 [00:05<00:00, 465.86it/s]

Assessing: 100%|██████████| 2650/2650 [00:06<00:00, 440.46it/s]





Fold 5 Results:
  Queries: 241
  Retriever nDCG@10: 0.7192
  Reranker nDCG@10:  0.7483 (+0.0291)


## 5. Aggregate Results Across Folds

In [8]:
# Compute mean and std across folds
aggregated = {}
for key in fold_metrics[0].keys():
    if key == 'n_queries':
        aggregated[key] = sum(m[key] for m in fold_metrics)
    else:
        values = [m[key] for m in fold_metrics]
        aggregated[f'{key}_mean'] = np.mean(values)
        aggregated[f'{key}_std'] = np.std(values)

# Print comprehensive results table
print("\n" + "="*80)
print("5-FOLD CROSS-VALIDATION RESULTS")
print(f"Model: NV-Embed-v2 + {RERANKER_MODEL_ID.split('/')[-1]} (Trial 33 config, no_evidence=True)")
print("="*80)

print(f"\nTotal queries assessed: {aggregated['n_queries']}")

print(f"\n{'Metric':<15} {'@1':>12} {'@5':>12} {'@10':>12} {'@20':>12}")
print("-"*65)

# Retriever metrics
for metric_name in ['ndcg', 'recall', 'mrr', 'map']:
    row = f"Ret {metric_name.upper():<10}"
    for k in [1, 5, 10, 20]:
        mean = aggregated[f'ret_{metric_name}@{k}_mean']
        std = aggregated[f'ret_{metric_name}@{k}_std']
        row += f" {mean:.4f}+/-{std:.3f}"
    print(row)

print("-"*65)

# Reranker metrics
for metric_name in ['ndcg', 'recall', 'mrr', 'map']:
    row = f"Rerank {metric_name.upper():<7}"
    for k in [1, 5, 10, 20]:
        mean = aggregated[f'rerank_{metric_name}@{k}_mean']
        std = aggregated[f'rerank_{metric_name}@{k}_std']
        row += f" {mean:.4f}+/-{std:.3f}"
    print(row)

print("-"*65)

# Improvement
print("\nImprovement (Reranker - Retriever):")
for metric_name in ['ndcg', 'recall', 'mrr', 'map']:
    row = f"{metric_name.upper():<12}"
    for k in [1, 5, 10, 20]:
        ret_mean = aggregated[f'ret_{metric_name}@{k}_mean']
        rerank_mean = aggregated[f'rerank_{metric_name}@{k}_mean']
        diff = rerank_mean - ret_mean
        pct = (diff / ret_mean * 100) if ret_mean > 0 else 0
        row += f" {diff:+.4f} ({pct:+.1f}%)"
    print(row)


5-FOLD CROSS-VALIDATION RESULTS
Model: NV-Embed-v2 + jina-reranker-v3 (Trial 33 config, no_evidence=True)

Total queries assessed: 1276

Metric                    @1           @5          @10          @20
-----------------------------------------------------------------
Ret NDCG       0.5222+/-0.032 0.6689+/-0.025 0.6955+/-0.019 0.7128+/-0.017
Ret RECALL     0.4849+/-0.023 0.7952+/-0.032 0.8739+/-0.018 0.9393+/-0.018
Ret MRR        0.5222+/-0.032 0.6355+/-0.025 0.6455+/-0.023 0.6499+/-0.023
Ret MAP        0.5222+/-0.032 0.6208+/-0.025 0.6330+/-0.023 0.6382+/-0.023
-----------------------------------------------------------------
Rerank NDCG    0.5991+/-0.033 0.7238+/-0.032 0.7474+/-0.028 0.7578+/-0.025
Rerank RECALL  0.5635+/-0.030 0.8319+/-0.030 0.9008+/-0.023 0.9393+/-0.018
Rerank MRR     0.5991+/-0.033 0.6949+/-0.031 0.7038+/-0.030 0.7063+/-0.029
Rerank MAP     0.5991+/-0.033 0.6826+/-0.034 0.6936+/-0.032 0.6969+/-0.031
--------------------------------------------------------------

In [9]:
# Create DataFrame for fold-by-fold comparison
fold_df = pd.DataFrame(fold_metrics)
fold_df.index = [f"Fold {i+1}" for i in range(len(fold_metrics))]

# Show key metrics per fold
key_metrics = ['ret_ndcg@10', 'rerank_ndcg@10', 'ret_recall@10', 'rerank_recall@10', 'n_queries']
print("\nPer-Fold Metrics:")
display(fold_df[key_metrics].round(4))


Per-Fold Metrics:


Unnamed: 0,ret_ndcg@10,rerank_ndcg@10,ret_recall@10,rerank_recall@10,n_queries
Fold 1,0.6954,0.7692,0.8889,0.9281,255
Fold 2,0.7115,0.7664,0.8643,0.8942,245
Fold 3,0.687,0.7607,0.891,0.9197,250
Fold 4,0.6644,0.6924,0.8432,0.8636,285
Fold 5,0.7192,0.7483,0.8823,0.8982,241


## 6. Detailed Results for Investigation

In [10]:
# Convert to DataFrame for easier analysis
detailed_df = pd.DataFrame([
    {
        'fold': d['fold'],
        'post_id': d['post_id'],
        'criterion_id': d['criterion_id'],
        'query_text': d['query_text'][:50] + '...',
        'n_gold': len(d['gold_uids']),
        'ret_ndcg@10': d['metrics']['ndcg@10']['retriever'],
        'rerank_ndcg@10': d['metrics']['ndcg@10']['reranker'],
        'improvement': d['metrics']['ndcg@10']['reranker'] - d['metrics']['ndcg@10']['retriever'],
    }
    for d in all_detailed_results
])

print(f"Total detailed results: {len(detailed_df)}")
display(detailed_df.head(10))

Total detailed results: 1276


Unnamed: 0,fold,post_id,criterion_id,query_text,n_gold,ret_ndcg@10,rerank_ndcg@10,improvement
0,1,s_993_557,A.7,Feelings of worthlessness or excessive or inap...,1,1.0,1.0,0.0
1,1,s_1404_1048,A.4,Insomnia or hypersomnia nearly every day....,1,0.63093,1.0,0.36907
2,1,s_2815_1660,A.1,"Depressed mood most of the day, nearly every d...",1,0.0,0.0,0.0
3,1,s_258_584,A.7,Feelings of worthlessness or excessive or inap...,1,0.5,1.0,0.5
4,1,s_577_1186,A.2,Markedly diminished interest or pleasure in al...,1,1.0,1.0,0.0
5,1,s_852_229,A.4,Insomnia or hypersomnia nearly every day....,3,0.967468,1.0,0.032532
6,1,s_1551_330,A.5,Psychomotor agitation or retardation nearly ev...,1,1.0,1.0,0.0
7,1,s_2904_522,A.2,Markedly diminished interest or pleasure in al...,2,0.877215,0.919721,0.042505
8,1,s_2890_305,A.7,Feelings of worthlessness or excessive or inap...,1,0.0,0.0,0.0
9,1,s_1404_585,A.2,Markedly diminished interest or pleasure in al...,1,0.5,0.63093,0.13093


In [11]:
# Show queries where reranker helped most
print("\nQueries where reranker improved most:")
top_improvements = detailed_df.nlargest(10, 'improvement')
display(top_improvements)


Queries where reranker improved most:


Unnamed: 0,fold,post_id,criterion_id,query_text,n_gold,ret_ndcg@10,rerank_ndcg@10,improvement
168,1,s_1499_1113,A.9,Recurrent thoughts of death (not just fear of ...,1,0.0,1.0,1.0
931,4,s_2180_39,A.10,Special case criterion for additional clinical...,1,0.0,1.0,1.0
1104,5,s_313_493,A.2,Markedly diminished interest or pleasure in al...,1,0.0,1.0,1.0
735,3,s_1271_760,A.1,"Depressed mood most of the day, nearly every d...",1,0.289065,1.0,0.710935
246,1,s_2958_177,A.1,"Depressed mood most of the day, nearly every d...",1,0.30103,1.0,0.69897
558,3,s_1404_500,A.2,Markedly diminished interest or pleasure in al...,1,0.30103,1.0,0.69897
1204,5,s_220_84,A.10,Special case criterion for additional clinical...,1,0.30103,1.0,0.69897
173,1,s_2415_1024,A.1,"Depressed mood most of the day, nearly every d...",1,0.315465,1.0,0.684535
360,2,s_2953_602,A.1,"Depressed mood most of the day, nearly every d...",1,0.315465,1.0,0.684535
645,3,s_2625_1,A.5,Psychomotor agitation or retardation nearly ev...,1,0.315465,1.0,0.684535


In [12]:
# Show queries where reranker hurt most
print("\nQueries where reranker hurt most:")
worst_regressions = detailed_df.nsmallest(10, 'improvement')
display(worst_regressions)


Queries where reranker hurt most:


Unnamed: 0,fold,post_id,criterion_id,query_text,n_gold,ret_ndcg@10,rerank_ndcg@10,improvement
848,4,s_1551_30,A.8,"Diminished ability to think or concentrate, or...",1,1.0,0.333333,-0.666667
1080,5,s_1404_890,A.6,Fatigue or loss of energy nearly every day....,1,1.0,0.333333,-0.666667
522,3,s_2350_66,A.7,Feelings of worthlessness or excessive or inap...,1,1.0,0.356207,-0.643793
1096,5,s_2023_576,A.8,"Diminished ability to think or concentrate, or...",1,1.0,0.356207,-0.643793
249,1,s_1328_30,A.3,Significant weight loss when not dieting or we...,1,1.0,0.386853,-0.613147
386,2,s_356_186,A.10,Special case criterion for additional clinical...,1,1.0,0.386853,-0.613147
903,4,s_1072_1187,A.1,"Depressed mood most of the day, nearly every d...",1,1.0,0.386853,-0.613147
987,4,s_3040_931,A.7,Feelings of worthlessness or excessive or inap...,1,1.0,0.386853,-0.613147
1040,5,s_2886_101,A.6,Fatigue or loss of energy nearly every day....,1,1.0,0.386853,-0.613147
90,1,s_2593_426,A.10,Special case criterion for additional clinical...,1,1.0,0.430677,-0.569323


In [13]:
# Examine a specific query in detail
def show_query_detail(query_idx):
    """Show detailed retriever vs reranker comparison for a specific query."""
    d = all_detailed_results[query_idx]
    
    print(f"Post ID: {d['post_id']}")
    print(f"Criterion: {d['criterion_id']}")
    print(f"Query: {d['query_text']}")
    print(f"Gold UIDs: {d['gold_uids']}")
    print()
    
    print("RETRIEVER RANKING:")
    print("-" * 80)
    for i, c in enumerate(d['retriever_ranking'][:10], 1):
        marker = "[+]" if c['is_positive'] else "[ ]"
        print(f"{i:2d}. {marker} {c['sent_uid']} (score: {c['score']:.4f})")
        print(f"     {c['text']}")
    
    print()
    print("RERANKER RANKING:")
    print("-" * 80)
    for i, c in enumerate(d['reranker_ranking'][:10], 1):
        marker = "[+]" if c['is_positive'] else "[ ]"
        print(f"{i:2d}. {marker} {c['sent_uid']} (score: {c['reranker_score']:.4f})")
        print(f"     {c['text']}")
    
    print()
    print("METRICS:")
    for k in [1, 5, 10]:
        ret = d['metrics'][f'ndcg@{k}']['retriever']
        rerank = d['metrics'][f'ndcg@{k}']['reranker']
        print(f"  nDCG@{k}: Retriever={ret:.4f}, Reranker={rerank:.4f}, Delta={rerank-ret:+.4f}")

In [14]:
# Show a query where reranker helped
best_idx = detailed_df['improvement'].idxmax()
print("QUERY WITH BEST RERANKER IMPROVEMENT:")
print("=" * 80)
show_query_detail(best_idx)

QUERY WITH BEST RERANKER IMPROVEMENT:
Post ID: s_1499_1113
Criterion: A.9
Query: Recurrent thoughts of death (not just fear of dying), recurrent suicidal ideation without a specific plan, or a suicide attempt or a specific plan for committing suicide.
Gold UIDs: ['s_1499_1113_24']

RETRIEVER RANKING:
--------------------------------------------------------------------------------
 1. [ ] s_1499_1113_0 (score: 0.2029)
     I just feel like absolute shit.
 2. [ ] s_1499_1113_1 (score: 0.1974)
     I've been depressed for as long as I can remember - even as early as Elementary school I've shown sy
 3. [ ] s_1499_1113_38 (score: 0.1945)
     I just feel like I'm spending my life waiting to get better when that may never happen, and what the
 4. [ ] s_1499_1113_12 (score: 0.1673)
     And that's why my depression is that much worse.
 5. [ ] s_1499_1113_35 (score: 0.1669)
     But I procrastinate and I feel like, what, going and talking to someone and taking pills is going to
 6. [ ] s_1499_

In [15]:
# Show a query where reranker hurt
worst_idx = detailed_df['improvement'].idxmin()
print("QUERY WITH WORST RERANKER REGRESSION:")
print("=" * 80)
show_query_detail(worst_idx)

QUERY WITH WORST RERANKER REGRESSION:
Post ID: s_1551_30
Criterion: A.8
Query: Diminished ability to think or concentrate, or indecisiveness, nearly every day (either by subjective account or as observed by others).
Gold UIDs: ['s_1551_30_9']

RETRIEVER RANKING:
--------------------------------------------------------------------------------
 1. [+] s_1551_30_9 (score: 0.2382)
     Incapable of asking for help.
 2. [ ] s_1551_30_2 (score: 0.2184)
     Back then I didnt care or think about it much, I felt expected to work but had no drive or motivatio
 3. [ ] s_1551_30_11 (score: 0.2120)
     I feel like Ive missed out on so much, most of my youth feels wasted and I find myself feeling hopel
 4. [ ] s_1551_30_0 (score: 0.2113)
     Depression and injuries kept me from working for years, worried about my future Im 22, past couple y
 5. [ ] s_1551_30_8 (score: 0.2093)
     Wish I didnt but I did, I was alone and incapable of making the right decisions.
 6. [ ] s_1551_30_10 (score: 0.2063)

## 7. Save Results

In [16]:
# Save aggregated metrics
with open(OUTPUT_DIR / "aggregated_metrics.json", "w") as f:
    json.dump(aggregated, f, indent=2)

# Save per-fold metrics
fold_df.to_csv(OUTPUT_DIR / "fold_metrics.csv")

# Save detailed results
detailed_df.to_csv(OUTPUT_DIR / "detailed_results.csv", index=False)

# Save full detailed results (including rankings) as JSON
with open(OUTPUT_DIR / "full_detailed_results.json", "w") as f:
    json.dump(all_detailed_results, f, indent=2)

print(f"Results saved to: {OUTPUT_DIR}")
print(f"  - aggregated_metrics.json")
print(f"  - fold_metrics.csv")
print(f"  - detailed_results.csv")
print(f"  - full_detailed_results.json")

Results saved to: ../outputs/5fold_results
  - aggregated_metrics.json
  - fold_metrics.csv
  - detailed_results.csv
  - full_detailed_results.json


## 8. Analysis Summary

In [17]:
# Summary statistics
print("\n" + "="*80)
print("ANALYSIS SUMMARY")
print("="*80)

print(f"\n1. Overall Performance:")
print(f"   - Retriever nDCG@10: {aggregated['ret_ndcg@10_mean']:.4f} +/- {aggregated['ret_ndcg@10_std']:.4f}")
print(f"   - Reranker nDCG@10:  {aggregated['rerank_ndcg@10_mean']:.4f} +/- {aggregated['rerank_ndcg@10_std']:.4f}")
improvement = aggregated['rerank_ndcg@10_mean'] - aggregated['ret_ndcg@10_mean']
pct_improvement = improvement / aggregated['ret_ndcg@10_mean'] * 100
print(f"   - Improvement: +{improvement:.4f} ({pct_improvement:+.1f}%)")

print(f"\n2. Query Analysis:")
improved = (detailed_df['improvement'] > 0).sum()
unchanged = (detailed_df['improvement'] == 0).sum()
regressed = (detailed_df['improvement'] < 0).sum()
print(f"   - Improved: {improved} ({improved/len(detailed_df)*100:.1f}%)")
print(f"   - Unchanged: {unchanged} ({unchanged/len(detailed_df)*100:.1f}%)")
print(f"   - Regressed: {regressed} ({regressed/len(detailed_df)*100:.1f}%)")

print(f"\n3. Best Config (Trial 33):")
print(f"   - Loss: BCE + Pairwise-Softplus + Lambda")
print(f"   - Weights: w_point={BEST_PARAMS['w_point']:.3f}, w_pair={BEST_PARAMS['w_pair']:.3f}, w_list={BEST_PARAMS['w_list']:.3f}")
print(f"   - Learning rate: {BEST_PARAMS['learning_rate']:.2e}")
print(f"   - LoRA: r={BEST_PARAMS['lora_r']}, alpha={BEST_PARAMS['lora_alpha']}")

print(f"\n4. Next Steps:")
print(f"   - Investigate queries where reranker regressed")
print(f"   - Analyze error patterns by criterion type")
print(f"   - Consider per-criterion fine-tuning")


ANALYSIS SUMMARY

1. Overall Performance:
   - Retriever nDCG@10: 0.6955 +/- 0.0193
   - Reranker nDCG@10:  0.7474 +/- 0.0284
   - Improvement: +0.0519 (+7.5%)

2. Query Analysis:
   - Improved: 366 (28.7%)
   - Unchanged: 716 (56.1%)
   - Regressed: 194 (15.2%)

3. Best Config (Trial 33):
   - Loss: BCE + Pairwise-Softplus + Lambda
   - Weights: w_point=0.814, w_pair=1.840, w_list=1.076
   - Learning rate: 4.45e-05
   - LoRA: r=16, alpha=16

4. Next Steps:
   - Investigate queries where reranker regressed
   - Analyze error patterns by criterion type
   - Consider per-criterion fine-tuning
