In [1]:
import requests
import csv
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.metrics import ndcg_score
import matplotlib.pyplot as plt
import time

# Configuration
EMBEDDING_SERVICE_URL = "http://localhost:8004"
DATASET = "argsme"
TOP_K = 1000


In [2]:
def calculate_ir_metrics_corrected(gt, pred_docs, pred_scores, top_k):
    """
    Calculate Information Retrieval metrics CORRECTLY.
    
    This replaces the flawed sklearn.average_precision_score usage
    with proper rank-based IR metrics.
    
    Args:
        gt: Ground truth relevant documents {doc_id: relevance_score}
        pred_docs: List of predicted document IDs in ranked order
        pred_scores: List of prediction scores (same order as pred_docs)
        top_k: Number of documents to consider
    
    Returns:
        Dictionary of metrics: ap, precision, recall, ndcg
    """
    if not gt or not pred_docs:
        return {'ap': 0.0, 'precision': 0.0, 'recall': 0.0, 'ndcg': 0.0}
    
    # Limit to top_k
    pred_docs = pred_docs[:top_k]
    pred_scores = pred_scores[:top_k]
    
    # Calculate hits (binary relevance)
    hits = [1 if doc in gt else 0 for doc in pred_docs]
    
    # Calculate Average Precision (AP) using rank-based method
    # This is the CORRECT way for IR evaluation
    ap = 0.0
    num_relevant_retrieved = 0
    for i, hit in enumerate(hits):
        if hit:
            num_relevant_retrieved += 1
            precision_at_i = num_relevant_retrieved / (i + 1)
            ap += precision_at_i
    
    if num_relevant_retrieved > 0:
        ap /= len(gt)  # Normalize by total relevant documents
    
    # Calculate Precision@k
    precision = sum(hits) / top_k
    
    # Calculate Recall@k
    recall = sum(hits) / len(gt) if len(gt) > 0 else 0.0
    
    # Calculate NDCG@k using graded relevance
    true_rels = [gt.get(doc, 0) for doc in pred_docs]
    
    if sum(true_rels) > 0:
        # Create ideal ranking for NDCG calculation
        ideal_rels = sorted(gt.values(), reverse=True)[:top_k]
        if len(ideal_rels) < top_k:
            ideal_rels.extend([0] * (top_k - len(ideal_rels)))
        
        ndcg = ndcg_score([ideal_rels], [true_rels])
    else:
        ndcg = 0.0
    
    return {
        'ap': ap,
        'precision': precision,
        'recall': recall,
        'ndcg': ndcg
    }


In [3]:
def load_queries(path):
    """Load queries from TSV file"""
    queries = []
    with open(path, encoding="utf-8") as f:
        reader = csv.DictReader(f, delimiter="\t")
        for row in reader:
            queries.append({"query_id": row["query_id"], "query": row["processed_text"]})
    return queries

def load_qrels(path):
    """Load qrels from TSV file"""
    qrels = defaultdict(dict)
    with open(path, encoding="utf-8") as f:
        next(f)  # Skip header
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) < 3:
                continue
            qid, docid, rel = parts[:3]
            qrels[qid][docid] = int(rel)
    return qrels

def search_vector_store(query, top_k=TOP_K):
    """Search using vector store method (embedding + vector store)"""
    # Step 1: Get embedding
    embed_response = requests.post(f"{EMBEDDING_SERVICE_URL}/embed", json={"text": query})
    embed_response.raise_for_status()
    embedding = embed_response.json()["embedding"]
    
    # Step 2: Search with vector store
    VECTOR_STORE_URL = "http://localhost:8007/search"
    search_payload = {
        "dataset": DATASET,
        "query_vector": embedding,
        "top_k": top_k,
        "index_type": "auto"
    }
    search_response = requests.post(VECTOR_STORE_URL, json=search_payload)
    search_response.raise_for_status()
    return search_response.json()["results"]


In [4]:
# Load data and run CORRECTED evaluation with VECTOR STORE
queries_path = "data/vectors/argsme/processed/ARGSME_cleaned_queries.tsv"
qrels_path = "data/vectors/argsme/qrels.tsv"

queries = load_queries(queries_path)
qrels = load_qrels(qrels_path)

print(f"Loaded {len(queries)} queries and {len(qrels)} qrels")

# Limit to first 20 queries for testing
queries = queries[:20]
print(f"Evaluating first {len(queries)} queries...")

# Evaluate vector store with CORRECTED metrics
print("\nVECTOR STORE EVALUATION - CORRECTED METRICS")
print("=" * 60)

metrics = {"AP": [], "Precision": [], "Recall": [], "NDCG": []}
failed_queries = []
total_time = 0

for q in tqdm(queries, desc="Evaluating queries"):
    qid = q["query_id"]
    query_text = q["query"]
    gt = qrels.get(qid, {})
    
    if not gt:
        continue
        
    print(f"\nQuery ID: {qid}")
    print(f"Query: {query_text[:100]}...")
    print(f"Relevant docs: {len(gt)}")
    
    try:
        start_time = time.time()
        results = search_vector_store(query_text, TOP_K)
        search_time = time.time() - start_time
        total_time += search_time
        
        # Vector store returns [(doc_id, score), ...]
        pred_docs = [str(r[0]) for r in results]
        pred_scores = [r[1] for r in results]
        
        # Calculate CORRECTED metrics
        metrics_result = calculate_ir_metrics_corrected(gt, pred_docs, pred_scores, TOP_K)
        
        metrics["AP"].append(metrics_result['ap'])
        metrics["Precision"].append(metrics_result['precision'])
        metrics["Recall"].append(metrics_result['recall'])
        metrics["NDCG"].append(metrics_result['ndcg'])
        
        print(f"   Search time: {search_time:.3f}s")
        print(f"   Precision@{TOP_K}: {metrics_result['precision']:.4f}")
        print(f"   Recall@{TOP_K}: {metrics_result['recall']:.4f}")
        print(f"   AP: {metrics_result['ap']:.4f}")
        print(f"   NDCG@{TOP_K}: {metrics_result['ndcg']:.4f}")
        print(f"   Top 5 Retrieved: {pred_docs[:5]}")
        print(f"   Relevant Retrieved: {[doc for doc in pred_docs[:5] if doc in gt]}")
        
    except Exception as e:
        print(f"   Error: {e}")
        metrics["AP"].append(0)
        metrics["Precision"].append(0)
        metrics["Recall"].append(0)
        metrics["NDCG"].append(0)
        failed_queries.append(qid)


Loaded 50 queries and 50 qrels
Evaluating first 20 queries...

VECTOR STORE EVALUATION - CORRECTED METRICS


Evaluating queries:   0%|          | 0/20 [00:00<?, ?it/s]


Query ID: 1
Query: teacher get tenure user heard country give teacher tenure others dont interested reason tenure user ...
Relevant docs: 23
   Search time: 4.240s
   Precision@1000: 0.0170
   Recall@1000: 0.7391
   AP: 0.4370
   NDCG@1000: 0.7505
   Top 5 Retrieved: ['Sc065954f-A6deb09b6', 'S51530f3f-A6ac2dcba', 'S51530f3f-Ac5b10bae', 'S51530f3f-Ae32a4a1b', 'Sb0680508-A304d661e']
   Relevant Retrieved: ['Sc065954f-A6deb09b6', 'S51530f3f-Ae32a4a1b', 'Sb0680508-A304d661e']

Query ID: 2
Query: vaping ecigarettes safe consider switch smoking vaping user wonder extent vaping safer new risk may ...
Relevant docs: 45
   Search time: 4.224s
   Precision@1000: 0.0240
   Recall@1000: 0.5333
   AP: 0.0866
   NDCG@1000: 0.5164
   Top 5 Retrieved: ['Sfa7e9c9a-A5ce266e9', 'Se435a482-A84859719', 'S65de0e0f-A9228bfd0', 'Sb83fa829-Aac67a0a6', 'Sa5657fa3-A59222b81']
   Relevant Retrieved: ['Se435a482-A84859719', 'S65de0e0f-A9228bfd0', 'Sb83fa829-Aac67a0a6']

Query ID: 3
Query: insider trading allow si

In [5]:
# Calculate and display final results
avg_metrics = {
    "MAP": np.mean(metrics["AP"]),
    "Precision": np.mean(metrics["Precision"]),
    "Recall": np.mean(metrics["Recall"]),
    "NDCG": np.mean(metrics["NDCG"]),
    "AvgTime": total_time / len(queries) if queries else 0
}

print(f"\nVECTOR STORE - CORRECTED FINAL RESULTS:")
print("=" * 60)
print(f"   MAP: {avg_metrics['MAP']:.4f}")
print(f"   Precision@{TOP_K}: {avg_metrics['Precision']:.4f}")
print(f"   Recall@{TOP_K}: {avg_metrics['Recall']:.4f}")
print(f"   NDCG@{TOP_K}: {avg_metrics['NDCG']:.4f}")
print(f"   Average Time: {avg_metrics['AvgTime']:.3f}s")

if failed_queries:
    print(f"   Failed queries: {len(failed_queries)}")
    print(f"   Failed IDs: {failed_queries}")

# Comparison note
print("\n" + "=" * 60)
print("COMPARISON: CORRECTED vs ORIGINAL FLAWED METRICS")
print("=" * 60)
print("\nIMPORTANT NOTE:")
print("Previous evaluations likely showed much lower scores")
print("due to incorrect use of sklearn.average_precision_score.")
print("")
print("These CORRECTED metrics show true IR performance.")
print("")
print("Action Required: Update all other evaluation notebooks")
print("with this corrected calculate_ir_metrics_corrected() function.")



VECTOR STORE - CORRECTED FINAL RESULTS:
   MAP: 0.1448
   Precision@1000: 0.0338
   Recall@1000: 0.5327
   NDCG@1000: 0.5306
   Average Time: 4.211s

COMPARISON: CORRECTED vs ORIGINAL FLAWED METRICS

IMPORTANT NOTE:
Previous evaluations likely showed much lower scores
due to incorrect use of sklearn.average_precision_score.

These CORRECTED metrics show true IR performance.

Action Required: Update all other evaluation notebooks
with this corrected calculate_ir_metrics_corrected() function.
