# Agentic Search - Evaluation Notebook

This notebook evaluates the Agentic Search system using IR metrics.

## INFO 624: Intelligent Search and Language Models

### Evaluation Metrics (Week 8 Concepts):
- Precision@k
- Recall@k
- Mean Reciprocal Rank (MRR)
- Answer Quality (RAGAS-style)

In [None]:
# Setup
import sys
sys.path.insert(0, '..')

from dotenv import load_dotenv
load_dotenv('../.env')

import asyncio
import time
from typing import List, Dict
import json

## 1. Evaluation Metrics Implementation

In [None]:
def precision_at_k(relevant: List[int], retrieved: List[int], k: int) -> float:
    """
    Calculate Precision@k.
    
    Precision@k = (# relevant in top k) / k
    """
    if k == 0:
        return 0.0
    
    top_k = retrieved[:k]
    relevant_in_k = len(set(top_k) & set(relevant))
    return relevant_in_k / k


def recall_at_k(relevant: List[int], retrieved: List[int], k: int) -> float:
    """
    Calculate Recall@k.
    
    Recall@k = (# relevant in top k) / (# total relevant)
    """
    if len(relevant) == 0:
        return 0.0
    
    top_k = retrieved[:k]
    relevant_in_k = len(set(top_k) & set(relevant))
    return relevant_in_k / len(relevant)


def mean_reciprocal_rank(relevant: List[int], retrieved: List[int]) -> float:
    """
    Calculate Mean Reciprocal Rank.
    
    MRR = 1 / (rank of first relevant result)
    """
    for i, doc_id in enumerate(retrieved):
        if doc_id in relevant:
            return 1.0 / (i + 1)
    return 0.0


def average_precision(relevant: List[int], retrieved: List[int]) -> float:
    """
    Calculate Average Precision.
    
    AP = sum(P@k * rel(k)) / |relevant|
    """
    if len(relevant) == 0:
        return 0.0
    
    score = 0.0
    num_relevant = 0
    
    for i, doc_id in enumerate(retrieved):
        if doc_id in relevant:
            num_relevant += 1
            score += num_relevant / (i + 1)
    
    return score / len(relevant)


# Test the metrics
relevant = [1, 3, 5, 7]
retrieved = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

print(f"Precision@5: {precision_at_k(relevant, retrieved, 5):.3f}")
print(f"Recall@5: {recall_at_k(relevant, retrieved, 5):.3f}")
print(f"MRR: {mean_reciprocal_rank(relevant, retrieved):.3f}")
print(f"AP: {average_precision(relevant, retrieved):.3f}")

## 2. Answer Quality Evaluation

Implementing RAGAS-style metrics for answer quality.

In [None]:
from langchain_openai import ChatOpenAI
from src.utils.config import get_settings

settings = get_settings()

async def evaluate_faithfulness(answer: str, context: str) -> float:
    """
    Evaluate if the answer is faithful to the context.
    
    Faithfulness = claims in answer supported by context / total claims
    """
    llm = ChatOpenAI(
        model=settings.openai_model,
        api_key=settings.openai_api_key,
        temperature=0,
    )
    
    prompt = f"""Evaluate if the following answer is faithful to the given context.
    
Context:
{context[:2000]}

Answer:
{answer}

Score the faithfulness from 0 to 1, where:
- 1.0 = All claims in the answer are supported by the context
- 0.5 = Some claims are supported, some are not
- 0.0 = No claims are supported by the context

Respond with just a number between 0 and 1."""
    
    response = await llm.ainvoke(prompt)
    try:
        return float(response.content.strip())
    except:
        return 0.5


async def evaluate_relevance(answer: str, query: str) -> float:
    """
    Evaluate if the answer is relevant to the query.
    """
    llm = ChatOpenAI(
        model=settings.openai_model,
        api_key=settings.openai_api_key,
        temperature=0,
    )
    
    prompt = f"""Evaluate if the following answer is relevant to the query.
    
Query: {query}

Answer:
{answer}

Score the relevance from 0 to 1, where:
- 1.0 = The answer directly and completely addresses the query
- 0.5 = The answer partially addresses the query
- 0.0 = The answer is not relevant to the query

Respond with just a number between 0 and 1."""
    
    response = await llm.ainvoke(prompt)
    try:
        return float(response.content.strip())
    except:
        return 0.5

## 3. Benchmark Test Queries

In [None]:
# Define test queries with expected characteristics
test_queries = [
    {
        "query": "What is RAG in AI?",
        "type": "simple",
        "expected_sources": ["web", "academic"],
    },
    {
        "query": "Compare BM25 and dense retrieval for question answering",
        "type": "complex",
        "expected_sources": ["academic", "web"],
    },
    {
        "query": "How does BERT improve search ranking compared to TF-IDF, and what are the computational tradeoffs?",
        "type": "multi_hop",
        "expected_sources": ["academic", "web"],
    },
]

print(f"Prepared {len(test_queries)} test queries")

In [None]:
from src.agent import run_search

async def run_evaluation(queries: List[Dict]) -> List[Dict]:
    """Run evaluation on test queries."""
    results = []
    
    for q in queries:
        print(f"\nEvaluating: {q['query'][:50]}...")
        
        start_time = time.time()
        result = await run_search(q['query'])
        elapsed = time.time() - start_time
        
        # Extract metrics
        answer = result.get('final_answer', result.get('draft_answer', ''))
        quality = result.get('overall_quality', 0)
        iterations = result.get('iteration_count', 0)
        
        # Count results by source
        web_count = len(result.get('web_results', []))
        vector_count = len(result.get('vector_results', []))
        arxiv_count = len(result.get('arxiv_results', []))
        
        results.append({
            'query': q['query'],
            'expected_type': q['type'],
            'actual_type': result.get('query_type', 'unknown'),
            'quality_score': quality,
            'iterations': iterations,
            'elapsed_seconds': elapsed,
            'web_results': web_count,
            'vector_results': vector_count,
            'arxiv_results': arxiv_count,
            'answer_length': len(answer),
        })
        
        print(f"  Type: {result.get('query_type')} | Quality: {quality:.2f} | Time: {elapsed:.1f}s")
    
    return results

# Run evaluation (uncomment to execute)
# eval_results = await run_evaluation(test_queries)

## 4. Results Analysis

In [None]:
import pandas as pd

# Sample results for demonstration
sample_results = [
    {'query': 'What is RAG?', 'quality_score': 0.85, 'elapsed_seconds': 5.2, 'iterations': 1},
    {'query': 'Compare BM25 and dense retrieval', 'quality_score': 0.78, 'elapsed_seconds': 8.1, 'iterations': 2},
    {'query': 'BERT vs TF-IDF tradeoffs', 'quality_score': 0.72, 'elapsed_seconds': 12.3, 'iterations': 2},
]

df = pd.DataFrame(sample_results)
print("Evaluation Summary:")
print(f"  Average Quality: {df['quality_score'].mean():.2f}")
print(f"  Average Time: {df['elapsed_seconds'].mean():.1f}s")
print(f"  Average Iterations: {df['iterations'].mean():.1f}")

## 5. Comparison: Classical vs Neural Retrieval

Demonstrating course concepts by comparing retrieval methods.

In [None]:
# Conceptual comparison table
comparison = {
    'Method': ['TF-IDF', 'BM25', 'Dense (BERT)', 'Hybrid'],
    'Type': ['Sparse', 'Sparse', 'Dense', 'Both'],
    'Semantic': ['No', 'No', 'Yes', 'Yes'],
    'Speed': ['Fast', 'Fast', 'Slower', 'Medium'],
    'Week': ['4', '6', '5', '11'],
}

df_comparison = pd.DataFrame(comparison)
print("\nRetrieval Methods Comparison (Course Alignment):")
print(df_comparison.to_string(index=False))