# TF-IDF Evaluation Notebook

This notebook evaluates TF-IDF system performance with service query processing.

## Metrics Calculated:
- **Mean Average Precision (MAP)**: Average precision across all queries
- **Mean Reciprocal Rank (MRR)**: Average reciprocal rank of first relevant document
- **Precision@10**: Precision at rank 10
- **Recall@10**: Recall at rank 10

## Prerequisites:
- TF-IDF query service running on port 8004
- Enhanced text cleaning service running on port 8003
- Pre-trained TF-IDF models in the models directory

In [None]:
# Import required libraries
import asyncio
import logging
import time
import ir_datasets
import numpy as np
import httpx
import json
from typing import List, Dict, Any
from tqdm import tqdm

# Import evaluation engine
from evaluation_engine import IRMetrics, SearchEvaluator

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("✅ Libraries imported successfully")

In [None]:
# Configuration
TFIDF_QUERY_SERVICE_URL = "http://localhost:8004"
MODEL_BASE_PATH = "/Users/raafatmhanna/Desktop/custom-search-engine/backend/models"

print(f"🔧 TF-IDF Service URL: {TFIDF_QUERY_SERVICE_URL}")
print(f"📁 Model Base Path: {MODEL_BASE_PATH}")

In [None]:
# Check service availability
async def check_service_status():
    try:
        async with httpx.AsyncClient() as client:
            response = await client.get(f"{TFIDF_QUERY_SERVICE_URL}/health")
            return response.status_code == 200
    except:
        return False

# Check if service is running
service_available = await check_service_status()
if service_available:
    print("✅ TF-IDF query service is available")
else:
    print("❌ TF-IDF query service is NOT available")
    print("   Please start the service with: python services/query_processing/tfidf_query_processor.py")

In [None]:
# Load ANTIQUE dataset queries and relevance judgments
def load_dataset_queries_and_qrels():
    """Load queries and relevance judgments from ANTIQUE dataset"""
    logger.info("Loading ANTIQUE dataset queries and qrels...")
    
    dataset = ir_datasets.load('antique/train')
    
    # Load queries
    queries = []
    for query in dataset.queries_iter():
        queries.append({
            'query_id': query.query_id,
            'text': query.text
        })
    
    # Load qrels (relevance judgments)
    qrels = {}
    for qrel in dataset.qrels_iter():
        if qrel.query_id not in qrels:
            qrels[qrel.query_id] = {}
        qrels[qrel.query_id][qrel.doc_id] = qrel.relevance
    
    logger.info(f"Loaded {len(queries)} queries and qrels for {len(qrels)} queries")
    return queries, qrels

# Load dataset
queries, qrels = load_dataset_queries_and_qrels()
print(f"📚 Loaded {len(queries)} queries")
print(f"📊 Loaded qrels for {len(qrels)} queries")

In [None]:
# Function to query TF-IDF service
async def query_tfidf_service(query: str, top_k: int = 1000):
    """Query the TF-IDF service and return ranked document IDs"""
    try:
        async with httpx.AsyncClient(timeout=60.0) as client:
            response = await client.post(
                f"{TFIDF_QUERY_SERVICE_URL}/search",
                json={
                    "query": query,
                    "top_k": top_k,
                    "similarity_threshold": 0.0,
                    "use_enhanced_cleaning": True
                }
            )
            response.raise_for_status()
            result = response.json()
            
            # Extract document IDs in rank order
            return [doc["doc_id"] for doc in result["results"]]
            
    except httpx.RequestError as e:
        logger.error(f"Service request error for query '{query}': {e}")
        return []
    except Exception as e:
        logger.error(f"Error processing query '{query}': {e}")
        return []

print("✅ Query function defined")

In [None]:
# Test with a sample query
if service_available:
    sample_query = "machine learning algorithms"
    print(f"🧪 Testing with sample query: '{sample_query}'...")
    
    sample_results = await query_tfidf_service(sample_query, top_k=5)
    print(f"📄 Retrieved {len(sample_results)} documents")
    
    if sample_results:
        print("✅ Sample query successful")
        print(f"   Top 3 results: {sample_results[:3]}")
    else:
        print("❌ Sample query failed")
else:
    print("⚠️ Skipping sample query test - service not available")

In [None]:
# Initialize evaluation metrics
metrics = IRMetrics()
evaluator = SearchEvaluator()

print("✅ Evaluation engines initialized")

In [None]:
# Function to evaluate a single query
async def evaluate_single_query(query: Dict[str, str], query_qrels: Dict[str, float]):
    """Evaluate a single query and return metrics"""
    query_id = query['query_id']
    query_text = query['text']
    
    # Get search results from TF-IDF service
    retrieved_docs = await query_tfidf_service(query_text, top_k=1000)
    
    if not retrieved_docs:
        logger.warning(f"No results for query {query_id}")
        return {
            'precision_at_10': 0.0,
            'recall_at_10': 0.0,
            'average_precision': 0.0,
            'reciprocal_rank': 0.0
        }
    
    # Get relevant documents (relevance > 0)
    relevant_docs = [doc_id for doc_id, rel in query_qrels.items() if rel > 0]
    
    if not relevant_docs:
        logger.warning(f"No relevant documents for query {query_id}")
        return {
            'precision_at_10': 0.0,
            'recall_at_10': 0.0,
            'average_precision': 0.0,
            'reciprocal_rank': 0.0
        }
    
    # Calculate metrics
    precision_at_10 = metrics.precision_at_k(retrieved_docs, relevant_docs, 10)
    recall_at_10 = metrics.recall_at_k(retrieved_docs, relevant_docs, 10)
    average_precision = metrics.average_precision(retrieved_docs, relevant_docs)
    reciprocal_rank = metrics.reciprocal_rank(retrieved_docs, relevant_docs)
    
    return {
        'precision_at_10': precision_at_10,
        'recall_at_10': recall_at_10,
        'average_precision': average_precision,
        'reciprocal_rank': reciprocal_rank
    }

print("✅ Single query evaluation function defined")

In [None]:
# Main evaluation function
async def evaluate_tfidf_system():
    """Evaluate the entire TF-IDF system"""
    if not service_available:
        raise RuntimeError("TF-IDF service is not available. Please start the service first.")
    
    logger.info("Starting TF-IDF system evaluation...")
    
    query_results = []
    evaluated_queries = 0
    
    # Process each query
    for query in tqdm(queries, desc="Evaluating queries"):
        query_id = query['query_id']
        
        # Skip queries without relevance judgments
        if query_id not in qrels or not qrels[query_id]:
            continue
        
        try:
            query_metrics = await evaluate_single_query(query, qrels[query_id])
            query_results.append(query_metrics)
            evaluated_queries += 1
            
            # Log progress every 50 queries
            if evaluated_queries % 50 == 0:
                logger.info(f"Evaluated {evaluated_queries} queries...")
                
        except Exception as e:
            logger.error(f"Error evaluating query {query_id}: {e}")
            continue
    
    if not query_results:
        logger.error("No queries were successfully evaluated")
        return {}
    
    # Aggregate results
    aggregated = {}
    for metric in query_results[0].keys():
        values = [result[metric] for result in query_results if metric in result]
        aggregated[metric] = sum(values) / len(values) if values else 0.0
    
    # Calculate MAP and MRR
    aggregated['map'] = aggregated.get('average_precision', 0.0)  # MAP = mean AP
    aggregated['mrr'] = aggregated.get('reciprocal_rank', 0.0)    # MRR = mean RR
    aggregated['num_queries_evaluated'] = evaluated_queries
    
    return aggregated

print("✅ Main evaluation function defined")

In [None]:
# Run the evaluation
if service_available:
    print("🚀 Starting evaluation...")
    start_time = time.time()
    
    # Run evaluation
    results = await evaluate_tfidf_system()
    
    evaluation_time = time.time() - start_time
    print(f"✅ Evaluation completed in {evaluation_time:.2f} seconds")
    
else:
    print("❌ Cannot run evaluation - service not available")
    results = {}

In [None]:
# Display detailed results
def print_detailed_results(results):
    """Print detailed evaluation results"""
    print("\n" + "="*60)
    print("TF-IDF EVALUATION RESULTS")
    print("="*60)
    
    if not results:
        print("❌ No results available")
        return
    
    print(f"📊 Evaluated on {results.get('num_queries_evaluated', 0)} queries")
    print("\n📈 CORE METRICS:")
    print(f"   Mean Average Precision (MAP): {results.get('map', 0.0):.4f}")
    print(f"   Mean Reciprocal Rank (MRR):  {results.get('mrr', 0.0):.4f}")
    print(f"   Precision@10:                {results.get('precision_at_10', 0.0):.4f}")
    print(f"   Recall@10:                   {results.get('recall_at_10', 0.0):.4f}")
    
    print("\n📋 INTERPRETATION:")
    map_score = results.get('map', 0.0)
    if map_score >= 0.3:
        print("   MAP: Excellent performance ✅")
    elif map_score >= 0.2:
        print("   MAP: Good performance 👍")
    elif map_score >= 0.1:
        print("   MAP: Fair performance ⚠️")
    else:
        print("   MAP: Poor performance ❌")
    
    mrr_score = results.get('mrr', 0.0)
    if mrr_score >= 0.5:
        print("   MRR: Excellent first relevant result ranking ✅")
    elif mrr_score >= 0.3:
        print("   MRR: Good first relevant result ranking 👍")
    elif mrr_score >= 0.2:
        print("   MRR: Fair first relevant result ranking ⚠️")
    else:
        print("   MRR: Poor first relevant result ranking ❌")
    
    print("="*60)

# Print results
print_detailed_results(results)

In [None]:
# Save results to file
if results:
    output_file = "tfidf_evaluation_results.json"
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"💾 Results saved to {output_file}")
    
    # Also save as a readable text file
    output_text_file = "tfidf_evaluation_results.txt"
    with open(output_text_file, 'w') as f:
        f.write("TF-IDF EVALUATION RESULTS\n")
        f.write("=" * 60 + "\n")
        f.write(f"Evaluated on {results.get('num_queries_evaluated', 0)} queries\n")
        f.write("\nCORE METRICS:\n")
        f.write(f"Mean Average Precision (MAP): {results.get('map', 0.0):.4f}\n")
        f.write(f"Mean Reciprocal Rank (MRR):  {results.get('mrr', 0.0):.4f}\n")
        f.write(f"Precision@10:                {results.get('precision_at_10', 0.0):.4f}\n")
        f.write(f"Recall@10:                   {results.get('recall_at_10', 0.0):.4f}\n")
    print(f"📄 Human-readable results saved to {output_text_file}")
else:
    print("⚠️ No results to save")

## Summary

This notebook evaluated the TF-IDF system using proper IR evaluation metrics:

- **MAP (Mean Average Precision)**: Measures the average precision across all queries
- **MRR (Mean Reciprocal Rank)**: Measures how well the system ranks the first relevant document
- **Precision@10**: Measures precision at rank 10
- **Recall@10**: Measures recall at rank 10

The evaluation uses the ANTIQUE dataset and processes queries through the TF-IDF service with enhanced query processing (lemmatization, stemming, and spell checking).