In [13]:
# ===================================================================
# Evaluation Notebook for Antique Embedding Model (No FAISS)
# Calculates MAP (Mean Average Precision) on Antique Test Set
# Uses cosine similarity directly with embeddings
# ===================================================================

# STEP 1: Install Required Libraries
!pip install sentence-transformers torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install ir-datasets joblib requests scipy tqdm nltk numpy pandas scikit-learn
!pip install ir-measures

# STEP 2: Import Libraries
import ir_datasets
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import joblib
from tqdm import tqdm
import ir_measures
from ir_measures import *
import torch
import gc
import os
from typing import List, Dict, Any
import json
from sklearn.metrics.pairwise import cosine_similarity
import re
import time
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

print("✅ Libraries imported successfully!")
print(f"🔥 CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"🎮 GPU: {torch.cuda.get_device_name(0)}")

# STEP 3: Load Unified Text Cleaning Service (SAME AS TRAINING)
class UnifiedTextCleaningService:
    """Identical to training notebook to ensure consistency"""

    def __init__(self):
        from nltk.stem import PorterStemmer
        from nltk.corpus import stopwords
        import nltk

        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text: str,
                   remove_stopwords: bool = True,
                   apply_stemming: bool = True,
                   apply_lemmatization: bool = False) -> str:
        """Identical cleaning to training notebook"""
        if not text or not isinstance(text, str):
            return ""

        # Basic cleaning
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^a-zA-Z0-9\s\.,!?;:()-]', '', text)
        text = text.strip()

        # Tokenization
        tokens = [word.strip() for word in text.split() if word.strip()]

        # Remove stopwords
        if remove_stopwords and self.stop_words:
            tokens = [token for token in tokens if token not in self.stop_words]

        # Stemming
        if apply_stemming and self.stemmer:
            tokens = [self.stemmer.stem(token) for token in tokens]

        return " ".join(tokens)

text_cleaner = UnifiedTextCleaningService()
print("✅ Unified text cleaner initialized (same as training notebook)!"),

# STEP 4: Load Model and Embeddings
def load_embedding_model_and_data(model_path: str,
                                 embeddings_path: str,
                                 metadata_path: str):
    """Load all components needed for evaluation"""
    print("📂 Loading embedding model and data...")
    # Load SentenceTransformer model
    print("🚀 Loading SentenceTransformer model...")
    model = SentenceTransformer("all-MiniLM-L6-v2")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    print(f"✅ Model loaded on {device}")
    # Load embeddings matrix
    print("📊 Loading embeddings matrix...")
    embeddings_matrix = joblib.load("/content/drive/MyDrive/downloads/antique_embeddings_matrix.joblib")
    print(f"✅ Embeddings matrix loaded with shape")

    # Load document metadata
    print("📄 Loading document metadata...")
    metadata = joblib.load("/content/drive/MyDrive/downloads/antique_embedding_document_metadata.joblib")
    documents = metadata['documents']
    document_order = metadata['document_order']
    docid_to_index = {doc_id: idx for idx, doc_id in enumerate(document_order)}

    cleaned_texts = metadata['cleaned_texts']

    print(f"✅ Metadata loaded with {len(documents):,} documents")
    print(f"✅ Document order aligned: {len(document_order):,}")

    # Verify alignment
    assert len(documents) == len(document_order) == len(cleaned_texts) == embeddings_matrix.shape[0]
    print("✅ Perfect alignment verified!")

    # Create docid to index mapping
    docid_to_index = {doc_id: idx for idx, doc_id in enumerate(document_order)}

    return model, embeddings_matrix, documents, document_order, docid_to_index

# Update these paths to match your saved files
MODEL_PATH = "/content/gdrive/My Drive/downloads/antique_embedding_model"
EMBEDDINGS_PATH = "/content/gdrive/My Drive/downloads/antique_embeddings_matrix.joblib"
METADATA_PATH = "/content/gdrive/My Drive/downloads/antique_embedding_document_metadata.joblib"

# Load the model and data
model, embeddings_matrix, documents, document_order, docid_to_index = load_embedding_model_and_data(
    MODEL_PATH, EMBEDDINGS_PATH, METADATA_PATH
)

# STEP 5: Load Antique Test Set
def load_antique_test_set():
    """Load test queries and qrels from Antique dataset"""
    print("📚 Loading Antique test set...")

    # Load test dataset
    dataset = ir_datasets.load('antique/test')

    # Extract test queries and qrels
    test_queries = []
    test_qrels = {}

    print("Loading test queries...")
    for query in dataset.queries_iter():
        test_queries.append({
            'query_id': query.query_id,
            'text': query.text
        })

    print("Loading test qrels...")
    for qrel in dataset.qrels_iter():
        if qrel.query_id not in test_qrels:
            test_qrels[qrel.query_id] = {}
        test_qrels[qrel.query_id][qrel.doc_id] = qrel.relevance

    print(f"✅ Test set loaded successfully!")
    print(f"   🔍 Test queries: {len(test_queries):,}")
    print(f"   🎯 Qrels: {len(test_qrels):,}")

    return test_queries, test_qrels

test_queries, test_qrels = load_antique_test_set()

# STEP 6: Run Evaluation with Cosine Similarity
def evaluate_with_cosine_similarity(model, embeddings_matrix, documents, document_order,
                                  docid_to_index, test_queries, test_qrels, top_k=100):
    """Evaluate embedding model using cosine similarity"""

    print("🧪 Evaluating embedding model on test set (cosine similarity)...")

    # Process each query and collect results
    run_results = []
    query_times = []

    for query in tqdm(test_queries, desc="Processing queries"):
        query_id = query['query_id']
        query_text = query['text']

        # Skip queries without relevant documents
        if query_id not in test_qrels:
            continue

        start_time = time.time()

        # Clean query with same method as training (less aggressive for queries)
        cleaned_query = text_cleaner.clean_text(
            query_text,
            remove_stopwords=False,  # Keep stopwords for queries
            apply_stemming=True,
            apply_lemmatization=False
        )

        # Generate query embedding
        with torch.no_grad():
            query_embedding = model.encode(
                [cleaned_query],
                convert_to_numpy=True,
                normalize_embeddings=True
            )

        # Calculate cosine similarity with all documents
        similarities = cosine_similarity(query_embedding, embeddings_matrix)[0]

        # Get top-k results
        top_indices = np.argsort(similarities)[::-1][:top_k]

        # Convert to run format for ir_measures
        for rank, idx in enumerate(top_indices, 1):
            doc_id = document_order[idx]
            score = similarities[idx]
            run_results.append({
                'query_id': query_id,
                'doc_id': doc_id,
                'score': float(score),
                'rank': rank
            })

        query_times.append(time.time() - start_time)

    # Convert to pandas DataFrame for ir_measures
    run_df = pd.DataFrame(run_results)

    # Convert qrels to proper format
    qrel_list = []
    for query_id, docs in test_qrels.items():
        for doc_id, rel in docs.items():
            qrel_list.append({
                'query_id': query_id,
                'doc_id': doc_id,
                'relevance': rel
            })
    qrels_df = pd.DataFrame(qrel_list)

    # Calculate metrics
    print("\n📊 Evaluation Results:")
    metrics = ir_measures.calc_aggregate(
        [AP@100, P@10, P@100, R@100, nDCG@100],
        qrels_df,
        run_df
    )

    avg_query_time = np.mean(query_times) * 1000  # in milliseconds

    # Print results
    print(f"🔍 Queries evaluated: {len([q for q in test_queries if q['query_id'] in test_qrels]):,}")
    print(f"⏱️  Average query time: {avg_query_time:.2f} ms")
    print(f"📊 MAP@100: {metrics[AP@100]:.4f}")
    print(f"📊 P@10: {metrics[P@10]:.4f}")
    print(f"📊 P@100: {metrics[P@100]:.4f}")
    print(f"📊 R@100: {metrics[R@100]:.4f}")
    print(f"📊 nDCG@100: {metrics[nDCG@100]:.4f}")

    return metrics

# Run evaluation
metrics = evaluate_with_cosine_similarity(
    model, embeddings_matrix, documents, document_order,
    docid_to_index, test_queries, test_qrels
)

# STEP 7: Save Evaluation Results
def save_evaluation_results(metrics, output_path="antique_embedding_evaluation.json"):
    """Save evaluation results to JSON file"""
    results = {
        'model': 'all-MiniLM-L6-v2',
        'dataset': 'antique/test',
        'metrics': {
            'MAP@100': float(metrics[AP@100]),
            'P@10': float(metrics[P@10]),
            'P@100': float(metrics[P@100]),
            'R@100': float(metrics[R@100]),
            'nDCG@100': float(metrics[nDCG@100])
        },
        'configuration': {
            'preprocessing': 'unified_text_cleaning',
            'query_cleaning': {
                'remove_stopwords': False,
                'apply_stemming': True,
                'apply_lemmatization': False
            },
            'evaluation_method': 'cosine_similarity',
            'evaluation_top_k': 100,
            'normalized_embeddings': True
        }
    }

    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)

    print(f"✅ Evaluation results saved to {output_path}")

save_evaluation_results(metrics)

# STEP 8: Example Query with Detailed Results
def show_example_query(model, embeddings_matrix, documents, document_order, test_queries, test_qrels):
    """Show detailed results for an example query"""
    print("\n🔍 Example Query Analysis:")

    # Find a query with several relevant documents
    example_query = None
    for query in test_queries:
        if query['query_id'] in test_qrels and len(test_qrels[query['query_id']]) >= 3:
            example_query = query
            break

    if not example_query:
        print("No suitable example query found")
        return

    query_id = example_query['query_id']
    query_text = example_query['text']

    print(f"Query ID: {query_id}")
    print(f"Query Text: '{query_text}'")

    # Clean query
    cleaned_query = text_cleaner.clean_text(
        query_text,
        remove_stopwords=False,
        apply_stemming=True,
        apply_lemmatization=False
    )
    print(f"Cleaned Query: '{cleaned_query}'")

    # Get relevant documents from qrels
    relevant_docs = test_qrels[query_id]
    print(f"\n🎯 Relevant Documents ({len(relevant_docs)}):")
    for doc_id, rel in sorted(relevant_docs.items(), key=lambda x: -x[1]):
        print(f"  - {doc_id} (relevance={rel})")

    # Generate embedding and calculate similarities
    with torch.no_grad():
        query_embedding = model.encode(
            [cleaned_query],
            convert_to_numpy=True,
            normalize_embeddings=True
        )

    similarities = cosine_similarity(query_embedding, embeddings_matrix)[0]
    top_indices = np.argsort(similarities)[::-1][:10]  # Top 10

    print("\n🔍 Top 10 Results:")
    for rank, idx in enumerate(top_indices, 1):
        doc_id = document_order[idx]
        is_relevant = doc_id in relevant_docs
        rel_score = relevant_docs.get(doc_id, 0)
        similarity = similarities[idx]
        doc_text = documents[idx]['text'][:100] + "..."

        print(f"{rank}. {doc_id} (score={similarity:.4f}) {'✅' if is_relevant else '❌'} rel={rel_score}")
        print(f"   {doc_text}\n")

show_example_query(model, embeddings_matrix, documents, document_order, test_queries, test_qrels)

print("\n✅ Evaluation completed!")

Looking in indexes: https://download.pytorch.org/whl/cu121


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ Libraries imported successfully!
🔥 CUDA available: False
✅ Unified text cleaner initialized (same as training notebook)!
📂 Loading embedding model and data...
🚀 Loading SentenceTransformer model...
✅ Model loaded on cpu
📊 Loading embeddings matrix...
✅ Embeddings matrix loaded with shape
📄 Loading document metadata...
✅ Metadata loaded with 403,194 documents
✅ Document order aligned: 403,194
✅ Perfect alignment verified!
📚 Loading Antique test set...
Loading test queries...
Loading test qrels...
✅ Test set loaded successfully!
   🔍 Test queries: 200
   🎯 Qrels: 200
🧪 Evaluating embedding model on test set (cosine similarity)...


Processing queries: 100%|██████████| 200/200 [02:25<00:00,  1.37it/s]



📊 Evaluation Results:
🔍 Queries evaluated: 200
⏱️  Average query time: 723.51 ms
📊 MAP@100: 0.1489
📊 P@10: 0.3405
📊 P@100: 0.1057
📊 R@100: 0.3299
📊 nDCG@100: 0.3396
✅ Evaluation results saved to antique_embedding_evaluation.json

🔍 Example Query Analysis:
Query ID: 3990512
Query Text: 'how can we get concentration onsomething?'
Cleaned Query: 'how can we get concentr onsomething?'

🎯 Relevant Documents (36):
  - 3990512_1 (relevance=4)
  - 332116_1 (relevance=4)
  - 3270641_1 (relevance=4)
  - 3980621_2 (relevance=4)
  - 3990512_0 (relevance=3)
  - 1900286_7 (relevance=3)
  - 3154563_3 (relevance=3)
  - 2800339_2 (relevance=3)
  - 311770_2 (relevance=3)
  - 4087614_7 (relevance=3)
  - 667270_1 (relevance=3)
  - 2036065_1 (relevance=2)
  - 3265991_12 (relevance=2)
  - 1772359_2 (relevance=2)
  - 3579785_0 (relevance=2)
  - 3239083_1 (relevance=2)
  - 3974525_0 (relevance=2)
  - 1173778_3 (relevance=2)
  - 4359089_0 (relevance=2)
  - 1173778_0 (relevance=2)
  - 3916430_1 (relevance=2)
 