# Comprehensive TF-IDF Evaluation on ANTIQUE Dataset

This notebook performs comprehensive evaluation of the TF-IDF model on the ANTIQUE dataset, calculating:
- **Mean Average Precision (MAP)**
- **Mean Reciprocal Rank (MRR)**
- **Precision@100**
- **Recall@100**
- **F1-Score@100**
- **Additional analysis and breakdowns**

The evaluation uses the models and data generated from the ANTIQUE TF-IDF Complete Implementation notebook.

## 1. Setup and Installation

In [1]:
# Install required packages
!pip install joblib numpy pandas scikit-learn tqdm

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import necessary libraries
import joblib
import numpy as np
import pandas as pd
import os
import time
from collections import defaultdict
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

print('✓ Libraries imported successfully')

✓ Libraries imported successfully


## 2. Data Loading and Verification

In [3]:
# Define paths
BASE_PATH = '/content/drive/MyDrive/tfidf-optimized/'
DATA_PATH = '/content/drive/MyDrive/downloads/'

# Check if files exist
required_files = [
    'tfidf_vectorizer.joblib',
    'tfidf_matrix.joblib',
    'doc_ids.joblib',
    'queries_df_cleaned.joblib',
    'text_cleaner.joblib'
]

print('Checking required files...')
for file in required_files:
    file_path = os.path.join(BASE_PATH, file)
    if os.path.exists(file_path):
        print(f'✓ Found: {file}')
    else:
        print(f'✗ Missing: {file}')

# Load qrels from data directory
qrels_path = os.path.join(DATA_PATH, 'qrels.tsv')
if os.path.exists(qrels_path):
    print(f'✓ Found: qrels.tsv')
else:
    print(f'✗ Missing: qrels.tsv')

print('\nFile verification complete!')

Checking required files...
✓ Found: tfidf_vectorizer.joblib
✓ Found: tfidf_matrix.joblib
✓ Found: doc_ids.joblib
✓ Found: queries_df_cleaned.joblib
✓ Found: text_cleaner.joblib
✓ Found: qrels.tsv

File verification complete!


In [14]:
# Load models and data
print('Loading TF-IDF models and data...')

# Define the simple_tokenizer function (required for loading joblib files)
def simple_tokenizer(text):
    """Basic tokenizer: lowercases, removes non-alphanumeric, splits by whitespace."""
    if text is None:
        return []
    text = str(text).lower()
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    return text.split()

# Define the OptimizedAntiqueTextCleaner class (required for loading joblib files)
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

class OptimizedAntiqueTextCleaner:
    """
    An optimized text cleaner for the ANTIQUE dataset, combining steps
    for efficiency.
    """
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def clean_text(self, text):
        if pd.isna(text) or text is None:
            return ""
        # Lowercasing
        text = text.lower()
        # Remove non-alphanumeric characters (keeping spaces)
        text = re.sub(r'[^a-z0-9\s]', '', text)
        # Tokenization
        tokens = text.split() # Using split as it's generally faster than nltk.word_tokenize for simple cases
        # Remove stop words and stem
        cleaned_tokens = [
            self.stemmer.stem(token) for token in tokens if token not in self.stop_words
        ]
        # Join tokens back into a string
        return " ".join(cleaned_tokens)


# Load TF-IDF components
tfidf_vectorizer = joblib.load(os.path.join(BASE_PATH, 'tfidf_vectorizer.joblib'))
tfidf_matrix = joblib.load(os.path.join(BASE_PATH, 'tfidf_matrix.joblib'))
doc_ids = joblib.load(os.path.join(BASE_PATH, 'doc_ids.joblib'))
text_cleaner = joblib.load(os.path.join(BASE_PATH, 'text_cleaner.joblib'))

# Load queries
queries_df = joblib.load(os.path.join(BASE_PATH, 'queries_df_cleaned.joblib'))

# Load qrels
qrels_df = pd.read_csv(qrels_path, sep='\t')

print(f'✓ TF-IDF vectorizer loaded')
print(f'✓ TF-IDF matrix loaded: {tfidf_matrix.shape}')
print(f'✓ Document IDs loaded: {len(doc_ids)}')
print(f'✓ Queries loaded: {len(queries_df)}')
print(f'✓ Qrels loaded: {len(qrels_df)}')

# Display sample data
print('\nSample queries:')
print(queries_df.head())

print('\nSample qrels:')
print(qrels_df.head())

Loading TF-IDF models and data...
✓ TF-IDF vectorizer loaded
✓ TF-IDF matrix loaded: (402025, 150000)
✓ Document IDs loaded: 402025
✓ Queries loaded: 2426
✓ Qrels loaded: 27422

Sample queries:
   query_id                                               text  \
0   3097310  What causes severe swelling and pain in the kn...   
1   3910705  why don't they put parachutes underneath airpl...   
2    237390                how to clean alloy cylinder heads ?   
3   2247892                          how do i get them whiter?   
4   1078492                    What is Cloud 9 and 7th Heaven?   

                                  cleaned_query  
0               what caus sever swell pain knee  
1  whi not put parachut underneath airplan seat  
2                   how clean alloy cylind head  
3                                how get whiter  
4                             what cloud heaven  

Sample qrels:
   query_id     doc_id  relevance
0   2531329  2531329_0          4
1   2531329  2531329_5    

## 3. Data Preprocessing and Preparation

In [15]:
# Prepare relevance judgments
print('Preparing relevance judgments...')

# Create relevance judgments dictionary
relevance_judgments = defaultdict(set)
for _, row in qrels_df.iterrows():
    query_id = row['query_id']
    doc_id = row['doc_id']
    relevance = row['relevance']

    # Consider relevance >= 1 as relevant
    if relevance >= 1:
        relevance_judgments[query_id].add(doc_id)

# Filter queries that have relevance judgments
evaluated_queries = set(relevance_judgments.keys())
eval_queries = queries_df[queries_df['query_id'].isin(evaluated_queries)].copy()

print(f'Total queries: {len(queries_df)}')
print(f'Queries with relevance judgments: {len(eval_queries)}')
print(f'Total relevance judgments: {len(qrels_df)}')
print(f'Unique relevant documents: {len(set(qrels_df["doc_id"].values))}')

# Statistics about relevance judgments
rel_per_query = [len(relevance_judgments[qid]) for qid in eval_queries['query_id']]
print(f'\nAverage relevant docs per query: {np.mean(rel_per_query):.2f}')
print(f'Max relevant docs per query: {np.max(rel_per_query)}')
print(f'Min relevant docs per query: {np.min(rel_per_query)}')

print('\nEvaluation data prepared successfully!')

Preparing relevance judgments...
Total queries: 2426
Queries with relevance judgments: 2426
Total relevance judgments: 27422
Unique relevant documents: 27422

Average relevant docs per query: 11.30
Max relevant docs per query: 490
Min relevant docs per query: 2

Evaluation data prepared successfully!


## 4. Search Functions

In [16]:
# Define search functions
def search_documents(query_text, tfidf_vectorizer, tfidf_matrix, doc_ids, top_k=1000):
    """
    Search documents using TF-IDF cosine similarity.

    Args:
        query_text: Cleaned query text
        tfidf_vectorizer: Trained TF-IDF vectorizer
        tfidf_matrix: Document-term matrix
        doc_ids: List of document IDs
        top_k: Number of top documents to return

    Returns:
        List of (doc_id, score) tuples
    """
    if not query_text or not query_text.strip():
        return []

    # Transform query to TF-IDF vector
    query_vector = tfidf_vectorizer.transform([query_text])

    # Calculate cosine similarities
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get top-k results
    if top_k < len(doc_ids):
        top_indices = np.argpartition(similarities, -top_k)[-top_k:]
        top_indices = top_indices[np.argsort(-similarities[top_indices])]
    else:
        top_indices = np.argsort(-similarities)

    # Return results with scores > 0
    results = [(doc_ids[i], similarities[i]) for i in top_indices if similarities[i] > 0]
    return results

# Test search function
print('Testing search function...')
test_query = eval_queries.iloc[0]['cleaned_query']
test_results = search_documents(test_query, tfidf_vectorizer, tfidf_matrix, doc_ids, top_k=5)
print(f'Test query: "{test_query}"')
print(f'Found {len(test_results)} results')
for i, (doc_id, score) in enumerate(test_results):
    print(f'  {i+1}. Doc {doc_id}: {score:.4f}')

print('\n✓ Search function working correctly!')

Testing search function...
Test query: "what caus sever swell pain knee"
Found 5 results
  1. Doc 3133211_0: 0.6473
  2. Doc 2606613_8: 0.4549
  3. Doc 3241109_2: 0.3869
  4. Doc 2606613_3: 0.3757
  5. Doc 1574073_3: 0.3743

✓ Search function working correctly!


## 5. Evaluation Metrics

In [17]:
# Define evaluation metrics
def calculate_average_precision(retrieved_docs, relevant_docs):
    """
    Calculate Average Precision (AP) for a single query.

    Args:
        retrieved_docs: List of retrieved document IDs in ranked order
        relevant_docs: Set of relevant document IDs for the query

    Returns:
        Average Precision score
    """
    if not relevant_docs or len(relevant_docs) == 0:
        return 0.0

    if not retrieved_docs or len(retrieved_docs) == 0:
        return 0.0

    hits = 0
    sum_precisions = 0.0

    for i, doc_id in enumerate(retrieved_docs):
        if doc_id in relevant_docs:
            hits += 1
            precision_at_i = hits / (i + 1)
            sum_precisions += precision_at_i

    return sum_precisions / len(relevant_docs)

def calculate_precision_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate Precision@k.
    """
    if k == 0 or len(retrieved_docs) == 0:
        return 0.0

    relevant_retrieved = 0
    for i, doc_id in enumerate(retrieved_docs[:k]):
        if doc_id in relevant_docs:
            relevant_retrieved += 1

    return relevant_retrieved / min(k, len(retrieved_docs))

def calculate_recall_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate Recall@k.
    """
    if len(relevant_docs) == 0:
        return 0.0

    relevant_retrieved = 0
    for i, doc_id in enumerate(retrieved_docs[:k]):
        if doc_id in relevant_docs:
            relevant_retrieved += 1

    return relevant_retrieved / len(relevant_docs)

def calculate_reciprocal_rank(retrieved_docs, relevant_docs):
    """
    Calculate Reciprocal Rank for a single query.
    """
    for i, doc_id in enumerate(retrieved_docs):
        if doc_id in relevant_docs:
            return 1.0 / (i + 1)
    return 0.0

def calculate_f1_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate F1-Score@k.
    """
    precision = calculate_precision_at_k(retrieved_docs, relevant_docs, k)
    recall = calculate_recall_at_k(retrieved_docs, relevant_docs, k)

    if precision + recall == 0:
        return 0.0

    return 2 * (precision * recall) / (precision + recall)

print('✓ Evaluation metrics defined successfully!')

✓ Evaluation metrics defined successfully!


## 6. Run Comprehensive Evaluation

In [18]:
# Run comprehensive evaluation
print('🚀 Starting comprehensive evaluation...')
print('=' * 60)

# Initialize result storage
evaluation_results = {
    'average_precisions': [],
    'reciprocal_ranks': [],
    'precision_at_k': {k: [] for k in [1, 5, 10, 20, 50, 100]},
    'recall_at_k': {k: [] for k in [1, 5, 10, 20, 50, 100]},
    'f1_at_k': {k: [] for k in [1, 5, 10, 20, 50, 100]},
    'query_details': []
}

# Track progress
start_time = time.time()
processed_queries = 0
total_queries = len(eval_queries)

print(f'Evaluating {total_queries} queries...')
print('Progress updates every 50 queries\n')

# Process each query
for idx, (_, query_row) in enumerate(tqdm(eval_queries.iterrows(), total=total_queries)):
    query_id = query_row['query_id']
    query_text = query_row['text']
    cleaned_query = query_row['cleaned_query']

    # Get relevant documents for this query
    relevant_docs = relevance_judgments[query_id]

    if len(relevant_docs) == 0:
        continue

    # Search using TF-IDF
    search_results = search_documents(
        cleaned_query,
        tfidf_vectorizer,
        tfidf_matrix,
        doc_ids,
        top_k=1000
    )

    # Extract document IDs from search results
    retrieved_docs = [doc_id for doc_id, score in search_results if score > 0]

    if len(retrieved_docs) == 0:
        continue

    # Calculate metrics
    # 1. Average Precision
    ap = calculate_average_precision(retrieved_docs, relevant_docs)
    evaluation_results['average_precisions'].append(ap)

    # 2. Reciprocal Rank
    rr = calculate_reciprocal_rank(retrieved_docs, relevant_docs)
    evaluation_results['reciprocal_ranks'].append(rr)

    # 3. Precision, Recall, and F1 at K
    for k in evaluation_results['precision_at_k'].keys():
        prec_k = calculate_precision_at_k(retrieved_docs, relevant_docs, k)
        rec_k = calculate_recall_at_k(retrieved_docs, relevant_docs, k)
        f1_k = calculate_f1_at_k(retrieved_docs, relevant_docs, k)

        evaluation_results['precision_at_k'][k].append(prec_k)
        evaluation_results['recall_at_k'][k].append(rec_k)
        evaluation_results['f1_at_k'][k].append(f1_k)

    # Store query details
    evaluation_results['query_details'].append({
        'query_id': query_id,
        'query_text': query_text,
        'cleaned_query': cleaned_query,
        'num_relevant': len(relevant_docs),
        'num_retrieved': len(retrieved_docs),
        'average_precision': ap,
        'reciprocal_rank': rr,
        'precision_at_100': calculate_precision_at_k(retrieved_docs, relevant_docs, 100),
        'recall_at_100': calculate_recall_at_k(retrieved_docs, relevant_docs, 100),
        'f1_at_100': calculate_f1_at_k(retrieved_docs, relevant_docs, 100)
    })

    processed_queries += 1

    # Progress update
    if processed_queries % 50 == 0:
        elapsed = time.time() - start_time
        avg_time = elapsed / processed_queries
        remaining = (total_queries - processed_queries) * avg_time
        current_map = np.mean(evaluation_results["average_precisions"]) if evaluation_results["average_precisions"] else 0.0
        current_mrr = np.mean(evaluation_results["reciprocal_ranks"]) if evaluation_results["reciprocal_ranks"] else 0.0

        print(f'Progress: {processed_queries}/{total_queries} ({processed_queries/total_queries*100:.1f}%)')
        print(f'  Elapsed: {elapsed:.1f}s, Remaining: {remaining:.1f}s')
        print(f'  Current MAP: {current_map:.4f}, Current MRR: {current_mrr:.4f}')
        print()

total_time = time.time() - start_time
print(f'\nEvaluation completed in {total_time:.2f} seconds')
print(f'Successfully evaluated {processed_queries} queries')
print('=' * 60)

🚀 Starting comprehensive evaluation...
Evaluating 2426 queries...
Progress updates every 50 queries



  2%|▏         | 50/2426 [00:38<28:46,  1.38it/s]

Progress: 50/2426 (2.1%)
  Elapsed: 38.1s, Remaining: 1811.7s
  Current MAP: 0.0370, Current MRR: 0.1946



  4%|▍         | 100/2426 [01:11<24:25,  1.59it/s]

Progress: 100/2426 (4.1%)
  Elapsed: 71.4s, Remaining: 1661.3s
  Current MAP: 0.0474, Current MRR: 0.2142



  6%|▌         | 150/2426 [01:46<27:26,  1.38it/s]

Progress: 150/2426 (6.2%)
  Elapsed: 106.0s, Remaining: 1608.7s
  Current MAP: 0.0459, Current MRR: 0.2180



  8%|▊         | 200/2426 [02:23<24:32,  1.51it/s]

Progress: 200/2426 (8.2%)
  Elapsed: 143.9s, Remaining: 1602.1s
  Current MAP: 0.0447, Current MRR: 0.2093



 10%|█         | 250/2426 [03:00<26:58,  1.34it/s]

Progress: 250/2426 (10.3%)
  Elapsed: 180.1s, Remaining: 1567.8s
  Current MAP: 0.0532, Current MRR: 0.2177



 12%|█▏        | 300/2426 [03:34<23:10,  1.53it/s]

Progress: 300/2426 (12.4%)
  Elapsed: 214.2s, Remaining: 1518.0s
  Current MAP: 0.0572, Current MRR: 0.2224



 14%|█▍        | 350/2426 [04:07<23:41,  1.46it/s]

Progress: 350/2426 (14.4%)
  Elapsed: 247.9s, Remaining: 1470.7s
  Current MAP: 0.0585, Current MRR: 0.2230



 16%|█▋        | 400/2426 [04:40<21:03,  1.60it/s]

Progress: 400/2426 (16.5%)
  Elapsed: 280.2s, Remaining: 1419.0s
  Current MAP: 0.0592, Current MRR: 0.2165



 19%|█▊        | 450/2426 [05:13<22:46,  1.45it/s]

Progress: 450/2426 (18.5%)
  Elapsed: 313.1s, Remaining: 1375.0s
  Current MAP: 0.0633, Current MRR: 0.2292



 21%|██        | 500/2426 [05:46<20:20,  1.58it/s]

Progress: 500/2426 (20.6%)
  Elapsed: 346.5s, Remaining: 1334.9s
  Current MAP: 0.0630, Current MRR: 0.2240



 23%|██▎       | 550/2426 [06:19<19:42,  1.59it/s]

Progress: 550/2426 (22.7%)
  Elapsed: 379.1s, Remaining: 1293.1s
  Current MAP: 0.0626, Current MRR: 0.2237



 25%|██▍       | 600/2426 [06:52<21:18,  1.43it/s]

Progress: 600/2426 (24.7%)
  Elapsed: 412.5s, Remaining: 1255.2s
  Current MAP: 0.0661, Current MRR: 0.2321



 27%|██▋       | 650/2426 [07:25<18:06,  1.63it/s]

Progress: 650/2426 (26.8%)
  Elapsed: 445.2s, Remaining: 1216.3s
  Current MAP: 0.0650, Current MRR: 0.2315



 29%|██▉       | 700/2426 [07:57<19:26,  1.48it/s]

Progress: 700/2426 (28.9%)
  Elapsed: 477.7s, Remaining: 1178.0s
  Current MAP: 0.0643, Current MRR: 0.2299



 31%|███       | 750/2426 [08:31<18:32,  1.51it/s]

Progress: 750/2426 (30.9%)
  Elapsed: 511.4s, Remaining: 1142.7s
  Current MAP: 0.0620, Current MRR: 0.2260



 33%|███▎      | 800/2426 [09:05<18:13,  1.49it/s]

Progress: 800/2426 (33.0%)
  Elapsed: 545.4s, Remaining: 1108.6s
  Current MAP: 0.0608, Current MRR: 0.2235



 35%|███▌      | 850/2426 [09:39<16:45,  1.57it/s]

Progress: 850/2426 (35.0%)
  Elapsed: 579.4s, Remaining: 1074.3s
  Current MAP: 0.0606, Current MRR: 0.2240



 37%|███▋      | 900/2426 [10:12<15:43,  1.62it/s]

Progress: 900/2426 (37.1%)
  Elapsed: 612.2s, Remaining: 1038.0s
  Current MAP: 0.0592, Current MRR: 0.2201



 39%|███▉      | 950/2426 [10:49<17:26,  1.41it/s]

Progress: 950/2426 (39.2%)
  Elapsed: 649.9s, Remaining: 1009.7s
  Current MAP: 0.0584, Current MRR: 0.2210



 41%|████      | 1000/2426 [11:24<15:32,  1.53it/s]

Progress: 1000/2426 (41.2%)
  Elapsed: 684.3s, Remaining: 975.8s
  Current MAP: 0.0576, Current MRR: 0.2162



 43%|████▎     | 1050/2426 [12:00<16:21,  1.40it/s]

Progress: 1050/2426 (43.3%)
  Elapsed: 720.2s, Remaining: 943.7s
  Current MAP: 0.0565, Current MRR: 0.2134



 45%|████▌     | 1100/2426 [12:35<14:57,  1.48it/s]

Progress: 1100/2426 (45.3%)
  Elapsed: 755.0s, Remaining: 910.1s
  Current MAP: 0.0567, Current MRR: 0.2132



 47%|████▋     | 1150/2426 [13:09<14:23,  1.48it/s]

Progress: 1150/2426 (47.4%)
  Elapsed: 789.5s, Remaining: 876.0s
  Current MAP: 0.0576, Current MRR: 0.2173



 49%|████▉     | 1200/2426 [13:42<13:40,  1.49it/s]

Progress: 1200/2426 (49.5%)
  Elapsed: 822.9s, Remaining: 840.7s
  Current MAP: 0.0577, Current MRR: 0.2161



 52%|█████▏    | 1250/2426 [14:17<12:59,  1.51it/s]

Progress: 1250/2426 (51.5%)
  Elapsed: 857.2s, Remaining: 806.4s
  Current MAP: 0.0573, Current MRR: 0.2152



 54%|█████▎    | 1300/2426 [14:50<12:39,  1.48it/s]

Progress: 1300/2426 (53.6%)
  Elapsed: 890.8s, Remaining: 771.6s
  Current MAP: 0.0565, Current MRR: 0.2130



 56%|█████▌    | 1350/2426 [15:25<12:23,  1.45it/s]

Progress: 1350/2426 (55.6%)
  Elapsed: 925.8s, Remaining: 737.9s
  Current MAP: 0.0568, Current MRR: 0.2155



 58%|█████▊    | 1400/2426 [15:59<11:19,  1.51it/s]

Progress: 1400/2426 (57.7%)
  Elapsed: 960.0s, Remaining: 703.5s
  Current MAP: 0.0559, Current MRR: 0.2151



 60%|█████▉    | 1450/2426 [16:34<11:08,  1.46it/s]

Progress: 1450/2426 (59.8%)
  Elapsed: 994.5s, Remaining: 669.4s
  Current MAP: 0.0551, Current MRR: 0.2132



 62%|██████▏   | 1500/2426 [17:08<10:19,  1.49it/s]

Progress: 1500/2426 (61.8%)
  Elapsed: 1028.9s, Remaining: 635.2s
  Current MAP: 0.0548, Current MRR: 0.2131



 64%|██████▍   | 1550/2426 [17:43<09:52,  1.48it/s]

Progress: 1550/2426 (63.9%)
  Elapsed: 1063.7s, Remaining: 601.2s
  Current MAP: 0.0541, Current MRR: 0.2104



 66%|██████▌   | 1600/2426 [18:17<09:32,  1.44it/s]

Progress: 1600/2426 (66.0%)
  Elapsed: 1097.7s, Remaining: 566.7s
  Current MAP: 0.0547, Current MRR: 0.2114



 68%|██████▊   | 1650/2426 [18:52<08:50,  1.46it/s]

Progress: 1650/2426 (68.0%)
  Elapsed: 1132.5s, Remaining: 532.6s
  Current MAP: 0.0550, Current MRR: 0.2118



 70%|███████   | 1700/2426 [19:26<08:10,  1.48it/s]

Progress: 1700/2426 (70.1%)
  Elapsed: 1166.4s, Remaining: 498.1s
  Current MAP: 0.0550, Current MRR: 0.2103



 72%|███████▏  | 1750/2426 [20:00<07:26,  1.51it/s]

Progress: 1750/2426 (72.1%)
  Elapsed: 1200.6s, Remaining: 463.8s
  Current MAP: 0.0543, Current MRR: 0.2074



 74%|███████▍  | 1800/2426 [20:35<08:23,  1.24it/s]

Progress: 1800/2426 (74.2%)
  Elapsed: 1235.3s, Remaining: 429.6s
  Current MAP: 0.0546, Current MRR: 0.2077



 76%|███████▋  | 1850/2426 [21:15<06:46,  1.42it/s]

Progress: 1850/2426 (76.3%)
  Elapsed: 1275.1s, Remaining: 397.0s
  Current MAP: 0.0548, Current MRR: 0.2100



 78%|███████▊  | 1900/2426 [21:51<06:33,  1.34it/s]

Progress: 1900/2426 (78.3%)
  Elapsed: 1312.0s, Remaining: 363.2s
  Current MAP: 0.0546, Current MRR: 0.2086



 80%|████████  | 1950/2426 [22:28<06:09,  1.29it/s]

Progress: 1950/2426 (80.4%)
  Elapsed: 1348.8s, Remaining: 329.3s
  Current MAP: 0.0544, Current MRR: 0.2092



 82%|████████▏ | 2000/2426 [23:05<05:03,  1.40it/s]

Progress: 2000/2426 (82.4%)
  Elapsed: 1385.1s, Remaining: 295.0s
  Current MAP: 0.0545, Current MRR: 0.2100



 85%|████████▍ | 2050/2426 [23:42<04:30,  1.39it/s]

Progress: 2050/2426 (84.5%)
  Elapsed: 1422.1s, Remaining: 260.8s
  Current MAP: 0.0545, Current MRR: 0.2102



 87%|████████▋ | 2100/2426 [24:17<04:00,  1.36it/s]

Progress: 2100/2426 (86.6%)
  Elapsed: 1457.8s, Remaining: 226.3s
  Current MAP: 0.0541, Current MRR: 0.2094



 89%|████████▊ | 2150/2426 [24:53<03:13,  1.42it/s]

Progress: 2150/2426 (88.6%)
  Elapsed: 1493.2s, Remaining: 191.7s
  Current MAP: 0.0542, Current MRR: 0.2090



 91%|█████████ | 2200/2426 [25:30<02:44,  1.37it/s]

Progress: 2200/2426 (90.7%)
  Elapsed: 1530.3s, Remaining: 157.2s
  Current MAP: 0.0541, Current MRR: 0.2087



 93%|█████████▎| 2250/2426 [26:07<02:13,  1.32it/s]

Progress: 2250/2426 (92.7%)
  Elapsed: 1567.1s, Remaining: 122.6s
  Current MAP: 0.0543, Current MRR: 0.2102



 95%|█████████▍| 2300/2426 [26:43<01:28,  1.43it/s]

Progress: 2300/2426 (94.8%)
  Elapsed: 1603.6s, Remaining: 87.8s
  Current MAP: 0.0542, Current MRR: 0.2090



 97%|█████████▋| 2350/2426 [27:19<00:54,  1.39it/s]

Progress: 2350/2426 (96.9%)
  Elapsed: 1639.5s, Remaining: 53.0s
  Current MAP: 0.0536, Current MRR: 0.2071



 99%|█████████▉| 2400/2426 [27:55<00:19,  1.35it/s]

Progress: 2400/2426 (98.9%)
  Elapsed: 1675.5s, Remaining: 18.2s
  Current MAP: 0.0532, Current MRR: 0.2059



100%|██████████| 2426/2426 [28:14<00:00,  1.43it/s]


Evaluation completed in 1694.38 seconds
Successfully evaluated 2426 queries





## 7. Results and Analysis

In [19]:
# Display comprehensive results
print('=' * 80)
print('📊 COMPREHENSIVE EVALUATION RESULTS')
print('=' * 80)

# Core metrics
map_score = np.mean(evaluation_results['average_precisions'])
mrr_score = np.mean(evaluation_results['reciprocal_ranks'])
precision_100 = np.mean(evaluation_results['precision_at_k'][100])
recall_100 = np.mean(evaluation_results['recall_at_k'][100])
f1_100 = np.mean(evaluation_results['f1_at_k'][100])

print(f'🎯 CORE METRICS:')
print(f'   MAP (Mean Average Precision): {map_score:.4f}')
print(f'   MRR (Mean Reciprocal Rank): {mrr_score:.4f}')
print(f'   Precision@100: {precision_100:.4f}')
print(f'   Recall@100: {recall_100:.4f}')
print(f'   F1-Score@100: {f1_100:.4f}')

# Target assessment
print('\n🎯 TARGET ASSESSMENT:')
target_map = 0.2
if map_score >= target_map:
    print(f'   ✅ MAP TARGET ACHIEVED! {map_score:.4f} >= {target_map}')
else:
    print(f'   ❌ MAP target not reached: {map_score:.4f} < {target_map}')
    print(f'   📈 Improvement needed: {target_map - map_score:.4f} points')

# Precision at different cutoffs
print('\n📈 PRECISION AT K:')
for k in [1, 5, 10, 20, 50, 100]:
    if evaluation_results['precision_at_k'][k]:
        prec_k = np.mean(evaluation_results['precision_at_k'][k])
        print(f'   P@{k:3d}: {prec_k:.4f}')

# Recall at different cutoffs
print('\n📉 RECALL AT K:')
for k in [1, 5, 10, 20, 50, 100]:
    if evaluation_results['recall_at_k'][k]:
        rec_k = np.mean(evaluation_results['recall_at_k'][k])
        print(f'   R@{k:3d}: {rec_k:.4f}')

# F1-Score at different cutoffs
print('\n🔄 F1-SCORE AT K:')
for k in [1, 5, 10, 20, 50, 100]:
    if evaluation_results['f1_at_k'][k]:
        f1_k = np.mean(evaluation_results['f1_at_k'][k])
        print(f'   F1@{k:3d}: {f1_k:.4f}')

print('\n' + '=' * 80)

📊 COMPREHENSIVE EVALUATION RESULTS
🎯 CORE METRICS:
   MAP (Mean Average Precision): 0.0533
   MRR (Mean Reciprocal Rank): 0.2055
   Precision@100: 0.0187
   Recall@100: 0.1931
   F1-Score@100: 0.0321

🎯 TARGET ASSESSMENT:
   ❌ MAP target not reached: 0.0533 < 0.2
   📈 Improvement needed: 0.1467 points

📈 PRECISION AT K:
   P@  1: 0.1286
   P@  5: 0.0856
   P@ 10: 0.0664
   P@ 20: 0.0487
   P@ 50: 0.0291
   P@100: 0.0187

📉 RECALL AT K:
   R@  1: 0.0152
   R@  5: 0.0488
   R@ 10: 0.0741
   R@ 20: 0.1061
   R@ 50: 0.1537
   R@100: 0.1931

🔄 F1-SCORE AT K:
   F1@  1: 0.0258
   F1@  5: 0.0554
   F1@ 10: 0.0617
   F1@ 20: 0.0595
   F1@ 50: 0.0450
   F1@100: 0.0321



In [20]:
# Additional statistics
print('📊 DETAILED STATISTICS')
print('=' * 50)

print(f'Queries evaluated: {len(evaluation_results["average_precisions"])}')
print(f'Total queries with judgments: {len(eval_queries)}')

# MAP statistics
print('\n📈 MAP STATISTICS:')
print(f'   Mean: {np.mean(evaluation_results["average_precisions"]):.4f}')
print(f'   Median: {np.median(evaluation_results["average_precisions"]):.4f}')
print(f'   Std Dev: {np.std(evaluation_results["average_precisions"]):.4f}')
print(f'   Min: {np.min(evaluation_results["average_precisions"]):.4f}')
print(f'   Max: {np.max(evaluation_results["average_precisions"]):.4f}')

# MRR statistics
print('\n📉 MRR STATISTICS:')
print(f'   Mean: {np.mean(evaluation_results["reciprocal_ranks"]):.4f}')
print(f'   Median: {np.median(evaluation_results["reciprocal_ranks"]):.4f}')
print(f'   Std Dev: {np.std(evaluation_results["reciprocal_ranks"]):.4f}')
print(f'   Min: {np.min(evaluation_results["reciprocal_ranks"]):.4f}')
print(f'   Max: {np.max(evaluation_results["reciprocal_ranks"]):.4f}')

# Performance breakdown
print('\n🏆 PERFORMANCE BREAKDOWN:')
high_perf_queries = [ap for ap in evaluation_results['average_precisions'] if ap >= 0.5]
med_perf_queries = [ap for ap in evaluation_results['average_precisions'] if 0.2 <= ap < 0.5]
low_perf_queries = [ap for ap in evaluation_results['average_precisions'] if ap < 0.2]

total_evaluated = len(evaluation_results['average_precisions'])
print(f'   High Performance (AP >= 0.5): {len(high_perf_queries)} queries ({len(high_perf_queries)/total_evaluated*100:.1f}%)')
print(f'   Medium Performance (0.2 <= AP < 0.5): {len(med_perf_queries)} queries ({len(med_perf_queries)/total_evaluated*100:.1f}%)')
print(f'   Low Performance (AP < 0.2): {len(low_perf_queries)} queries ({len(low_perf_queries)/total_evaluated*100:.1f}%)')

# Zero performance queries
zero_perf_queries = [ap for ap in evaluation_results['average_precisions'] if ap == 0.0]
print(f'   Zero Performance (AP = 0.0): {len(zero_perf_queries)} queries ({len(zero_perf_queries)/total_evaluated*100:.1f}%)')

print('\n' + '=' * 50)

📊 DETAILED STATISTICS
Queries evaluated: 2426
Total queries with judgments: 2426

📈 MAP STATISTICS:
   Mean: 0.0533
   Median: 0.0095
   Std Dev: 0.1089
   Min: 0.0000
   Max: 1.0000

📉 MRR STATISTICS:
   Mean: 0.2055
   Median: 0.0357
   Std Dev: 0.3313
   Min: 0.0000
   Max: 1.0000

🏆 PERFORMANCE BREAKDOWN:
   High Performance (AP >= 0.5): 34 queries (1.4%)
   Medium Performance (0.2 <= AP < 0.5): 146 queries (6.0%)
   Low Performance (AP < 0.2): 2246 queries (92.6%)
   Zero Performance (AP = 0.0): 283 queries (11.7%)



In [21]:
# Analyze top and bottom performing queries
print('🔍 QUERY ANALYSIS')
print('=' * 50)

# Sort queries by performance
query_performance = sorted(evaluation_results['query_details'], key=lambda x: x['average_precision'], reverse=True)

# Top 5 performing queries
print('🏆 TOP 5 PERFORMING QUERIES:')
for i, query_info in enumerate(query_performance[:5]):
    print(f'{i+1}. Query ID: {query_info["query_id"]}')
    print(f'   AP: {query_info["average_precision"]:.4f}, RR: {query_info["reciprocal_rank"]:.4f}')
    print(f'   Text: "{query_info["query_text"][:80]}..."')
    print(f'   Relevant docs: {query_info["num_relevant"]}, P@100: {query_info["precision_at_100"]:.4f}')
    print()

# Bottom 5 performing queries
print('❌ BOTTOM 5 PERFORMING QUERIES:')
for i, query_info in enumerate(query_performance[-5:]):
    print(f'{i+1}. Query ID: {query_info["query_id"]}')
    print(f'   AP: {query_info["average_precision"]:.4f}, RR: {query_info["reciprocal_rank"]:.4f}')
    print(f'   Text: "{query_info["query_text"][:80]}..."')
    print(f'   Relevant docs: {query_info["num_relevant"]}, P@100: {query_info["precision_at_100"]:.4f}')
    print()

print('=' * 50)

🔍 QUERY ANALYSIS
🏆 TOP 5 PERFORMING QUERIES:
1. Query ID: 1225605
   AP: 1.0000, RR: 1.0000
   Text: "what are cell lines and a monolayer?..."
   Relevant docs: 2, P@100: 0.0200

2. Query ID: 1599582
   AP: 0.9667, RR: 1.0000
   Text: "What is the difference between a Masala Dosa and a Rawa Dosa?..."
   Relevant docs: 5, P@100: 0.0500

3. Query ID: 88316
   AP: 0.9306, RR: 1.0000
   Text: "what is the make of Alfa Romeo?..."
   Relevant docs: 6, P@100: 0.0600

4. Query ID: 2770978
   AP: 0.8822, RR: 1.0000
   Text: "How is Bali as a honeymoon destination as compared to Phuket?..."
   Relevant docs: 8, P@100: 0.0800

5. Query ID: 956761
   AP: 0.8762, RR: 1.0000
   Text: "what is the difference between a lesson and a lesson plan?..."
   Relevant docs: 5, P@100: 0.0500

❌ BOTTOM 5 PERFORMING QUERIES:
1. Query ID: 2500332
   AP: 0.0000, RR: 0.0000
   Text: "how do you change fractions into word form?..."
   Relevant docs: 4, P@100: 0.0000

2. Query ID: 3066868
   AP: 0.0000, RR: 0.0000
  

## 8. Summary and Recommendations

In [None]:
# Final summary
print('🏁 EVALUATION SUMMARY')
print('=' * 60)

print(f'✅ Successfully evaluated {processed_queries} queries')
print(f'✅ Comprehensive metrics calculated')
print(f'✅ Performance analysis completed')

print('\n📋 KEY FINDINGS:')
print(f'   • MAP: {map_score:.4f} (Target: {target_map})')
print(f'   • MRR: {mrr_score:.4f}')
print(f'   • Precision@100: {precision_100:.4f}')
print(f'   • Recall@100: {recall_100:.4f}')
print(f'   • F1@100: {f1_100:.4f}')

if map_score >= target_map:
    print('\n🎉 SUCCESS: MAP target achieved!')
    print('   The TF-IDF model meets the performance requirements.')
else:
    print('\n🔧 IMPROVEMENT OPPORTUNITIES:')
    print('   1. Fine-tune TF-IDF parameters (max_df, min_df, ngram_range)')
    print('   2. Enhance query preprocessing and expansion')
    print('   3. Implement BM25 scoring for better term weighting')
    print('   4. Add pseudo-relevance feedback')
    print('   5. Consider semantic embeddings (BERT, etc.)')
    print('   6. Optimize text cleaning for medical domain')

print('\n💾 EVALUATION COMPLETE!')
print('   All metrics have been calculated and analyzed.')
print('   Results are ready for further analysis or reporting.')
print('\n' + '=' * 60)

In [13]:
# Download NLTK resources if not already downloaded
import nltk
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

print('✓ NLTK resources checked/downloaded.')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


✓ NLTK resources checked/downloaded.
