# Optimized ANTIQUE Dataset Processing and Embedding Generation

This notebook implements optimized processing for higher MAP scores:
1. **Better Model Selection**: Uses retrieval-optimized models
2. **Improved Text Processing**: Preserves semantic information
3. **Enhanced Embedding Strategy**: Query-document optimization
4. **Memory & Speed Optimization**: Efficient batch processing

## Step 1: Install Optimized Packages

In [None]:
# Install compatible packages for Colab
!pip install --upgrade pip
!pip install sentence-transformers>=2.2.2
!pip install transformers>=4.21.0
!pip install torch>=1.13.0
!pip install pandas numpy scikit-learn joblib nltk tqdm faiss-cpu beir datasets ir_datasets
!pip install huggingface_hub>=0.10.0

# Restart runtime after package installation
print("[INFO] Packages installed! Please restart runtime and run the next cell.")

## Step 1.5: Import Packages (Run After Restart)

In [None]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import ir_datasets
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import os
from tqdm import tqdm
from collections import defaultdict
import joblib
import faiss
from sklearn.metrics.pairwise import cosine_similarity
import zipfile
import tarfile
import warnings
warnings.filterwarnings('ignore')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Step 2: Download and Extract ANTIQUE Dataset

In [None]:
print("Downloading ANTIQUE dataset directly...")

# Download the ANTIQUE dataset
dataset = ir_datasets.load('antique/train')

# Create directory
os.makedirs('antique_dataset', exist_ok=True)

# Save documents
print("Saving documents...")
docs_data = [{'doc_id': doc.doc_id, 'text': getattr(doc, 'text', '')} for doc in tqdm(dataset.docs_iter(), desc="Loading documents")]
docs_df = pd.DataFrame(docs_data)
docs_df.to_csv('antique_dataset/documents.tsv', sep='\t', index=False)

# Save queries
print("Saving queries...")
queries_data = [{'query_id': query.query_id, 'text': query.text} for query in tqdm(dataset.queries_iter(), desc="Loading queries")]
queries_df = pd.DataFrame(queries_data)
queries_df.to_csv('antique_dataset/queries.tsv', sep='\t', index=False)

# Save qrels
print("Saving relevance judgments...")
qrels_data = [{'query_id': qrel.query_id, 'doc_id': qrel.doc_id, 'relevance': qrel.relevance} for qrel in tqdm(dataset.qrels_iter(), desc="Loading qrels")]
qrels_df = pd.DataFrame(qrels_data)
qrels_df.to_csv('antique_dataset/qrels.tsv', sep='\t', index=False)

print("✅ Downloaded ANTIQUE dataset")

## Step 3: Smart Text Preprocessing (Preserves Semantics)

In [None]:
stop_words = set(stopwords.words('english'))
stop_words = stop_words - {'not', 'no', 'nor', 'against', 'up', 'down', 'over', 'under', 'more', 'most', 'very'}
lemmatizer = WordNetLemmatizer()

def smart_clean_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' url ', text)
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'\b\d{4}\b', ' YEAR ', text)
    text = re.sub(r'\b\d+\.\d+\b', ' DECIMAL ', text)
    text = re.sub(r'\b\d+\b', ' NUMBER ', text)
    text = re.sub(r'[!]{2,}', ' EMPHASIS ', text)
    text = re.sub(r'[?]{2,}', ' QUESTION ', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    processed_tokens = [lemmatizer.lemmatize(token) for token in tokens if len(token) >= 2 and token not in stop_words and not token.isdigit() and token.isalpha()]
    return ' '.join(processed_tokens)

## Step 4: Embedding Generation

In [None]:
print(f"Loading model: multi-qa-MiniLM-L6-cos-v1")
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1', device=device)
print(f"Model loaded successfully on {device}")

# Prepare texts for embedding
print("\nPreparing texts for embedding...")
doc_texts = docs_df['text'].apply(smart_clean_text).tolist()
doc_ids = docs_df['doc_id'].tolist()
query_texts = queries_df['text'].apply(smart_clean_text).tolist()
query_ids = queries_df['query_id'].tolist()

def generate_embeddings_optimized(texts, batch_size=64):
    embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
    return embeddings

doc_embeddings = generate_embeddings_optimized(doc_texts)
query_embeddings = generate_embeddings_optimized(query_texts)

print(f"\nEmbedding generation completed!")
print(f"Document embeddings shape: {doc_embeddings.shape}")
print(f"Query embeddings shape: {query_embeddings.shape}")

## Step 5: Retrieval Evaluation & MAP Calculation

In [None]:
index = faiss.IndexFlatIP(doc_embeddings.shape[1])
index.add(doc_embeddings.astype(np.float32))

qrels_dict = defaultdict(dict)
for _, row in qrels_df.iterrows():
    qid = str(row['query_id'])
    did = str(row['doc_id'])
    rel = int(row['relevance'])
    qrels_dict[qid][did] = rel

average_precisions = []
for i, query_emb in enumerate(query_embeddings):
    query_id = str(query_ids[i])
    scores, indices = index.search(query_emb.reshape(1, -1).astype(np.float32), 100)
    relevant_found = 0
    precision_sum = 0
    for rank, doc_idx in enumerate(indices[0]):
        doc_id = str(doc_ids[doc_idx])
        is_relevant = qrels_dict[query_id].get(doc_id, 0) > 0
        if is_relevant:
            relevant_found += 1
            precision_sum += relevant_found / (rank + 1)
    avg_precision = precision_sum / relevant_found if relevant_found > 0 else 0.0
    average_precisions.append(avg_precision)
map_score = np.mean(average_precisions)
print(f"MAP Score: {map_score:.4f}")