# Day 5: Category-Specific Retrieval & Reranking
Advanced retrieval techniques for better accuracy

In [None]:
import sys
sys.path.append('../src')

from sentence_transformers import SentenceTransformer
from category_retrieval import CategoryRetriever, RerankedRetriever
from retrieval import RetrieverSystem

## Part 1: Category-Specific Retrieval

Build separate FAISS index per category for better precision

In [None]:
# Load embedding model
print("Loading model...")
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
print("✅ Model loaded")

In [None]:
# Build category-specific retriever
print("Building category-specific indexes...")
cat_retriever = CategoryRetriever(
    '../index/embeddings.npy',
    '../index/corpus_chunks.json',
    '../index/corpus_meta.json'
)
print("\n✅ Category retriever ready!")

## Part 2: Test Category Detection

In [None]:
# Test category detection
test_queries = [
    "كيف أحصل على رخصة ليموزين؟",  # transportation
    "ما هي خطوات تسجيل المقررات؟",  # education
    "كيف أطلب استشارة طبية؟",  # health
    "كيف أقدم عروض المناقصات؟",  # business
]

print("Testing category detection:\n")
for query in test_queries:
    detected = cat_retriever.detect_category(query)
    print(f"Query: {query}")
    print(f"Detected: {detected}\n")

## Part 3: Compare Global vs Category-Specific Search

In [None]:
query = "كيف أحصل على رخصة ليموزين؟"
query_emb = model.encode([query])[0]

print(f"Query: {query}\n")

# Global search
print("=" * 80)
print("GLOBAL SEARCH (all categories)")
print("=" * 80)
global_results = cat_retriever.search(query_emb, category=None, k=5)
for r in global_results:
    print(f"[{r['rank']}] Score: {r['score']:.3f} | Category: {r['metadata']['category']}")
    print(f"    File: {r['metadata']['source_file'].split('/')[-1]}")
    print(f"    Preview: {r['chunk'][:100]}...\n")

# Category-specific search
detected_cat = cat_retriever.detect_category(query)
print("\n" + "=" * 80)
print(f"CATEGORY-SPECIFIC SEARCH (category: {detected_cat})")
print("=" * 80)
cat_results = cat_retriever.search(query_emb, category=detected_cat, k=5)
for r in cat_results:
    print(f"[{r['rank']}] Score: {r['score']:.3f} | Category: {r['metadata']['category']}")
    print(f"    File: {r['metadata']['source_file'].split('/')[-1]}")
    print(f"    Preview: {r['chunk'][:100]}...\n")

## Part 4: Cross-Encoder Reranking

Two-stage retrieval: fast embeddings + accurate reranking

In [None]:
# Build reranked retriever
print("Building reranked retriever...")
reranked_retriever = RerankedRetriever(
    '../index/embeddings.npy',
    '../index/corpus_chunks.json',
    '../index/corpus_meta.json'
)
print("\n✅ Reranked retriever ready!")

## Part 5: Compare With/Without Reranking

In [None]:
query = "كيف أحصل على رخصة ليموزين في قطر؟"
query_emb = model.encode([query])[0]

print(f"Query: {query}\n")

# Without reranking
print("=" * 80)
print("WITHOUT RERANKING (embedding similarity only)")
print("=" * 80)
basic_results = reranked_retriever.search(query_emb, k=5)
for r in basic_results:
    print(f"[{r['rank']}] Score: {r['score']:.3f}")
    print(f"    Category: {r['metadata']['category']}")
    print(f"    File: {r['metadata']['source_file'].split('/')[-1]}")
    print(f"    Preview: {r['chunk'][:100]}...\n")

# With reranking
print("\n" + "=" * 80)
print("WITH RERANKING (cross-encoder)")
print("=" * 80)
reranked_results = reranked_retriever.search_with_rerank(
    query, query_emb,
    initial_k=20,
    final_k=5
)
for r in reranked_results:
    print(f"[{r['rank']}] Rerank: {r['rerank_score']:.3f} | Original: {r['original_score']:.3f}")
    print(f"    Category: {r['metadata']['category']}")
    print(f"    File: {r['metadata']['source_file'].split('/')[-1]}")
    print(f"    Preview: {r['chunk'][:100]}...\n")

## Part 6: Test Multiple Queries

In [None]:
test_queries = [
    "كيف أحصل على رخصة ليموزين؟",
    "ما هي خطوات تسجيل المقررات في جامعة قطر؟",
    "كيف أطلب استشارة طبية عاجلة؟",
    "كيف أقدم عروض المناقصات؟",
]

for query in test_queries:
    print(f"\n{'='*80}")
    print(f"Query: {query}")
    print('='*80)
    
    # Detect category
    detected_cat = reranked_retriever.detect_category(query)
    print(f"Detected category: {detected_cat}")
    
    # Search with reranking
    query_emb = model.encode([query])[0]
    results = reranked_retriever.search_with_rerank(
        query, query_emb,
        category=detected_cat,
        final_k=3
    )
    
    print(f"\nTop 3 results:")
    for r in results:
        print(f"  [{r['rank']}] Rerank: {r['rerank_score']:.3f}")
        print(f"      Category: {r['metadata']['category']}")
        print(f"      File: {r['metadata']['source_file'].split('/')[-1]}")

## ✅ Summary

### Category-Specific Retrieval
- Built separate FAISS index per category
- Keyword-based category detection
- More precise results within category

### Cross-Encoder Reranking
- Two-stage retrieval (fast + accurate)
- Stage 1: Get 20 candidates with embeddings
- Stage 2: Rerank to top 5 with cross-encoder
- Better ranking of relevant results

### Benefits
- Improved precision
- Better handling of ambiguous queries
- More accurate top results