# Day 2: FAISS Retrieval Testing
Build FAISS index and test retrieval quality

In [None]:
import sys
sys.path.append('../src')

from retrieval import RetrieverSystem
from sentence_transformers import SentenceTransformer

## Step 1: Build Retriever

In [None]:
# Build retriever
retriever = RetrieverSystem(
    '../index/embeddings.npy',
    '../index/corpus_chunks.json',
    '../index/corpus_meta.json'
)

# Save index
retriever.save_index('../index/faiss.index')

## Step 2: Load Embedding Model

In [None]:
# Load model for queries
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
print("✅ Model loaded")

## Step 3: Test Retrieval with Real Queries

In [None]:
# Test queries matching actual data
test_queries = [
    "كيف أحصل على رخصة ليموزين في قطر؟",  # limousine license
    "ما هي إجراءات تسجيل المقررات في جامعة قطر؟",  # QU course registration
    "كيف أطلب استشارة طبية؟",  # medical consultation
    "ما هي متطلبات تقديم العروض للمناقصات؟",  # tender submission
    "كيف أحصل على شهادة من وزارة المواصلات؟",  # MOT certificate
]

for query in test_queries:
    print(f"\n{'='*80}")
    print(f"QUERY: {query}")
    print('='*80)
    
    # Get query embedding
    query_emb = model.encode([query])[0]
    
    # Search
    results = retriever.search(query_emb, k=5)
    
    # Display
    for r in results:
        print(f"\n[Rank {r['rank']}] Score: {r['score']:.3f}")
        print(f"Category: {r['metadata']['category']}")
        print(f"Source: {r['metadata']['source_file'].split('/')[-1]}")
        print(f"Text: {r['chunk'][:200]}...")

## Step 4: Evaluate Retrieval Quality

Manually check:
- Are top-3 results relevant?
- What's the typical score range?
- Do categories match query intent?

In [None]:
# Detailed analysis for one query
query = "كيف أحصل على رخصة ليموزين؟"
query_emb = model.encode([query])[0]
results = retriever.search(query_emb, k=10)

print(f"Query: {query}\n")
print("Top 10 Results:")
print(f"{'Rank':<6} {'Score':<8} {'Category':<15} {'File'}")
print("-" * 80)

for r in results:
    filename = r['metadata']['source_file'].split('/')[-1][:40]
    print(f"{r['rank']:<6} {r['score']:<8.3f} {r['metadata']['category']:<15} {filename}")

## ✅ Checkpoint

FAISS index built and tested. Ready for LLM integration!