# Day 2: Generate Embeddings
Convert text chunks into vector embeddings for semantic search

In [None]:
import sys
sys.path.append('../src')

import json
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

## Step 1: Understanding Embeddings

Test how embeddings capture semantic similarity

In [None]:
# Load model
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# Test texts
texts = [
    "كيف أحصل على رخصة ليموزين؟",  # How do I get limousine license?
    "ما هي متطلبات تأجير السيارات؟",  # What are car rental requirements?
    "أريد تقديم طلب قبول جامعي"  # I want to apply for university (different topic)
]

embeddings = model.encode(texts)
print(f"Shape: {embeddings.shape}")  # Should be (3, 768)

# Calculate cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
sim_matrix = cosine_similarity(embeddings)

print("\nSimilarity matrix:")
print(sim_matrix)
print("\n✅ First two should be more similar (both about transportation)")

## Step 2: Load Preprocessed Chunks

In [None]:
# Load chunks
with open('../index/corpus_chunks.json', 'r', encoding='utf-8') as f:
    chunks = json.load(f)

with open('../index/corpus_meta.json', 'r', encoding='utf-8') as f:
    metadata = json.load(f)

print(f"Loaded {len(chunks)} chunks")
print(f"\nSample chunk:")
print(chunks[0][:200])

## Step 3: Generate Embeddings for All Chunks

In [None]:
print("Generating embeddings...")
embeddings_list = []

batch_size = 32
for i in tqdm(range(0, len(chunks), batch_size)):
    batch = chunks[i:i+batch_size]
    batch_embeddings = model.encode(batch, show_progress_bar=False)
    embeddings_list.append(batch_embeddings)

embeddings = np.vstack(embeddings_list)

print(f"\nEmbeddings shape: {embeddings.shape}")
print(f"Expected: ({len(chunks)}, 768)")

## Step 4: Save Embeddings

In [None]:
np.save('../index/embeddings.npy', embeddings)
print("✅ Saved embeddings.npy")

## Step 5: Quick Quality Test

In [None]:
# Test query
test_query = "كيف أحصل على خدمة الليموزين؟"
query_embedding = model.encode([test_query])[0]

# Find most similar chunks
similarities = cosine_similarity([query_embedding], embeddings)[0]
top_5_idx = np.argsort(similarities)[-5:][::-1]

print(f"Test query: {test_query}")
print("\nTop 5 most similar chunks:")
for idx in top_5_idx:
    print(f"\n{'='*60}")
    print(f"Score: {similarities[idx]:.3f}")
    print(f"Category: {metadata[idx]['category']}")
    print(f"File: {metadata[idx]['source_file'].split('/')[-1]}")
    print(f"Text: {chunks[idx][:150]}...")

## Step 6: Test Multiple Queries

In [None]:
# Queries matching actual data
test_queries = [
    "كيف أحصل على رخصة ليموزين؟",  # limousine license
    "ما هي متطلبات تسجيل المقررات الجامعية؟",  # university course registration
    "كيف أطلب استشارة طبية؟",  # medical consultation
    "ما هي إجراءات تقديم العروض للمناقصات؟",  # tender submission
    "كيف أحصل على شهادة تأكيد استلام الطلب؟",  # CRA certificate
]

for query in test_queries:
    query_emb = model.encode([query])[0]
    sims = cosine_similarity([query_emb], embeddings)[0]
    top_idx = np.argmax(sims)
    
    print(f"\n{'='*80}")
    print(f"Query: {query}")
    print(f"Top match ({sims[top_idx]:.3f}):")
    print(f"  Category: {metadata[top_idx]['category']}")
    print(f"  File: {metadata[top_idx]['source_file'].split('/')[-1]}")
    print(f"  Text: {chunks[top_idx][:150]}...")

## ✅ Checkpoint

Embeddings generated and saved. Ready for FAISS indexing!