# 02 - Generate Embeddings

Generate embeddings for all document chunks using sentence-transformers.

In [None]:
import sys
sys.path.append('../src')

import json
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

## Load Chunks

In [None]:
# Load chunks
with open('../index/corpus_chunks.json', 'r', encoding='utf-8') as f:
    chunks = json.load(f)

with open('../index/corpus_meta.json', 'r', encoding='utf-8') as f:
    metadata = json.load(f)

print(f"Loaded {len(chunks)} chunks")
print(f"Loaded {len(metadata)} metadata entries")

## Load Embedding Model

In [None]:
print("Loading embedding model...")
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
print("✅ Model loaded!")

## Generate Embeddings

In [None]:
print("Generating embeddings...")
embeddings = []

batch_size = 32
for i in tqdm(range(0, len(chunks), batch_size)):
    batch = chunks[i:i+batch_size]
    batch_embeddings = model.encode(batch, show_progress_bar=False)
    embeddings.append(batch_embeddings)

embeddings = np.vstack(embeddings)

print(f"\nEmbeddings shape: {embeddings.shape}")
print(f"Expected: ({len(chunks)}, 768)")

## Save Embeddings

In [None]:
np.save('../index/embeddings.npy', embeddings)
print("✅ Saved embeddings.npy")

## Quick Test

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

test_query = "ما هي شروط الحصول على رخصة العمل؟"
query_embedding = model.encode([test_query])[0]

# Find most similar chunks
similarities = cosine_similarity([query_embedding], embeddings)[0]
top_5_idx = np.argsort(similarities)[-5:][::-1]

print(f"Test query: {test_query}")
print("\nTop 5 most similar chunks:")
for idx in top_5_idx:
    print(f"\nScore: {similarities[idx]:.3f}")
    print(f"Category: {metadata[idx]['category']}")
    print(f"Text: {chunks[idx][:150]}...")

## Summary

- ✅ Embeddings generated for all chunks
- ✅ Saved to `../index/embeddings.npy`
- ✅ Quick test shows retrieval is working

Next: Build FAISS index (notebook 03)