## Information Retrieval Benchmarking on Legal Texts

### Evaluation Pipeline 

In [None]:
import os
import time
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics import ndcg_score
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from openai import AzureOpenAI
import tiktoken
import google.generativeai as genai
from mistralai import Mistral
import random
from sentence_transformers import InputExample
import json
import pandas as pd
import random
from sentence_transformers import InputExample, SentenceTransformer
import torch
from transformers import AutoTokenizer
from farasa.segmenter import FarasaSegmenter


In [None]:
def embed_law_texts(law_texts: list) -> list:
    return [embed_query(text) for text in law_texts]

def build_bm25_index(law_texts: list):
    tokenized_texts = [text.split() for text in law_texts]
    bm25 = BM25Okapi(tokenized_texts)
    return bm25

def bm25_search(query: str, bm25: BM25Okapi, k=100):
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    ranked_indices = np.argsort(scores)[::-1]
    return ranked_indices[:k], scores

def semantic_search(query_embedding: np.ndarray, law_text_embeddings: list, top_k=100):
    similarities = cosine_similarity([query_embedding], law_text_embeddings)[0]
    ranked_indices = np.argsort(similarities)[::-1]
    return ranked_indices[:top_k], similarities

def rrf_fusion(bm25_indices, semantic_indices, k=60):
    fused_scores = defaultdict(float)

    for rank, idx in enumerate(bm25_indices):
        fused_scores[idx] += 1 / (k + rank)

    for rank, idx in enumerate(semantic_indices):
        fused_scores[idx] += 1 / (k + rank)

    fused_sorted = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    fused_indices = [idx for idx, _ in fused_sorted]
    return fused_indices

def average_precision(relevant_docs, retrieved_docs):
    relevant = set(relevant_docs)
    retrieved = retrieved_docs

    precisions = []
    num_relevant = 0

    for i, doc in enumerate(retrieved):
        if doc in relevant:
            num_relevant += 1
            precisions.append(num_relevant / (i + 1))

    if not precisions:
        return 0.0

    return sum(precisions) / len(relevant)

def calculate_ndcg(relevant_docs, retrieved_docs, k):
    relevance_scores = np.zeros(len(retrieved_docs))
    for i, doc in enumerate(retrieved_docs[:k]):
        if doc in relevant_docs:
            relevance_scores[i] = 1

    if sum(relevance_scores) == 0:
        return 0.0

    dcg = 0.0
    for i, rel in enumerate(relevance_scores[:k]):
        dcg += rel / np.log2(i + 2)  

    ideal_relevance = np.sort(relevance_scores)[::-1]
    idcg = 0.0
    for i, rel in enumerate(ideal_relevance[:k]):
        idcg += rel / np.log2(i + 2)

    return dcg / idcg if idcg > 0 else 0.0

### Ada 3 small 

In [None]:
endpoint = "https://eslsca-openai.openai.azure.com/"
deployment = "text-embedding-3-small"  
subscription_key = "0d368117945a4cb8a0f5b282dd192340"

client = AzureOpenAI(
    azure_endpoint=endpoint,
    api_key=subscription_key,
    api_version="2024-05-01-preview",
)

tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
MAX_TOKENS = 8191  

In [None]:
def split_text(text):
    tokens = tokenizer.encode(text)
    chunked_texts = []
    for i in range(0, len(tokens), MAX_TOKENS):
        chunk_tokens = tokens[i:i + MAX_TOKENS]
        chunked_texts.append(tokenizer.decode(chunk_tokens))
    return chunked_texts

def generate_embeddings(text, model="text-embedding-3-small"):
    if not isinstance(text, str) or not text.strip():
        raise ValueError("Invalid input: input must be a non-empty string.")

    if len(tokenizer.encode(text)) > MAX_TOKENS:
        text_chunks = split_text(text)
    else:
        text_chunks = [text]

    embeddings = []
    for chunk in text_chunks:
        response = client.embeddings.create(input=[chunk], model=model)
        embeddings.append(response.data[0].embedding)

    final_embedding = np.mean(embeddings, axis=0)  
    return final_embedding

In [None]:
def extract_law_text_from_json(json_file):
    with open(json_file, "r", encoding="utf-8") as file:
        laws_data = json.load(file)
    law_texts = [law["Law_Text"] for law in laws_data]
    return law_texts

In [None]:
json_file = "/content/drive/MyDrive/articles.json"  
law_texts = extract_law_text_from_json(json_file)
law_embeddings = []
skipped = []

for idx, text in tqdm(enumerate(law_texts), total=len(law_texts), desc="Generating embeddings"):
    try:
        embedding = generate_embeddings(text)
        law_embeddings.append(embedding)
    except Exception as e:
        print(f"[Skipped {idx}] Reason: {e}")
        skipped.append(idx)

law_embeddings = np.array(law_embeddings)
np.save("/content/drive/MyDrive/law_embeddings.npy", law_embeddings)

print(f"Embeddings saved to 'law_embeddings.npy'. Shape: {law_embeddings.shape}")
print(f"Skipped {len(skipped)} entries.")


Generating embeddings:  13%|█▎        | 6297/49521 [15:53<56:35, 12.73it/s]  

[Skipped 6293] Reason: Invalid input: input must be a non-empty string.
[Skipped 6295] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  13%|█▎        | 6307/49521 [15:54<1:04:52, 11.10it/s]

[Skipped 6305] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  14%|█▍        | 7072/49521 [17:18<58:13, 12.15it/s]

[Skipped 7069] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  14%|█▍        | 7091/49521 [17:20<59:39, 11.86it/s]  

[Skipped 7089] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  14%|█▍        | 7120/49521 [17:23<1:01:08, 11.56it/s]

[Skipped 7119] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  18%|█▊        | 8904/49521 [21:23<1:04:52, 10.43it/s]

[Skipped 8902] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  34%|███▍      | 17030/49521 [45:37<49:02, 11.04it/s]  

[Skipped 17027] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  37%|███▋      | 18311/49521 [48:40<47:27, 10.96it/s]

[Skipped 18309] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  41%|████▏     | 20494/49521 [54:59<42:43, 11.32it/s]

[Skipped 20492] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  43%|████▎     | 21109/49521 [57:03<39:32, 11.98it/s]

[Skipped 21107] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  65%|██████▌   | 32196/49521 [1:33:04<27:29, 10.50it/s]

[Skipped 32194] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  68%|██████▊   | 33554/49521 [1:37:03<22:06, 12.04it/s]

[Skipped 33552] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  68%|██████▊   | 33866/49521 [1:37:36<24:13, 10.77it/s]

[Skipped 33864] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  79%|███████▊  | 38931/49521 [1:50:24<16:08, 10.94it/s]

[Skipped 38929] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  79%|███████▉  | 39001/49521 [1:50:31<16:36, 10.56it/s]

[Skipped 38999] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  94%|█████████▎| 46409/49521 [2:11:22<04:22, 11.84it/s]

[Skipped 46407] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  99%|█████████▉| 49058/49521 [2:16:40<00:38, 11.93it/s]

[Skipped 49056] Reason: Invalid input: input must be a non-empty string.


Generating embeddings:  99%|█████████▉| 49075/49521 [2:16:41<00:39, 11.26it/s]

[Skipped 49072] Reason: Invalid input: input must be a non-empty string.


Generating embeddings: 100%|██████████| 49521/49521 [2:17:27<00:00,  6.00it/s]


Embeddings saved to 'law_embeddings.npy'. Shape: (49502, 1536)
Skipped 19 entries.


In [None]:
law_embeddings

array([[ 0.05179457, -0.01040705,  0.02915129, ..., -0.005088  ,
         0.01804145,  0.00913625],
       [-0.01238826, -0.00120428,  0.03797311, ..., -0.00015582,
        -0.01076218,  0.0131496 ],
       [-0.00831607, -0.02329179,  0.04631175, ..., -0.03018928,
        -0.02894909, -0.01863683],
       ...,
       [-0.01761828,  0.01699308,  0.06670143, ..., -0.02312207,
         0.00127346,  0.01466653],
       [ 0.00454035, -0.01636438,  0.08503358, ...,  0.00263818,
        -0.03769924,  0.01017994],
       [ 0.02106914, -0.032288  ,  0.06716913, ..., -0.00746424,
        -0.03560144, -0.00385592]])

In [None]:
client = AzureOpenAI(
    azure_endpoint="https://eslsca-openai.openai.azure.com/",
    api_key="0d368117945a4cb8a0f5b282dd192340",
    api_version="2024-05-01-preview",
)

tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
MAX_TOKENS = 8191

def embed_query(text: str, model="text-embedding-3-small") -> np.ndarray:
    if not isinstance(text, str):
        raise ValueError("Input text must be a string.")
    if len(tokenizer.encode(text)) > MAX_TOKENS:
        raise ValueError("Query too long!")
    response = client.embeddings.create(input=[text], model=model)
    return np.array(response.data[0].embedding)
df_laws = pd.read_excel('/content/drive/MyDrive/BM_Egypt_Law_Samples_500.xlsx')
with open('/content/drive/MyDrive/EG_Legislations_BenchMark_1000.json', 'r', encoding='utf-8') as f:
    questions_answers = json.load(f)

law_texts = df_laws['Law Text'].tolist()
law_text_embeddings = embed_law_texts(law_texts)
bm25 = build_bm25_index(law_texts)

ground_truth = {i: [i] for i in range(len(law_texts))}  

bm25_ndcg10 = []
bm25_ndcg100 = []
bm25_map = []

semantic_ndcg10 = []
semantic_ndcg100 = []
semantic_map = []

hybrid_ndcg10 = []
hybrid_ndcg100 = []
hybrid_map = []

for idx, qa in enumerate(questions_answers):
    questions = qa['Questions']

    for question in questions:
        query_embedding = embed_query(question)

        #BM25
        bm25_top_indices, bm25_scores = bm25_search(question, bm25, k=100)

        #Semantic
        semantic_top_indices, semantic_scores = semantic_search(query_embedding, law_text_embeddings, top_k=100)

        #Hybrid
        hybrid_top_indices = rrf_fusion(bm25_top_indices, semantic_top_indices)

        relevant_docs = ground_truth.get(idx, [])

        
        bm25_ndcg10.append(calculate_ndcg(relevant_docs, bm25_top_indices, 10))
        bm25_ndcg100.append(calculate_ndcg(relevant_docs, bm25_top_indices, 100))
        bm25_map.append(average_precision(relevant_docs, bm25_top_indices))

        semantic_ndcg10.append(calculate_ndcg(relevant_docs, semantic_top_indices, 10))
        semantic_ndcg100.append(calculate_ndcg(relevant_docs, semantic_top_indices, 100))
        semantic_map.append(average_precision(relevant_docs, semantic_top_indices))

        hybrid_ndcg10.append(calculate_ndcg(relevant_docs, hybrid_top_indices, 10))
        hybrid_ndcg100.append(calculate_ndcg(relevant_docs, hybrid_top_indices, 100))
        hybrid_map.append(average_precision(relevant_docs, hybrid_top_indices))




print("=== Final Scores ===")
print(f"BM25 -> nDCG@10: {np.mean(bm25_ndcg10):.4f}, nDCG@100: {np.mean(bm25_ndcg100):.4f}, MAP: {np.mean(bm25_map):.4f}")
print(f"Semantic -> nDCG@10: {np.mean(semantic_ndcg10):.4f}, nDCG@100: {np.mean(semantic_ndcg100):.4f}, MAP: {np.mean(semantic_map):.4f}")
print(f"Hybrid -> nDCG@10: {np.mean(hybrid_ndcg10):.4f}, nDCG@100: {np.mean(hybrid_ndcg100):.4f}, MAP: {np.mean(hybrid_map):.4f}")

=== Final Scores ===
BM25 -> nDCG@10: 0.7556, nDCG@100: 0.7731, MAP: 0.7185
Semantic -> nDCG@10: 0.7920, nDCG@100: 0.8058, MAP: 0.7483
Hybrid -> nDCG@10: 0.8386, nDCG@100: 0.8500, MAP: 0.8053


### Arabic Morphology(Farasa)

In [6]:
farasa_segmenter = FarasaSegmenter(interactive=True)

def preprocess_arabic(text: str) -> str:
    if not isinstance(text, str):
        return ""
    return farasa_segmenter.segment(text)

client = AzureOpenAI(
    azure_endpoint="https://eslsca-openai.openai.azure.com/",
    api_key="0d368117945a4cb8a0f5b282dd192340",
    api_version="2024-05-01-preview",
)

tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
MAX_TOKENS = 8191

def embed_query(text: str, model="text-embedding-3-small") -> np.ndarray:
    if not isinstance(text, str):
        raise ValueError("Input text must be a string.")
    if len(tokenizer.encode(text)) > MAX_TOKENS:
        raise ValueError("Query too long!")
    response = client.embeddings.create(input=[text], model=model)
    return np.array(response.data[0].embedding)


df_laws = pd.read_excel('/content/drive/MyDrive/BM_Egypt_Law_Samples_500.xlsx')
with open('/content/drive/MyDrive/EG_Legislations_BenchMark_1000.json', 'r', encoding='utf-8') as f:
    questions_answers = json.load(f)

law_texts = df_laws['Law Text'].tolist()

law_text_embeddings = embed_law_texts(law_texts)

bm25_raw = build_bm25_index(law_texts, preprocess=False)
bm25_preprocessed = build_bm25_index(law_texts, preprocess=True)

ground_truth = {i: [i] for i in range(len(law_texts))}

raw_ndcg10, raw_ndcg100, raw_map = [], [], []
proc_ndcg10, proc_ndcg100, proc_map = [], [], []
semantic_ndcg10, semantic_ndcg100, semantic_map = [], [], []
hybrid_ndcg10, hybrid_ndcg100, hybrid_map = [], [], []

for idx, qa in enumerate(questions_answers):
    questions = qa['Questions']
    for question in questions:
        query_embedding = embed_query(question)
        relevant_docs = ground_truth.get(idx, [])

        raw_indices, _ = bm25_search(question, bm25_raw, preprocess=False)
        raw_ndcg10.append(calculate_ndcg(relevant_docs, raw_indices, 10))
        raw_ndcg100.append(calculate_ndcg(relevant_docs, raw_indices, 100))
        raw_map.append(average_precision(relevant_docs, raw_indices))

        proc_indices, _ = bm25_search(question, bm25_preprocessed, preprocess=True)
        proc_ndcg10.append(calculate_ndcg(relevant_docs, proc_indices, 10))
        proc_ndcg100.append(calculate_ndcg(relevant_docs, proc_indices, 100))
        proc_map.append(average_precision(relevant_docs, proc_indices))

        semantic_indices, _ = semantic_search(query_embedding, law_text_embeddings)
        semantic_ndcg10.append(calculate_ndcg(relevant_docs, semantic_indices, 10))
        semantic_ndcg100.append(calculate_ndcg(relevant_docs, semantic_indices, 100))
        semantic_map.append(average_precision(relevant_docs, semantic_indices))

        hybrid_indices = rrf_fusion(proc_indices, semantic_indices)
        hybrid_ndcg10.append(calculate_ndcg(relevant_docs, hybrid_indices, 10))
        hybrid_ndcg100.append(calculate_ndcg(relevant_docs, hybrid_indices, 100))
        hybrid_map.append(average_precision(relevant_docs, hybrid_indices))

print("=== Final Scores ===")
print(f"BM25 (Raw)       -> nDCG@10: {np.mean(raw_ndcg10):.4f}, nDCG@100: {np.mean(raw_ndcg100):.4f}, MAP: {np.mean(raw_map):.4f}")
print(f"BM25 (Processed) -> nDCG@10: {np.mean(proc_ndcg10):.4f}, nDCG@100: {np.mean(proc_ndcg100):.4f}, MAP: {np.mean(proc_map):.4f}")
print(f"Semantic         -> nDCG@10: {np.mean(semantic_ndcg10):.4f}, nDCG@100: {np.mean(semantic_ndcg100):.4f}, MAP: {np.mean(semantic_map):.4f}")
print(f"Hybrid           -> nDCG@10: {np.mean(hybrid_ndcg10):.4f}, nDCG@100: {np.mean(hybrid_ndcg100):.4f}, MAP: {np.mean(hybrid_map):.4f}")




=== Final Scores ===
BM25 (Raw)       -> nDCG@10: 0.7551, nDCG@100: 0.7726, MAP: 0.7178
BM25 (Processed) -> nDCG@10: 0.8041, nDCG@100: 0.8180, MAP: 0.7715
Semantic         -> nDCG@10: 0.7924, nDCG@100: 0.8061, MAP: 0.7488
Hybrid           -> nDCG@10: 0.8626, nDCG@100: 0.8702, MAP: 0.8304


### GTE

In [3]:
MODEL_NAME = "Alibaba-NLP/gte-multilingual-base"
tokenizer  = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True
)
model = AutoModel.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True
).eval().to("cuda" if torch.cuda.is_available() else "cpu")

MAX_TOKENS = tokenizer.model_max_length - 10 



def embed_text(text: str) -> np.ndarray:
    """Embed a single text (question or law) via chunking + masked mean pooling."""
    pieces = (
        split_text(text)
        if len(tokenizer.encode(text, add_special_tokens=False)) > MAX_TOKENS
        else [text]
    )
    embs = []
    with torch.no_grad():
        for piece in pieces:
            inp = tokenizer(
                piece,
                truncation=True,
                max_length=MAX_TOKENS,
                return_tensors="pt"
            ).to(model.device)
            outputs = model(**inp).last_hidden_state          
            mask = inp.attention_mask.unsqueeze(-1).float()     
            summed = (outputs * mask).sum(dim=1)               
            counts = mask.sum(dim=1).clamp(min=1)               
            pooled = (summed / counts).squeeze(0)            
            embs.append(pooled.cpu().numpy())
    return np.mean(np.stack(embs, axis=0), axis=0)

excel_path = "BM_Egypt_Law_Samples_500.xlsx"
df = pd.read_excel(excel_path)
law_texts = df["Law Text"].astype(str).tolist()

law_embs = []
for txt in tqdm(law_texts, desc="Embedding laws"):
    law_embs.append(embed_text(txt))
law_embs = np.vstack(law_embs)

tokenized_laws = [text.split() for text in law_texts]
bm25 = BM25Okapi(tokenized_laws)

json_path = "EG_Legislations_BenchMark_1000.json"
with open(json_path, "r", encoding="utf-8") as f:
    bm = json.load(f)

queries = []
for idx, obj in enumerate(bm):
    for q in obj["Questions"]:
        queries.append((q, idx))

results = {
    "bm25_ndcg10": [], "bm25_ndcg100": [], "bm25_map": [],
    "sem_ndcg10": [],  "sem_ndcg100": [],  "sem_map": [],
    "hybrid_ndcg10": [], "hybrid_ndcg100": [], "hybrid_map": []
}

for query, gt_idx in tqdm(queries, desc="Evaluating queries"):
    q_emb = embed_text(query)
    b_top = bm25_search(query, top_k=100)
    s_top = semantic_search(q_emb, top_k=100)
    h_top = rrf_fusion(b_top, s_top)

    rel_docs = [gt_idx]

    results["bm25_ndcg10"].append(ndcg_at_k(rel_docs, b_top, 10))
    results["bm25_ndcg100"].append(ndcg_at_k(rel_docs, b_top, 100))
    results["bm25_map"].append(calculate_ap(rel_docs, b_top))
    
    results["sem_ndcg10"].append(ndcg_at_k(rel_docs, s_top, 10))
    results["sem_ndcg100"].append(ndcg_at_k(rel_docs, s_top, 100))
    results["sem_map"].append(calculate_ap(rel_docs, s_top))
    results["hybrid_ndcg10"].append(ndcg_at_k(rel_docs, h_top, 10))
    results["hybrid_ndcg100"].append(ndcg_at_k(rel_docs, h_top, 100))
    results["hybrid_map"].append(calculate_ap(rel_docs, h_top))

for k, vals in results.items():
    results[k] = np.nan_to_num(vals)

print("\n=== Final Scores ===")
print(f"BM25      → nDCG@10: {np.mean(results['bm25_ndcg10']):.4f}, "
      f"nDCG@100: {np.mean(results['bm25_ndcg100']):.4f}, MAP: {np.mean(results['bm25_map']):.4f}")
print(f"Semantic  → nDCG@10: {np.mean(results['sem_ndcg10']):.4f}, "
      f"nDCG@100: {np.mean(results['sem_ndcg100']):.4f}, MAP: {np.mean(results['sem_map']):.4f}")
print(f"Hybrid    → nDCG@10: {np.mean(results['hybrid_ndcg10']):.4f}, "
      f"nDCG@100: {np.mean(results['hybrid_ndcg100']):.4f}, MAP: {np.mean(results['hybrid_map']):.4f}")


model.safetensors:   0%|          | 0.00/611M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Embedding laws: 100%|██████████| 500/500 [02:34<00:00,  3.23it/s]
Evaluating queries: 100%|██████████| 1000/1000 [01:11<00:00, 14.0


=== Final Scores ===
BM25      → nDCG@10: 0.7564, nDCG@100: 0.7738, MAP: 0.7195
Semantic  → nDCG@10: 0.6551, nDCG@100: 0.6818, MAP: 0.6092
Hybrid    → nDCG@10: 0.7637, nDCG@100: 0.7865, MAP: 0.7273





### BGE-M3

In [None]:
MODEL_NAME = "BAAI/bge-m3"
device     = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModel.from_pretrained(MODEL_NAME).to(device).eval()

MAX_TOKENS = tokenizer.model_max_length - 10 
excel_path = "BM_Egypt_Law_Samples_500.xlsx"
df         = pd.read_excel(excel_path)
law_texts  = df["Law Text"].astype(str).tolist()

law_embs = []
for txt in tqdm(law_texts, desc="Embedding laws"):
    law_embs.append(embed_text(txt))
law_embs = np.vstack(law_embs)

tokenized_laws = [text.split() for text in law_texts]
bm25 = BM25Okapi(tokenized_laws)

json_path = "EG_Legislations_BenchMark_1000.json"
with open(json_path, "r", encoding="utf-8") as f:
    bm = json.load(f)

queries = []
for idx, obj in enumerate(bm):
    for q in obj["Questions"]:
        queries.append((q, idx))

results = {
    "bm25_ndcg10": [], "bm25_ndcg100": [], "bm25_map": [],
    "sem_ndcg10": [],  "sem_ndcg100": [],  "sem_map": [],
    "hybrid_ndcg10": [], "hybrid_ndcg100": [], "hybrid_map": []
}

for query, gt_idx in tqdm(queries, desc="Evaluating queries"):
    q_emb = embed_text(query)
    b_top = bm25_search(query, top_k=100)
    s_top = semantic_search(q_emb, top_k=100)
    h_top = rrf_fusion(b_top, s_top)

    rel_docs = [gt_idx]

    results["bm25_ndcg10"].append(ndcg_at_k(rel_docs, b_top, 10))
    results["bm25_ndcg100"].append(ndcg_at_k(rel_docs, b_top, 100))
    results["bm25_map"].append(calculate_ap(rel_docs, b_top))
    results["sem_ndcg10"].append(ndcg_at_k(rel_docs, s_top, 10))
    results["sem_ndcg100"].append(ndcg_at_k(rel_docs, s_top, 100))
    results["sem_map"].append(calculate_ap(rel_docs, s_top))
    results["hybrid_ndcg10"].append(ndcg_at_k(rel_docs, h_top, 10))
    results["hybrid_ndcg100"].append(ndcg_at_k(rel_docs, h_top, 100))
    results["hybrid_map"].append(calculate_ap(rel_docs, h_top))

for k, vals in results.items():
    results[k] = np.nan_to_num(vals)
print("\n=== Final Scores ===")
print(f"BM25      → nDCG@10: {np.mean(results['bm25_ndcg10']):.4f}, "
      f"nDCG@100: {np.mean(results['bm25_ndcg100']):.4f}, MAP: {np.mean(results['bm25_map']):.4f}")
print(f"Semantic  → nDCG@10: {np.mean(results['sem_ndcg10']):.4f}, "
      f"nDCG@100: {np.mean(results['sem_ndcg100']):.4f}, MAP: {np.mean(results['sem_map']):.4f}")
print(f"Hybrid    → nDCG@10: {np.mean(results['hybrid_ndcg10']):.4f}, "
      f"nDCG@100: {np.mean(results['hybrid_ndcg100']):.4f}, MAP: {np.mean(results['hybrid_map']):.4f}")


Embedding laws: 100%|██████████| 500/500 [05:54<00:00,  1.41it/s]
Evaluating queries: 100%|██████████| 1000/1000 [02:59<00:00,  5.58it/s]


=== Final Scores ===
BM25      → nDCG@10: 0.7564, nDCG@100: 0.7738, MAP: 0.7195
Semantic  → nDCG@10: 0.7872, nDCG@100: 0.8017, MAP: 0.7558
Hybrid    → nDCG@10: 0.8126, nDCG@100: 0.8312, MAP: 0.7841





### Gemini

In [None]:
genai.configure(api_key="AIzaSyC-1eD9ThWOyHPGvMJ-ZZ1XYW371fjRkIU")
model = "models/text-embedding-004"

def split_text(text, max_chars=8000):
    chunks = []
    while len(text) > max_chars:
        split_point = text.rfind(" ", 0, max_chars)
        if split_point == -1:
            split_point = max_chars
        chunks.append(text[:split_point])
        text = text[split_point:]
    if text:
        chunks.append(text)
    return chunks

def embed_query(text):
    pieces = split_text(text)
    embeddings = []
    for chunk in pieces:
        try:
            response = genai.embed_content(
                model=model,
                content=chunk,
                task_type="RETRIEVAL_DOCUMENT"
            )
            embeddings.append(response['embedding'])
            time.sleep(2)
        except Exception as e:
            raise RuntimeError(f"Embedding failed: {e}")
    return np.mean(np.stack(embeddings, axis=0), axis=0)


df_laws = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/BM_Egypt_Law_Samples_500.xlsx')
with open('/content/drive/MyDrive/Colab Notebooks/EG_Legislations_BenchMark_1000.json', 'r', encoding='utf-8') as f:
    questions_answers = json.load(f)

law_texts = df_laws['Law Text'].tolist()
law_text_embeddings = embed_law_texts(law_texts)
bm25 = build_bm25_index(law_texts)

ground_truth = {i: [i] for i in range(len(law_texts))} 


bm25_ndcg = {k: [] for k in range(10, 101, 10)}
semantic_ndcg = {k: [] for k in range(10, 101, 10)}
hybrid_ndcg = {k: [] for k in range(10, 101, 10)}

bm25_map = []
semantic_map = []
hybrid_map = []

for idx, qa in enumerate(questions_answers):
    questions = qa['Questions']
    for question in questions:
        query_embedding = embed_query(question)

        bm25_top_indices, bm25_scores = bm25_search(question, bm25, k=100)

        semantic_top_indices, semantic_scores = semantic_search(query_embedding, law_text_embeddings, top_k=100)

        hybrid_top_indices = rrf_fusion(bm25_top_indices, semantic_top_indices)

        relevant_docs = ground_truth.get(idx, [])

        for k in range(10, 101, 10):
            bm25_ndcg[k].append(ndcg_at_k(relevant_docs, bm25_top_indices, k=k))
            semantic_ndcg[k].append(ndcg_at_k(relevant_docs, semantic_top_indices, k=k))
            hybrid_ndcg[k].append(ndcg_at_k(relevant_docs, hybrid_top_indices, k=k))

        bm25_map.append(calculate_ap(relevant_docs, bm25_top_indices))
        semantic_map.append(calculate_ap(relevant_docs, semantic_top_indices))
        hybrid_map.append(calculate_ap(relevant_docs, hybrid_top_indices))

def handle_nan_score(score):
    return score if not np.isnan(score) else 0.0

metrics = {
    "BM25_nDCG": [np.nanmean([np.nanmean(bm25_ndcg[k]) for k in range(10, 101, 10)])],
    "Semantic_nDCG": [np.nanmean([np.nanmean(semantic_ndcg[k]) for k in range(10, 101, 10)])],
    "Hybrid_nDCG": [np.nanmean([np.nanmean(hybrid_ndcg[k]) for k in range(10, 101, 10)])],
    "BM25_MAP": [np.nanmean(bm25_map)],
    "Semantic_MAP": [np.nanmean(semantic_map)],
    "Hybrid_MAP": [np.nanmean(hybrid_map)],
}

results_df = pd.DataFrame(metrics)
print(results_df)

   BM25_nDCG  Semantic_nDCG  Hybrid_nDCG  BM25_MAP  Semantic_MAP  Hybrid_MAP
0   0.768336        0.02818     0.330348  0.718475      0.011345    0.189314


### Mistral

In [None]:
api_key = "S5y1cIeaYsoUSXCBkxM6fg6TC5FzRJVx"
model = "mistral-embed"
client = Mistral(api_key=api_key)

def embed_query(text):
    pieces = split_text(text)
    embeddings = []
    for chunk in pieces:
        try:
            response = client.embeddings.create(
                    model=model,
                    inputs=chunk
            )
            embeddings.append(response.data[0].embedding)
            time.sleep(2)
        except Exception as e:
            raise RuntimeError(f"Embedding failed: {e}")
    return np.mean(np.stack(embeddings, axis=0), axis=0)


df_laws = pd.read_excel('/content/drive/MyDrive/BM_Egypt_Law_Samples_500.xlsx')
with open('/content/drive/MyDrive/EG_Legislations_BenchMark_1000.json', 'r', encoding='utf-8') as f:
    questions_answers = json.load(f)

law_texts = df_laws['Law Text'].tolist()
law_text_embeddings = embed_law_texts(law_texts)
bm25 = build_bm25_index(law_texts)

ground_truth = {i: [i] for i in range(len(law_texts))} 

bm25_ndcg = {k: [] for k in range(10, 101, 10)}
semantic_ndcg = {k: [] for k in range(10, 101, 10)}
hybrid_ndcg = {k: [] for k in range(10, 101, 10)}

bm25_map = []
semantic_map = []
hybrid_map = []

for idx, qa in enumerate(questions_answers):
    questions = qa['Questions']
    for question in questions:
        query_embedding = embed_query(question)

        bm25_top_indices, bm25_scores = bm25_search(question, bm25, k=100)

        semantic_top_indices, semantic_scores = semantic_search(query_embedding, law_text_embeddings, top_k=100)

        hybrid_top_indices = rrf_fusion(bm25_top_indices, semantic_top_indices)

        relevant_docs = ground_truth.get(idx, [])

        for k in range(10, 101, 10):
            bm25_ndcg[k].append(ndcg_at_k(relevant_docs, bm25_top_indices, k=k))
            semantic_ndcg[k].append(ndcg_at_k(relevant_docs, semantic_top_indices, k=k))
            hybrid_ndcg[k].append(ndcg_at_k(relevant_docs, hybrid_top_indices, k=k))

        bm25_map.append(calculate_ap(relevant_docs, bm25_top_indices))
        semantic_map.append(calculate_ap(relevant_docs, semantic_top_indices))
        hybrid_map.append(calculate_ap(relevant_docs, hybrid_top_indices))

def handle_nan_score(score):
    return score if not np.isnan(score) else 0.0

metrics = {
    "BM25_nDCG": [np.nanmean([np.nanmean(bm25_ndcg[k]) for k in range(10, 101, 10)])],
    "Semantic_nDCG": [np.nanmean([np.nanmean(semantic_ndcg[k]) for k in range(10, 101, 10)])],
    "Hybrid_nDCG": [np.nanmean([np.nanmean(hybrid_ndcg[k]) for k in range(10, 101, 10)])],
    "BM25_MAP": [np.nanmean(bm25_map)],
    "Semantic_MAP": [np.nanmean(semantic_map)],
    "Hybrid_MAP": [np.nanmean(hybrid_map)],
}

results_df = pd.DataFrame(metrics)
print(results_df)

   BM25_nDCG  Semantic_nDCG  Hybrid_nDCG  BM25_MAP  Semantic_MAP  Hybrid_MAP
0   0.768336       0.807883     0.848969  0.718475      0.757073    0.807296


### E5-Large

In [3]:
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large")
model = AutoModel.from_pretrained("intfloat/multilingual-e5-large")

def embed_query(text: str) -> np.ndarray:
    if not isinstance(text, str):
        raise ValueError("Input text must be a string.")
    text = "query: " + text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings[0]


df_laws = pd.read_excel('/content/drive/MyDrive/BM_Egypt_Law_Samples_500.xlsx')
with open('/content/drive/MyDrive/EG_Legislations_BenchMark_1000.json', 'r', encoding='utf-8') as f:
    questions_answers = json.load(f)

law_texts = df_laws['Law Text'].tolist()
law_text_embeddings = embed_law_texts(law_texts)
bm25 = build_bm25_index(law_texts)
ground_truth = {i: [i] for i in range(len(law_texts))}

bm25_ndcg10, bm25_ndcg100, bm25_map = [], [], []
semantic_ndcg10, semantic_ndcg100, semantic_map = [], [], []
hybrid_ndcg10, hybrid_ndcg100, hybrid_map = [], [], []

for idx, qa in enumerate(questions_answers):
    questions = qa['Questions']
    for question in questions:
        query_embedding = embed_query(question)
        bm25_top_indices, _ = bm25_search(question, bm25, k=100)
        semantic_top_indices, _ = semantic_search(query_embedding, law_text_embeddings, top_k=100)
        hybrid_top_indices = rrf_fusion(bm25_top_indices, semantic_top_indices)
        relevant_docs = ground_truth.get(idx, [])
        total_docs = len(law_texts)
        bm25_ndcg10.append(ndcg_at_k(relevant_docs, bm25_top_indices, 10, total_docs))
        bm25_ndcg100.append(ndcg_at_k(relevant_docs, bm25_top_indices, 100, total_docs))
        bm25_map.append(calculate_ap(relevant_docs, bm25_top_indices))
        semantic_ndcg10.append(ndcg_at_k(relevant_docs, semantic_top_indices, 10, total_docs))
        semantic_ndcg100.append(ndcg_at_k(relevant_docs, semantic_top_indices, 100, total_docs))
        semantic_map.append(calculate_ap(relevant_docs, semantic_top_indices))
        hybrid_ndcg10.append(ndcg_at_k(relevant_docs, hybrid_top_indices, 10, total_docs))
        hybrid_ndcg100.append(ndcg_at_k(relevant_docs, hybrid_top_indices, 100, total_docs))
        hybrid_map.append(calculate_ap(relevant_docs, hybrid_top_indices))

print("=== Final Scores ===")
print(f"BM25 -> nDCG@10: {np.mean(bm25_ndcg10):.4f}, nDCG@100: {np.mean(bm25_ndcg100):.4f}, MAP: {np.mean(bm25_map):.4f}")
print(f"Semantic -> nDCG@10: {np.mean(semantic_ndcg10):.4f}, nDCG@100: {np.mean(semantic_ndcg100):.4f}, MAP: {np.mean(semantic_map):.4f}")
print(f"Hybrid -> nDCG@10: {np.mean(hybrid_ndcg10):.4f}, nDCG@100: {np.mean(hybrid_ndcg100):.4f}, MAP: {np.mean(hybrid_map):.4f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

=== Final Scores ===
BM25 -> nDCG@10: 0.7551, nDCG@100: 0.7726, MAP: 0.7178
Semantic -> nDCG@10: 0.9212, nDCG@100: 0.9235, MAP: 0.8986
Hybrid -> nDCG@10: 0.8672, nDCG@100: 0.8751, MAP: 0.8378


###  Sentence Transformers with AraBERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2') 

def encode_with_arabert(texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
    with torch.no_grad():
        model_output = model.encode(texts)
    return model_output

questions = [ex['question'] for ex in train_data_json]
law_texts = [ex['law_text'] for ex in train_data_json]

question_embeddings = encode_with_arabert(questions)
law_text_embeddings = encode_with_arabert(law_texts)

np.save("/content/drive/MyDrive/question_embeddings.npy", question_embeddings)
np.save("/content/drive/MyDrive/law_text_embeddings.npy", law_text_embeddings)
print("Embeddings saved to 'question_embeddings.npy' and 'law_text_embeddings.npy'")

tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/720k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Embeddings saved to 'question_embeddings.npy' and 'law_text_embeddings.npy'


In [None]:
question_embeddings = np.load("/content/drive/MyDrive/question_embeddings.npy")[:500]
law_text_embeddings = np.load("/content/drive/MyDrive/law_text_embeddings.npy")[:500]
df_laws = pd.read_excel('/content/drive/MyDrive/BM_Egypt_Law_Samples_500.xlsx')
law_texts = df_laws['Law Text'].tolist()[:500] 
with open('/content/drive/MyDrive/EG_Legislations_BenchMark_1000.json', 'r', encoding='utf-8') as f:
    questions_answers = json.load(f)
questions_answers = questions_answers[:500]

ground_truth = {i: [i] for i in range(500)}

semantic_ndcg10 = []
semantic_ndcg100 = []
semantic_map = []

for i, q_embed in enumerate(question_embeddings):
    relevant_docs = ground_truth.get(i, [])

    similarities = cosine_similarity([q_embed], law_text_embeddings)[0]
    ranked_indices = np.argsort(similarities)[::-1][:100]
    semantic_ndcg10.append(ndcg_at_k(relevant_docs, ranked_indices, k=10))
    semantic_ndcg100.append(ndcg_at_k(relevant_docs, ranked_indices, k=100))
    semantic_map.append(calculate_ap(relevant_docs, ranked_indices))

print("\n=== Semantic Search Results For Sentence Transformer ===")
print(f"nDCG@10:  {np.nanmean(semantic_ndcg10):.4f}")
print(f"nDCG@100: {np.nanmean(semantic_ndcg100):.4f}")
print(f"MAP:      {np.nanmean(semantic_map):.4f}")



=== Semantic Search Results For Sentence Transformer ===
nDCG@10:  0.2613
nDCG@100: 0.2988
MAP:      0.2258
