In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive



---


**Load and inspect legal documents from JSON**

This cell defines utility functions to normalize text and load a structured JSON dataset of legal documents. Each document is processed to extract `id`, `full_text`, `articles`, and `metadata`. The cell also builds a quick summary DataFrame for inspection and prints a preview of the first document.

---


In [None]:
import json
import re
from pathlib import Path
from typing import Any, Dict, List
import pandas as pd

def normalize_str(s: Any) -> str:
    """
    Convert input to string, remove leading/trailing whitespace,
    and collapse multiple spaces/tabs while preserving newlines.
    Returns an empty string if input is None.
    """
    if s is None:
        return ""
    if not isinstance(s, str):
        s = str(s)
    s = re.sub(r"[ \t]+", " ", s)
    s = s.strip()
    return s

def load_and_validate_json(path: str) -> List[Dict[str, Any]]:
    """
    Load a JSON file containing a list of legal documents.
    Normalize and structure each document into a dictionary with:
      - id: canonical link or generated id
      - full_text: main text content
      - articles: list of dicts with title & text
      - metadata: other document attributes
    Raises:
        FileNotFoundError: if the JSON file does not exist
        ValueError: if the top-level JSON is not a list
    """
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"File not found: {path}")
    
    raw = json.loads(p.read_text(encoding="utf-8"))

    if not isinstance(raw, list):
        raise ValueError("Expected top-level JSON array (list of documents).")

    docs = []
    for idx, item in enumerate(raw):
        canonical = normalize_str(
            item.get("canonical_link") or item.get("short_link") or f"doc_{idx}"
        )
        doc_id = canonical
        full_text = normalize_str(item.get("text", ""))

        # Process articles within the document
        articles = []
        for art in item.get("articles", []):
            title = normalize_str(art.get("title", ""))
            text = normalize_str(art.get("text", ""))
            articles.append({"title": title, "text": text})

        metadata = {
            "json_link": normalize_str(item.get("json_link", "")),
            "short_link": normalize_str(item.get("short_link", "")),
            "canonical_link": canonical,
            "issue_at": normalize_str(item.get("issue_at", "")),
            "approve_at": normalize_str(item.get("approve_at", "")),
            "signed_by": normalize_str(item.get("signed_by", "")),
            "publication": normalize_str(item.get("publication", "")),
        }

        docs.append({
            "id": doc_id,
            "full_text": full_text,
            "articles": articles,
            "metadata": metadata
        })

    return docs

# ---------------- Run loader and show summary ----------------
docs = load_and_validate_json(
    "/content/drive/MyDrive/Project/my-qanoon-data.json"
)

# Build a summary DataFrame for inspection
rows = []
for d in docs:
    rows.append({
        "id": d["id"],
        "n_articles": len(d["articles"]),
        "text_len_chars": len(d["full_text"]),
        "canonical_link": d["metadata"]["canonical_link"],
        "approve_at": d["metadata"]["approve_at"]
    })

df = pd.DataFrame(rows)
print(df)

# Display first document for quick inspection
first = docs[0]
print("=== First document preview ===")
print("ID:", first["id"])
print("Full text (first 400 chars):")
print(first["full_text"][:400])
print("\nArticles:")
for a in first["articles"]:
    print("-", a["title"], "->", a["text"][:150])

print("\nFirst doc dictionary:")
print(first)


                                           id  n_articles  text_len_chars  \
0         https://qanoon.om/p/2025/rd2025100/           2             327   
1         https://qanoon.om/p/2025/rd2025099/           2             285   
2         https://qanoon.om/p/2025/rd2025098/           2             280   
3         https://qanoon.om/p/2025/rd2025097/           2             328   
4         https://qanoon.om/p/2025/rd2025096/           3             226   
...                                       ...         ...             ...   
4818      https://qanoon.om/p/1974/rd1974004/          15             144   
4819      https://qanoon.om/p/1974/rd1974002/           3             143   
4820      https://qanoon.om/p/1974/rd1974002/           3             143   
4821  https://qanoon.om/p/1972/legacy1973003/          18              56   
4822  https://qanoon.om/p/1972/legacy1972009/          13              54   

                               canonical_link                       approve



---


**RAG Chunking Pipeline**

This cell implements a pipeline to split legal documents into manageable chunks for retrieval-augmented generation (RAG). It handles:

1. **Article-based chunks** with clause detection and contextual overlap.
2. **Text-based chunks** for long free text sections.
3. Includes helpers to extract decree info, article numbers, and preamble lines.
   The output is a list of structured chunks containing both the text and metadata for downstream processing.

---



In [None]:
import json
import re
from typing import List, Dict, Tuple

JSON_PATH = "/content/drive/MyDrive/Project/my-qanoon-data.json"

# -----------------------------
# Load JSON data
# -----------------------------
with open(JSON_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

assert isinstance(data, list), "Top-level JSON must be a list of laws"

# -----------------------------
# Helper Functions
# -----------------------------

def extract_decree_info(canonical_link: str) -> Tuple[str, str]:
    """
    Extract the decree year and number from a canonical link.
    Returns (year, number) or (None, None) if not found.
    Example: "/2025/rd2025100" -> ("2025", "100")
    """
    m = re.search(r"/(\d{4})/rd(\d+)", canonical_link or "")
    if not m:
        return None, None
    year = m.group(1)
    raw = m.group(2)
    number = raw[len(year):] if raw.startswith(year) else raw
    return year, number

def extract_article_number(title: str) -> str:
    """
    Extracts the article number from a title string, e.g., "(3)" -> "3".
    """
    if not title:
        return ""
    m = re.search(r"\(([^)]+)\)", title)
    return m.group(1) if m else ""

def build_short_preamble(text: str, max_lines: int = 4) -> str:
    """
    Builds a short preamble from the law text by capturing key introductory lines.
    Lines starting with 'نحن', 'بعد الاطلاع', 'وعلى' or containing 'الاتفاقية' are included.
    Stops at 'وبناء' or after max_lines.
    """
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    result = []
    capturing = False

    for line in lines:
        if line.startswith("نحن"):
            result.append(line)
            capturing = True
            continue
        if capturing:
            if line.startswith("وبناء"):
                break
            if (
                line.startswith("بعد الاطلاع")
                or line.startswith("وعلى")
                or "الاتفاقية" in line
            ):
                result.append(line)
        if len(result) >= max_lines:
            break
    return "\n".join(result)

def split_text_into_chunks(text: str, max_lines: int = 4, overlap_lines: int = 1) -> List[str]:
    """
    Split long text into chunks of max_lines, optionally overlapping previous lines.
    Returns a list of text chunks.
    """
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    chunks = []
    i = 0
    while i < len(lines):
        chunk_lines = lines[i:i+max_lines]
        if i != 0 and overlap_lines > 0:
            chunk_lines = lines[i-overlap_lines:i] + chunk_lines
        chunks.append('\n'.join(chunk_lines))
        i += max_lines
    return chunks

def split_long_article(text: str) -> List[Tuple[str, str]]:
    """
    Split a long article into clauses using numbering patterns.
    Returns a list of tuples (clause_number, clause_text).
    """
    if not text:
        return [(None, "")]
    pattern = r"\n?([0-9٠-٩]+)\s*[–\-]\s*"
    parts = re.split(pattern, text)
    if len(parts) <= 1:
        return [(None, text.strip())]
    clauses = []
    it = iter(parts)
    first = next(it).strip()
    if first:
        clauses.append((None, first))
    for num, body in zip(it, it):
        clauses.append((num, body.strip()))
    return clauses

# -----------------------------
# Article Chunk Builder
# -----------------------------

def build_chunks_for_article(law: Dict, article: Dict, overlap_lines: int = 2) -> List[Dict]:
    """
    Build structured chunks for a single article, including:
      - Law context metadata
      - Short preamble
      - Article text broken into clauses with optional overlap
    Returns a list of dictionaries with 'text' and 'metadata'.
    """
    year, decree_number = extract_decree_info(law.get("canonical_link"))
    article_number = extract_article_number(article.get("title", ""))

    context = f"""[سياق تشريعي]
نوع الوثيقة: مرسوم سلطاني
الرقم: {decree_number}
السنة: {year}
{law.get('issue_at', '').strip()}
{law.get('publication', '').strip()}
""".strip()

    preamble = f"""[ديباجة مختصرة]
{build_short_preamble(law.get('text', ''))}
""".strip()

    clauses = split_long_article(article.get("text", ""))
    chunks = []
    previous_lines = []

    for clause_no, clause_text in clauses:
        clause_lines = clause_text.splitlines()
        if previous_lines:
            overlap_text = '\n'.join(previous_lines[-overlap_lines:])
            clause_lines = [f"[ديباجة مختصرة – استمرار الجزء السابق]"] + [overlap_text] + clause_lines
        clause_text_with_overlap = '\n'.join(clause_lines).strip()
        clause_label = f" – الفقرة ({clause_no})" if clause_no else ""
        legal_text = f"""[ النص القانونی / ماهية: مادة]

{article.get('title', '').strip()}{clause_label}:
{clause_text_with_overlap}
""".strip()
        full_text = '\n\n'.join([context, preamble, legal_text])
        chunks.append({
            "text": full_text,
            "metadata": {
                "canonical_link": law.get("canonical_link"),
                "decree_year": year,
                "decree_number": decree_number,
                "article_number": article_number,
                "clause_number": clause_no,
                "is_overlap": bool(previous_lines),
            }
        })
        previous_lines = clause_lines

    return chunks

# -----------------------------
# Text Field Chunk Builder
# -----------------------------

def build_chunks_for_text(law: Dict, max_lines: int = 4, overlap_lines: int = 1) -> List[Dict]:
    """
    Split the 'text' field of the law into chunks with optional overlap.
    Returns a list of dictionaries with 'text' and metadata.
    """
    text_field = law.get("text", "")
    if not text_field:
        return []
    text_chunks = split_text_into_chunks(text_field, max_lines=max_lines, overlap_lines=overlap_lines)
    chunks = []
    for idx, chunk in enumerate(text_chunks):
        chunks.append({
            "text": f"""[ماهیت: نص]{chunk}""",
            "metadata": {
                "canonical_link": law.get("canonical_link"),
                "chunk_index": idx,
                "is_overlap": idx > 0
            }
        })
    return chunks

# -----------------------------
# Build All Chunks
# -----------------------------

all_chunks: List[Dict] = []
for law in data:
    # 1. Article chunks
    for article in law.get("articles", []):
        all_chunks.extend(build_chunks_for_article(law, article))
    # 2. Text chunks
    all_chunks.extend(build_chunks_for_text(law))

print(f"Total chunks generated: {len(all_chunks)}")

# -----------------------------
# Sanity Check: Print sample chunks
# -----------------------------

print(all_chunks[0]["text"])
print("-"*40)
print(all_chunks[10000]["text"])
print("-"*40)
print(all_chunks[1000]["text"])


Total chunks generated: 66789
[سياق تشريعي]
نوع الوثيقة: مرسوم سلطاني
الرقم: 100
السنة: 2025
صدر في: ٢٨ من جمادى الأولى سنة ١٤٤٧ هـ
نشر في عدد الجريدة الرسمية رقم (١٦٢٣) الصادر في ٢٣ من نوفمبر ٢٠٢٥م.

[ديباجة مختصرة]
نحن هيثم بن طارق سلطان عمان
بعد الاطلاع على النظام الأساسي للدولة،
وعلى الاتفاقية بين حكومة سلطنة عمان وحكومة جمهورية العراق حول الإعفاء المتبادل من التأشيرة لحاملي جوازات السفر الدبلوماسية والخاصة والخدمة، الموقعة في مدينة صلالة بتاريخ ٣ من سبتمبر ٢٠٢٥ م،

[ النص القانونی / ماهية: مادة]

المادة الأولى:
التصديق على الاتفاقية المشار إليها، وفقا للصيغة المرفقة.
----------------------------------------
[سياق تشريعي]
نوع الوثيقة: مرسوم سلطاني
الرقم: 025
السنة: 2020
صدر في: ٢٤ من رجب سنة ١٤٤١هـ

[ديباجة مختصرة]
نحن هيثم بن طارق سلطان عمان
بعد الاطلاع على النظام الأساسي للدولة الصادر بالمرسوم السلطاني رقم ١٠١ / ٩٦،
وعلى الاتفاقية الدولية بشأن المسؤولية المدنية عن أضرار التلوث بوقود السفن الزيتي لعام ٢٠٠١،

[ النص القانونی / ماهية: مادة]

المادة (٤)
الاستثناءات – الفقرة (٢):
[ديب

In [6]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m68.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.1



---

**Embedding and FAISS Indexing for Legal Chunks**

This cell performs preprocessing, embedding, and indexing of the legal text chunks for retrieval. Steps include:

1. **Arabic text normalization** to standardize characters.
2. **Chunk metadata preparation** (type and overlap flags).
3. **Embedding chunks** using a pre-trained Arabic SentenceTransformer.
4. **FAISS index creation and training** with IVF + Flat index for efficient inner-product search (cosine similarity).
5. **Batch insertion of embeddings** into the index.
6. **Saving the FAISS index and metadata** for later retrieval.

---



In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle
import re

# -------------------- Arabic Text Normalization --------------------
def normalize_arabic(text: str) -> str:
    """
    Standardize Arabic text for embedding:
    - Remove tatweel (ـ)
    - Normalize alef variants (إأآا -> ا)
    - Convert final ya (ى) to ي, and taa marbuta (ة) to ه
    - Collapse multiple whitespace to single space
    """
    text = text.replace("ـ", "")
    text = re.sub(r"[إأآا]", "ا", text)
    text = re.sub(r"ى", "ي", text)
    text = re.sub(r"ة", "ه", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# -------------------- Prepare Chunks --------------------
for chunk in all_chunks:
    # normalize text for consistent embeddings
    chunk['text'] = normalize_arabic(chunk['text'])

    # ensure 'chunk_type' exists
    if 'chunk_type' not in chunk['metadata']:
        if '[النص القانوني]' in chunk['text'] or '[ماهية: مادة]' in chunk['text']:
            chunk['metadata']['chunk_type'] = 'مادة'
        else:
            chunk['metadata']['chunk_type'] = 'نص'

    # ensure 'is_overlap' exists
    if 'is_overlap' not in chunk['metadata']:
        chunk['metadata']['is_overlap'] = False

print(f"Total chunks ready for embedding: {len(all_chunks)}")

# -------------------- Load Embedding Model --------------------
model = SentenceTransformer("Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2")

# -------------------- Prepare FAISS Index --------------------
sample_emb = model.encode([all_chunks[0]['text']], convert_to_numpy=True)
embedding_dim = sample_emb.shape[-1]

nlist = 500  # number of IVF clusters
quantizer = faiss.IndexFlatIP(embedding_dim)  # inner-product = cosine similarity
index = faiss.IndexIVFFlat(quantizer, embedding_dim, nlist, faiss.METRIC_INNER_PRODUCT)

# -------------------- Train FAISS --------------------
sample_size = min(5000, len(all_chunks))
sample_embs = model.encode([c['text'] for c in all_chunks[:sample_size]], convert_to_numpy=True)
sample_embs = sample_embs / np.linalg.norm(sample_embs, axis=1, keepdims=True)  # normalize embeddings
index.train(sample_embs.astype(np.float32))

# -------------------- Add Embeddings in Batches --------------------
batch_size = 500
metadata_store = []

for i in range(0, len(all_chunks), batch_size):
    batch = all_chunks[i:i+batch_size]
    batch_texts = [c['text'] for c in batch]
    batch_embs = model.encode(batch_texts, convert_to_numpy=True)
    batch_embs = batch_embs / np.linalg.norm(batch_embs, axis=1, keepdims=True)
    index.add(batch_embs.astype(np.float32))
    metadata_store.extend([c['metadata'] for c in batch])
    print(f"Added batch {i//batch_size + 1} / {len(all_chunks)//batch_size + 1}")

# -------------------- Save FAISS Index and Metadata --------------------
faiss.write_index(index, "/content/drive/MyDrive/Project/Vector_DB/law_chunks_matryoshka.index")

with open("/content/drive/MyDrive/Project/Vector_DB/metadata_store_matryoshka.pkl", "wb") as f:
    pickle.dump(metadata_store, f)

print("FAISS index and metadata saved successfully.")


Total chunks ready for embedding: 66789


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Added batch 1 / 134
Added batch 2 / 134
Added batch 3 / 134
Added batch 4 / 134
Added batch 5 / 134
Added batch 6 / 134
Added batch 7 / 134
Added batch 8 / 134
Added batch 9 / 134
Added batch 10 / 134
Added batch 11 / 134
Added batch 12 / 134
Added batch 13 / 134
Added batch 14 / 134
Added batch 15 / 134
Added batch 16 / 134
Added batch 17 / 134
Added batch 18 / 134
Added batch 19 / 134
Added batch 20 / 134
Added batch 21 / 134
Added batch 22 / 134
Added batch 23 / 134
Added batch 24 / 134
Added batch 25 / 134
Added batch 26 / 134
Added batch 27 / 134
Added batch 28 / 134
Added batch 29 / 134
Added batch 30 / 134
Added batch 31 / 134
Added batch 32 / 134
Added batch 33 / 134
Added batch 34 / 134
Added batch 35 / 134
Added batch 36 / 134
Added batch 37 / 134
Added batch 38 / 134
Added batch 39 / 134
Added batch 40 / 134
Added batch 41 / 134
Added batch 42 / 134
Added batch 43 / 134
Added batch 44 / 134
Added batch 45 / 134
Added batch 46 / 134
Added batch 47 / 134
Added batch 48 / 134
A


---

**Query Legal Chunks via FAISS**

This cell loads a pre-trained FAISS index and metadata for legal text chunks. It normalizes a query, encodes it using the Arabic SentenceTransformer model, and retrieves the top-k most similar chunks. Each result shows a text preview and associated metadata.

---



In [None]:
!pip install faiss-cpu

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle
import re

# -------------------- Load FAISS index and metadata --------------------
save_path_index = "/content/drive/MyDrive/Project/Vector_DB/law_chunks_matryoshka.index"
save_path_meta  = "/content/drive/MyDrive/Project/Vector_DB/metadata_store_matryoshka.pkl"

index = faiss.read_index(save_path_index)
with open(save_path_meta, "rb") as f:
    metadata_store = pickle.load(f)

# Load embedding model
model = SentenceTransformer("Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2")

# -------------------- Prepare Query --------------------
query = "مع من وقعت سلطنة عمان الاتفاقية حول الإعفاء المتبادل من التأشيرات، وفي أي مدينة تم التوقيع؟"

# -------------------- Arabic Normalization --------------------
def normalize_arabic(text: str) -> str:
    """
    Minimal normalization for chunks:
    - Remove tatweel (ـ)
    - Normalize alef variants (إأآا -> ا)
    - Convert final ya (ى) to ي, taa marbuta (ة) to ه
    - Collapse multiple whitespace
    """
    text = text.replace("ـ", "")
    text = re.sub(r"[إأآا]", "ا", text)
    text = re.sub(r"ى", "ي", text)
    text = re.sub(r"ة", "ه", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# Normalize query similarly to chunks
query_norm = normalize_arabic(query)

# -------------------- Encode Query --------------------
query_emb = model.encode([query_norm], convert_to_numpy=True)
query_emb = query_emb / np.linalg.norm(query_emb, axis=1, keepdims=True)

# -------------------- Perform FAISS Search --------------------
k = 10  # number of top results
D, I = index.search(query_emb.astype(np.float32), k)

# -------------------- Display Results --------------------
print("\n--- Top results ---")
for dist, idx in zip(D[0], I[0]):
    print(f"Distance: {dist:.4f}")
    print(f"Text preview: {all_chunks[idx]['text'][:200]}...")
    print(f"Metadata: {metadata_store[idx]}")
    print("-"*50)



--- Top results ---
Distance: 0.7849
Text preview: [ماهیت: نص]وعلى الاتفاقية بين حكومة سلطنة عمان، وحكومة جمهورية الهند حول الإعفاء المتبادل من التأشيرة لحاملي جوازات السفر الدبلوماسية والخاصة والخدمة والرسمية، الموقعة في مدينة مسقط بتاريخ ١١ من فبراي...
Metadata: {'canonical_link': 'https://qanoon.om/p/2020/rd2020047/', 'chunk_index': 1, 'is_overlap': True, 'chunk_type': 'نص'}
--------------------------------------------------
Distance: 0.7720
Text preview: [ماهیت: نص]وعلى الاتفاقية بين حكومة سلطنة عمان وحكومة جمهورية بيلاروس حول الإعفاء المتبادل من التأشيرات، الموقعة في مدينة مينسك بتاريخ ٦ من أكتوبر ٢٠٢٥م،
وبناء على ما تقتضيه المصلحة العامة،
رسمنا بما ...
Metadata: {'canonical_link': 'https://qanoon.om/p/2025/rd2025098/', 'chunk_index': 1, 'is_overlap': True, 'chunk_type': 'نص'}
--------------------------------------------------
Distance: 0.7687
Text preview: [ماهیت: نص]وعلى الاتفاقية بين حكومة سلطنة عمان وحكومة جمهورية بولندا حول الإعفاء المتبادل من التأشيرة لحاملي جوازات السفر ال

In [10]:
D

array([[0.78490704, 0.7719861 , 0.7687116 , 0.7685536 , 0.75512844,
        0.7448718 , 0.7393601 , 0.73517203, 0.73517203, 0.72865516]],
      dtype=float32)

In [11]:
I

array([[ 9491,    11,  1224, 15629,  5663,   155,   247,  1017,  1013,
          427]])

In [None]:
I

array([[21296, 49061,  4583, 26899, 13164, 26358,  7428, 25602, 12169,
        18891]])


---


**Reranking Top FAISS Candidates with Cross-Encoder**

This cell reranks the top-k candidates returned by FAISS using a Cross-Encoder model. Steps include:

1. Load a pre-trained Cross-Encoder for Arabic legal text.
2. Normalize candidate texts and pair them with the query.
3. Compute Cross-Encoder relevance scores.
4. Compute FAISS ranks and combine them with Cross-Encoder ranks using a weighted formula.
5. Select and display the top-n final results with detailed metadata.

---



In [None]:
from sentence_transformers import CrossEncoder
import numpy as np
from scipy.stats import rankdata

# -------------------- Parameters --------------------
top_k = 50   # number of initial FAISS candidates
top_n = 5    # number of final top results
alpha = 0.7  # weight of Cross-Encoder relative to FAISS (0-1)

# -------------------- Load Cross-Encoder Model --------------------
cross_model = CrossEncoder("Omartificial-Intelligence-Space/ARA-Reranker-V1")

# -------------------- Prepare Candidate Chunks --------------------
candidate_idxs = I[0][:top_k]
candidate_texts = [all_chunks[idx]['text'] for idx in candidate_idxs]
candidate_meta = [metadata_store[idx] for idx in candidate_idxs]
candidate_distances = D[0][:top_k]  # FAISS similarity scores

# -------------------- Normalize Candidate Texts --------------------
candidate_texts_norm = [normalize_arabic(text) for text in candidate_texts]

# -------------------- Compute Cross-Encoder Scores --------------------
pairs = [[query_norm, text] for text in candidate_texts_norm]
cross_scores = cross_model.predict(pairs, show_progress_bar=True)
cross_scores = np.array(cross_scores, dtype=float)

# -------------------- Compute FAISS Ranks --------------------
# higher distance = higher similarity, so descending order
sorted_idx = np.argsort(-candidate_distances)
faiss_ranks = np.empty_like(sorted_idx)
for rank, idx in enumerate(sorted_idx):
    faiss_ranks[idx] = rank + 1  # 1-based ranking

# Alternative tie-safe ranking:
# faiss_ranks = rankdata(-candidate_distances, method='min')

# -------------------- Compute Cross-Encoder Ranks --------------------
cross_ranks = rankdata(-cross_scores, method='min')  # higher score = better rank

# -------------------- Weighted Combined Rank --------------------
combined_rank = alpha * cross_ranks + (1 - alpha) * faiss_ranks

# -------------------- Select Top-N Results --------------------
order = np.argsort(combined_rank)
selected = order[:top_n]

# -------------------- Prepare Final Results --------------------
reranked_results = []
for rank, i in enumerate(selected, start=1):
    reranked_results.append({
        "rank": rank,
        "text": candidate_texts[i],
        "metadata": candidate_meta[i],
        "faiss_distance": candidate_distances[i],
        "cross_score": float(cross_scores[i]),
        "faiss_rank": int(faiss_ranks[i]),
        "cross_rank": int(cross_ranks[i]),
        "combined_rank": float(combined_rank[i])
    })

# -------------------- Display Reranked Results --------------------
print(f"\n===== RERANKED TOP {top_n} RESULTS (Weighted Rank, Stable FAISS) =====\n")
for item in reranked_results:
    print(f"[Rank {item['rank']}] Combined Rank: {item['combined_rank']:.2f} | "
          f"FAISS Rank: {item['faiss_rank']} | Cross Rank: {item['cross_rank']} | "
          f"Cross Score: {item['cross_score']:.4f} | FAISS Distance: {item['faiss_distance']:.4f}")
    print(f"Text Preview: {item['text'][:300].replace(chr(10), ' ')}...")
    print(f"Metadata: {item['metadata']}")
    print("-"*80)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


===== RERANKED TOP 5 RESULTS (Weighted Rank, Stable FAISS) =====

[Rank 1] Combined Rank: 2.20 | FAISS Rank: 5 | Cross Rank: 1 | Cross Score: 0.9854 | FAISS Distance: 0.7551
Text Preview: [ماهیت: نص]وعلى الاتفاقية بين حكومة سلطنة عمان وحكومة الجمهورية العربية السورية حول الإعفاء المتبادل من التأشيرة لحاملي جوازات السفر الدبلوماسية والخاصة والخدمة، الموقعة في مدينة مسقط بتاريخ ٢١ من مارس ٢٠٢١م، وبناء على ما تقتضيه المصلحة العامة. رسمنا بما هو آت...
Metadata: {'canonical_link': 'https://qanoon.om/p/2021/rd2021035/', 'chunk_index': 1, 'is_overlap': True, 'chunk_type': 'نص'}
--------------------------------------------------------------------------------
[Rank 2] Combined Rank: 2.70 | FAISS Rank: 2 | Cross Rank: 3 | Cross Score: 0.9648 | FAISS Distance: 0.7720
Text Preview: [ماهیت: نص]وعلى الاتفاقية بين حكومة سلطنة عمان وحكومة جمهورية بيلاروس حول الإعفاء المتبادل من التأشيرات، الموقعة في مدينة مينسك بتاريخ ٦ من أكتوبر ٢٠٢٥م، وبناء على ما تقتضيه المصلحة العامة، رسمنا بما هو آت...
Metadata:

In [41]:
!pip install ollama

Collecting ollama
  Downloading ollama-0.6.1-py3-none-any.whl.metadata (4.3 kB)
Downloading ollama-0.6.1-py3-none-any.whl (14 kB)
Installing collected packages: ollama
Successfully installed ollama-0.6.1



---


**LLM Legal Answer Generation with Ollama**

This cell sends a legal question and reranked retrieval results to an LLM (Ollama) for generating a structured, document-backed answer. It:

1. Loads API credentials from a `.env` file.
2. Mounts Google Drive to access project files.
3. Prepares a context prompt by combining the top-k reranked results.
4. Sends a detailed instruction prompt in Arabic to the LLM for a precise, professional, and source-cited answer.
5. Returns the LLM’s generated answer for display.

---


In [None]:
import os
from dotenv import load_dotenv
from ollama import Client
from google.colab import drive

# -------------------- Mount Google Drive --------------------
drive.mount('/content/drive')
env_path = "/content/drive/MyDrive/Project/.env"
load_dotenv(env_path)

# -------------------- Load API credentials --------------------
API_KEY = os.getenv("OLLAMA_API_KEY")
MODEL = os.getenv("OLLAMA_MODEL")

print("API_KEY:", API_KEY[:4], "...")  # show only first 4 chars
print("MODEL:", MODEL)

# -------------------- Initialize Ollama client --------------------
client = Client(
    host="https://ollama.com",
    headers={"Authorization": f"Bearer {API_KEY}"}
)

# -------------------- Function to query LLM --------------------
def llm_answer(question: str, reranked_results, top_k: int = 5) -> str:
    """
    Send a legal question and reranked results to the LLM.
    
    Parameters:
    - question: str, the user's legal question (Arabic)
    - reranked_results: list of dicts, output from weighted FAISS + Cross-Encoder reranking
    - top_k: number of top results to include in context
    
    Returns:
    - LLM-generated answer (str) with source citations.
    """
    # Select top-k documents
    selected_docs = reranked_results[:top_k]

    # Build context from reranked results
    context_blocks = []
    for item in selected_docs:
        block = (
            f"المرتبة: {item['rank']}\n"
            f"درجة الصلة: {item['combined_rank']:.4f}\n"
            f"المصدر: {item['metadata']}\n"
            f"النص:\n{item['text']}"
        )
        context_blocks.append(block)

    context = "\n\n======================\n\n".join(context_blocks)

    # Construct the prompt with detailed instructions
    prompt = (
        "أنت مساعد قانوني متخصص للإجابة على الأسئلة القانونية بدقة واحترافية. "
        "اتبع التعليمات التالية بدقة:\n"
        "1. اقرأ جميع الإجابات المسترجعة المرتبة حسب الصلة.\n"
        "2. حدد الإجابة التي تجيب فعلياً على السؤال، وليس فقط الأعلى تشابهًا أو درجة صلة. "
        "قبل اختيار أي إجابة، تحقق أن المعلومات فيها تجيب بشكل دقيق على السؤال، حتى لو كان لها درجة صلة أقل. "
        "إذا لم تجد أي إجابة دقيقة، أجب: 'متأسف، لم أجد أي إجابة مناسبة.'\n"
        "3. إذا كانت هناك أكثر من إجابة صحيحة، دمج المعلومات منها لصياغة رد رسمي وواضح، مدمج، وجاهز للقراءة. "
        "تجنب اختصار زائد الذي قد يحذف المعلومات الأساسية.\n"
        "4. أدرج المصادر لكل معلومة باستخدام بيانات التعريف (metadata) لكل إجابة، دائمًا بهذا التنسيق:\n"
        "   المصدر: [URL] (المرتبة: [Rank]، درجة الصلة: [Relevance Score])\n"
        "5. تجنب الهذيان أو إضافة معلومات غير موجودة في الإجابات المسترجعة.\n\n"
        "=== السياق ===\n"
        f"{context}\n\n"
        "=== السؤال ===\n"
        f"{question}\n\n"
        "=== الإجابة ===\n"
        "=== المصادر ===\n"
    )

    # Send prompt to Ollama LLM
    response = client.chat(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}]
    )

    return response["message"]["content"]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
API_KEY: 3ab5 ...
MODEL: deepseek-v3.1:671b-cloud


In [46]:
question = "مع من وقعت سلطنة عمان الاتفاقية حول الإعفاء المتبادل من التأشيرات، وفي أي مدينة تم التوقيع؟"


answer = llm_answer(question, reranked_results, top_k=10)
print(answer)


بناء على المعلومات المسترجعة، تتضمن الإجابات تفاصيل عن عدة اتفاقيات وقعتها سلطنة عمان. الإجابة التي تعالج السؤال بشكل دقيق هي تلك التي تذكر الطرف الآخر ومدينة التوقيع. فيما يلي التفاصيل المستخلصة من المصادر المتاحة:

وقعّت سلطنة عمان اتفاقيات للإعفاء المتبادل من التأشيرات مع الحكومات التالية في المدن المذكورة:
*   **الجمهورية العربية السورية**: تم التوقيع في مدينة مسقط بتاريخ ٢١ مارس ٢٠٢١م.
*   **جمهورية بيلاروس**: تم التوقيع في مدينة مينسك بتاريخ ٦ أكتوبر ٢٠٢٥م.
*   **جمهورية الهند**: تم التوقيع في مدينة مسقط بتاريخ ١١ فبراير ٢٠١٨م.
*   **جمهورية المالديف**: تم التوقيع في مدينة مسقط بتاريخ ١١ ديسمبر ٢٠٢٤م.
*   **روسيا الاتحادية**: تم التوقيع في مدينة موسكو بتاريخ ٢٢ أبريل ٢٠٢٥م.

=== المصادر ===
المصدر: https://qanoon.om/p/2021/rd2021035/ (المرتبة: 1، درجة الصلة: 2.2000)
المصدر: https://qanoon.om/p/2025/rd2025098/ (المرتبة: 2، درجة الصلة: 2.7000)
المصدر: https://qanoon.om/p/2020/rd2020047/ (المرتبة: 3، درجة الصلة: 3.8000)
المصدر: https://qanoon.om/p/2025/rd2025019/ (المرتبة: 4، درجة ا