In [1]:
# RAG Pipeline (Offline Version, No OpenAI Required)
# Notebook: 01_rag_no_openai.ipynb
# ------------------------------------------------------
# This notebook implements a full Arabic-focused RAG pipeline
# that uses ONLY local / open-source tools:
# - sentence-transformers (e5 multilingual)
# - HuggingFace M2M100 for translation
# - langdetect for language detection
# - FAISS for vector search
# - CrossEncoder (MS MARCO) for reranking
# - No OpenAI API used

In [2]:
# ------------------------------------------------------
# Cell 1 — Install dependencies
# ------------------------------------------------------
!pip install sentence-transformers faiss-cpu langdetect arabic-reshaper transformers torch --quiet


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# ------------------------------------------------------
# Cell 2 — Imports
# ------------------------------------------------------
import os
import re
import glob
import json
from pathlib import Path


import numpy as np
import faiss
from langdetect import detect
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

  from .autonotebook import tqdm as notebook_tqdm
  warn(
  warn(


In [4]:
# ------------------------------------------------------
# Cell 3 — Config
# ------------------------------------------------------
DATA_DIR = "../data"
INDEX_DIR = "../index_no_openai"
os.makedirs(INDEX_DIR, exist_ok=True)


EMBED_MODEL = "intfloat/multilingual-e5-base"
CHUNK_SIZE = 900
CHUNK_OVERLAP = 150


CATEGORIES = ["business","culture","education","health","housing","info","justice","transportation"]

In [5]:
# ------------------------------------------------------
# Cell 4 — File Loader
# ------------------------------------------------------

def list_text_files():
    files = sorted(glob.glob(os.path.join(DATA_DIR, "**/*.txt"), recursive=True))
    return files

files = list_text_files()
print(f"Loaded {len(files)} text files.")


Loaded 34 text files.


In [10]:
# ------------------------------------------------------
# Cell 5 — Read Documents into Memory
# ------------------------------------------------------

documents = []
for f in files:
    rel = os.path.relpath(f, DATA_DIR)
    parts = rel.split(os.sep)
    category = parts[0].lower() if len(parts) > 1 else "info"
    with open(f, 'r', encoding='utf-8') as fh:
        txt = fh.read().strip()
    documents.append({
        "path": f,
        "category": category,
        "filename": os.path.basename(f),
        "text": txt
    })

print("Categories:", set(d['category'] for d in documents))


Categories: {'culture', 'health', 'business', 'housing', 'transportation', 'education', 'info', 'justice'}


In [11]:
# ------------------------------------------------------
# Cell 6 — Paragraph + Smart Chunking
# ------------------------------------------------------

def para_split(text):
    paras = re.split(r"\n\s*\n", text)
    return [p.strip() for p in paras if p.strip()]

def smart_chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP, min_size=200):
    paras = para_split(text)
    chunks = []
    buffer = ""

    for p in paras:
        if len(buffer) + len(p) + 1 <= chunk_size:
            buffer = (buffer + "\n\n" + p).strip() if buffer else p
        else:
            if len(buffer) >= min_size:
                chunks.append(buffer)
            buffer = p

    if buffer and len(buffer) >= min_size:
        chunks.append(buffer)

    # Sliding window if needed
    final_chunks = []
    for ch in chunks:
        if len(ch) <= chunk_size:
            final_chunks.append(ch)
        else:
            start = 0
            while start < len(ch):
                end = start + chunk_size
                final_chunks.append(ch[start:end])
                if end >= len(ch):
                    break
                start = end - overlap

    return final_chunks


In [34]:
# ------------------------------------------------------
# Cell 7 — Build Corpus Chunks
# ------------------------------------------------------

corpus_chunks = []
corpus_meta = []

for doc in documents:
    chunks = smart_chunk_text(doc['text'])
    for i, ch in enumerate(chunks):
        corpus_chunks.append(ch)
        corpus_meta.append({
            "file": doc['filename'],
            "path": doc['path'],
            "category": doc['category'],
            "chunk_id": i
        })

print("Total chunks:", len(corpus_chunks))


Total chunks: 45


In [35]:
# ------------------------------------------------------
# Cell 8 — Load Embedding Model
# ------------------------------------------------------

model = SentenceTransformer(EMBED_MODEL)
print("Embedding model loaded.")

# FIX: Add keyword boosting
KEYWORDS = {
    "transportation": "مواصلات نقل مركبات سيارات ليموزين شحن طرود تراخيص سفر",
    "business": "تجارة شركات رخصة سجل تجاري تمويل شركة مستندات",
    "education": "جامعة طلاب مقررات تسجيل قبول كشوف درجات",
    "health": "طبي صحة تقرير حمد استشارة ترخيص ممارس",
    "justice": "محكمة دعوى مرافعة قضية قانون عدالة",
    "housing": "إسكان سند ملكية سكن منزل",
    "culture": "تصوير تلفزيون أفلام ترخيص إعلام",
    "info": "شارك استبيان استطلاع حكومي معلومات",
}

def prep(c, m):
    kw = KEYWORDS.get(m["category"], "")
    return (
        f"passage: document={m['file']} category={m['category']}\n"
        f"keywords: {kw}\n\n"
        f"{c}"
    )

prepared_chunks = [prep(c, m) for c, m in zip(corpus_chunks, corpus_meta)]


Embedding model loaded.


In [36]:
# ------------------------------------------------------
# Cell 9 — Embed Chunks
# ------------------------------------------------------


embeddings = model.encode(prepared_chunks, batch_size=64, show_progress_bar=True)
embeddings = embeddings.astype('float32')
print('Embeddings:', embeddings.shape)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:17<00:00, 17.45s/it]

Embeddings: (45, 768)





In [37]:
# ------------------------------------------------------
# Cell 10 — Build Per-Category Indexes
# ------------------------------------------------------

indexes = {}
for cat in CATEGORIES + ["all"]:
    indexes[cat] = {
        "index": faiss.IndexFlatL2(embeddings.shape[1]),
        "ids": []
    }

for global_id, meta in enumerate(corpus_meta):
    vec = embeddings[global_id:global_id+1]
    cat = meta['category']
    indexes[cat]['index'].add(vec)
    indexes[cat]['ids'].append(global_id)
    indexes['all']['index'].add(vec)
    indexes['all']['ids'].append(global_id)

print("Indexes built.")


Indexes built.


In [38]:
# ------------------------------------------------------
# Cell 11 — Load Translation Model (M2M100)
# ------------------------------------------------------

# Ensure sentencepiece is installed for the tokenizer
!pip install sentencepiece --quiet

translator = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

print("Translator ready.")



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Translator ready.


In [39]:
# ------------------------------------------------------
# Cell 12 — Translation Helpers
# ------------------------------------------------------

def translate_to_ar(text):
    tokenizer.src_lang = "en"
    enc = tokenizer(text, return_tensors="pt")
    out = translator.generate(**enc, forced_bos_token_id=tokenizer.get_lang_id("ar"))
    return tokenizer.decode(out[0], skip_special_tokens=True)


def translate_to_en(text):
    tokenizer.src_lang = "ar"
    enc = tokenizer(text, return_tensors="pt")
    out = translator.generate(**enc, forced_bos_token_id=tokenizer.get_lang_id("en"))
    return tokenizer.decode(out[0], skip_special_tokens=True)


In [40]:
# ------------------------------------------------------
# Cell 13 — Arabic Normalization
# ------------------------------------------------------

import arabic_reshaper


def normalize_ar(text):
    text = re.sub(r"[ًٌٍَُِّْ]", "", text)  # remove diacritics
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    text = text.replace("ى", "ي")
    return text.strip()


In [41]:
# ------------------------------------------------------
# Cell 14 — Category Detection (Rule-Based)
# ------------------------------------------------------


def detect_category(query):
    q = query
    if any(k in q for k in ['ليموزين','سيارة','مواصلات','شحن','طرود']): return 'transportation'
    if any(k in q for k in ['جامعة','مقررات','قبول','كشف','transcript']): return 'education'
    if any(k in q for k in ['طبيب','حمد','طبي','استشارة']): return 'health'
    if any(k in q for k in ['رخصة','سجل','تمويل','قرض','شركة']): return 'business'
    if any(k in q for k in ['ملكية','اسكان','سند']): return 'housing'
    if any(k in q for k in ['شارك','استبيان','استطلاع','مشاركة']): return 'info'
    if any(k in q for k in ['دعوى','مرافعة','محكمة']):  return 'justice'
    if any(k in q for k in ["limousine", "rent car", "car hire", "limo", "vehicle"]):  return "transportation"
    if any(k in q for k in ["university", "register", "courses"]):  return "education"
    return 'all'

In [42]:
# ------------------------------------------------------
# Cell 15 — Load Cross-Encoder Reranker
# ------------------------------------------------------


reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
print("Reranker loaded.")

Reranker loaded.


In [43]:
# ------------------------------------------------------
# Cell 16 — Retrieval Function (with Fix #2)
# ------------------------------------------------------

def retrieve(query, top_k=5, use_category=True):

    # 1 — Detect language
    lang = detect(query)
    original_query = query

    # 2 — Translate EN → AR
    if lang.startswith("en"):
        query = translate_to_ar(query)

    # 3 — Normalize Arabic
    query = normalize_ar(query)

    # 4 — Category detection
    cat = detect_category(query) if use_category else "all"
    index_obj = indexes.get(cat, indexes["all"])

    # 5 — Embed query
    q_embed = model.encode([f"query: {query}"], convert_to_numpy=True)
    q_embed = q_embed.astype("float32")

    # ------------------------------
    # FIX #2 — Retrieve more candidates
    # ------------------------------
    FAISS_K = 40   # instead of top_k (5)
    D, I = index_obj["index"].search(q_embed, FAISS_K)

    # 6 — Map to global chunk IDs
    candidates = []
    for local_id in I[0]:
        global_id = index_obj["ids"][local_id]
        candidates.append({
            "meta": corpus_meta[global_id],
            "text": corpus_chunks[global_id]
        })

    # 7 — Rerank with cross-encoder
    pairs = [(query, c["text"]) for c in candidates]
    scores = reranker.predict(pairs)

    ranked = [
        x for _, x in sorted(
            zip(scores, candidates),
            key=lambda x: x[0],
            reverse=True
        )
    ]

    # return final best 3
    return ranked[:3]


In [44]:
# ------------------------------------------------------
# Cell 17 — Test
# ------------------------------------------------------

qs = [
    "How can I rent a limousine in Qatar?",  # English test
    "كيف اسجل في مقررات جامعة قطر؟",  # Arabic
    "ما هي رسوم تجديد رخصة مكتب سفر؟",
]

for q in qs:
    print("\n=== QUERY ===", q)
    results = retrieve(q, top_k=5)
    for r in results:
        print("FILE:", r['meta']['file'], "| CATEGORY:", r['meta']['category'])
        print(r['text'][:200].replace('\n', ' '), '...')



=== QUERY === How can I rent a limousine in Qatar?
FILE: transportation_fish_transport_permit.txt | CATEGORY: transportation
# طلب إصدار ترخيص وسيلة نقل أسماك  **مقدّم الخدمة:**   وزارة البلدية  **نوع الخدمة:**   خدمة ورقية – تقديم شخصي  ## وصف الخدمة يمكن لمالكي السفن وقوارب الصيد تقديم طلب إصدار ترخيص نقل أسماك داخل دولة  ...
FILE: transportation_ag_vehicle_circulation_request.txt | CATEGORY: transportation
# طلب تعميم على مركبة  **مقدّم الخدمة:**   النيابة العامة  **نوع الخدمة:**   خدمة إلكترونية – تقديم عبر الإنترنت  ## وصف الخدمة تتيح هذه الخدمة التعميم على مركبة عبر النيابة العامة.  ## الإرشادات - ال ...
FILE: transportation_qpost_cargo_service.txt | CATEGORY: transportation
# خدمة الشحن من "بريد قطر"  **مقدّم الخدمة:**   الشركة القطرية للخدمات البريدية (بريد قطر)  **نوع الخدمة:**   خدمة إلكترونية – تقديم عبر الإنترنت  ## وصف الخدمة تُقدِّم الشركةُ القطريةُ للخدمات البريد ...

=== QUERY === كيف اسجل في مقررات جامعة قطر؟
FILE: transportation_fish_transport_permit.txt | CATEGORY: 