In [1]:
# 02_rag_openai.ipynb
# OpenAI-enabled version of the RAG pipeline.
# Same structure as the offline notebook, but adds:
# - OpenAI translation
# - OpenAI LLM answer synthesis
# - OpenAI reranking (optional)

In [2]:
# ------------------------------------------------------
# Cell 1 — Install dependencies
# ------------------------------------------------------
%pip install sentence-transformers faiss-cpu langdetect arabic-reshaper transformers torch openai python-bidi --quiet

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# -----------------------------
# Cell 2 — Imports
# -----------------------------
import os
import re
import glob
import json
from pathlib import Path


import numpy as np
import faiss
from sentence_transformers import SentenceTransformer, CrossEncoder
from langdetect import detect
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from openai import OpenAI

  from .autonotebook import tqdm as notebook_tqdm
  warn(


In [None]:
# -----------------------------
# Cell 3 — Config
# -----------------------------
DATA_DIR = "../data"
INDEX_DIR = "../index_openai"
os.makedirs(INDEX_DIR, exist_ok=True)


EMBED_MODEL = "intfloat/multilingual-e5-base"
CHUNK_SIZE = 900
CHUNK_OVERLAP = 150
CATEGORIES = ["business","culture","education","health","housing","info","justice","transportation"]


# OpenAI client
oi = OpenAI()

Sample document categories: ['education', 'business', 'housing', 'health', 'justice', 'info', 'culture', 'transportation']


In [None]:
# -----------------------------
# Cell 4 — Language tools
# -----------------------------
def detect_lang(text):
    try:
        return detect(text)
    except:
        return "ar"


# Local translation fallback (M2M100)
m2m_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
m2m_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")


def translate_local(text, src="en", tgt="ar"):
    m2m_tokenizer.src_lang = src
    encoded = m2m_tokenizer(text, return_tensors="pt")
    generated = m2m_model.generate(**encoded, forced_bos_token_id=m2m_tokenizer.get_lang_id(tgt))
    return m2m_tokenizer.batch_decode(generated, skip_special_tokens=True)[0]


# OpenAI translation (preferred)
def translate_openai(text, tgt="ar"):
    try:
        resp = oi.responses.create(
            model="gpt-4.1-mini",
            input=f"Translate to {tgt}: {text}"
        )
        return resp.output_text.strip()
    except:
        return translate_local(text, "en", tgt)

In [None]:
# -----------------------------
# Cell 5 — Arabic normalization
# -----------------------------
def normalize_arabic(text):
    text = re.sub(r"[أإآا]", "ا", text)
    text = text.replace("ى", "ي").replace("ئ", "ي").replace("ؤ", "و")
    text = re.sub(r"[\u064B-\u0652]", "", text)
    text = re.sub(r"[^0-9A-Za-zءاأإآبتثجحخدذرزسشصضطظعغفقكلمنهوية\s]", " ", text)
    return text.strip()

In [None]:
# -----------------------------
# Cell 6 — Load files
# -----------------------------
def list_files():
    return sorted(glob.glob(os.path.join(DATA_DIR, "**/*.txt"), recursive=True))


files = list_files()
print("Files found:", len(files))


documents = []
for f in files:
    rel = os.path.relpath(f, DATA_DIR)
    parts = rel.split(os.sep)
    category = parts[0] if len(parts) > 1 else "info"
    with open(f, "r", encoding="utf-8") as fh:
        text = fh.read().strip()
    documents.append({"file": os.path.basename(f), "path": f, "category": category, "text": text})

Sample length: 1379
Chunks sample: 2


In [None]:
# -----------------------------
# Cell 7 — Chunking
# -----------------------------
def para_split(text):
    return [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]


def smart_chunk(text, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    paras = para_split(text)
    chunks, buf = [], ""
    for p in paras:
        if len(buf) + len(p) <= size:
            buf = buf + "\n\n" + p if buf else p
        else:
            chunks.append(buf)
            buf = p
    if buf:
        chunks.append(buf)
    return chunks


corpus_chunks = []
corpus_meta = []


for d in documents:
    chs = smart_chunk(d["text"])
    for i, ch in enumerate(chs):
        corpus_chunks.append(ch)
        corpus_meta.append({"file": d["file"], "category": d["category"], "chunk_id": i})


print("Total chunks:", len(corpus_chunks))

Total chunks: 44


In [None]:
# -----------------------------
# Cell 8 — Embeddings
# -----------------------------
model = SentenceTransformer(EMBED_MODEL)
print("Loaded embedding model.")


BATCH = 64
prepared = [f"document: {m['file']} category: {m['category']}\n\n{c}" for c,m in zip(corpus_chunks, corpus_meta)]
emb = model.encode(prepared, batch_size=BATCH, show_progress_bar=True)
emb = np.array(emb).astype('float32')

Loading model: intfloat/multilingual-e5-large


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model loaded


In [None]:
# -----------------------------
# Cell 9 — FAISS indexes
# -----------------------------
indexes = {cat: {"index": faiss.IndexFlatL2(emb.shape[1]), "ids": []} for cat in CATEGORIES}
indexes["all"] = {"index": faiss.IndexFlatL2(emb.shape[1]), "ids": []}


for i, meta in enumerate(corpus_meta):
    cat = meta["category"] if meta["category"] in CATEGORIES else "all"
    indexes[cat]["index"].add(np.expand_dims(emb[i], 0))
    indexes[cat]["ids"].append(i)
    indexes["all"]["index"].add(np.expand_dims(emb[i], 0))
    indexes["all"]["ids"].append(i)

In [None]:
# -----------------------------
# Cell 10 — Category detection
# -----------------------------
def detect_category(q):
    q = q.lower()
    if any(k in q for k in ["ليموزين","مواصلات","شحن","طرد"]): return "transportation"
    if any(k in q for k in ["جامعة","مقررات","قبول","كشف"]): return "education"
    if any(k in q for k in ["طب","حمد","صحي"]): return "health"
    if any(k in q for k in ["سجل","ترخيص","تمويل","قرض"]): return "business"
    if any(k in q for k in ["محكمة","دعوى","مرافعة"]): return "justice"
    if any(k in q for k in ["سند","إسكان"]): return "housing"
    if any(k in q for k in ["شارك","استبيان","مشاركة"]): return "info"
    return "all"

Batches: 100%|██████████| 1/1 [00:54<00:00, 54.83s/it]

Embeddings shape: (44, 1024)





In [None]:
# -----------------------------
# Cell 11 — Retrieval (with reranker)
# -----------------------------
reranker = CrossEncoder("cross-encoder/ms-marco-mminiLM-L-6-v2")




def retrieve(query, top_k=5):
    lang = detect_lang(query)


    # Step 1 — Translate to Arabic if English
    if lang == "en":
        query_ar = translate_openai(query, tgt="ar")
    else:
        query_ar = query


    query_ar = normalize_arabic(query_ar)


    # Step 2 — Category routing
    cat = detect_category(query_ar)
    idx = indexes.get(cat, indexes["all"])


    # Step 3 — Embedding search
    q_emb = model.encode(["query: " + query_ar])
    D, I = idx["index"].search(np.array(q_emb).astype('float32'), top_k)


    raw_results = []
    for local_i in I[0]:
        if local_i < 0: continue
        global_id = idx["ids"][local_i]
        raw_results.append({
            "meta": corpus_meta[global_id],
            "text": corpus_chunks[global_id]
        })


    # Step 4 — Cross-encoder rerank
    pairs = [(query_ar, r["text"]) for r in raw_results]
    scores = reranker.predict(pairs)


    reranked = sorted(zip(raw_results, scores), key=lambda x: x[1], reverse=True)
    return [r for r,_ in reranked]

In [None]:
# -----------------------------
# Cell 12 — LLM answer synthesis
# -----------------------------
def synthesize_answer(query, retrieved):
    chunks_text = "\n\n".join(r["text"] for r in retrieved)


    prompt = f"""
    اجب عن السؤال التالي باستخدام المعلومات في النصوص بدقة.
   السؤال: {query}


    النصوص:
    {chunks_text}


    اكتب الجواب بالعربية الفصحى.
    """  


    resp = oi.responses.create(model="gpt-4.1-mini", input=prompt)
    return resp.output_text

In [None]:
# -----------------------------
# Cell 13 — Query handler
# -----------------------------
def answer(query):
    results = retrieve(query, top_k=5)
    arabic_answer = synthesize_answer(query, results[:3])


    # If user asked in English → translate back to English
    if detect_lang(query) == "en":
        return translate_openai(arabic_answer, tgt="en")
    return arabic_answer

Indexes saved to ../index


In [None]:
# -----------------------------
# Cell 14 — Example
# -----------------------------
print(answer("How do I rent a limousine in Qatar?"))