In [None]:
import json


file_path = "/content/merged_final.json"


with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)


print("Number of records:", len(data))


for i, record in enumerate(data[:2], start=1):
    print(f"\n--- Record {i} ---")
    for key, value in record.items():
        print(f"{key}: {value}")


In [None]:
def make_doc_text(rec):
    parts = []
    if rec.get("Topic"):     parts.append(f"[Topic] {rec['Topic']}")
    if rec.get("question"):  parts.append(f"[Question] {rec['question']}")
    if rec.get("answer"):    parts.append(f"[Answer] {rec['answer']}")
    if rec.get("Example"):   parts.append(f"[Example] {rec['Example']}")
    if rec.get("Source"):    parts.append(f"[Source] {rec['Source']}")
    return "\n".join(parts).strip()


for i in range(2):
    doc_text = make_doc_text(data[i])
    print(f"\n===== Document {i+1} ready for embedding =====\n")
    print(doc_text)


In [None]:
import json



output_path = "/content/zatca_docs.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    for rec in data:
        doc_text = make_doc_text(rec)
        meta = {
            "topic": rec.get("Topic", ""),
            "source": rec.get("Source", ""),
            "date": rec.get("Date", ""),
            "question": rec.get("question", ""),
            "answer": rec.get("answer", "")
        }
        f.write(json.dumps({"text": doc_text, "metadata": meta}, ensure_ascii=False) + "\n")

print("✅ File rebuilt successfully:", output_path)


In [None]:
!pip -q install faiss-cpu sentence-transformers ujson

import ujson, numpy as np, faiss
from sentence_transformers import SentenceTransformer
from pathlib import Path

DOCS_PATH  = "/content/zatca_docs.jsonl"
STORE_DIR  = "/content/rag_store"
INDEX_PATH = f"{STORE_DIR}/faiss_index.bin"
META_PATH  = f"{STORE_DIR}/metadata.jsonl"

Path(STORE_DIR).mkdir(parents=True, exist_ok=True)

docs = [ujson.loads(l) for l in open(DOCS_PATH, "r", encoding="utf-8") if l.strip()]
print("Number of documents:", len(docs))

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

texts = [d["text"] for d in docs]
embs = model.encode(texts, normalize_embeddings=True).astype("float32")
index = faiss.IndexFlatIP(embs.shape[1])
index.add(embs)
faiss.write_index(index, INDEX_PATH)

with open(META_PATH, "w", encoding="utf-8") as w:
    for d in docs:
        w.write(ujson.dumps(d, ensure_ascii=False) + "\n")

print(" Save completed:")
print(" Index:", INDEX_PATH)
print(" Metadata:", META_PATH)



In [None]:
import ujson, itertools
META_PATH = "/content/rag_store/metadata.jsonl"

rows = []
with open(META_PATH, "r", encoding="utf-8") as f:
    for line in itertools.islice(f, 3):
        rows.append(ujson.loads(line))

for i, r in enumerate(rows, 1):
    md = r.get("metadata", {})
    print(f"\n--- Line {i} ---")
    print("keys in metadata:", list(md.keys()))
    print("question:", md.get("question"))
    print("answer:", md.get("answer"))


In [None]:
import re, ujson, unicodedata, difflib, faiss, numpy as np
from sentence_transformers import SentenceTransformer

INDEX_PATH = "/content/rag_store/faiss_index.bin"
META_PATH  = "/content/rag_store/metadata.jsonl"

index = faiss.read_index(INDEX_PATH)
metas = [ujson.loads(l) for l in open(META_PATH, "r", encoding="utf-8") if l.strip()]
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def norm_ar(s: str) -> str:
    if not s: return ""
    s = unicodedata.normalize("NFKC", s).strip().lower()
    s = re.sub(r"[\u200c\u200f\u200e\u2066-\u2069]", "", s)
    s = re.sub(r"[ًٌٍَُِّْـ]", "", s)
    s = s.replace("أ","ا").replace("إ","ا").replace("آ","ا").replace("ة","ه").replace("ى","ي")
    s = re.sub(r"[^ء-ي0-9\s]+", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s

def search_smart(query, top_k=3, fuzzy_threshold=0.92):
    qn = norm_ar(query)

    best = None; best_sc = 0.0
    for m in metas:
        q_meta = m.get("metadata", {}).get("question", "")
        if not q_meta:
            continue
        qm = norm_ar(q_meta)
        if qm == qn:
            return [{
                "mode": "exact",
                "score": 1.0,
                "topic": m["metadata"].get("topic",""),
                "source": m["metadata"].get("source",""),
                "answer": m["metadata"].get("answer",""),
                "preview": m["text"][:220] + "..."
            }]
        sc = difflib.SequenceMatcher(a=qn, b=qm).ratio()
        if sc > best_sc:
            best_sc, best = sc, m
    if best and best_sc >= fuzzy_threshold:
        return [{
            "mode": "fuzzy",
            "score": float(best_sc),
            "topic": best["metadata"].get("topic",""),
            "source": best["metadata"].get("source",""),
            "answer": best["metadata"].get("answer",""),
            "preview": best["text"][:220] + "..."
        }]


    qv = model.encode([query], normalize_embeddings=True).astype("float32")
    D, I = index.search(qv, top_k)
    out = []
    for score, idx in zip(D[0], I[0]):
        if idx == -1: continue
        item = metas[idx]
        out.append({
            "mode": "semantic",
            "score": float(score),
            "topic": item["metadata"].get("topic",""),
            "source": item["metadata"].get("source",""),
            "answer": item["metadata"].get("answer",""),
            "preview": item["text"][:220] + "..."
        })
    return out

for r in search_smart("هل يمكن التحول بين الطرق التقديرية والحسابات", top_k=3):
    print("\nMode:", r["mode"])
    print("Score:", round(r["score"],4))
    print("Topic:", r["topic"])
    print("Answer:", r["answer"])


In [None]:
import faiss, ujson

index = faiss.read_index("/content/rag_store/faiss_index.bin")
metas = [ujson.loads(l) for l in open("/content/rag_store/metadata.jsonl", "r", encoding="utf-8") if l.strip()]

print("✅ Files loaded successfully!")
print("Number of records:", len(metas))
print("Number of vectors in the index:", index.ntotal)



In [None]:
import shutil

shutil.make_archive("/content/rag_store", "zip", "/content/rag_store")

print(" ZIP file created successfully: rag_store.zip")



In [None]:
from google.colab import files
files.download("/content/rag_store.zip")
