<a href="https://colab.research.google.com/github/Renmsd/portfoilo/blob/main/Gen%20Ai/LLM/RAG/reading_from_pdf_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pypdf sentence-transformers chromadb

Collecting pypdf
  Downloading pypdf-6.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting chromadb
  Downloading chromadb-1.1.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.37.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3

In [None]:
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
import chromadb, re, math
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

In [None]:
PDF_PATH = "/content/7606_0_2025-06-18_13-35-36_Ar.pdf"   # <-- set path

# 1) Extract text per page (keeps page numbers for citations)
def extract_pages(pdf_path):
    reader = PdfReader(pdf_path)
    pages = []
    for i, page in enumerate(reader.pages, start=1):
        text = page.extract_text() or ""
        text = re.sub(r'\s+', ' ', text).strip()
        pages.append({"page": i, "text": text})
    return pages

pages = extract_pages(PDF_PATH)

# 2) Chunking (token-agnostic, word-based; simple & robust)
def chunk_page_text(text, max_words=350, overlap=60):
    words = text.split()
    out = []
    step = max(1, max_words - overlap)
    for i in range(0, max(1, len(words)), step):
        chunk = " ".join(words[i:i+max_words]).strip()
        if chunk:
            out.append(chunk)
    return out

docs, metadatas, ids = [], [], []
for p in pages:
    chunks = chunk_page_text(p["text"], max_words=350, overlap=60)
    for j, ch in enumerate(chunks):
        docs.append(ch)
        metadatas.append({"source": PDF_PATH, "page": p["page"], "chunk": j})
        ids.append(f"{p['page']:04d}-{j:04d}")

# 3) Embeddings + Vector store (Chroma, local persistence)
emb_model = "intfloat/multilingual-e5-small"  # good for AR/EN
embed_fn = SentenceTransformerEmbeddingFunction(model_name=emb_model)
client = chromadb.PersistentClient(path="./chroma_db")   # folder for persistence
coll = client.get_or_create_collection(
    name="pdf_kb",
    embedding_function=embed_fn,
    metadata={"hnsw:space": "cosine"}  # cosine works well with e5 (it normalizes internally)
)

# (re)ingest (idempotent-ish: clear if re-running)
try:
    coll.delete(ids=ids)
except Exception:
    pass
coll.add(ids=ids, documents=docs, metadatas=metadatas)

# 4) Retrieval helper
def retrieve(query, k=5, where=None):
    q = f"query: {query}"  # e5-style queries
    res = coll.query(query_texts=[q], n_results=k, where=where) # Removed the or {} from where=where or {}
    hits = []
    for i in range(len(res["ids"][0])):
        hits.append({
            "id": res["ids"][0][i],
            "text": res["documents"][0][i],
            "meta": res["metadatas"][0][i],
            "score": res["distances"][0][i],  # smaller is closer with cosine in Chroma
        })
    return hits

# 5) Try a query
hits = retrieve("What does the report say about total revenues in Q1 2025?", k=4)
for h in hits:
    print(f"[p.{h['meta']['page']}] {h['text'][:180]}…  (score={h['score']:.3f})")

[p.2] ﻧﺎﺱ ﻁﻳﺭﺍﻥ ﺷﺭﻛﺔ ﺷﺭﻛﺔ( )ﻣﺳﺎﻫﻣﺔ ( ﻣﺭﺍﺟﻌﺔ ﻏﻳﺭ )ﺍﻟﻣﻭﺟﺯﺓ ﺍﻷﻭﻟﻳﺔ ﺍﻟﻣﺎﻟﻳﺔ ﺍﻟﻣﻌﻠﻭﻣﺎﺕ ﻓﻲ ﺍﻟﻣﻧﺗﻬﻳﺔ ﺃﺷﻬﺭ ﺍﻟﺛﻼﺛﺔ ١٣ﻟﻔﺗﺭﺓ ٥٢٠٢ﻣﺎﺭﺱ ﺍﻟﺻﻔﺣﺎﺕ ﺍﻟﻣﺳﺗﻘﻝ ﺍﻟﻣﺭﺍﺟﻊ ﻓﺣﺹ ﺗﻘﺭﻳﺭ ١ ﺍﻟﻣﻭﺟﺯﺓ ﺍﻷﻭﻟﻳﺔ ﺍﻟﻣﺎﻟﻲ ﺍﻟﻣﺭﻛ…  (score=0.225)
[p.1] ﻧﺎﺱ ﻁﻳﺭﺍﻥ ﺷﺭﻛﺔ ( ﻣﺳﺎﻫﻣﺔ ﺷﺭﻛﺔ) ( ﻣﺭﺍﺟﻌﺔ ﻏﻳﺭ )ﺍﻟﻣﻭﺟﺯﺓ ﺍﻷﻭﻟﻳﺔ ﺍﻟﻣﺎﻟﻳﺔ ﺍﻟﻣﻌﻠﻭﻣﺎﺕ ﻓﻲ ﺍﻟﻣﻧﺗﻬﻳﺔ ﺃﺷﻬﺭ ﺍﻟﺛﻼﺛﺔ ١٣ﻟﻔﺗﺭﺓ ٥٢٠٢ﻣﺎﺭﺱ ﺍﻟﻣﻭﺟﺯﺓ ﺍﻷﻭﻟﻳﺔ ﺍﻟﻣﺎﻟﻳﺔ ﺍﻟﻣﻌﻠﻭﻣﺎﺕ ﻓﺣﺹ ﺣﻭﻝ ﻭﺗﻘﺭﻳﺭ…  (score=0.230)
[p.15] ﻧﺎﺱ ﻁﻳﺭﺍﻥ ﺷﺭﻛﺔ ( ﻣﺳﺎﻫﻣﺔ ﺷﺭﻛﺔ) ﺣﻭﻝ ﺇﻳﺿﺎﺣﺎﺕ ﺍﻟﻣﻭﺟﺯﺓ ﺍﻷﻭﻟﻳﺔ ﺍﻟﻣﺎﻟﻳﺔ ﺍﻟﻣﻌﻠﻭﻣﺎﺕ (ﺫﻟﻙ ﻏﻳﺭ ﻳﺫﻛﺭ ﻟﻡ ﻣﺎ ﺍﻟﺳﻌﻭﺩﻳﺔ ﺑﺎﻟﺭﻳﺎﻻﺕ ﺍﻟﻣﺑﺎﻟﻎ ﺟﻣﻳﻊ ) ٣١ ٠١ ﺍﻟﻘﻁﺎﻋﻳﺔ ﻭﺍﻟﺗﻘﺎﺭﻳﺭ ﺍﻹﻳﺭﺍﺩﺍﺕ( ﺗﺗﻣﺔ) ﻋﻘﻭﺩ ﻣﻁﻠﻭﺑﺎ…  (score=0.231)
[p.9] ﻧﺎﺱ ﻁﻳﺭﺍﻥ ﺷﺭﻛﺔ ( ﻣﺳﺎﻫﻣﺔ ﺷﺭﻛﺔ) ﺣﻭﻝ ﺇﻳﺿﺎﺣﺎﺕ ﺍﻟﻣﻭﺟﺯﺓ ﺍﻷﻭﻟﻳﺔ ﺍﻟﻣﺎﻟﻳﺔ ﺍﻟﻣﻌﻠﻭﻣﺎﺕ (ﺫﻟﻙ ﻏﻳﺭ ﻳﺫﻛﺭ ﻟﻡ ﻣﺎ ﺍﻟﺳﻌﻭﺩﻳﺔ ﺑﺎﻟﺭﻳﺎﻻﺕ ﺍﻟﻣﺑﺎﻟﻎ ﺟﻣﻳﻊ ) ٧ ٢( ﺗﺗﻣﺔ )ﺍﻹﻋﺩﺍﺩ ﺃﺳﺱ ٢-٣ ﺍﻟﺣﺎﻟﻳﺔ ﺍﻟﺗﻘﺭﻳﺭ ﻓﺗﺭﺓ ﻓﻲ ﺍﻟ…  (score=0.231)


In [None]:
hits = retrieve("كم كانت مطلوبات العقود ", k=4)
for h in hits:
    print(f"[p.{h['meta']['page']}] {h['text'][:180]}…  (score={h['score']:.3f})")

[p.7] )٦١١٫٤١٢٫٥٣ ( )٧٨٣٫٦٦٢٫٨٩٣ ( ﻳﻣﺎﺛﻠﻪ ﻭﻣﺎ ﺍﻟﻧﻘﺩ ﻲﻓ ﺍﻟﺯﻳﺎﺩﺓ (ﺍﻟﻧﻘﺹ) ﻲﺎﻓﺻ )٠٧٣٫٩١١٫٧٣ ( ٠٩٠٫٩٦٩٫٢٦ ﺓﺍﻟﻔﺗﺭ ﺑﺩﺍﻳﺔ ﻲﻓ ﻳﻣﺎﺛﻠﻪ ﻭﻣﺎ ﻧﻘﺩ ٨٧٣٫٦٢٣٫٠٠٧٫١ ٩٩٣٫٨٨٤٫٠٥٤١ ﺍﻟﻔﺗﺭﺓ ﻧﻬﺎﻳﺔ ﻲﻓ ﻳﻣﺎﺛﻠﻪ ﻭﻣﺎ …  (score=0.149)
[p.4] ٢ ﻧﺎﺱ ﻁﻳﺭﺍﻥ ﺷﺭﻛﺔ ( ﻣﺳﺎﻫﻣﺔ ﺷﺭﻛﺔ) ﺍﻟﻣﻭﺟﺯﺓ ﺍﻷﻭﻟﻳﺔ ﺍﻟﻣﺎﻟﻲ ﺍﻟﻣﺭﻛﺯ ﻗﺎﺋﻣﺔ ﺟﻣﻳﻊ()ﺫﻟﻙ ﻏﻳﺭ ﻳﺫﻛﺭ ﻟﻡ ﻣﺎ ﺍﻟﺳﻌﻭﺩﻳﺔ ﺑﺎﻟﺭﻳﺎﻻﺕ ﺍﻟﻣﺑﺎﻟﻎ ﺇﻳﺿﺎﺡ ﻓﻲ ١٣ﻛﻣﺎ ﻣﺎﺭﺱ ﻓﻲ ١٣ﻛﻣﺎ ﺩﻳﺳﻣﺑﺭ ٥٢٠٢ ٤٢٠٢ ﺍﻟﻣﻭﺟﻭﺩﺍﺕ (ﻣﺭﺍﺟﻌ…  (score=0.155)
[p.13] ﻧﺎﺱ ﻁﻳﺭﺍﻥ ﺷﺭﻛﺔ ( ﻣﺳﺎﻫﻣﺔ ﺷﺭﻛﺔ) ﺣﻭﻝ ﺇﻳﺿﺎﺣﺎﺕ ﺍﻟﻣﻭﺟﺯﺓ ﺍﻷﻭﻟﻳﺔ ﺍﻟﻣﺎﻟﻳﺔ ﺍﻟﻣﻌﻠﻭﻣﺎﺕ (ﺫﻟﻙ ﻏﻳﺭ ﻳﺫﻛﺭ ﻟﻡ ﻣﺎ ﺍﻟﺳﻌﻭﺩﻳﺔ ﺑﺎﻟﺭﻳﺎﻻﺕ ﺍﻟﻣﺑﺎﻟﻎ ﺟﻣﻳﻊ ) ١١ ٦ ﺇﻳﺟﺎﺭﻳﺔ ﻭﻣﻁﻠﻭﺑﺎﺕ ﺍﻻﺳﺗﺧﺩﺍﻡ ﺣﻖ ﻣﻭﺟﻭﺩﺍﺕ ٦-١ ﺍﻻﺳﺗﺧﺩ…  (score=0.158)
[p.15] ﻧﺎﺱ ﻁﻳﺭﺍﻥ ﺷﺭﻛﺔ ( ﻣﺳﺎﻫﻣﺔ ﺷﺭﻛﺔ) ﺣﻭﻝ ﺇﻳﺿﺎﺣﺎﺕ ﺍﻟﻣﻭﺟﺯﺓ ﺍﻷﻭﻟﻳﺔ ﺍﻟﻣﺎﻟﻳﺔ ﺍﻟﻣﻌﻠﻭﻣﺎﺕ (ﺫﻟﻙ ﻏﻳﺭ ﻳﺫﻛﺭ ﻟﻡ ﻣﺎ ﺍﻟﺳﻌﻭﺩﻳﺔ ﺑﺎﻟﺭﻳﺎﻻﺕ ﺍﻟﻣﺑﺎﻟﻎ ﺟﻣﻳﻊ ) ٣١ ٠١ ﺍﻟﻘﻁﺎﻋﻳﺔ ﻭﺍﻟﺗﻘﺎﺭﻳﺭ ﺍﻹﻳﺭﺍﺩﺍﺕ( ﺗﺗﻣﺔ) ﻋﻘﻭﺩ ﻣﻁﻠﻭﺑﺎ…  (score=0.166)


In [None]:
import sys

# Try printing with explicit UTF-8 encoding
for h in hits:
    print(f"[p.{h['meta']['page']}] {h['text'][:180].encode('utf-8').decode('utf-8')}…  (score={h['score']:.3f})")

# You can also try changing the default encoding for the output
# sys.stdout.reconfigure(encoding='utf-8')
# Then re-run the previous print loop

[p.7] )٦١١٫٤١٢٫٥٣ ( )٧٨٣٫٦٦٢٫٨٩٣ ( ﻳﻣﺎﺛﻠﻪ ﻭﻣﺎ ﺍﻟﻧﻘﺩ ﻲﻓ ﺍﻟﺯﻳﺎﺩﺓ (ﺍﻟﻧﻘﺹ) ﻲﺎﻓﺻ )٠٧٣٫٩١١٫٧٣ ( ٠٩٠٫٩٦٩٫٢٦ ﺓﺍﻟﻔﺗﺭ ﺑﺩﺍﻳﺔ ﻲﻓ ﻳﻣﺎﺛﻠﻪ ﻭﻣﺎ ﻧﻘﺩ ٨٧٣٫٦٢٣٫٠٠٧٫١ ٩٩٣٫٨٨٤٫٠٥٤١ ﺍﻟﻔﺗﺭﺓ ﻧﻬﺎﻳﺔ ﻲﻓ ﻳﻣﺎﺛﻠﻪ ﻭﻣﺎ …  (score=0.149)
[p.4] ٢ ﻧﺎﺱ ﻁﻳﺭﺍﻥ ﺷﺭﻛﺔ ( ﻣﺳﺎﻫﻣﺔ ﺷﺭﻛﺔ) ﺍﻟﻣﻭﺟﺯﺓ ﺍﻷﻭﻟﻳﺔ ﺍﻟﻣﺎﻟﻲ ﺍﻟﻣﺭﻛﺯ ﻗﺎﺋﻣﺔ ﺟﻣﻳﻊ()ﺫﻟﻙ ﻏﻳﺭ ﻳﺫﻛﺭ ﻟﻡ ﻣﺎ ﺍﻟﺳﻌﻭﺩﻳﺔ ﺑﺎﻟﺭﻳﺎﻻﺕ ﺍﻟﻣﺑﺎﻟﻎ ﺇﻳﺿﺎﺡ ﻓﻲ ١٣ﻛﻣﺎ ﻣﺎﺭﺱ ﻓﻲ ١٣ﻛﻣﺎ ﺩﻳﺳﻣﺑﺭ ٥٢٠٢ ٤٢٠٢ ﺍﻟﻣﻭﺟﻭﺩﺍﺕ (ﻣﺭﺍﺟﻌ…  (score=0.155)
[p.13] ﻧﺎﺱ ﻁﻳﺭﺍﻥ ﺷﺭﻛﺔ ( ﻣﺳﺎﻫﻣﺔ ﺷﺭﻛﺔ) ﺣﻭﻝ ﺇﻳﺿﺎﺣﺎﺕ ﺍﻟﻣﻭﺟﺯﺓ ﺍﻷﻭﻟﻳﺔ ﺍﻟﻣﺎﻟﻳﺔ ﺍﻟﻣﻌﻠﻭﻣﺎﺕ (ﺫﻟﻙ ﻏﻳﺭ ﻳﺫﻛﺭ ﻟﻡ ﻣﺎ ﺍﻟﺳﻌﻭﺩﻳﺔ ﺑﺎﻟﺭﻳﺎﻻﺕ ﺍﻟﻣﺑﺎﻟﻎ ﺟﻣﻳﻊ ) ١١ ٦ ﺇﻳﺟﺎﺭﻳﺔ ﻭﻣﻁﻠﻭﺑﺎﺕ ﺍﻻﺳﺗﺧﺩﺍﻡ ﺣﻖ ﻣﻭﺟﻭﺩﺍﺕ ٦-١ ﺍﻻﺳﺗﺧﺩ…  (score=0.158)
[p.15] ﻧﺎﺱ ﻁﻳﺭﺍﻥ ﺷﺭﻛﺔ ( ﻣﺳﺎﻫﻣﺔ ﺷﺭﻛﺔ) ﺣﻭﻝ ﺇﻳﺿﺎﺣﺎﺕ ﺍﻟﻣﻭﺟﺯﺓ ﺍﻷﻭﻟﻳﺔ ﺍﻟﻣﺎﻟﻳﺔ ﺍﻟﻣﻌﻠﻭﻣﺎﺕ (ﺫﻟﻙ ﻏﻳﺭ ﻳﺫﻛﺭ ﻟﻡ ﻣﺎ ﺍﻟﺳﻌﻭﺩﻳﺔ ﺑﺎﻟﺭﻳﺎﻻﺕ ﺍﻟﻣﺑﺎﻟﻎ ﺟﻣﻳﻊ ) ٣١ ٠١ ﺍﻟﻘﻁﺎﻋﻳﺔ ﻭﺍﻟﺗﻘﺎﺭﻳﺭ ﺍﻹﻳﺭﺍﺩﺍﺕ( ﺗﺗﻣﺔ) ﻋﻘﻭﺩ ﻣﻁﻠﻭﺑﺎ…  (score=0.166)


In [None]:
!pip install caas-jupyter-tools

[31mERROR: Could not find a version that satisfies the requirement caas-jupyter-tools (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for caas-jupyter-tools[0m[31m
[0m

In [None]:
# PDF → Table rows (verbatim + normalized) extractor
# - Reads the uploaded Arabic PDF
# - Extracts tables per page using pdfplumber (fallback to page text if tables not detected)
# - Produces a row-level CSV with both verbatim and normalized digits
# - Adds useful metadata: page, table_idx, row_idx, col_idx, context_before (snippet from page)
#
# Files saved:
# - /mnt/data/pdf_table_rows.csv
# - /mnt/data/pdf_page_text.csv
#
# The code will also preview a few extracted rows.

import os, re, json
from typing import List, Dict, Any
from IPython.display import display
import pandas as pd


PDF_PATH = "7606_0_2025-06-18_13-35-36_Ar.pdf"

# Digit normalization (Arabic-Indic to ASCII); also normalize decimal/group separators
TRANS = str.maketrans("٠١٢٣٤٥٦٧٨٩٬٫", "0123456789,.")
def normalize_nums(s: str) -> str:
    if s is None:
        return ""
    return str(s).translate(TRANS)

def try_import_pdfplumber():
    try:
        import pdfplumber  # type: ignore
        return pdfplumber
    except Exception as e:
        return None

def extract_with_pdfplumber(pdf_path: str):
    pdfplumber = try_import_pdfplumber()
    if not pdfplumber:
        return None, None

    rows = []
    pages_meta = []
    with pdfplumber.open(pdf_path) as pdf:
        for p_idx, page in enumerate(pdf.pages, start=1):
            # Store a light page text for "context_before"
            page_text = (page.extract_text() or "").strip()
            page_text = re.sub(r"\s+", " ", page_text)
            pages_meta.append({"page": p_idx, "text": page_text})
            # Detect tables
            try:
                tables = page.extract_tables()
            except Exception:
                tables = []
            if not tables:
                continue
            for t_idx, tbl in enumerate(tables):
                # tbl is a list of rows; each row is a list of cells (strings/None)
                for r_idx, row in enumerate(tbl):
                    # Create one record per cell to allow fine-grained retrieval (row-level + cell-level)
                    for c_idx, cell in enumerate(row):
                        v_text = "" if cell is None else str(cell).strip()
                        n_text = normalize_nums(v_text)
                        # Skip empty cells to reduce noise, but keep non-empty rows
                        rows.append({
                            "page": p_idx,
                            "table_idx": t_idx,
                            "row_idx": r_idx,
                            "col_idx": c_idx,
                            "cell_text_verbatim": v_text,
                            "cell_text_normalized": n_text,
                            "row_join_verbatim": " | ".join([("" if x is None else str(x).strip()) for x in row]),
                            "row_join_normalized": normalize_nums(" | ".join([("" if x is None else str(x).strip()) for x in row])),
                            "context_before": page_text[:200]
                        })
    return rows, pages_meta

# Fallback: extract only page text using PyPDF if pdfplumber isn't available
def extract_with_pypdf(pdf_path: str):
    try:
        from pypdf import PdfReader
    except Exception:
        return None, None
    reader = PdfReader(pdf_path)
    pages_meta = []
    for p_idx, page in enumerate(reader.pages, start=1):
        text = page.extract_text() or ""
        text = re.sub(r"\s+", " ", text).strip()
        pages_meta.append({"page": p_idx, "text": text})
    return [], pages_meta

rows, pages_meta = extract_with_pdfplumber(PDF_PATH)

if rows is None:
    # pdfplumber not available; fallback to text-only
    rows, pages_meta = extract_with_pypdf(PDF_PATH)

# Save page texts regardless (for context / troubleshooting)
df_pages = pd.DataFrame(pages_meta or [])
df_pages.to_csv("pdf_page_text.csv", index=False, encoding="utf-8")

# Save table rows if any; otherwise create an empty frame with schema
if rows:
    df_rows = pd.DataFrame(rows)
else:
    df_rows = pd.DataFrame(columns=[
        "page","table_idx","row_idx","col_idx",
        "cell_text_verbatim","cell_text_normalized",
        "row_join_verbatim","row_join_normalized","context_before"
    ])

df_rows.to_csv("pdf_table_rows.csv", index=False, encoding="utf-8")

# Preview a few rows
display(df_rows.head(40))

# Return small textual summary
summary = {
    "pdf_path": PDF_PATH,
    "num_pages": len(df_pages),
    "num_table_cells": int(df_rows.shape[0]),
    "num_rows_previewed": int(min(40, df_rows.shape[0])),
    "outputs": {
        "rows_csv": "pdf_table_rows.csv",
        "pages_csv": "pdf_page_text.csv"
    }
}
summary

Unnamed: 0,page,table_idx,row_idx,col_idx,cell_text_verbatim,cell_text_normalized,row_join_verbatim,row_join_normalized,context_before


{'pdf_path': '7606_0_2025-06-18_13-35-36_Ar.pdf',
 'num_pages': 18,
 'num_table_cells': 0,
 'num_rows_previewed': 0,
 'outputs': {'rows_csv': 'pdf_table_rows.csv',
  'pages_csv': 'pdf_page_text.csv'}}

In [None]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Robust PDF table extractor with fallback (no caas_jupyter_tools required)
# - Tries pdfplumber for structured tables.
# - If none found, falls back to text-based "row" extraction from PyPDF:
#     * Keep lines that look like table rows (>=2 numbers or many delimiters)
#     * Split cells by multiple spaces / tabs / pipes
#     * Preserve verbatim Arabic, plus a digit-normalized version
# - Saves:
#    /mnt/data/pdf_table_rows.csv         (structured if possible, else fallback rows)
#    /mnt/data/pdf_table_rows_fallback.csv (always saved for debugging)
#    /mnt/data/pdf_page_text.csv
#
# Notes:
# - No dependency on caas_jupyter_tools.
# - Should work even if pdfplumber is missing.
# - Designed for Arabic/English mixed PDFs with numeric tables.
#
import os, re, csv
import pandas as pd

PDF_PATH = "7606_0_2025-06-18_13-35-36_Ar.pdf"

# Arabic-Indic → ASCII, also normalize separators (٬٫ → , .)
TRANS = str.maketrans("٠١٢٣٤٥٦٧٨٩٬٫", "0123456789,.")
def normalize_nums(s: str) -> str:
    return (s or "").translate(TRANS)

def try_pdfplumber_tables(pdf_path):
    try:
        import pdfplumber
    except Exception:
        return [], []
    rows = []
    pages_meta = []
    with pdfplumber.open(pdf_path) as pdf:
        for p_idx, page in enumerate(pdf.pages, start=1):
            page_text = (page.extract_text() or "").strip()
            page_text = re.sub(r"\s+", " ", page_text)
            pages_meta.append({"page": p_idx, "text": page_text})
            try:
                # both heuristics: tables() and extract_tables()
                tables = page.extract_tables() or []
            except Exception:
                tables = []
            for t_idx, tbl in enumerate(tables):
                for r_idx, row in enumerate(tbl):
                    # Emit one record per ROW (row_join) and also per CELL for precision search
                    row_verbatim = [("" if c is None else str(c).strip()) for c in row]
                    row_norm = [normalize_nums(x) for x in row_verbatim]
                    # Row-level
                    rows.append({
                        "page": p_idx, "table_idx": t_idx, "row_idx": r_idx,
                        "col_idx": -1,
                        "cell_text_verbatim": "",
                        "cell_text_normalized": "",
                        "row_join_verbatim": " | ".join(row_verbatim),
                        "row_join_normalized": " | ".join(row_norm),
                        "context_before": page_text[:200]
                    })
                    # Cell-level
                    for c_idx, cell in enumerate(row_verbatim):
                        rows.append({
                            "page": p_idx, "table_idx": t_idx, "row_idx": r_idx,
                            "col_idx": c_idx,
                            "cell_text_verbatim": cell,
                            "cell_text_normalized": normalize_nums(cell),
                            "row_join_verbatim": " | ".join(row_verbatim),
                            "row_join_normalized": " | ".join(row_norm),
                            "context_before": page_text[:200]
                        })
    return rows, pages_meta

def extract_text_pages(pdf_path):
    # Fallback: use PyPDF only for page text
    try:
        from pypdf import PdfReader
    except Exception:
        return []
    reader = PdfReader(pdf_path)
    pages = []
    for p_idx, page in enumerate(reader.pages, start=1):
        text = page.extract_text() or ""
        text = re.sub(r"\r", "\n", text)
        # collapse >2 spaces into a single space but preserve newlines
        text = re.sub(r"[ \t\f\v]+", " ", text)
        text = re.sub(r"\n{3,}", "\n\n", text)
        pages.append({"page": p_idx, "text": text.strip()})
    return pages

def looks_like_table_line(line: str) -> bool:
    """Heuristic: a line is 'tabular' if it contains >=2 numbers or many separators."""
    norm = normalize_nums(line)
    num_count = len(re.findall(r"\d+(?:[.,]\d+)?", norm))
    sep_count = len(re.findall(r"[|·•\-–—\t]", line)) + len(re.findall(r"\s{2,}", line))
    return (num_count >= 2) or (sep_count >= 2)

def split_cells(line: str):
    """Split by strong delimiters first, then by 2+ spaces."""
    if "|" in line:
        parts = [p.strip() for p in line.split("|")]
    else:
        parts = re.split(r"\s{2,}|\t", line)
        parts = [p.strip() for p in parts if p.strip()]
    return parts if len(parts) >= 2 else [line.strip()]

def build_fallback_rows(pages_meta):
    rows = []
    for page in pages_meta:
        p = page["page"]
        text = page["text"]
        for block in text.split("\n\n"):
            lines = [l.strip() for l in block.split("\n") if l.strip()]
            # If many lines in a block are 'tabular', treat block as a table
            tab_lines = [l for l in lines if looks_like_table_line(l)]
            if len(tab_lines) >= max(2, int(0.5 * len(lines))):
                # Emit a table_idx counter per page
                table_idx = hash((p, block[:30])) % (10**6)
                for r_idx, ln in enumerate(tab_lines):
                    cells = split_cells(ln)
                    row_verbatim = " | ".join(cells)
                    row_norm = " | ".join([normalize_nums(c) for c in cells])
                    # Row-level
                    rows.append({
                        "page": p, "table_idx": table_idx, "row_idx": r_idx, "col_idx": -1,
                        "cell_text_verbatim": "",
                        "cell_text_normalized": "",
                        "row_join_verbatim": row_verbatim,
                        "row_join_normalized": row_norm,
                        "context_before": text[:200].replace("\n", " ")
                    })
                    # Cell-level
                    for c_idx, cell in enumerate(cells):
                        rows.append({
                            "page": p, "table_idx": table_idx, "row_idx": r_idx, "col_idx": c_idx,
                            "cell_text_verbatim": cell,
                            "cell_text_normalized": normalize_nums(cell),
                            "row_join_verbatim": row_verbatim,
                            "row_join_normalized": row_norm,
                            "context_before": text[:200].replace("\n", " ")
                        })
    return rows

# 1) Try structured extraction
rows_struct, pages_meta = try_pdfplumber_tables(PDF_PATH)

# 2) Always get page texts
if not pages_meta:
    pages_meta = extract_text_pages(PDF_PATH)

# Save page texts
pd.DataFrame(pages_meta).to_csv("pdf_page_text.csv", index=False, encoding="utf-8")

# 3) Fallback from page text if no structured tables found
rows_fallback = build_fallback_rows(pages_meta)

# Prefer structured rows if any; else use fallback
rows_final = rows_struct if rows_struct else rows_fallback

# Persist both for debugging
pd.DataFrame(rows_fallback).to_csv("pdf_table_rows_fallback.csv", index=False, encoding="utf-8")
pd.DataFrame(rows_final).to_csv("pdf_table_rows.csv", index=False, encoding="utf-8")

summary = {
    "pages": len(pages_meta),
    "rows_structured": len(rows_struct),
    "rows_fallback": len(rows_fallback),
    "rows_final": len(rows_final),
    "outputs": {
        "rows_csv": "pdf_table_rows.csv",
        "rows_fallback_csv": "pdf_table_rows_fallback.csv",
        "pages_csv": "pdf_page_text.csv",
    }
}
summary


{'pages': 18,
 'rows_structured': 103,
 'rows_fallback': 0,
 'rows_final': 103,
 'outputs': {'rows_csv': 'pdf_table_rows.csv',
  'rows_fallback_csv': 'pdf_table_rows_fallback.csv',
  'pages_csv': 'pdf_page_text.csv'}}