VectifyAI · rejojer · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,16 +1,6 @@
 .ipynb_checkpoints
 __pycache__
-files
-index
-temp/*
-chroma-collections.parquet
-chroma-embeddings.parquet
 .DS_Store
 .env*
 .venv/
-notebook
-SDK/*
-log/*
 logs/
-parts/*
-json_results/*
diff --git a/README.md b/README.md
@@ -105,7 +105,7 @@ The PageIndex service is available as a ChatGPT-style [chat platform](https://ch
 
 PageIndex can transform lengthy PDF documents into a semantic **tree structure**, similar to a _"table of contents"_ but optimized for use with Large Language Models (LLMs). It's ideal for: financial reports, regulatory filings, academic textbooks, legal or technical manuals, and any document that exceeds LLM context limits.
 
-Below is an example PageIndex tree structure. Also see more example [documents](https://github.com/VectifyAI/PageIndex/tree/main/tests/pdfs) and generated [tree structures](https://github.com/VectifyAI/PageIndex/tree/main/tests/results).
+Below is an example PageIndex tree structure. Also see more example [documents](https://github.com/VectifyAI/PageIndex/tree/main/examples/documents) and generated [tree structures](https://github.com/VectifyAI/PageIndex/tree/main/examples/documents/results).
 
 ```jsonc
 ...

diff --git a/tests/pdfs/2023-annual-report-truncated.pdf → ...ocuments/2023-annual-report-truncated.pdf b/tests/pdfs/2023-annual-report-truncated.pdf → ...ocuments/2023-annual-report-truncated.pdf
diff --git a/tests/pdfs/2023-annual-report.pdf → examples/documents/2023-annual-report.pdf b/tests/pdfs/2023-annual-report.pdf → examples/documents/2023-annual-report.pdf
diff --git a/tests/pdfs/PRML.pdf → examples/documents/PRML.pdf b/tests/pdfs/PRML.pdf → examples/documents/PRML.pdf
diff --git a/...on Best Interest_Interpretive release.pdf → ...on Best Interest_Interpretive release.pdf b/...on Best Interest_Interpretive release.pdf → ...on Best Interest_Interpretive release.pdf
diff --git a/...egulation Best Interest_proposed rule.pdf → ...egulation Best Interest_proposed rule.pdf b/...egulation Best Interest_proposed rule.pdf → ...egulation Best Interest_proposed rule.pdf
diff --git a/examples/documents/attention-residuals.pdf b/examples/documents/attention-residuals.pdf
diff --git a/tests/pdfs/earthmover.pdf → examples/documents/earthmover.pdf b/tests/pdfs/earthmover.pdf → examples/documents/earthmover.pdf
diff --git a/tests/pdfs/four-lectures.pdf → examples/documents/four-lectures.pdf b/tests/pdfs/four-lectures.pdf → examples/documents/four-lectures.pdf
diff --git a/tests/pdfs/q1-fy25-earnings.pdf → examples/documents/q1-fy25-earnings.pdf b/tests/pdfs/q1-fy25-earnings.pdf → examples/documents/q1-fy25-earnings.pdf
diff --git a/...23-annual-report-truncated_structure.json → ...23-annual-report-truncated_structure.json b/...23-annual-report-truncated_structure.json → ...23-annual-report-truncated_structure.json
diff --git a/...results/2023-annual-report_structure.json → ...results/2023-annual-report_structure.json b/...results/2023-annual-report_structure.json → ...results/2023-annual-report_structure.json
diff --git a/tests/results/PRML_structure.json → ...les/documents/results/PRML_structure.json b/tests/results/PRML_structure.json → ...les/documents/results/PRML_structure.json
diff --git a/...erest_Interpretive release_structure.json → ...erest_Interpretive release_structure.json b/...erest_Interpretive release_structure.json → ...erest_Interpretive release_structure.json
diff --git a/...est Interest_proposed rule_structure.json → ...est Interest_proposed rule_structure.json b/...est Interest_proposed rule_structure.json → ...est Interest_proposed rule_structure.json
diff --git a/tests/results/earthmover_structure.json → ...cuments/results/earthmover_structure.json b/tests/results/earthmover_structure.json → ...cuments/results/earthmover_structure.json
diff --git a/tests/results/four-lectures_structure.json → ...ents/results/four-lectures_structure.json b/tests/results/four-lectures_structure.json → ...ents/results/four-lectures_structure.json
diff --git a/...s/results/q1-fy25-earnings_structure.json → ...s/results/q1-fy25-earnings_structure.json b/...s/results/q1-fy25-earnings_structure.json → ...s/results/q1-fy25-earnings_structure.json
diff --git a/examples/openai_agents_demo.py b/examples/openai_agents_demo.py
@@ -18,10 +18,10 @@
 """
 import os
 import sys
+import json
 import asyncio
 import concurrent.futures
 import requests
-from pathlib import Path
 
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
@@ -32,9 +32,10 @@
 from pageindex import PageIndexClient
 import pageindex.utils as utils
 
+_EXAMPLES_DIR = os.path.dirname(os.path.abspath(__file__))
 PDF_URL = "https://arxiv.org/pdf/2603.15031"
-PDF_PATH = "tests/pdfs/attention-residuals.pdf"
-WORKSPACE = "./pageindex_workspace"
+PDF_PATH = os.path.join(_EXAMPLES_DIR, "documents", "attention-residuals.pdf")
+WORKSPACE = os.path.join(_EXAMPLES_DIR, "workspace")
 
 AGENT_SYSTEM_PROMPT = """
 You are PageIndex, a document QA assistant.
@@ -147,16 +148,16 @@ async def _run():
 print("=" * 60)
 print("Step 1: Indexing PDF and inspecting tree structure")
 print("=" * 60)
-_id_cache = Path(WORKSPACE).expanduser() / "demo_doc_id.txt"
-if _id_cache.exists() and (doc_id := _id_cache.read_text().strip()) in client.documents:
+doc_id = next((did for did, doc in client.documents.items()
+                if doc.get('doc_name') == os.path.basename(PDF_PATH)), None)
+if doc_id:
     print(f"\nLoaded cached doc_id: {doc_id}")
 else:
     doc_id = client.index(PDF_PATH)
-    _id_cache.parent.mkdir(parents=True, exist_ok=True)
-    _id_cache.write_text(doc_id)
     print(f"\nIndexed. doc_id: {doc_id}")
 print("\nTree Structure (top-level sections):")
-utils.print_tree(client.documents[doc_id]["structure"])
+structure = json.loads(client.get_document_structure(doc_id))
+utils.print_tree(structure)
 
 # ── Step 2: Document Metadata ──────────────────────────────────────────────────
 print("\n" + "=" * 60)

diff --git a/examples/workspace/12345678-abcd-4321-abcd-123456789abc.json b/examples/workspace/12345678-abcd-4321-abcd-123456789abc.json
diff --git a/examples/workspace/_meta.json b/examples/workspace/_meta.json
@@ -0,0 +1,9 @@
+{
+  "12345678-abcd-4321-abcd-123456789abc": {
+    "type": "pdf",
+    "doc_name": "attention-residuals.pdf",
+    "doc_description": "This document introduces \"Attention Residuals\" (AttnRes) and its scalable variant \"Block AttnRes,\" novel mechanisms for replacing fixed residual accumulation in neural networks with learned, input-dependent depth-wise attention, addressing limitations of standard residual connections while optimizing memory, computation, and scalability for large-scale training and inference.",
+    "page_count": 21,
+    "path": "../documents/attention-residuals.pdf"
+  }
+}
diff --git a/pageindex/client.py b/pageindex/client.py
@@ -5,10 +5,15 @@
 import concurrent.futures
 from pathlib import Path
 
+import PyPDF2
+
 from .page_index import page_index
 from .page_index_md import md_to_tree
 from .retrieve import get_document, get_document_structure, get_page_content
-from .utils import ConfigLoader
+from .utils import ConfigLoader, remove_fields
+
+META_INDEX = "_meta.json"
+
 
 class PageIndexClient:
     """
@@ -39,6 +44,9 @@ def __init__(self, api_key: str = None, model: str = None, retrieve_model: str =
 
     def index(self, file_path: str, mode: str = "auto") -> str:
         """Index a document. Returns a document_id."""
+        # Persist a canonical absolute path so workspace reloads do not
+        # reinterpret caller-relative paths against the workspace directory.
+        file_path = os.path.abspath(os.path.expanduser(file_path))
         if not os.path.exists(file_path):
             raise FileNotFoundError(f"File not found: {file_path}")
 
@@ -58,13 +66,22 @@ def index(self, file_path: str, mode: str = "auto") -> str:
                 if_add_node_id='yes',
                 if_add_doc_description='yes'
             )
+            # Extract per-page text so queries don't need the original PDF
+            pages = []
+            with open(file_path, 'rb') as f:
+                pdf_reader = PyPDF2.PdfReader(f)
+                for i, page in enumerate(pdf_reader.pages, 1):
+                    pages.append({'page': i, 'content': page.extract_text() or ''})
+
             self.documents[doc_id] = {
                 'id': doc_id,
-                'path': file_path,
                 'type': 'pdf',
-                'structure': result['structure'],
+                'path': file_path,
                 'doc_name': result.get('doc_name', ''),
-                'doc_description': result.get('doc_description', '')
+                'doc_description': result.get('doc_description', ''),
+                'page_count': len(pages),
+                'structure': result['structure'],
+                'pages': pages,
             }
 
         elif mode == "md" or (mode == "auto" and is_md):
@@ -87,11 +104,12 @@ def index(self, file_path: str, mode: str = "auto") -> str:
                 result = asyncio.run(coro)
             self.documents[doc_id] = {
                 'id': doc_id,
-                'path': file_path,
                 'type': 'md',
-                'structure': result['structure'],
+                'path': file_path,
                 'doc_name': result.get('doc_name', ''),
-                'doc_description': result.get('doc_description', '')
+                'doc_description': result.get('doc_description', ''),
+                'line_count': result.get('line_count', 0),
+                'structure': result['structure'],
             }
         else:
             raise ValueError(f"Unsupported file format for: {file_path}")
@@ -101,32 +119,106 @@ def index(self, file_path: str, mode: str = "auto") -> str:
             self._save_doc(doc_id)
         return doc_id
 
+    @staticmethod
+    def _make_meta_entry(doc: dict) -> dict:
+        """Build a lightweight meta entry from a document dict."""
+        entry = {
+            'type': doc.get('type', ''),
+            'doc_name': doc.get('doc_name', ''),
+            'doc_description': doc.get('doc_description', ''),
+            'path': doc.get('path', ''),
+        }
+        if doc.get('type') == 'pdf':
+            entry['page_count'] = doc.get('page_count')
+        elif doc.get('type') == 'md':
+            entry['line_count'] = doc.get('line_count')
+        return entry
+
+    @staticmethod
+    def _read_json(path) -> dict | None:
+        """Read a JSON file, returning None on any error."""
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                return json.load(f)
+        except (json.JSONDecodeError, OSError) as e:
+            print(f"Warning: corrupt {Path(path).name}: {e}")
+            return None
+
     def _save_doc(self, doc_id: str):
+        doc = self.documents[doc_id].copy()
+        # Strip text from structure nodes — redundant with pages (PDF only)
+        if doc.get('structure') and doc.get('type') == 'pdf':
+            doc['structure'] = remove_fields(doc['structure'], fields=['text'])
         path = self.workspace / f"{doc_id}.json"
         with open(path, "w", encoding="utf-8") as f:
-            json.dump(self.documents[doc_id], f, ensure_ascii=False, indent=2)
+            json.dump(doc, f, ensure_ascii=False, indent=2)
+        self._save_meta(doc_id, self._make_meta_entry(doc))
+        # Drop heavy fields; will lazy-load on demand
+        self.documents[doc_id].pop('structure', None)
+        self.documents[doc_id].pop('pages', None)
 
-    def _load_workspace(self):
-        loaded = 0
+    def _rebuild_meta(self) -> dict:
+        """Scan individual doc JSON files and return a meta dict."""
+        meta = {}
         for path in self.workspace.glob("*.json"):
-            try:
-                with open(path, "r", encoding="utf-8") as f:
-                    doc = json.load(f)
-                self.documents[path.stem] = doc
-                loaded += 1
-            except (json.JSONDecodeError, OSError) as e:
-                print(f"Warning: skipping corrupt workspace file {path.name}: {e}")
-        if loaded:
-            print(f"Loaded {loaded} document(s) from workspace.")
+            if path.name == META_INDEX:
+                continue
+            doc = self._read_json(path)
+            if doc and isinstance(doc, dict):
+                meta[path.stem] = self._make_meta_entry(doc)
+        return meta
+
+    def _read_meta(self) -> dict | None:
+        """Read and validate _meta.json, returning None on any corruption."""
+        meta = self._read_json(self.workspace / META_INDEX)
+        if meta is not None and not isinstance(meta, dict):
+            print(f"Warning: {META_INDEX} is not a JSON object, ignoring")
+            return None
+        return meta
+
+    def _save_meta(self, doc_id: str, entry: dict):
+        meta = self._read_meta() or self._rebuild_meta()
+        meta[doc_id] = entry
+        meta_path = self.workspace / META_INDEX
+        with open(meta_path, "w", encoding="utf-8") as f:
+            json.dump(meta, f, ensure_ascii=False, indent=2)
+
+    def _load_workspace(self):
+        meta = self._read_meta()
+        if meta is None:
+            meta = self._rebuild_meta()
+            if meta:
+                print(f"Loaded {len(meta)} document(s) from workspace (legacy mode).")
+        for doc_id, entry in meta.items():
+            doc = dict(entry, id=doc_id)
+            if doc.get('path') and not os.path.isabs(doc['path']):
+                doc['path'] = str((self.workspace / doc['path']).resolve())
+            self.documents[doc_id] = doc
+
+    def _ensure_doc_loaded(self, doc_id: str):
+        """Load full document JSON on demand (structure, pages, etc.)."""
+        doc = self.documents.get(doc_id)
+        if not doc or doc.get('structure') is not None:
+            return
+        full = self._read_json(self.workspace / f"{doc_id}.json")
+        if not full:
+            return
+        doc['structure'] = full.get('structure', [])
+        if full.get('pages'):
+            doc['pages'] = full['pages']
 
     def get_document(self, doc_id: str) -> str:
         """Return document metadata JSON."""
         return get_document(self.documents, doc_id)
 
     def get_document_structure(self, doc_id: str) -> str:
         """Return document tree structure JSON (without text fields)."""
+        if self.workspace:
+            self._ensure_doc_loaded(doc_id)
         return get_document_structure(self.documents, doc_id)
 
     def get_page_content(self, doc_id: str, pages: str) -> str:
         """Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
+        if self.workspace:
+            self._ensure_doc_loaded(doc_id)
         return get_page_content(self.documents, doc_id, pages)
diff --git a/pageindex/page_index.py b/pageindex/page_index.py
@@ -1095,11 +1095,13 @@ async def page_index_builder():
                 # Create a clean structure without unnecessary fields for description generation
                 clean_structure = create_clean_structure_for_description(structure)
                 doc_description = generate_doc_description(clean_structure, model=opt.model)
+                structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
                 return {
                     'doc_name': get_pdf_name(doc),
                     'doc_description': doc_description,
                     'structure': structure,
                 }
+        structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
         return {
             'doc_name': get_pdf_name(doc),
             'structure': structure,

diff --git a/pageindex/page_index_md.py b/pageindex/page_index_md.py
@@ -243,7 +243,8 @@ def clean_tree_for_output(tree_nodes):
 async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'):
     with open(md_path, 'r', encoding='utf-8') as f:
         markdown_content = f.read()
-
+    line_count = markdown_content.count('\n') + 1
+
     print(f"Extracting nodes from markdown...")
     node_list, markdown_lines = extract_nodes_from_markdown(markdown_content)
 
@@ -265,14 +266,14 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
 
     if if_add_node_summary == 'yes':
         # Always include text for summary generation
-        tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
+        tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
 
         print(f"Generating summaries for each node...")
         tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)
 
         if if_add_node_text == 'no':
             # Remove text after summary generation if not requested
-            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
+            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
 
         if if_add_doc_description == 'yes':
             print(f"Generating document description...")
@@ -282,17 +283,19 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
             return {
                 'doc_name': os.path.splitext(os.path.basename(md_path))[0],
                 'doc_description': doc_description,
+                'line_count': line_count,
                 'structure': tree_structure,
             }
     else:
         # No summaries needed, format based on text preference
         if if_add_node_text == 'yes':
-            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
+            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
         else:
-            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
+            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
 
     return {
         'doc_name': os.path.splitext(os.path.basename(md_path))[0],
+        'line_count': line_count,
         'structure': tree_structure,
     }
 
@@ -303,7 +306,7 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
 
     # MD_NAME = 'Detect-Order-Construct'
     MD_NAME = 'cognitive-load'
-    MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'tests/markdowns/', f'{MD_NAME}.md')
+    MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'examples/documents/', f'{MD_NAME}.md')
 
 
     MODEL="gpt-4.1"

diff --git a/pageindex/retrieve.py b/pageindex/retrieve.py
@@ -25,25 +25,23 @@ def _parse_pages(pages: str) -> list[int]:
 
 
 def _count_pages(doc_info: dict) -> int:
-    """Return total page count for a document."""
-    if doc_info.get('type') == 'pdf':
-        return get_number_of_pages(doc_info['path'])
-    # For MD, find max line_num across all nodes
-    max_line = 0
-    def _traverse(nodes):
-        nonlocal max_line
-        for node in nodes:
-            ln = node.get('line_num', 0)
-            if ln and ln > max_line:
-                max_line = ln
-            if node.get('nodes'):
-                _traverse(node['nodes'])
-    _traverse(doc_info.get('structure', []))
-    return max_line
+    """Return total page count for a PDF document."""
+    if doc_info.get('page_count'):
+        return doc_info['page_count']
+    if doc_info.get('pages'):
+        return len(doc_info['pages'])
+    return get_number_of_pages(doc_info['path'])
 
 
 def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
-    """Extract text for specific PDF pages (1-indexed), opening the PDF once."""
+    """Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF."""
+    cached_pages = doc_info.get('pages')
+    if cached_pages:
+        page_map = {p['page']: p['content'] for p in cached_pages}
+        return [
+            {'page': p, 'content': page_map[p]}
+            for p in page_nums if p in page_map
+        ]
     path = doc_info['path']
     with open(path, 'rb') as f:
         pdf_reader = PyPDF2.PdfReader(f)
@@ -95,7 +93,7 @@ def get_document(documents: dict, doc_id: str) -> str:
     if doc_info.get('type') == 'pdf':
         result['page_count'] = _count_pages(doc_info)
     else:
-        result['line_count'] = _count_pages(doc_info)
+        result['line_count'] = doc_info.get('line_count', 0)
     return json.dumps(result)