Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 0 additions & 10 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,16 +1,6 @@
.ipynb_checkpoints
__pycache__
files
index
temp/*
chroma-collections.parquet
chroma-embeddings.parquet
.DS_Store
.env*
.venv/
notebook
SDK/*
log/*
logs/
parts/*
json_results/*
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ The PageIndex service is available as a ChatGPT-style [chat platform](https://ch

PageIndex can transform lengthy PDF documents into a semantic **tree structure**, similar to a _"table of contents"_ but optimized for use with Large Language Models (LLMs). It's ideal for: financial reports, regulatory filings, academic textbooks, legal or technical manuals, and any document that exceeds LLM context limits.

Below is an example PageIndex tree structure. Also see more example [documents](https://github.com/VectifyAI/PageIndex/tree/main/tests/pdfs) and generated [tree structures](https://github.com/VectifyAI/PageIndex/tree/main/tests/results).
Below is an example PageIndex tree structure. Also see more example [documents](https://github.com/VectifyAI/PageIndex/tree/main/examples/documents) and generated [tree structures](https://github.com/VectifyAI/PageIndex/tree/main/examples/documents/results).

```jsonc
...
Expand Down
File renamed without changes.
Binary file added examples/documents/attention-residuals.pdf
Binary file not shown.
File renamed without changes.
File renamed without changes.
17 changes: 9 additions & 8 deletions examples/openai_agents_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
"""
import os
import sys
import json
import asyncio
import concurrent.futures
import requests
from pathlib import Path

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

Expand All @@ -32,9 +32,10 @@
from pageindex import PageIndexClient
import pageindex.utils as utils

_EXAMPLES_DIR = os.path.dirname(os.path.abspath(__file__))
PDF_URL = "https://arxiv.org/pdf/2603.15031"
PDF_PATH = "tests/pdfs/attention-residuals.pdf"
WORKSPACE = "./pageindex_workspace"
PDF_PATH = os.path.join(_EXAMPLES_DIR, "documents", "attention-residuals.pdf")
WORKSPACE = os.path.join(_EXAMPLES_DIR, "workspace")

AGENT_SYSTEM_PROMPT = """
You are PageIndex, a document QA assistant.
Expand Down Expand Up @@ -147,16 +148,16 @@ async def _run():
print("=" * 60)
print("Step 1: Indexing PDF and inspecting tree structure")
print("=" * 60)
_id_cache = Path(WORKSPACE).expanduser() / "demo_doc_id.txt"
if _id_cache.exists() and (doc_id := _id_cache.read_text().strip()) in client.documents:
doc_id = next((did for did, doc in client.documents.items()
if doc.get('doc_name') == os.path.basename(PDF_PATH)), None)
if doc_id:
print(f"\nLoaded cached doc_id: {doc_id}")
else:
doc_id = client.index(PDF_PATH)
_id_cache.parent.mkdir(parents=True, exist_ok=True)
_id_cache.write_text(doc_id)
print(f"\nIndexed. doc_id: {doc_id}")
print("\nTree Structure (top-level sections):")
utils.print_tree(client.documents[doc_id]["structure"])
structure = json.loads(client.get_document_structure(doc_id))
utils.print_tree(structure)

# ── Step 2: Document Metadata ──────────────────────────────────────────────────
print("\n" + "=" * 60)
Expand Down
274 changes: 274 additions & 0 deletions examples/workspace/12345678-abcd-4321-abcd-123456789abc.json

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions examples/workspace/_meta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"12345678-abcd-4321-abcd-123456789abc": {
"type": "pdf",
"doc_name": "attention-residuals.pdf",
"doc_description": "This document introduces \"Attention Residuals\" (AttnRes) and its scalable variant \"Block AttnRes,\" novel mechanisms for replacing fixed residual accumulation in neural networks with learned, input-dependent depth-wise attention, addressing limitations of standard residual connections while optimizing memory, computation, and scalability for large-scale training and inference.",
"page_count": 21,
"path": "../documents/attention-residuals.pdf"
}
}
130 changes: 111 additions & 19 deletions pageindex/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,15 @@
import concurrent.futures
from pathlib import Path

import PyPDF2

from .page_index import page_index
from .page_index_md import md_to_tree
from .retrieve import get_document, get_document_structure, get_page_content
from .utils import ConfigLoader
from .utils import ConfigLoader, remove_fields

META_INDEX = "_meta.json"


class PageIndexClient:
"""
Expand Down Expand Up @@ -39,6 +44,9 @@ def __init__(self, api_key: str = None, model: str = None, retrieve_model: str =

def index(self, file_path: str, mode: str = "auto") -> str:
"""Index a document. Returns a document_id."""
# Persist a canonical absolute path so workspace reloads do not
# reinterpret caller-relative paths against the workspace directory.
file_path = os.path.abspath(os.path.expanduser(file_path))
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")

Expand All @@ -58,13 +66,22 @@ def index(self, file_path: str, mode: str = "auto") -> str:
if_add_node_id='yes',
if_add_doc_description='yes'
)
# Extract per-page text so queries don't need the original PDF
pages = []
with open(file_path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
for i, page in enumerate(pdf_reader.pages, 1):
pages.append({'page': i, 'content': page.extract_text() or ''})

self.documents[doc_id] = {
'id': doc_id,
'path': file_path,
'type': 'pdf',
'structure': result['structure'],
'path': file_path,
'doc_name': result.get('doc_name', ''),
'doc_description': result.get('doc_description', '')
'doc_description': result.get('doc_description', ''),
'page_count': len(pages),
'structure': result['structure'],
'pages': pages,
}

elif mode == "md" or (mode == "auto" and is_md):
Expand All @@ -87,11 +104,12 @@ def index(self, file_path: str, mode: str = "auto") -> str:
result = asyncio.run(coro)
self.documents[doc_id] = {
'id': doc_id,
'path': file_path,
'type': 'md',
'structure': result['structure'],
'path': file_path,
'doc_name': result.get('doc_name', ''),
'doc_description': result.get('doc_description', '')
'doc_description': result.get('doc_description', ''),
'line_count': result.get('line_count', 0),
'structure': result['structure'],
}
else:
raise ValueError(f"Unsupported file format for: {file_path}")
Expand All @@ -101,32 +119,106 @@ def index(self, file_path: str, mode: str = "auto") -> str:
self._save_doc(doc_id)
return doc_id

@staticmethod
def _make_meta_entry(doc: dict) -> dict:
"""Build a lightweight meta entry from a document dict."""
entry = {
'type': doc.get('type', ''),
'doc_name': doc.get('doc_name', ''),
'doc_description': doc.get('doc_description', ''),
'path': doc.get('path', ''),
}
if doc.get('type') == 'pdf':
entry['page_count'] = doc.get('page_count')
elif doc.get('type') == 'md':
entry['line_count'] = doc.get('line_count')
return entry

@staticmethod
def _read_json(path) -> dict | None:
"""Read a JSON file, returning None on any error."""
try:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except (json.JSONDecodeError, OSError) as e:
print(f"Warning: corrupt {Path(path).name}: {e}")
return None

def _save_doc(self, doc_id: str):
doc = self.documents[doc_id].copy()
# Strip text from structure nodes — redundant with pages (PDF only)
if doc.get('structure') and doc.get('type') == 'pdf':
doc['structure'] = remove_fields(doc['structure'], fields=['text'])
path = self.workspace / f"{doc_id}.json"
with open(path, "w", encoding="utf-8") as f:
json.dump(self.documents[doc_id], f, ensure_ascii=False, indent=2)
json.dump(doc, f, ensure_ascii=False, indent=2)
self._save_meta(doc_id, self._make_meta_entry(doc))
# Drop heavy fields; will lazy-load on demand
self.documents[doc_id].pop('structure', None)
self.documents[doc_id].pop('pages', None)

def _load_workspace(self):
loaded = 0
def _rebuild_meta(self) -> dict:
"""Scan individual doc JSON files and return a meta dict."""
meta = {}
for path in self.workspace.glob("*.json"):
try:
with open(path, "r", encoding="utf-8") as f:
doc = json.load(f)
self.documents[path.stem] = doc
loaded += 1
except (json.JSONDecodeError, OSError) as e:
print(f"Warning: skipping corrupt workspace file {path.name}: {e}")
if loaded:
print(f"Loaded {loaded} document(s) from workspace.")
if path.name == META_INDEX:
continue
doc = self._read_json(path)
if doc and isinstance(doc, dict):
meta[path.stem] = self._make_meta_entry(doc)
return meta

def _read_meta(self) -> dict | None:
"""Read and validate _meta.json, returning None on any corruption."""
meta = self._read_json(self.workspace / META_INDEX)
if meta is not None and not isinstance(meta, dict):
print(f"Warning: {META_INDEX} is not a JSON object, ignoring")
return None
return meta

def _save_meta(self, doc_id: str, entry: dict):
meta = self._read_meta() or self._rebuild_meta()
meta[doc_id] = entry
meta_path = self.workspace / META_INDEX
with open(meta_path, "w", encoding="utf-8") as f:
json.dump(meta, f, ensure_ascii=False, indent=2)

def _load_workspace(self):
meta = self._read_meta()
if meta is None:
meta = self._rebuild_meta()
if meta:
print(f"Loaded {len(meta)} document(s) from workspace (legacy mode).")
for doc_id, entry in meta.items():
doc = dict(entry, id=doc_id)
if doc.get('path') and not os.path.isabs(doc['path']):
doc['path'] = str((self.workspace / doc['path']).resolve())
self.documents[doc_id] = doc

def _ensure_doc_loaded(self, doc_id: str):
"""Load full document JSON on demand (structure, pages, etc.)."""
doc = self.documents.get(doc_id)
if not doc or doc.get('structure') is not None:
return
full = self._read_json(self.workspace / f"{doc_id}.json")
if not full:
return
doc['structure'] = full.get('structure', [])
if full.get('pages'):
doc['pages'] = full['pages']

def get_document(self, doc_id: str) -> str:
"""Return document metadata JSON."""
return get_document(self.documents, doc_id)

def get_document_structure(self, doc_id: str) -> str:
"""Return document tree structure JSON (without text fields)."""
if self.workspace:
self._ensure_doc_loaded(doc_id)
return get_document_structure(self.documents, doc_id)

def get_page_content(self, doc_id: str, pages: str) -> str:
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
if self.workspace:
self._ensure_doc_loaded(doc_id)
return get_page_content(self.documents, doc_id, pages)
2 changes: 2 additions & 0 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1095,11 +1095,13 @@ async def page_index_builder():
# Create a clean structure without unnecessary fields for description generation
clean_structure = create_clean_structure_for_description(structure)
doc_description = generate_doc_description(clean_structure, model=opt.model)
structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
return {
'doc_name': get_pdf_name(doc),
'doc_description': doc_description,
'structure': structure,
}
structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
return {
'doc_name': get_pdf_name(doc),
'structure': structure,
Expand Down
15 changes: 9 additions & 6 deletions pageindex/page_index_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,8 @@ def clean_tree_for_output(tree_nodes):
async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'):
with open(md_path, 'r', encoding='utf-8') as f:
markdown_content = f.read()

line_count = markdown_content.count('\n') + 1

print(f"Extracting nodes from markdown...")
node_list, markdown_lines = extract_nodes_from_markdown(markdown_content)

Expand All @@ -265,14 +266,14 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad

if if_add_node_summary == 'yes':
# Always include text for summary generation
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])

print(f"Generating summaries for each node...")
tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)

if if_add_node_text == 'no':
# Remove text after summary generation if not requested
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])

if if_add_doc_description == 'yes':
print(f"Generating document description...")
Expand All @@ -282,17 +283,19 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
return {
'doc_name': os.path.splitext(os.path.basename(md_path))[0],
'doc_description': doc_description,
'line_count': line_count,
'structure': tree_structure,
}
else:
# No summaries needed, format based on text preference
if if_add_node_text == 'yes':
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
else:
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])

return {
'doc_name': os.path.splitext(os.path.basename(md_path))[0],
'line_count': line_count,
'structure': tree_structure,
}

Expand All @@ -303,7 +306,7 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad

# MD_NAME = 'Detect-Order-Construct'
MD_NAME = 'cognitive-load'
MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'tests/markdowns/', f'{MD_NAME}.md')
MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'examples/documents/', f'{MD_NAME}.md')


MODEL="gpt-4.1"
Expand Down
32 changes: 15 additions & 17 deletions pageindex/retrieve.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,25 +25,23 @@ def _parse_pages(pages: str) -> list[int]:


def _count_pages(doc_info: dict) -> int:
"""Return total page count for a document."""
if doc_info.get('type') == 'pdf':
return get_number_of_pages(doc_info['path'])
# For MD, find max line_num across all nodes
max_line = 0
def _traverse(nodes):
nonlocal max_line
for node in nodes:
ln = node.get('line_num', 0)
if ln and ln > max_line:
max_line = ln
if node.get('nodes'):
_traverse(node['nodes'])
_traverse(doc_info.get('structure', []))
return max_line
"""Return total page count for a PDF document."""
if doc_info.get('page_count'):
return doc_info['page_count']
if doc_info.get('pages'):
return len(doc_info['pages'])
return get_number_of_pages(doc_info['path'])


def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
"""Extract text for specific PDF pages (1-indexed), opening the PDF once."""
"""Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF."""
cached_pages = doc_info.get('pages')
if cached_pages:
page_map = {p['page']: p['content'] for p in cached_pages}
return [
{'page': p, 'content': page_map[p]}
for p in page_nums if p in page_map
]
path = doc_info['path']
with open(path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
Expand Down Expand Up @@ -95,7 +93,7 @@ def get_document(documents: dict, doc_id: str) -> str:
if doc_info.get('type') == 'pdf':
result['page_count'] = _count_pages(doc_info)
else:
result['line_count'] = _count_pages(doc_info)
result['line_count'] = doc_info.get('line_count', 0)
return json.dumps(result)


Expand Down
Loading