Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions pageindex/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def __init__(self, api_key: str = None, model: str = None, retrieve_model: str =
if self.workspace:
self._load_workspace()

def index(self, file_path: str, mode: str = "auto") -> str:
def index(self, file_path: str, mode: str = "auto", checkpoint_dir: str = None, resume: bool = False) -> str:
"""Index a document. Returns a document_id."""
# Persist a canonical absolute path so workspace reloads do not
# reinterpret caller-relative paths against the workspace directory.
Expand All @@ -74,7 +74,9 @@ def index(self, file_path: str, mode: str = "auto") -> str:
if_add_node_summary='yes',
if_add_node_text='yes',
if_add_node_id='yes',
if_add_doc_description='yes'
if_add_doc_description='yes',
checkpoint_dir=checkpoint_dir,
resume='yes' if resume else None,
)
# Extract per-page text so queries don't need the original PDF
pages = []
Expand Down Expand Up @@ -104,7 +106,9 @@ def index(self, file_path: str, mode: str = "auto") -> str:
model=self.model,
if_add_doc_description='yes',
if_add_node_text='yes',
if_add_node_id='yes'
if_add_node_id='yes',
checkpoint_dir=checkpoint_dir,
resume=resume,
)
try:
asyncio.get_running_loop()
Expand Down
4 changes: 3 additions & 1 deletion pageindex/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ max_token_num_each_node: 20000
if_add_node_id: "yes"
if_add_node_summary: "yes"
if_add_doc_description: "no"
if_add_node_text: "no"
if_add_node_text: "no"
checkpoint_dir: null
resume: "no"
46 changes: 40 additions & 6 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1063,6 +1063,17 @@ async def tree_parser(page_list, opt, doc=None, logger=None):
return toc_tree


def _save_checkpoint(data, path):
if not path:
return
os.makedirs(os.path.dirname(path), exist_ok=True)
tmp_path = path + '.tmp'
with open(tmp_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
os.replace(tmp_path, path)
print(f'Checkpoint saved: {path}')


def page_index_main(doc, opt=None):
logger = JsonLogger(doc)

Expand All @@ -1080,38 +1091,61 @@ def page_index_main(doc, opt=None):
logger.info({'total_token': sum([page[1] for page in page_list])})

async def page_index_builder():
structure = await tree_parser(page_list, opt, doc=doc, logger=logger)
checkpoint_dir = getattr(opt, 'checkpoint_dir', None)
resume = getattr(opt, 'resume', 'no') == 'yes'
doc_name = get_pdf_name(doc)

tree_ckpt = os.path.join(checkpoint_dir, doc_name + '_tree.json') if checkpoint_dir else None
summary_ckpt = os.path.join(checkpoint_dir, doc_name + '_summary.json') if checkpoint_dir else None

if resume and checkpoint_dir and summary_ckpt and os.path.isfile(summary_ckpt):
print(f'Resuming from summary checkpoint: {summary_ckpt}')
with open(summary_ckpt, 'r', encoding='utf-8') as f:
structure = json.load(f)
elif resume and checkpoint_dir and tree_ckpt and os.path.isfile(tree_ckpt):
print(f'Resuming from tree checkpoint: {tree_ckpt}')
with open(tree_ckpt, 'r', encoding='utf-8') as f:
structure = json.load(f)
else:
if resume and checkpoint_dir:
raise FileNotFoundError(
f"No checkpoint found in {checkpoint_dir} for '{doc_name}'. "
f"Expected: {tree_ckpt}")
structure = await tree_parser(page_list, opt, doc=doc, logger=logger)
_save_checkpoint(structure, tree_ckpt)

if opt.if_add_node_id == 'yes':
write_node_id(structure)
write_node_id(structure)
if opt.if_add_node_text == 'yes':
add_node_text(structure, page_list)
if opt.if_add_node_summary == 'yes':
if opt.if_add_node_text == 'no':
add_node_text(structure, page_list)
await generate_summaries_for_structure(structure, model=opt.model)
_save_checkpoint(structure, summary_ckpt)
if opt.if_add_node_text == 'no':
remove_structure_text(structure)
if opt.if_add_doc_description == 'yes':
# Create a clean structure without unnecessary fields for description generation
clean_structure = create_clean_structure_for_description(structure)
doc_description = generate_doc_description(clean_structure, model=opt.model)
structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
return {
'doc_name': get_pdf_name(doc),
'doc_name': doc_name,
'doc_description': doc_description,
'structure': structure,
}
structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
return {
'doc_name': get_pdf_name(doc),
'doc_name': doc_name,
'structure': structure,
}

return asyncio.run(page_index_builder())


def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None):
if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None,
checkpoint_dir=None, resume=None):

user_opt = {
arg: value for arg, value in locals().items()
Expand Down
41 changes: 33 additions & 8 deletions pageindex/page_index_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,26 @@ def clean_tree_for_output(tree_nodes):
return cleaned_nodes


async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'):
def _save_checkpoint_md(data, path):
if not path:
return
os.makedirs(os.path.dirname(path), exist_ok=True)
tmp_path = path + '.tmp'
with open(tmp_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
os.replace(tmp_path, path)
print(f'Checkpoint saved: {path}')


async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes', checkpoint_dir=None, resume=False):
doc_name = os.path.splitext(os.path.basename(md_path))[0]
summary_ckpt = os.path.join(checkpoint_dir, doc_name + '_md_summary.json') if checkpoint_dir else None

if resume and checkpoint_dir and summary_ckpt and os.path.isfile(summary_ckpt):
print(f'Resuming from summary checkpoint: {summary_ckpt}')
with open(summary_ckpt, 'r', encoding='utf-8') as f:
return json.load(f)

with open(md_path, 'r', encoding='utf-8') as f:
markdown_content = f.read()
line_count = markdown_content.count('\n') + 1
Expand All @@ -265,36 +284,42 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
print(f"Formatting tree structure...")

if if_add_node_summary == 'yes':
# Always include text for summary generation
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])

print(f"Generating summaries for each node...")
tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)

if if_add_node_text == 'no':
# Remove text after summary generation if not requested
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])

if if_add_doc_description == 'yes':
print(f"Generating document description...")
# Create a clean structure without unnecessary fields for description generation
clean_structure = create_clean_structure_for_description(tree_structure)
doc_description = generate_doc_description(clean_structure, model=model)
return {
'doc_name': os.path.splitext(os.path.basename(md_path))[0],
result = {
'doc_name': doc_name,
'doc_description': doc_description,
'line_count': line_count,
'structure': tree_structure,
}
_save_checkpoint_md(result, summary_ckpt)
return result

result = {
'doc_name': doc_name,
'line_count': line_count,
'structure': tree_structure,
}
_save_checkpoint_md(result, summary_ckpt)
return result
else:
# No summaries needed, format based on text preference
if if_add_node_text == 'yes':
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
else:
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])

return {
'doc_name': os.path.splitext(os.path.basename(md_path))[0],
'doc_name': doc_name,
'line_count': line_count,
'structure': tree_structure,
}
Expand Down
22 changes: 19 additions & 3 deletions run_pageindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,16 @@
help='Whether to add doc description to the doc')
parser.add_argument('--if-add-node-text', type=str, default=None,
help='Whether to add text to the node')


parser.add_argument('--checkpoint-dir', type=str, default=None,
help='Directory to save/load tree structure checkpoints. '
'When set, intermediate results are saved after expensive LLM calls '
'so processing can be resumed later with --resume, '
'or the checkpoint can be manually edited for correction.')
parser.add_argument('--resume', action='store_true', default=False,
help='Resume from a previously saved checkpoint instead of re-running '
'expensive LLM calls (requires --checkpoint-dir)')

# Markdown specific arguments
parser.add_argument('--if-thinning', type=str, default='no',
help='Whether to apply tree thinning for markdown (markdown only)')
Expand All @@ -44,13 +53,16 @@
if args.pdf_path and args.md_path:
raise ValueError("Only one of --pdf_path or --md_path can be specified")

if args.resume and not args.checkpoint_dir:
raise ValueError("--resume requires --checkpoint-dir to be set")

if args.pdf_path:
# Validate PDF file
if not args.pdf_path.lower().endswith('.pdf'):
raise ValueError("PDF file must have .pdf extension")
if not os.path.isfile(args.pdf_path):
raise ValueError(f"PDF file not found: {args.pdf_path}")

# Process PDF file
user_opt = {
'model': args.model,
Expand All @@ -61,6 +73,8 @@
'if_add_node_summary': args.if_add_node_summary,
'if_add_doc_description': args.if_add_doc_description,
'if_add_node_text': args.if_add_node_text,
'checkpoint_dir': args.checkpoint_dir,
'resume': 'yes' if args.resume else None,
}
opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None})

Expand Down Expand Up @@ -117,7 +131,9 @@
model=opt.model,
if_add_doc_description=opt.if_add_doc_description,
if_add_node_text=opt.if_add_node_text,
if_add_node_id=opt.if_add_node_id
if_add_node_id=opt.if_add_node_id,
checkpoint_dir=args.checkpoint_dir,
resume=args.resume,
))

print('Parsing done, saving to file...')
Expand Down