Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 30 additions & 13 deletions openkb/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
"""OpenKB CLI — command-line interface for the knowledge base workflow."""
from __future__ import annotations

# Silence import-time warnings (e.g. pydub's missing-ffmpeg warning emitted
# when markitdown pulls it in). markitdown later clobbers the filters during
# its own import, so we re-apply after all imports below.
import warnings
warnings.filterwarnings("ignore")

import asyncio
import json
import logging
Expand Down Expand Up @@ -256,22 +262,23 @@ def init():
return

# Interactive prompts
click.echo("Pick an LLM in `provider/model` LiteLLM format:")
click.echo(" OpenAI: gpt-5.4-mini, gpt-5.4")
click.echo(" Anthropic: anthropic/claude-sonnet-4-6, anthropic/claude-opus-4-6")
click.echo(" Gemini: gemini/gemini-3.1-pro-preview, gemini/gemini-3-flash-preview")
click.echo(" Others: see https://docs.litellm.ai/docs/providers")
click.echo()
model = click.prompt(
f"Model (e.g. gpt-5.4-mini, anthropic/claude-sonnet-4-6) [default: {DEFAULT_CONFIG['model']}]",
f"Model (enter for default {DEFAULT_CONFIG['model']})",
default=DEFAULT_CONFIG["model"],
show_default=False,
)
language = click.prompt(
f"Language [default: {DEFAULT_CONFIG['language']}]",
default=DEFAULT_CONFIG["language"],
show_default=False,
)
pageindex_threshold = click.prompt(
f"PageIndex threshold (pages) [default: {DEFAULT_CONFIG['pageindex_threshold']}]",
default=DEFAULT_CONFIG["pageindex_threshold"],
type=int,
api_key = click.prompt(
"LLM API Key (saved to .env, enter to skip)",
default="",
hide_input=True,
show_default=False,
)
).strip()
# Create directory structure
Path("raw").mkdir(exist_ok=True)
Path("wiki/sources/images").mkdir(parents=True, exist_ok=True)
Expand All @@ -290,12 +297,22 @@ def init():
openkb_dir.mkdir()
config = {
"model": model,
"language": language,
"pageindex_threshold": pageindex_threshold,
"language": DEFAULT_CONFIG["language"],
"pageindex_threshold": DEFAULT_CONFIG["pageindex_threshold"],
}
save_config(openkb_dir / "config.yaml", config)
(openkb_dir / "hashes.json").write_text(json.dumps({}), encoding="utf-8")

# Write API key to KB-local .env (0600) if the user provided one
if api_key:
env_path = Path(".env")
if env_path.exists():
click.echo(".env already exists, skipping write. Add LLM_API_KEY manually if needed.")
else:
env_path.write_text(f"LLM_API_KEY={api_key}\n", encoding="utf-8")
os.chmod(env_path, 0o600)
click.echo("Saved LLM API key to .env.")

# Register this KB in the global config
register_kb(Path.cwd())

Expand Down
19 changes: 17 additions & 2 deletions openkb/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,28 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
"structure": structure,
}

# Write wiki/sources/ — extract per-page content with pymupdf (not PageIndex)
# Write wiki/sources/ — per-page content
sources_dir = kb_dir / "wiki" / "sources"
sources_dir.mkdir(parents=True, exist_ok=True)
images_dir = sources_dir / "images" / pdf_path.stem

from openkb.images import convert_pdf_to_pages
all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)

all_pages: list = []
if pageindex_api_key:
# Cloud mode: fetch OCR'd markdown from PageIndex. get_page_content
# requires a page range, so pass "1-N".
from openkb.converter import get_pdf_page_count
page_count = get_pdf_page_count(pdf_path)
try:
all_pages = col.get_page_content(doc_id, f"1-{page_count}")
except Exception as exc:
logger.warning("Cloud get_page_content failed for %s: %s", pdf_path.name, exc)

if not all_pages:
if pageindex_api_key:
logger.warning("Cloud returned no pages for %s; falling back to local pymupdf", pdf_path.name)
all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)

(sources_dir / f"{pdf_path.stem}.json").write_text(
json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ classifiers = [
]
keywords = ["ai", "rag", "retrieval", "knowledge-base", "llm", "pageindex", "agents", "document"]
dependencies = [
"pageindex==0.3.0.dev0",
"pageindex==0.3.0.dev1",
"markitdown[all]",
"click>=8.0",
"watchdog>=3.0",
Expand Down