Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,21 @@ update_job = client.jobs.create(
document = client.documents.get(document_id)
print(document.status)

chunks = client.documents.list_chunks(
document_id,
page=1,
page_size=50,
chunk_type="text",
)
print(chunks.pagination.total)
if chunks.chunks:
chunk = client.documents.get_chunk(
document_id,
chunks.chunks[0].id,
include_asset_urls=True,
)
print(chunk.chunk.content)

client.documents.archive(document_id)
```

Expand Down
16 changes: 16 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,22 @@ for document in document_list.documents:
document = client.documents.get("doc_123")
print(document.current_job_result_id)

chunks = client.documents.list_chunks(
"doc_123",
page=1,
page_size=50,
chunk_type="text",
)
for chunk in chunks.chunks:
print(chunk.id, chunk.content)

image_chunk = client.documents.get_chunk(
"doc_123",
"dchk_123",
include_asset_urls=True,
)
print(image_chunk.chunk.asset_url)

archived = client.documents.archive("doc_123")
print(archived.status) # "archived"
```
Expand Down
15 changes: 14 additions & 1 deletion src/knowhere/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,15 @@
)
from knowhere._types import PollProgressCallback, UploadProgressCallback
from knowhere._version import __version__
from knowhere.types.document import Document, DocumentListResponse
from knowhere.types.document import (
Document,
DocumentChunk,
DocumentChunkListResponse,
DocumentChunkPagination,
DocumentChunkResponse,
DocumentChunkType,
DocumentListResponse,
)
from knowhere.types.job import Job, JobError, JobProgress, JobResult
from knowhere.types.params import ParsingParams, WebhookConfig
from knowhere.types.retrieval import (
Expand Down Expand Up @@ -98,6 +106,11 @@
"JobResult",
# Document types
"Document",
"DocumentChunk",
"DocumentChunkListResponse",
"DocumentChunkPagination",
"DocumentChunkResponse",
"DocumentChunkType",
"DocumentListResponse",
# Retrieval types
"RetrievalChannel",
Expand Down
119 changes: 118 additions & 1 deletion src/knowhere/resources/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@
from typing import Any, Dict, Optional

from knowhere.resources._base import AsyncAPIResource, SyncAPIResource
from knowhere.types.document import Document, DocumentListResponse
from knowhere.types.document import (
Document,
DocumentChunkListResponse,
DocumentChunkResponse,
DocumentChunkType,
DocumentListResponse,
)


class Documents(SyncAPIResource):
Expand All @@ -32,6 +38,49 @@ def get(self, document_id: str) -> Document:
cast_to=Document,
)

def list_chunks(
self,
document_id: str,
*,
page: int = 1,
page_size: int = 50,
chunk_type: Optional[DocumentChunkType] = None,
include_asset_urls: bool = False,
) -> DocumentChunkListResponse:
"""List current-revision chunks for one canonical document."""
params: Dict[str, Any] = _build_chunk_list_params(
page=page,
page_size=page_size,
chunk_type=chunk_type,
include_asset_urls=include_asset_urls,
)

return self._request(
"GET",
f"v1/documents/{document_id}/chunks",
params=params or None,
cast_to=DocumentChunkListResponse,
)

def get_chunk(
self,
document_id: str,
document_chunk_id: str,
*,
include_asset_urls: bool = False,
) -> DocumentChunkResponse:
"""Get one current-revision chunk for one canonical document."""
params: Dict[str, Any] = _build_chunk_get_params(
include_asset_urls=include_asset_urls,
)

return self._request(
"GET",
f"v1/documents/{document_id}/chunks/{document_chunk_id}",
params=params or None,
cast_to=DocumentChunkResponse,
)

def archive(self, document_id: str) -> Document:
"""Archive one canonical document by ID."""
return self._request(
Expand Down Expand Up @@ -65,10 +114,78 @@ async def get(self, document_id: str) -> Document:
cast_to=Document,
)

async def list_chunks(
self,
document_id: str,
*,
page: int = 1,
page_size: int = 50,
chunk_type: Optional[DocumentChunkType] = None,
include_asset_urls: bool = False,
) -> DocumentChunkListResponse:
"""List current-revision chunks for one canonical document."""
params: Dict[str, Any] = _build_chunk_list_params(
page=page,
page_size=page_size,
chunk_type=chunk_type,
include_asset_urls=include_asset_urls,
)

return await self._request(
"GET",
f"v1/documents/{document_id}/chunks",
params=params or None,
cast_to=DocumentChunkListResponse,
)

async def get_chunk(
self,
document_id: str,
document_chunk_id: str,
*,
include_asset_urls: bool = False,
) -> DocumentChunkResponse:
"""Get one current-revision chunk for one canonical document."""
params: Dict[str, Any] = _build_chunk_get_params(
include_asset_urls=include_asset_urls,
)

return await self._request(
"GET",
f"v1/documents/{document_id}/chunks/{document_chunk_id}",
params=params or None,
cast_to=DocumentChunkResponse,
)

async def archive(self, document_id: str) -> Document:
"""Archive one canonical document by ID."""
return await self._request(
"POST",
f"v1/documents/{document_id}/archive",
cast_to=Document,
)


def _build_chunk_list_params(
*,
page: int,
page_size: int,
chunk_type: Optional[DocumentChunkType],
include_asset_urls: bool,
) -> Dict[str, Any]:
params: Dict[str, Any] = {}
if page != 1:
params["page"] = page
if page_size != 50:
params["page_size"] = page_size
if chunk_type is not None:
params["chunk_type"] = chunk_type
if include_asset_urls:
params["include_asset_urls"] = True
return params


def _build_chunk_get_params(*, include_asset_urls: bool) -> Dict[str, Any]:
if not include_asset_urls:
return {}
return {"include_asset_urls": True}
15 changes: 14 additions & 1 deletion src/knowhere/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,15 @@

from __future__ import annotations

from knowhere.types.document import Document, DocumentListResponse
from knowhere.types.document import (
Document,
DocumentChunk,
DocumentChunkListResponse,
DocumentChunkPagination,
DocumentChunkResponse,
DocumentChunkType,
DocumentListResponse,
)
from knowhere.types.job import Job, JobError, JobResult
from knowhere.types.params import ParsingParams, WebhookConfig
from knowhere.types.retrieval import (
Expand Down Expand Up @@ -39,6 +47,11 @@
"JobResult",
# document
"Document",
"DocumentChunk",
"DocumentChunkListResponse",
"DocumentChunkPagination",
"DocumentChunkResponse",
"DocumentChunkType",
"DocumentListResponse",
# retrieval
"RetrievalChannel",
Expand Down
52 changes: 51 additions & 1 deletion src/knowhere/types/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from __future__ import annotations

from datetime import datetime
from typing import Optional
from typing import Any, Dict, Literal, Optional

from pydantic import BaseModel

Expand All @@ -26,3 +26,53 @@ class DocumentListResponse(BaseModel):

namespace: str
documents: list[Document]


DocumentChunkType = Literal["text", "image", "table"]


class DocumentChunkPagination(BaseModel):
"""Pagination metadata returned by document chunk list endpoints."""

page: int
page_size: int
total: int
total_pages: int


class DocumentChunk(BaseModel):
"""One current-revision document chunk."""

id: str
chunk_id: str
chunk_type: DocumentChunkType
content: Optional[str] = None
section_id: Optional[str] = None
section_path: Optional[str] = None
source_chunk_path: Optional[str] = None
file_path: Optional[str] = None
sort_order: int
metadata: Dict[str, Any]
asset_url: Optional[str] = None
created_at: Optional[datetime] = None


class DocumentChunkListResponse(BaseModel):
"""Response from ``GET /v1/documents/{document_id}/chunks``."""

document_id: str
namespace: str
job_result_id: Optional[str] = None
job_id: Optional[str] = None
chunks: list[DocumentChunk]
pagination: DocumentChunkPagination


class DocumentChunkResponse(BaseModel):
"""Response from ``GET /v1/documents/{document_id}/chunks/{chunk_id}``."""

document_id: str
namespace: str
job_result_id: Optional[str] = None
job_id: Optional[str] = None
chunk: DocumentChunk
Loading
Loading