Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

- [docker](https://docs.docker.com/get-started/get-docker/) Docker
- [uv](https://docs.astral.sh/uv/) for Python package and environment management.
- **Poppler** – Install Poppler, required for PDF processing.

## Project Setup

Expand Down
7 changes: 5 additions & 2 deletions backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@ ENV PYTHONUNBUFFERED=1
# Set working directory
WORKDIR /app/

# Install system dependencies
RUN apt-get update && apt-get install -y curl
# Install system dependencies (added poppler-utils)
RUN apt-get update && apt-get install -y \
curl \
poppler-utils \
&& rm -rf /var/lib/apt/lists/*

# Install uv package manager
COPY --from=ghcr.io/astral-sh/uv:0.5.11 /uv /uvx /bin/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""create doc transformation job table
Revision ID: 9f8a4af9d6fd
Revises: b5b9412d3d2a
Create Date: 2025-08-29 16:00:47.848950
"""
from alembic import op
import sqlalchemy as sa
import sqlmodel.sql.sqltypes


# revision identifiers, used by Alembic.
revision = "9f8a4af9d6fd"
down_revision = "b5b9412d3d2a"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"doc_transformation_job",
sa.Column("id", sa.Uuid(), nullable=False),
sa.Column("source_document_id", sa.Uuid(), nullable=False),
sa.Column("transformed_document_id", sa.Uuid(), nullable=True),
sa.Column(
"status",
sa.Enum(
"PENDING",
"PROCESSING",
"COMPLETED",
"FAILED",
name="transformationstatus",
),
nullable=False,
),
sa.Column("error_message", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
sa.Column("created_at", sa.DateTime(), nullable=False),
sa.Column("updated_at", sa.DateTime(), nullable=False),
sa.ForeignKeyConstraint(
["source_document_id"],
["document.id"],
),
sa.ForeignKeyConstraint(
["transformed_document_id"],
["document.id"],
),
sa.PrimaryKeyConstraint("id"),
)
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("doc_transformation_job")
# ### end Alembic commands ###
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""add source document id to document table
Revision ID: b5b9412d3d2a
Revises: 40307ab77e9f
Create Date: 2025-08-29 15:59:34.347031
"""
from alembic import op
import sqlalchemy as sa
import sqlmodel.sql.sqltypes


# revision identifiers, used by Alembic.
revision = "b5b9412d3d2a"
down_revision = "40307ab77e9f"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column("document", sa.Column("source_document_id", sa.Uuid(), nullable=True))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the "source document id" column in this table and doc transformation table are different, like in the context of doc transformation table, I understand what the source document id could mean, but what is the role of this column in this table

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So first you upload doc to s3 and than you will use that doc for transformation, so this id of document you uploaded to s3 and from which the transformed document got created.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

still a little confused

op.create_foreign_key(None, "document", "document", ["source_document_id"], ["id"])
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_constraint(None, "document", type_="foreignkey")
op.drop_column("document", "source_document_id")
# ### end Alembic commands ###
19 changes: 17 additions & 2 deletions backend/app/api/docs/documents/upload.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,17 @@
Upload a document to the AI platform. The response will contain an ID,
which is the document ID required by other routes.
Upload a document to the AI platform.

- If only a file is provided, the document will be uploaded and stored, and its ID will be returned.
- If a target format is specified, a transformation job will also be created to transform document into target format in the background. The response will include both the uploaded document details and information about the transformation job.

### Supported Transformations

The following (source_format → target_format) transformations are supported:

- pdf → markdown
- zerox

### Transformers

Available transformer names and their implementations, default transformer is zerox:

- `zerox`
2 changes: 2 additions & 0 deletions backend/app/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
assistants,
collections,
documents,
doc_transformation_job,
login,
organization,
openai_conversation,
Expand All @@ -26,6 +27,7 @@
api_router.include_router(collections.router)
api_router.include_router(credentials.router)
api_router.include_router(documents.router)
api_router.include_router(doc_transformation_job.router)
api_router.include_router(login.router)
api_router.include_router(onboarding.router)
api_router.include_router(openai_conversation.router)
Expand Down
45 changes: 45 additions & 0 deletions backend/app/api/routes/doc_transformation_job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from uuid import UUID

from fastapi import APIRouter, HTTPException, Query, Path as FastPath

from app.api.deps import CurrentUserOrgProject, SessionDep
from app.crud.doc_transformation_job import DocTransformationJobCrud
from app.models import DocTransformationJob, DocTransformationJobs
from app.utils import APIResponse

router = APIRouter(prefix="/documents/transformations", tags=["doc_transformation_job"])


@router.get(
"/{job_id}",
description="Get the status and details of a document transformation job.",
response_model=APIResponse[DocTransformationJob],
)
def get_transformation_job(
session: SessionDep,
current_user: CurrentUserOrgProject,
job_id: UUID = FastPath(description="Transformation job ID"),
):
crud = DocTransformationJobCrud(session, current_user.project_id)
job = crud.read_one(job_id)
return APIResponse.success_response(job)


@router.get(
"/",
description="Get the status and details of multiple document transformation jobs by IDs.",
response_model=APIResponse[DocTransformationJobs],
)
def get_multiple_transformation_jobs(
session: SessionDep,
current_user: CurrentUserOrgProject,
job_ids: list[UUID] = Query(
description="List of transformation job IDs", min=1, max_length=100
),
):
crud = DocTransformationJobCrud(session, project_id=current_user.project_id)
jobs = crud.read_each(set(job_ids))
jobs_not_found = set(job_ids) - {job.id for job in jobs}
return APIResponse.success_response(
DocTransformationJobs(jobs=jobs, jobs_not_found=list(jobs_not_found))
)
115 changes: 103 additions & 12 deletions backend/app/api/routes/documents.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,38 @@
import logging
from uuid import UUID, uuid4
from typing import List
from pathlib import Path
from uuid import UUID, uuid4

from fastapi import APIRouter, File, UploadFile, Query, HTTPException
from fastapi import (
APIRouter,
BackgroundTasks,
File,
Form,
HTTPException,
Query,
UploadFile,
)
from fastapi import Path as FastPath

from app.crud import DocumentCrud, CollectionCrud
from app.models import Document, DocumentPublic, Message
from app.utils import APIResponse, load_description, get_openai_client
from app.api.deps import CurrentUser, SessionDep, CurrentUserOrgProject
from app.api.deps import CurrentUserOrgProject, SessionDep
from app.core.cloud import get_cloud_storage
from app.core.doctransform import service as transformation_service
from app.core.doctransform.registry import (
get_available_transformers,
get_file_format,
is_transformation_supported,
resolve_transformer,
)
from app.crud import CollectionCrud, DocumentCrud
from app.crud.rag import OpenAIAssistantCrud
from app.models import (
Document,
DocumentPublic,
DocumentUploadResponse,
Message,
TransformationJobInfo,
)
from app.utils import APIResponse, get_openai_client, load_description


logger = logging.getLogger(__name__)
router = APIRouter(prefix="/documents", tags=["documents"])
Expand All @@ -20,7 +41,7 @@
@router.get(
"/list",
description=load_description("documents/list.md"),
response_model=APIResponse[List[DocumentPublic]],
response_model=APIResponse[list[DocumentPublic]],
)
def list_docs(
session: SessionDep,
Expand All @@ -36,13 +57,53 @@ def list_docs(
@router.post(
"/upload",
description=load_description("documents/upload.md"),
response_model=APIResponse[DocumentPublic],
response_model=APIResponse[DocumentUploadResponse],
)
def upload_doc(
async def upload_doc(
session: SessionDep,
current_user: CurrentUserOrgProject,
background_tasks: BackgroundTasks,
src: UploadFile = File(...),
target_format: str
| None = Form(
None,
description="Desired output format for the uploaded document (e.g., pdf, docx, txt). ",
),
transformer: str
| None = Form(
None, description="Name of the transformer to apply when converting. "
),
):
# Determine source file format
try:
source_format = get_file_format(src.filename)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))

# validate if transformation is possible or not
if target_format:
if not is_transformation_supported(source_format, target_format):
raise HTTPException(
status_code=400,
detail=f"Transformation from {source_format} to {target_format} is not supported",
)

# Resolve the transformer to use
if not transformer:
transformer = "default"
try:
actual_transformer = resolve_transformer(
source_format, target_format, transformer
)
except ValueError as e:
available_transformers = get_available_transformers(
source_format, target_format
)
raise HTTPException(
status_code=400,
detail=f"{str(e)}. Available transformers: {list(available_transformers.keys())}",
)

storage = get_cloud_storage(session=session, project_id=current_user.project_id)
document_id = uuid4()

Expand All @@ -54,8 +115,38 @@ def upload_doc(
fname=src.filename,
object_store_url=str(object_store_url),
)
data = crud.update(document)
return APIResponse.success_response(data)
source_document = crud.update(document)

job_info: TransformationJobInfo | None = None
if target_format and actual_transformer:
job_id = transformation_service.start_job(
db=session,
current_user=current_user,
source_document_id=source_document.id,
transformer_name=actual_transformer,
target_format=target_format,
background_tasks=background_tasks,
)
job_info = TransformationJobInfo(
message=f"Document accepted for transformation from {source_format} to {target_format}.",
job_id=str(job_id),
source_format=source_format,
target_format=target_format,
transformer=actual_transformer,
status_check_url=f"/documents/transformations/{job_id}",
)

document_schema = DocumentPublic.model_validate(
source_document, from_attributes=True
)
document_schema.signed_url = storage.get_signed_url(
source_document.object_store_url
)
response = DocumentUploadResponse(
**document_schema.model_dump(), transformation_job=job_info
)

return APIResponse.success_response(response)


@router.delete(
Expand Down
Loading