ProjectTech4DevAI · avirajsingh7 · Sep 4, 2025 · Sep 1, 2025 · Sep 1, 2025 · Sep 1, 2025
diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@
 
 - [docker](https://docs.docker.com/get-started/get-docker/) Docker
 - [uv](https://docs.astral.sh/uv/) for Python package and environment management.
+- **Poppler** – Install Poppler, required for PDF processing.
 
 ## Project Setup
 

diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -7,8 +7,11 @@ ENV PYTHONUNBUFFERED=1
 # Set working directory
 WORKDIR /app/
 
-# Install system dependencies
-RUN apt-get update && apt-get install -y curl
+# Install system dependencies (added poppler-utils)
+RUN apt-get update && apt-get install -y \
+    curl \
+    poppler-utils \
+ && rm -rf /var/lib/apt/lists/*
 
 # Install uv package manager
 COPY --from=ghcr.io/astral-sh/uv:0.5.11 /uv /uvx /bin/

diff --git a/backend/app/alembic/versions/9f8a4af9d6fd_create_doc_transformation_job_table.py b/backend/app/alembic/versions/9f8a4af9d6fd_create_doc_transformation_job_table.py
@@ -0,0 +1,57 @@
+"""create doc transformation job table
+
+Revision ID: 9f8a4af9d6fd
+Revises: b5b9412d3d2a
+Create Date: 2025-08-29 16:00:47.848950
+
+"""
+from alembic import op
+import sqlalchemy as sa
+import sqlmodel.sql.sqltypes
+
+
+# revision identifiers, used by Alembic.
+revision = "9f8a4af9d6fd"
+down_revision = "b5b9412d3d2a"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "doc_transformation_job",
+        sa.Column("id", sa.Uuid(), nullable=False),
+        sa.Column("source_document_id", sa.Uuid(), nullable=False),
+        sa.Column("transformed_document_id", sa.Uuid(), nullable=True),
+        sa.Column(
+            "status",
+            sa.Enum(
+                "PENDING",
+                "PROCESSING",
+                "COMPLETED",
+                "FAILED",
+                name="transformationstatus",
+            ),
+            nullable=False,
+        ),
+        sa.Column("error_message", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
+        sa.Column("created_at", sa.DateTime(), nullable=False),
+        sa.Column("updated_at", sa.DateTime(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["source_document_id"],
+            ["document.id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["transformed_document_id"],
+            ["document.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table("doc_transformation_job")
+    # ### end Alembic commands ###
diff --git a/backend/app/alembic/versions/b5b9412d3d2a_add_source_document_id_to_document_table.py b/backend/app/alembic/versions/b5b9412d3d2a_add_source_document_id_to_document_table.py
@@ -0,0 +1,31 @@
+"""add source document id to document table
+
+Revision ID: b5b9412d3d2a
+Revises: 40307ab77e9f
+Create Date: 2025-08-29 15:59:34.347031
+
+"""
+from alembic import op
+import sqlalchemy as sa
+import sqlmodel.sql.sqltypes
+
+
+# revision identifiers, used by Alembic.
+revision = "b5b9412d3d2a"
+down_revision = "40307ab77e9f"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column("document", sa.Column("source_document_id", sa.Uuid(), nullable=True))
+    op.create_foreign_key(None, "document", "document", ["source_document_id"], ["id"])
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_constraint(None, "document", type_="foreignkey")
+    op.drop_column("document", "source_document_id")
+    # ### end Alembic commands ###
diff --git a/backend/app/api/docs/documents/upload.md b/backend/app/api/docs/documents/upload.md
@@ -1,2 +1,17 @@
-Upload a document to the AI platform. The response will contain an ID,
-which is the document ID required by other routes.
+Upload a document to the AI platform.
+
+- If only a file is provided, the document will be uploaded and stored, and its ID will be returned.
+- If a target format is specified, a transformation job will also be created to transform document into target format in the background. The response will include both the uploaded document details and information about the transformation job.
+
+### Supported Transformations
+
+The following (source_format → target_format) transformations are supported:
+
+- pdf → markdown
+  - zerox
+
+### Transformers
+
+Available transformer names and their implementations, default transformer is zerox:
+
+- `zerox`
diff --git a/backend/app/api/main.py b/backend/app/api/main.py
@@ -5,6 +5,7 @@
     assistants,
     collections,
     documents,
+    doc_transformation_job,
     login,
     organization,
     openai_conversation,
@@ -26,6 +27,7 @@
 api_router.include_router(collections.router)
 api_router.include_router(credentials.router)
 api_router.include_router(documents.router)
+api_router.include_router(doc_transformation_job.router)
 api_router.include_router(login.router)
 api_router.include_router(onboarding.router)
 api_router.include_router(openai_conversation.router)

diff --git a/backend/app/api/routes/doc_transformation_job.py b/backend/app/api/routes/doc_transformation_job.py
@@ -0,0 +1,45 @@
+from uuid import UUID
+
+from fastapi import APIRouter, HTTPException, Query, Path as FastPath
+
+from app.api.deps import CurrentUserOrgProject, SessionDep
+from app.crud.doc_transformation_job import DocTransformationJobCrud
+from app.models import DocTransformationJob, DocTransformationJobs
+from app.utils import APIResponse
+
+router = APIRouter(prefix="/documents/transformations", tags=["doc_transformation_job"])
+
+
+@router.get(
+    "/{job_id}",
+    description="Get the status and details of a document transformation job.",
+    response_model=APIResponse[DocTransformationJob],
+)
+def get_transformation_job(
+    session: SessionDep,
+    current_user: CurrentUserOrgProject,
+    job_id: UUID = FastPath(description="Transformation job ID"),
+):
+    crud = DocTransformationJobCrud(session, current_user.project_id)
+    job = crud.read_one(job_id)
+    return APIResponse.success_response(job)
+
+
+@router.get(
+    "/",
+    description="Get the status and details of multiple document transformation jobs by IDs.",
+    response_model=APIResponse[DocTransformationJobs],
+)
+def get_multiple_transformation_jobs(
+    session: SessionDep,
+    current_user: CurrentUserOrgProject,
+    job_ids: list[UUID] = Query(
+        description="List of transformation job IDs", min=1, max_length=100
+    ),
+):
+    crud = DocTransformationJobCrud(session, project_id=current_user.project_id)
+    jobs = crud.read_each(set(job_ids))
+    jobs_not_found = set(job_ids) - {job.id for job in jobs}
+    return APIResponse.success_response(
+        DocTransformationJobs(jobs=jobs, jobs_not_found=list(jobs_not_found))
+    )
diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py
@@ -1,17 +1,38 @@
 import logging
-from uuid import UUID, uuid4
-from typing import List
 from pathlib import Path
+from uuid import UUID, uuid4
 
-from fastapi import APIRouter, File, UploadFile, Query, HTTPException
+from fastapi import (
+    APIRouter,
+    BackgroundTasks,
+    File,
+    Form,
+    HTTPException,
+    Query,
+    UploadFile,
+)
 from fastapi import Path as FastPath
 
-from app.crud import DocumentCrud, CollectionCrud
-from app.models import Document, DocumentPublic, Message
-from app.utils import APIResponse, load_description, get_openai_client
-from app.api.deps import CurrentUser, SessionDep, CurrentUserOrgProject
+from app.api.deps import CurrentUserOrgProject, SessionDep
 from app.core.cloud import get_cloud_storage
+from app.core.doctransform import service as transformation_service
+from app.core.doctransform.registry import (
+    get_available_transformers,
+    get_file_format,
+    is_transformation_supported,
+    resolve_transformer,
+)
+from app.crud import CollectionCrud, DocumentCrud
 from app.crud.rag import OpenAIAssistantCrud
+from app.models import (
+    Document,
+    DocumentPublic,
+    DocumentUploadResponse,
+    Message,
+    TransformationJobInfo,
+)
+from app.utils import APIResponse, get_openai_client, load_description
+
 
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/documents", tags=["documents"])
@@ -20,7 +41,7 @@
 @router.get(
     "/list",
     description=load_description("documents/list.md"),
-    response_model=APIResponse[List[DocumentPublic]],
+    response_model=APIResponse[list[DocumentPublic]],
 )
 def list_docs(
     session: SessionDep,
@@ -36,13 +57,53 @@ def list_docs(
 @router.post(
     "/upload",
     description=load_description("documents/upload.md"),
-    response_model=APIResponse[DocumentPublic],
+    response_model=APIResponse[DocumentUploadResponse],
 )
-def upload_doc(
+async def upload_doc(
     session: SessionDep,
     current_user: CurrentUserOrgProject,
+    background_tasks: BackgroundTasks,
     src: UploadFile = File(...),
+    target_format: str
+    | None = Form(
+        None,
+        description="Desired output format for the uploaded document (e.g., pdf, docx, txt). ",
+    ),
+    transformer: str
+    | None = Form(
+        None, description="Name of the transformer to apply when converting. "
+    ),
 ):
+    # Determine source file format
+    try:
+        source_format = get_file_format(src.filename)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    # validate if transformation is possible or not
+    if target_format:
+        if not is_transformation_supported(source_format, target_format):
+            raise HTTPException(
+                status_code=400,
+                detail=f"Transformation from {source_format} to {target_format} is not supported",
+            )
+
+        # Resolve the transformer to use
+        if not transformer:
+            transformer = "default"
+        try:
+            actual_transformer = resolve_transformer(
+                source_format, target_format, transformer
+            )
+        except ValueError as e:
+            available_transformers = get_available_transformers(
+                source_format, target_format
+            )
+            raise HTTPException(
+                status_code=400,
+                detail=f"{str(e)}. Available transformers: {list(available_transformers.keys())}",
+            )
+
     storage = get_cloud_storage(session=session, project_id=current_user.project_id)
     document_id = uuid4()
 
@@ -54,8 +115,38 @@ def upload_doc(
         fname=src.filename,
         object_store_url=str(object_store_url),
     )
-    data = crud.update(document)
-    return APIResponse.success_response(data)
+    source_document = crud.update(document)
+
+    job_info: TransformationJobInfo | None = None
+    if target_format and actual_transformer:
+        job_id = transformation_service.start_job(
+            db=session,
+            current_user=current_user,
+            source_document_id=source_document.id,
+            transformer_name=actual_transformer,
+            target_format=target_format,
+            background_tasks=background_tasks,
+        )
+        job_info = TransformationJobInfo(
+            message=f"Document accepted for transformation from {source_format} to {target_format}.",
+            job_id=str(job_id),
+            source_format=source_format,
+            target_format=target_format,
+            transformer=actual_transformer,
+            status_check_url=f"/documents/transformations/{job_id}",
+        )
+
+    document_schema = DocumentPublic.model_validate(
+        source_document, from_attributes=True
+    )
+    document_schema.signed_url = storage.get_signed_url(
+        source_document.object_store_url
+    )
+    response = DocumentUploadResponse(
+        **document_schema.model_dump(), transformation_job=job_info
+    )
+
+    return APIResponse.success_response(response)
 
 
 @router.delete(