-
Couldn't load subscription status.
- Fork 5
Implement document transformation pipeline to improve RAG performance #363
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b615d5a
b7c95ab
9fd7ade
af8ab26
39166bb
194ce2d
b341421
de4a89d
820efc9
f12cba0
6461d51
9f68c0f
76f8757
e06042f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| """create doc transformation job table | ||
| Revision ID: 9f8a4af9d6fd | ||
| Revises: b5b9412d3d2a | ||
| Create Date: 2025-08-29 16:00:47.848950 | ||
| """ | ||
| from alembic import op | ||
| import sqlalchemy as sa | ||
| import sqlmodel.sql.sqltypes | ||
|
|
||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision = "9f8a4af9d6fd" | ||
| down_revision = "b5b9412d3d2a" | ||
| branch_labels = None | ||
| depends_on = None | ||
|
|
||
|
|
||
| def upgrade(): | ||
| # ### commands auto generated by Alembic - please adjust! ### | ||
| op.create_table( | ||
| "doc_transformation_job", | ||
| sa.Column("id", sa.Uuid(), nullable=False), | ||
| sa.Column("source_document_id", sa.Uuid(), nullable=False), | ||
| sa.Column("transformed_document_id", sa.Uuid(), nullable=True), | ||
| sa.Column( | ||
nishika26 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| "status", | ||
| sa.Enum( | ||
| "PENDING", | ||
| "PROCESSING", | ||
| "COMPLETED", | ||
| "FAILED", | ||
| name="transformationstatus", | ||
| ), | ||
| nullable=False, | ||
| ), | ||
| sa.Column("error_message", sqlmodel.sql.sqltypes.AutoString(), nullable=True), | ||
| sa.Column("created_at", sa.DateTime(), nullable=False), | ||
| sa.Column("updated_at", sa.DateTime(), nullable=False), | ||
| sa.ForeignKeyConstraint( | ||
| ["source_document_id"], | ||
| ["document.id"], | ||
| ), | ||
| sa.ForeignKeyConstraint( | ||
| ["transformed_document_id"], | ||
| ["document.id"], | ||
| ), | ||
| sa.PrimaryKeyConstraint("id"), | ||
| ) | ||
| # ### end Alembic commands ### | ||
|
|
||
|
|
||
| def downgrade(): | ||
| # ### commands auto generated by Alembic - please adjust! ### | ||
| op.drop_table("doc_transformation_job") | ||
| # ### end Alembic commands ### | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| """add source document id to document table | ||
| Revision ID: b5b9412d3d2a | ||
| Revises: 40307ab77e9f | ||
| Create Date: 2025-08-29 15:59:34.347031 | ||
| """ | ||
| from alembic import op | ||
| import sqlalchemy as sa | ||
| import sqlmodel.sql.sqltypes | ||
|
|
||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision = "b5b9412d3d2a" | ||
| down_revision = "40307ab77e9f" | ||
| branch_labels = None | ||
| depends_on = None | ||
|
|
||
|
|
||
| def upgrade(): | ||
| # ### commands auto generated by Alembic - please adjust! ### | ||
| op.add_column("document", sa.Column("source_document_id", sa.Uuid(), nullable=True)) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if the "source document id" column in this table and doc transformation table are different, like in the context of doc transformation table, I understand what the source document id could mean, but what is the role of this column in this table There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So first you upload doc to s3 and than you will use that doc for transformation, so this id of document you uploaded to s3 and from which the transformed document got created. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. still a little confused |
||
| op.create_foreign_key(None, "document", "document", ["source_document_id"], ["id"]) | ||
| # ### end Alembic commands ### | ||
|
|
||
|
|
||
| def downgrade(): | ||
| # ### commands auto generated by Alembic - please adjust! ### | ||
| op.drop_constraint(None, "document", type_="foreignkey") | ||
| op.drop_column("document", "source_document_id") | ||
| # ### end Alembic commands ### | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,2 +1,17 @@ | ||
| Upload a document to the AI platform. The response will contain an ID, | ||
| which is the document ID required by other routes. | ||
| Upload a document to the AI platform. | ||
|
|
||
| - If only a file is provided, the document will be uploaded and stored, and its ID will be returned. | ||
| - If a target format is specified, a transformation job will also be created to transform document into target format in the background. The response will include both the uploaded document details and information about the transformation job. | ||
|
|
||
| ### Supported Transformations | ||
|
|
||
| The following (source_format → target_format) transformations are supported: | ||
|
|
||
| - pdf → markdown | ||
| - zerox | ||
|
|
||
| ### Transformers | ||
|
|
||
| Available transformer names and their implementations, default transformer is zerox: | ||
|
|
||
| - `zerox` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| from uuid import UUID | ||
|
|
||
| from fastapi import APIRouter, HTTPException, Query, Path as FastPath | ||
|
|
||
| from app.api.deps import CurrentUserOrgProject, SessionDep | ||
| from app.crud.doc_transformation_job import DocTransformationJobCrud | ||
| from app.models import DocTransformationJob, DocTransformationJobs | ||
| from app.utils import APIResponse | ||
|
|
||
| router = APIRouter(prefix="/documents/transformations", tags=["doc_transformation_job"]) | ||
|
|
||
|
|
||
| @router.get( | ||
| "/{job_id}", | ||
| description="Get the status and details of a document transformation job.", | ||
| response_model=APIResponse[DocTransformationJob], | ||
| ) | ||
| def get_transformation_job( | ||
| session: SessionDep, | ||
| current_user: CurrentUserOrgProject, | ||
| job_id: UUID = FastPath(description="Transformation job ID"), | ||
| ): | ||
| crud = DocTransformationJobCrud(session, current_user.project_id) | ||
| job = crud.read_one(job_id) | ||
| return APIResponse.success_response(job) | ||
|
|
||
|
|
||
| @router.get( | ||
| "/", | ||
| description="Get the status and details of multiple document transformation jobs by IDs.", | ||
| response_model=APIResponse[DocTransformationJobs], | ||
| ) | ||
| def get_multiple_transformation_jobs( | ||
| session: SessionDep, | ||
| current_user: CurrentUserOrgProject, | ||
| job_ids: list[UUID] = Query( | ||
| description="List of transformation job IDs", min=1, max_length=100 | ||
| ), | ||
| ): | ||
| crud = DocTransformationJobCrud(session, project_id=current_user.project_id) | ||
| jobs = crud.read_each(set(job_ids)) | ||
| jobs_not_found = set(job_ids) - {job.id for job in jobs} | ||
| return APIResponse.success_response( | ||
| DocTransformationJobs(jobs=jobs, jobs_not_found=list(jobs_not_found)) | ||
| ) |
Uh oh!
There was an error while loading. Please reload this page.