From 8c99ec1cd2c6ba018396f89d4a66ce09e944b731 Mon Sep 17 00:00:00 2001 From: Aviraj <100823015+avirajsingh7@users.noreply.github.com> Date: Thu, 19 Jun 2025 13:01:50 +0530 Subject: [PATCH 1/8] Implement permanent delete in document endpoint --- .../api/docs/documents/permanent_delete.md | 2 ++ backend/app/api/routes/documents.py | 24 +++++++++++++++++++ backend/app/core/cloud/storage.py | 8 +++++++ backend/app/crud/document.py | 6 +++++ 4 files changed, 40 insertions(+) create mode 100644 backend/app/api/docs/documents/permanent_delete.md diff --git a/backend/app/api/docs/documents/permanent_delete.md b/backend/app/api/docs/documents/permanent_delete.md new file mode 100644 index 00000000..47260b06 --- /dev/null +++ b/backend/app/api/docs/documents/permanent_delete.md @@ -0,0 +1,2 @@ +Permanently deletes a document from the database and its associated file from cloud storage. This action is irreversible. +If the document is part of collections, those collections will be deleted, including any associated OpenAI Vector Stores and Assistants. \ No newline at end of file diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index f0f98158..9817190d 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -74,6 +74,30 @@ def remove_doc( return APIResponse.success_response(data) +@router.delete( + "remove/{doc_id}/permanent", + description=load_description("documents/permanent_delete.md"), + response_model=APIResponse[Document], +) +def permanent_delete_doc( + session: SessionDep, + current_user: CurrentUser, + doc_id: UUID = FastPath(description="Document to permanently delete"), +): + a_crud = OpenAIAssistantCrud() + d_crud = DocumentCrud(session, current_user.id) + c_crud = CollectionCrud(session, current_user.id) + storage = AmazonCloudStorage(current_user) + + document = d_crud.read_one(doc_id) + + c_crud.delete(document, a_crud) + storage.delete(document.object_store_url) + d_crud.hard_delete(doc_id) + + return APIResponse.success_response(document) + + @router.get( "/info/{doc_id}", description=load_description("documents/info.md"), diff --git a/backend/app/core/cloud/storage.py b/backend/app/core/cloud/storage.py index 341abad5..0e5c2065 100644 --- a/backend/app/core/cloud/storage.py +++ b/backend/app/core/cloud/storage.py @@ -124,3 +124,11 @@ def stream(self, url: str) -> StreamingBody: return self.aws.client.get_object(**kwargs).get("Body") except ClientError as err: raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err + + def delete(self, url: str) -> None: + name = SimpleStorageName.from_url(url) + kwargs = asdict(name) + try: + self.aws.client.delete_object(**kwargs) + except ClientError as err: + raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err diff --git a/backend/app/crud/document.py b/backend/app/crud/document.py index ecb81890..57d56911 100644 --- a/backend/app/crud/document.py +++ b/backend/app/crud/document.py @@ -88,3 +88,9 @@ def delete(self, doc_id: UUID): document.updated_at = now() return self.update(document) + + def hard_delete(self, doc_id: UUID) -> Document: + document = self.read_one(doc_id) + self.session.delete(document) + self.session.commit() + return document From d55614e9f475840f51afe473465397a55f399f46 Mon Sep 17 00:00:00 2001 From: Aviraj <100823015+avirajsingh7@users.noreply.github.com> Date: Thu, 19 Jun 2025 13:53:01 +0530 Subject: [PATCH 2/8] Modify delete operation --- backend/app/api/docs/documents/permanent_delete.md | 7 +++++-- backend/app/api/routes/documents.py | 2 +- backend/app/crud/document.py | 6 ------ 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/backend/app/api/docs/documents/permanent_delete.md b/backend/app/api/docs/documents/permanent_delete.md index 47260b06..33ff293a 100644 --- a/backend/app/api/docs/documents/permanent_delete.md +++ b/backend/app/api/docs/documents/permanent_delete.md @@ -1,2 +1,5 @@ -Permanently deletes a document from the database and its associated file from cloud storage. This action is irreversible. -If the document is part of collections, those collections will be deleted, including any associated OpenAI Vector Stores and Assistants. \ No newline at end of file +This operation performs a soft delete on the document — the document remains in the database but is marked as deleted. However, the associated file in cloud storage is permanently deleted. This file deletion is irreversible. +If the document is part of an active collection, those collections +will be deleted using the collections delete interface. Noteably, this +means all OpenAI Vector Store's and Assistant's to which this document +belongs will be deleted. diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index 9817190d..7121dd47 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -93,7 +93,7 @@ def permanent_delete_doc( c_crud.delete(document, a_crud) storage.delete(document.object_store_url) - d_crud.hard_delete(doc_id) + d_crud.delete(doc_id) return APIResponse.success_response(document) diff --git a/backend/app/crud/document.py b/backend/app/crud/document.py index 57d56911..ecb81890 100644 --- a/backend/app/crud/document.py +++ b/backend/app/crud/document.py @@ -88,9 +88,3 @@ def delete(self, doc_id: UUID): document.updated_at = now() return self.update(document) - - def hard_delete(self, doc_id: UUID) -> Document: - document = self.read_one(doc_id) - self.session.delete(document) - self.session.commit() - return document From dbce14e7c1b164c655fc2d58a9ffcc819f1a6de9 Mon Sep 17 00:00:00 2001 From: Aviraj <100823015+avirajsingh7@users.noreply.github.com> Date: Thu, 19 Jun 2025 14:00:39 +0530 Subject: [PATCH 3/8] typo fix --- backend/app/api/docs/documents/permanent_delete.md | 2 +- backend/app/api/routes/documents.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/app/api/docs/documents/permanent_delete.md b/backend/app/api/docs/documents/permanent_delete.md index 33ff293a..ca875a2e 100644 --- a/backend/app/api/docs/documents/permanent_delete.md +++ b/backend/app/api/docs/documents/permanent_delete.md @@ -1,4 +1,4 @@ -This operation performs a soft delete on the document — the document remains in the database but is marked as deleted. However, the associated file in cloud storage is permanently deleted. This file deletion is irreversible. +This operation soft deletes the document — meaning its metadata and reference are retained in the database, but it is marked as deleted. The actual file stored in cloud storage (e.g., S3) is permanently deleted, and this action is irreversible. If the document is part of an active collection, those collections will be deleted using the collections delete interface. Noteably, this means all OpenAI Vector Store's and Assistant's to which this document diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index 7121dd47..9e6d1c2e 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -75,7 +75,7 @@ def remove_doc( @router.delete( - "remove/{doc_id}/permanent", + "/remove/{doc_id}/permanent", description=load_description("documents/permanent_delete.md"), response_model=APIResponse[Document], ) From e9de5d5e986f9d44d25bc9a9e5b23bd9a8be024f Mon Sep 17 00:00:00 2001 From: Aviraj <100823015+avirajsingh7@users.noreply.github.com> Date: Thu, 19 Jun 2025 16:51:26 +0530 Subject: [PATCH 4/8] Add testcases for permanent delete endpoint --- .../test_route_document_permanent_remove.py | 79 +++++++++++++++++++ backend/app/tests/utils/document.py | 21 +++-- 2 files changed, 94 insertions(+), 6 deletions(-) create mode 100644 backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py diff --git a/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py b/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py new file mode 100644 index 00000000..251b3289 --- /dev/null +++ b/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py @@ -0,0 +1,79 @@ +import os +from pathlib import Path +from urllib.parse import urlparse + +import pytest +from moto import mock_aws +from sqlmodel import Session, select + +import openai_responses + +from app.core.cloud import AmazonCloudStorageClient +from app.core.config import settings +from app.models import Document +from app.tests.utils.document import ( + DocumentStore, + DocumentMaker, + Route, + WebCrawler, +) + + +@pytest.fixture +def route(): + return Route("remove") + +@pytest.fixture(scope="class") +def aws_credentials(): + os.environ["AWS_ACCESS_KEY_ID"] = "testing" + os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" + os.environ["AWS_SECURITY_TOKEN"] = "testing" + os.environ["AWS_SESSION_TOKEN"] = "testing" + os.environ["AWS_DEFAULT_REGION"] = settings.AWS_DEFAULT_REGION + + +@pytest.mark.usefixtures("openai_credentials", "aws_credentials") +@mock_aws +class TestDocumentRoutePermanentRemove: + + @openai_responses.mock() + def test_permanently_deletes_document_from_db_and_s3( + self, + db: Session, + route: Route, + crawler: WebCrawler, + ): + # Setup AWS + aws = AmazonCloudStorageClient() + aws.create() + + # Setup document in DB and S3 + store = DocumentStore(db) + document = store.put() + s3_key = Path(urlparse(document.object_store_url).path).relative_to("/") + aws.client.put_object(Bucket=settings.AWS_S3_BUCKET, Key=str(s3_key), Body=b"test") + + # Delete document + response = crawler.delete(route.append(document, suffix="permanent")) + assert response.is_success + + db.refresh(document) + + stmt = select(Document).where(Document.id == document.id) + doc_in_db = db.exec(stmt).first() + assert doc_in_db is not None + assert doc_in_db.deleted_at is not None + + @openai_responses.mock() + def test_cannot_delete_nonexistent_document( + self, + db: Session, + route: Route, + crawler: WebCrawler, + ): + DocumentStore.clear(db) + + maker = DocumentMaker(db) + response = crawler.delete(route.append(next(maker), suffix="permanent")) + + assert response.is_error diff --git a/backend/app/tests/utils/document.py b/backend/app/tests/utils/document.py index 7669d6d6..a8923e7e 100644 --- a/backend/app/tests/utils/document.py +++ b/backend/app/tests/utils/document.py @@ -37,14 +37,14 @@ def __iter__(self): def __next__(self): doc_id = next(self.index) - args = str(doc_id).split("-") - fname = Path("/", *args).with_suffix(".xyz") + key = f"{self.owner_id}/{doc_id}.xyz" + object_store_url = f"s3://{settings.AWS_S3_BUCKET}/{key}" return Document( id=doc_id, owner_id=self.owner_id, - fname=fname.name, - object_store_url=fname.as_uri(), + fname=f"{doc_id}.xyz", + object_store_url=object_store_url, ) @@ -102,8 +102,11 @@ def to_url(self): return self._empty._replace(**kwargs) - def append(self, doc: Document): - endpoint = Path(self.endpoint, str(doc.id)) + def append(self, doc: Document, suffix: str = None): + segments = [self.endpoint, str(doc.id)] + if suffix: + segments.append(suffix) + endpoint = Path(*segments) return type(self)(endpoint, **self.qs_args) @@ -118,6 +121,12 @@ def get(self, route: Route): headers=self.superuser_token_headers, ) + def delete(self, route: Route): + return self.client.delete( + str(route), + headers=self.superuser_token_headers, + ) + class DocumentComparator: @ft.singledispatchmethod From e99159f437e31a86c24f2160d1115bf74591bce0 Mon Sep 17 00:00:00 2001 From: Aviraj <100823015+avirajsingh7@users.noreply.github.com> Date: Thu, 19 Jun 2025 16:53:34 +0530 Subject: [PATCH 5/8] pre commit run --- .../documents/test_route_document_permanent_remove.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py b/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py index 251b3289..a2677b1f 100644 --- a/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py +++ b/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py @@ -23,6 +23,7 @@ def route(): return Route("remove") + @pytest.fixture(scope="class") def aws_credentials(): os.environ["AWS_ACCESS_KEY_ID"] = "testing" @@ -35,7 +36,6 @@ def aws_credentials(): @pytest.mark.usefixtures("openai_credentials", "aws_credentials") @mock_aws class TestDocumentRoutePermanentRemove: - @openai_responses.mock() def test_permanently_deletes_document_from_db_and_s3( self, @@ -51,7 +51,9 @@ def test_permanently_deletes_document_from_db_and_s3( store = DocumentStore(db) document = store.put() s3_key = Path(urlparse(document.object_store_url).path).relative_to("/") - aws.client.put_object(Bucket=settings.AWS_S3_BUCKET, Key=str(s3_key), Body=b"test") + aws.client.put_object( + Bucket=settings.AWS_S3_BUCKET, Key=str(s3_key), Body=b"test" + ) # Delete document response = crawler.delete(route.append(document, suffix="permanent")) From 55f1a7126b5ec3597f492569773c3e80ef8b9c99 Mon Sep 17 00:00:00 2001 From: Aviraj <100823015+avirajsingh7@users.noreply.github.com> Date: Thu, 19 Jun 2025 17:01:54 +0530 Subject: [PATCH 6/8] improve test --- .../documents/test_route_document_permanent_remove.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py b/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py index a2677b1f..0c403395 100644 --- a/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py +++ b/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py @@ -3,6 +3,7 @@ from urllib.parse import urlparse import pytest +from botocore.exceptions import ClientError from moto import mock_aws from sqlmodel import Session, select @@ -16,7 +17,9 @@ DocumentMaker, Route, WebCrawler, + crawler, ) +from app.tests.utils.utils import openai_credentials @pytest.fixture @@ -66,6 +69,13 @@ def test_permanently_deletes_document_from_db_and_s3( assert doc_in_db is not None assert doc_in_db.deleted_at is not None + with pytest.raises(ClientError) as exc_info: + aws.client.head_object( + Bucket=settings.AWS_S3_BUCKET, + Key=str(s3_key), + ) + assert exc_info.value.response["Error"]["Code"] == "404" + @openai_responses.mock() def test_cannot_delete_nonexistent_document( self, From 34b209a00b3d9b145e420b4a939f5abbf087d448 Mon Sep 17 00:00:00 2001 From: Aviraj <100823015+avirajsingh7@users.noreply.github.com> Date: Thu, 19 Jun 2025 17:02:55 +0530 Subject: [PATCH 7/8] rename function --- .../routes/documents/test_route_document_permanent_remove.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py b/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py index 0c403395..8a6d353f 100644 --- a/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py +++ b/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py @@ -40,7 +40,7 @@ def aws_credentials(): @mock_aws class TestDocumentRoutePermanentRemove: @openai_responses.mock() - def test_permanently_deletes_document_from_db_and_s3( + def test_permanent_delete_document_from_s3( self, db: Session, route: Route, From cbb95bd6768f69cdb79a51879be306b2d13b860c Mon Sep 17 00:00:00 2001 From: Aviraj <100823015+avirajsingh7@users.noreply.github.com> Date: Fri, 20 Jun 2025 10:20:54 +0530 Subject: [PATCH 8/8] change extension to txt --- backend/app/tests/utils/document.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/tests/utils/document.py b/backend/app/tests/utils/document.py index a8923e7e..078ea1c0 100644 --- a/backend/app/tests/utils/document.py +++ b/backend/app/tests/utils/document.py @@ -37,7 +37,7 @@ def __iter__(self): def __next__(self): doc_id = next(self.index) - key = f"{self.owner_id}/{doc_id}.xyz" + key = f"{self.owner_id}/{doc_id}.txt" object_store_url = f"s3://{settings.AWS_S3_BUCKET}/{key}" return Document(