diff --git a/backend/app/api/docs/documents/permanent_delete.md b/backend/app/api/docs/documents/permanent_delete.md new file mode 100644 index 00000000..ca875a2e --- /dev/null +++ b/backend/app/api/docs/documents/permanent_delete.md @@ -0,0 +1,5 @@ +This operation soft deletes the document — meaning its metadata and reference are retained in the database, but it is marked as deleted. The actual file stored in cloud storage (e.g., S3) is permanently deleted, and this action is irreversible. +If the document is part of an active collection, those collections +will be deleted using the collections delete interface. Noteably, this +means all OpenAI Vector Store's and Assistant's to which this document +belongs will be deleted. diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index f0f98158..9e6d1c2e 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -74,6 +74,30 @@ def remove_doc( return APIResponse.success_response(data) +@router.delete( + "/remove/{doc_id}/permanent", + description=load_description("documents/permanent_delete.md"), + response_model=APIResponse[Document], +) +def permanent_delete_doc( + session: SessionDep, + current_user: CurrentUser, + doc_id: UUID = FastPath(description="Document to permanently delete"), +): + a_crud = OpenAIAssistantCrud() + d_crud = DocumentCrud(session, current_user.id) + c_crud = CollectionCrud(session, current_user.id) + storage = AmazonCloudStorage(current_user) + + document = d_crud.read_one(doc_id) + + c_crud.delete(document, a_crud) + storage.delete(document.object_store_url) + d_crud.delete(doc_id) + + return APIResponse.success_response(document) + + @router.get( "/info/{doc_id}", description=load_description("documents/info.md"), diff --git a/backend/app/core/cloud/storage.py b/backend/app/core/cloud/storage.py index 341abad5..0e5c2065 100644 --- a/backend/app/core/cloud/storage.py +++ b/backend/app/core/cloud/storage.py @@ -124,3 +124,11 @@ def stream(self, url: str) -> StreamingBody: return self.aws.client.get_object(**kwargs).get("Body") except ClientError as err: raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err + + def delete(self, url: str) -> None: + name = SimpleStorageName.from_url(url) + kwargs = asdict(name) + try: + self.aws.client.delete_object(**kwargs) + except ClientError as err: + raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err diff --git a/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py b/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py new file mode 100644 index 00000000..8a6d353f --- /dev/null +++ b/backend/app/tests/api/routes/documents/test_route_document_permanent_remove.py @@ -0,0 +1,91 @@ +import os +from pathlib import Path +from urllib.parse import urlparse + +import pytest +from botocore.exceptions import ClientError +from moto import mock_aws +from sqlmodel import Session, select + +import openai_responses + +from app.core.cloud import AmazonCloudStorageClient +from app.core.config import settings +from app.models import Document +from app.tests.utils.document import ( + DocumentStore, + DocumentMaker, + Route, + WebCrawler, + crawler, +) +from app.tests.utils.utils import openai_credentials + + +@pytest.fixture +def route(): + return Route("remove") + + +@pytest.fixture(scope="class") +def aws_credentials(): + os.environ["AWS_ACCESS_KEY_ID"] = "testing" + os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" + os.environ["AWS_SECURITY_TOKEN"] = "testing" + os.environ["AWS_SESSION_TOKEN"] = "testing" + os.environ["AWS_DEFAULT_REGION"] = settings.AWS_DEFAULT_REGION + + +@pytest.mark.usefixtures("openai_credentials", "aws_credentials") +@mock_aws +class TestDocumentRoutePermanentRemove: + @openai_responses.mock() + def test_permanent_delete_document_from_s3( + self, + db: Session, + route: Route, + crawler: WebCrawler, + ): + # Setup AWS + aws = AmazonCloudStorageClient() + aws.create() + + # Setup document in DB and S3 + store = DocumentStore(db) + document = store.put() + s3_key = Path(urlparse(document.object_store_url).path).relative_to("/") + aws.client.put_object( + Bucket=settings.AWS_S3_BUCKET, Key=str(s3_key), Body=b"test" + ) + + # Delete document + response = crawler.delete(route.append(document, suffix="permanent")) + assert response.is_success + + db.refresh(document) + + stmt = select(Document).where(Document.id == document.id) + doc_in_db = db.exec(stmt).first() + assert doc_in_db is not None + assert doc_in_db.deleted_at is not None + + with pytest.raises(ClientError) as exc_info: + aws.client.head_object( + Bucket=settings.AWS_S3_BUCKET, + Key=str(s3_key), + ) + assert exc_info.value.response["Error"]["Code"] == "404" + + @openai_responses.mock() + def test_cannot_delete_nonexistent_document( + self, + db: Session, + route: Route, + crawler: WebCrawler, + ): + DocumentStore.clear(db) + + maker = DocumentMaker(db) + response = crawler.delete(route.append(next(maker), suffix="permanent")) + + assert response.is_error diff --git a/backend/app/tests/utils/document.py b/backend/app/tests/utils/document.py index 7669d6d6..078ea1c0 100644 --- a/backend/app/tests/utils/document.py +++ b/backend/app/tests/utils/document.py @@ -37,14 +37,14 @@ def __iter__(self): def __next__(self): doc_id = next(self.index) - args = str(doc_id).split("-") - fname = Path("/", *args).with_suffix(".xyz") + key = f"{self.owner_id}/{doc_id}.txt" + object_store_url = f"s3://{settings.AWS_S3_BUCKET}/{key}" return Document( id=doc_id, owner_id=self.owner_id, - fname=fname.name, - object_store_url=fname.as_uri(), + fname=f"{doc_id}.xyz", + object_store_url=object_store_url, ) @@ -102,8 +102,11 @@ def to_url(self): return self._empty._replace(**kwargs) - def append(self, doc: Document): - endpoint = Path(self.endpoint, str(doc.id)) + def append(self, doc: Document, suffix: str = None): + segments = [self.endpoint, str(doc.id)] + if suffix: + segments.append(suffix) + endpoint = Path(*segments) return type(self)(endpoint, **self.qs_args) @@ -118,6 +121,12 @@ def get(self, route: Route): headers=self.superuser_token_headers, ) + def delete(self, route: Route): + return self.client.delete( + str(route), + headers=self.superuser_token_headers, + ) + class DocumentComparator: @ft.singledispatchmethod