Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions backend/app/api/docs/documents/permanent_delete.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
This operation soft deletes the document — meaning its metadata and reference are retained in the database, but it is marked as deleted. The actual file stored in cloud storage (e.g., S3) is permanently deleted, and this action is irreversible.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a very significant change but the line where it states that the file gets permanently deleted from S3 should come as the first line

If the document is part of an active collection, those collections
will be deleted using the collections delete interface. Noteably, this
means all OpenAI Vector Store's and Assistant's to which this document
belongs will be deleted.
24 changes: 24 additions & 0 deletions backend/app/api/routes/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,30 @@ def remove_doc(
return APIResponse.success_response(data)


@router.delete(
"/remove/{doc_id}/permanent",
description=load_description("documents/permanent_delete.md"),
response_model=APIResponse[Document],
)
def permanent_delete_doc(
session: SessionDep,
current_user: CurrentUser,
doc_id: UUID = FastPath(description="Document to permanently delete"),
):
a_crud = OpenAIAssistantCrud()
d_crud = DocumentCrud(session, current_user.id)
c_crud = CollectionCrud(session, current_user.id)
Comment on lines +87 to +89
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not a big fan of these names like a_crud, d_crud, I raised same in jerome's PR also but don't know what's the best way to do in OOPs

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AkhileshNegi that's right,
But this is something used across the document module to keep things consistent across module I used this way

storage = AmazonCloudStorage(current_user)

document = d_crud.read_one(doc_id)

c_crud.delete(document, a_crud)
storage.delete(document.object_store_url)
d_crud.delete(doc_id)

return APIResponse.success_response(document)


@router.get(
"/info/{doc_id}",
description=load_description("documents/info.md"),
Expand Down
8 changes: 8 additions & 0 deletions backend/app/core/cloud/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,11 @@
return self.aws.client.get_object(**kwargs).get("Body")
except ClientError as err:
raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err

def delete(self, url: str) -> None:
name = SimpleStorageName.from_url(url)
kwargs = asdict(name)
try:
self.aws.client.delete_object(**kwargs)
except ClientError as err:
raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err

Check warning on line 134 in backend/app/core/cloud/storage.py

View check run for this annotation

Codecov / codecov/patch

backend/app/core/cloud/storage.py#L133-L134

Added lines #L133 - L134 were not covered by tests
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import os
from pathlib import Path
from urllib.parse import urlparse

import pytest
from botocore.exceptions import ClientError
from moto import mock_aws
from sqlmodel import Session, select

import openai_responses

from app.core.cloud import AmazonCloudStorageClient
from app.core.config import settings
from app.models import Document
from app.tests.utils.document import (
DocumentStore,
DocumentMaker,
Route,
WebCrawler,
crawler,
)
from app.tests.utils.utils import openai_credentials


@pytest.fixture
def route():
return Route("remove")


@pytest.fixture(scope="class")
def aws_credentials():
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
os.environ["AWS_SECURITY_TOKEN"] = "testing"
os.environ["AWS_SESSION_TOKEN"] = "testing"
os.environ["AWS_DEFAULT_REGION"] = settings.AWS_DEFAULT_REGION
Comment on lines +31 to +36
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think at some point we also need to create .env.test as we add more testcases that may need similar behaviour.
As this is duplicate from backend/app/tests/api/routes/documents/test_route_document_upload.py

Copy link
Collaborator Author

@avirajsingh7 avirajsingh7 Jun 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed.
For the time being, we can go with this way.



@pytest.mark.usefixtures("openai_credentials", "aws_credentials")
@mock_aws
class TestDocumentRoutePermanentRemove:
@openai_responses.mock()
def test_permanent_delete_document_from_s3(
self,
db: Session,
route: Route,
crawler: WebCrawler,
):
# Setup AWS
aws = AmazonCloudStorageClient()
aws.create()

# Setup document in DB and S3
store = DocumentStore(db)
document = store.put()
s3_key = Path(urlparse(document.object_store_url).path).relative_to("/")
aws.client.put_object(
Bucket=settings.AWS_S3_BUCKET, Key=str(s3_key), Body=b"test"
)

# Delete document
response = crawler.delete(route.append(document, suffix="permanent"))
assert response.is_success

db.refresh(document)

stmt = select(Document).where(Document.id == document.id)
doc_in_db = db.exec(stmt).first()
assert doc_in_db is not None
assert doc_in_db.deleted_at is not None

with pytest.raises(ClientError) as exc_info:
aws.client.head_object(
Bucket=settings.AWS_S3_BUCKET,
Key=str(s3_key),
)
assert exc_info.value.response["Error"]["Code"] == "404"

@openai_responses.mock()
def test_cannot_delete_nonexistent_document(
self,
db: Session,
route: Route,
crawler: WebCrawler,
):
DocumentStore.clear(db)

maker = DocumentMaker(db)
response = crawler.delete(route.append(next(maker), suffix="permanent"))

assert response.is_error
21 changes: 15 additions & 6 deletions backend/app/tests/utils/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ def __iter__(self):

def __next__(self):
doc_id = next(self.index)
args = str(doc_id).split("-")
fname = Path("/", *args).with_suffix(".xyz")
key = f"{self.owner_id}/{doc_id}.txt"
object_store_url = f"s3://{settings.AWS_S3_BUCKET}/{key}"

return Document(
id=doc_id,
owner_id=self.owner_id,
fname=fname.name,
object_store_url=fname.as_uri(),
fname=f"{doc_id}.xyz",
object_store_url=object_store_url,
)


Expand Down Expand Up @@ -102,8 +102,11 @@ def to_url(self):

return self._empty._replace(**kwargs)

def append(self, doc: Document):
endpoint = Path(self.endpoint, str(doc.id))
def append(self, doc: Document, suffix: str = None):
segments = [self.endpoint, str(doc.id)]
if suffix:
segments.append(suffix)
endpoint = Path(*segments)
return type(self)(endpoint, **self.qs_args)


Expand All @@ -118,6 +121,12 @@ def get(self, route: Route):
headers=self.superuser_token_headers,
)

def delete(self, route: Route):
return self.client.delete(
str(route),
headers=self.superuser_token_headers,
)


class DocumentComparator:
@ft.singledispatchmethod
Expand Down