From 47a9b8977bc44cb4fdb1554fa0dc1425ed3909f3 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 19 Jun 2025 18:12:59 +0530 Subject: [PATCH 01/12] routes and deps --- backend/app/api/deps.py | 24 +++++++ backend/app/api/routes/collections.py | 95 ++++++++++++++------------- backend/app/models/collection.py | 37 ++++++++--- backend/app/models/organization.py | 4 ++ backend/app/models/project.py | 3 + 5 files changed, 110 insertions(+), 53 deletions(-) diff --git a/backend/app/api/deps.py b/backend/app/api/deps.py index 50455d54..61abef4a 100644 --- a/backend/app/api/deps.py +++ b/backend/app/api/deps.py @@ -107,6 +107,30 @@ def get_current_user_org( CurrentUserOrg = Annotated[UserOrganization, Depends(get_current_user_org)] +def get_current_user_org_project( + current_user: CurrentUser, session: SessionDep, request: Request +) -> UserProjectOrg: + api_key = request.headers.get("X-API-KEY") + organization_id = None + project_id = None + + if api_key: + api_key_record = get_api_key_by_value(session, api_key) + if api_key_record: + validate_organization(session, api_key_record.organization_id) + organization_id = api_key_record.organization_id + project_id = api_key_record.project_id + + return UserProjectOrg( + **current_user.model_dump(), + organization_id=organization_id, + project_id=project_id, + ) + + +CurrentUserOrgproject = Annotated[UserProjectOrg, Depends(get_current_user_org_project)] + + def get_current_active_superuser(current_user: CurrentUser) -> User: if not current_user.is_superuser: raise HTTPException( diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index 532bab72..e838d2be 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -11,7 +11,7 @@ from pydantic import BaseModel, Field, HttpUrl from sqlalchemy.exc import NoResultFound, MultipleResultsFound, SQLAlchemyError -from app.api.deps import CurrentUser, SessionDep +from app.api.deps import CurrentUser, SessionDep, CurrentUserOrgproject from app.core.cloud import AmazonCloudStorage from app.core.config import settings from app.core.util import now, raise_from_unknown, post_callback @@ -169,65 +169,59 @@ def _backout(crud: OpenAIAssistantCrud, assistant_id: str): def do_create_collection( session: SessionDep, - current_user: CurrentUser, + current_user: CurrentUserOrgproject, request: CreationRequest, payload: ResponsePayload, ): client = OpenAI(api_key=settings.OPENAI_API_KEY) - if request.callback_url is None: - callback = SilentCallback(payload) - else: - callback = WebHookCallback(request.callback_url, payload) - - # - # Create the assistant and vector store - # + callback = ( + SilentCallback(payload) + if request.callback_url is None + else WebHookCallback(request.callback_url, payload) + ) vector_store_crud = OpenAIVectorStoreCrud(client) - try: - vector_store = vector_store_crud.create() - except OpenAIError as err: - callback.fail(str(err)) - return - + assistant_crud = OpenAIAssistantCrud(client) storage = AmazonCloudStorage(current_user) document_crud = DocumentCrud(session, current_user.id) - assistant_crud = OpenAIAssistantCrud(client) + collection_crud = CollectionCrud(session, current_user.id) - docs = request(document_crud) - kwargs = dict(request.extract_super_type(AssistantOptions)) try: + vector_store = vector_store_crud.create() + + docs = request(document_crud) updates = vector_store_crud.update(vector_store.id, storage, docs) documents = list(updates) + + kwargs = dict(request.extract_super_type(AssistantOptions)) assistant = assistant_crud.create(vector_store.id, **kwargs) - except Exception as err: # blanket to handle SQL and OpenAI errors - logging.error(f"File Search setup error: {err} ({type(err).__name__})") - vector_store_crud.delete(vector_store.id) - callback.fail(str(err)) - return - # - # Store the results - # + # 3. Read and update collection with assistant info + collection = collection_crud.read_one(UUID(payload.key)) + collection.llm_service_id = assistant.id + collection.llm_service_name = request.model + collection.status = "successfull" + collection.updated_at = now() - collection_crud = CollectionCrud(session, current_user.id) - collection = Collection( - id=UUID(payload.key), - llm_service_id=assistant.id, - llm_service_name=request.model, - ) - try: - collection_crud.create(collection, documents) - except SQLAlchemyError as err: - _backout(assistant_crud, assistant.id) - callback.fail(str(err)) - return + dc_crud = DocumentCollectionCrud(session) + dc_crud.create(collection, documents) - # - # Send back successful response - # + collection_crud._update(collection) - callback.success(collection.model_dump(mode="json")) + callback.success({"id": payload.key}) + except Exception as err: + logging.error(f"[CollectionTask] Failed: {err} ({type(err).__name__})") + + # 4. On failure, update collection status only + try: + collection = collection_crud.read_one(UUID(payload.key)) + collection.status = "failed" + collection.updated_at = now() + collection_crud._update(collection) + except Exception as suberr: + logging.error(f"Failed to update collection status: {suberr}") + + callback.fail(str(err)) @router.post( @@ -236,7 +230,7 @@ def do_create_collection( ) def create_collection( session: SessionDep, - current_user: CurrentUser, + current_user: CurrentUserOrgproject, request: CreationRequest, background_tasks: BackgroundTasks, ): @@ -244,6 +238,19 @@ def create_collection( route = router.url_path_for(this.f_code.co_name) payload = ResponsePayload("processing", route) + # 1. Create initial collection record + collection = Collection( + id=UUID(payload.key), + owner_id=current_user.id, + organization_id=current_user.organization_id, + project_id=current_user.project_id, + status="processing", + ) + + collection_crud = CollectionCrud(session, current_user.id) + collection_crud.create(collection, documents=[]) + + # 2. Launch background task background_tasks.add_task( do_create_collection, session, diff --git a/backend/app/models/collection.py b/backend/app/models/collection.py index 2d7ea2a5..7a574c0a 100644 --- a/backend/app/models/collection.py +++ b/backend/app/models/collection.py @@ -1,27 +1,46 @@ from uuid import UUID, uuid4 from datetime import datetime +from typing import Optional from sqlmodel import Field, Relationship, SQLModel from app.core.util import now from .user import User +from .organization import Organization +from .project import Project +from enum import Enum class Collection(SQLModel, table=True): - id: UUID = Field( - default_factory=uuid4, - primary_key=True, - ) + id: UUID = Field(default_factory=uuid4, primary_key=True) + owner_id: int = Field( foreign_key="user.id", nullable=False, ondelete="CASCADE", ) - llm_service_id: str - llm_service_name: str - created_at: datetime = Field( - default_factory=now, + + organization_id: int = Field( + foreign_key="organization.id", + nullable=False, + ondelete="CASCADE", + ) + + project_id: int = Field( + foreign_key="project.id", + nullable=True, + ondelete="SET NULL", ) - deleted_at: datetime | None + + llm_service_id: Optional[str] = Field(default=None, nullable=True) + llm_service_name: Optional[str] = Field(default=None, nullable=True) + + status: Optional[str] = None + + created_at: datetime = Field(default_factory=now) + updated_at: datetime = Field(default_factory=now) + deleted_at: Optional[datetime] = None owner: User = Relationship(back_populates="collections") + organization: Organization = Relationship(back_populates="collections") + project: Project = Relationship(back_populates="collections") diff --git a/backend/app/models/organization.py b/backend/app/models/organization.py index 7e3e2cd8..fc52bf83 100644 --- a/backend/app/models/organization.py +++ b/backend/app/models/organization.py @@ -10,6 +10,7 @@ from .project import Project from .api_key import APIKey from .assistants import Assistant + from .collection import Collection # Shared properties for an Organization @@ -48,6 +49,9 @@ class Organization(OrganizationBase, table=True): assistants: list["Assistant"] = Relationship( back_populates="organization", sa_relationship_kwargs={"cascade": "all, delete"} ) + collections: list["Collection"] = Relationship( + back_populates="organization", sa_relationship_kwargs={"cascade": "all, delete"} + ) # Properties to return via API diff --git a/backend/app/models/project.py b/backend/app/models/project.py index 8a56ec81..df63f0d4 100644 --- a/backend/app/models/project.py +++ b/backend/app/models/project.py @@ -44,6 +44,9 @@ class Project(ProjectBase, table=True): back_populates="project", sa_relationship_kwargs={"cascade": "all, delete"} ) organization: Optional["Organization"] = Relationship(back_populates="project") + collections: list["Collection"] = Relationship( + back_populates="project", sa_relationship_kwargs={"cascade": "all, delete"} + ) # Properties to return via API From f3494ecc5da6a88d2d8f52fd728f536a92fe3f3d Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 19 Jun 2025 18:24:59 +0530 Subject: [PATCH 02/12] small fix --- backend/app/api/routes/collections.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index e838d2be..4230f55a 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -174,11 +174,10 @@ def do_create_collection( payload: ResponsePayload, ): client = OpenAI(api_key=settings.OPENAI_API_KEY) - callback = ( - SilentCallback(payload) - if request.callback_url is None - else WebHookCallback(request.callback_url, payload) - ) + if request.callback_url is None: + callback = SilentCallback(payload) + else: + callback = WebHookCallback(request.callback_url, payload) vector_store_crud = OpenAIVectorStoreCrud(client) assistant_crud = OpenAIAssistantCrud(client) From e30f5d8abd49c86cab185eefcc128d3044005a6c Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 19 Jun 2025 18:43:24 +0530 Subject: [PATCH 03/12] logging --- backend/app/api/routes/collections.py | 77 +++++++++++++++++++-------- backend/app/core/cloud/storage.py | 7 +++ 2 files changed, 62 insertions(+), 22 deletions(-) diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index 4230f55a..38e6029a 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -1,5 +1,6 @@ import inspect import logging +import time import warnings from uuid import UUID, uuid4 from typing import Any, List, Optional @@ -169,58 +170,90 @@ def _backout(crud: OpenAIAssistantCrud, assistant_id: str): def do_create_collection( session: SessionDep, - current_user: CurrentUserOrgproject, + current_user: CurrentUser, request: CreationRequest, payload: ResponsePayload, ): + start_time = time.time() client = OpenAI(api_key=settings.OPENAI_API_KEY) if request.callback_url is None: callback = SilentCallback(payload) else: callback = WebHookCallback(request.callback_url, payload) + # + # Create the assistant and vector store + # + vector_store_crud = OpenAIVectorStoreCrud(client) - assistant_crud = OpenAIAssistantCrud(client) + try: + vector_store = vector_store_crud.create() + except OpenAIError as err: + callback.fail(str(err)) + return + storage = AmazonCloudStorage(current_user) document_crud = DocumentCrud(session, current_user.id) - collection_crud = CollectionCrud(session, current_user.id) + assistant_crud = OpenAIAssistantCrud(client) - try: - vector_store = vector_store_crud.create() + docs = request(document_crud) + log_doc = list(docs) + doc_count = len(log_doc) + flat_docs = [doc for sublist in log_doc for doc in sublist] + file_exts = list( + {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname} + ) - docs = request(document_crud) + file_sizes_kb = [] + for doc in flat_docs: + size_kb = storage.get_file_size_kb(doc.object_store_url) + file_sizes_kb.append(size_kb) + + kwargs = dict(request.extract_super_type(AssistantOptions)) + try: updates = vector_store_crud.update(vector_store.id, storage, docs) documents = list(updates) - - kwargs = dict(request.extract_super_type(AssistantOptions)) assistant = assistant_crud.create(vector_store.id, **kwargs) + except Exception as err: # blanket to handle SQL and OpenAI errors + logging.error(f"File Search setup error: {err} ({type(err).__name__})") + vector_store_crud.delete(vector_store.id) + callback.fail(str(err)) + return + + # + # Store the results + # - # 3. Read and update collection with assistant info + collection_crud = CollectionCrud(session, current_user.id) + try: + collection_crud = CollectionCrud(session, current_user.id) collection = collection_crud.read_one(UUID(payload.key)) collection.llm_service_id = assistant.id collection.llm_service_name = request.model - collection.status = "successfull" + collection.status = "Successful" collection.updated_at = now() dc_crud = DocumentCollectionCrud(session) dc_crud.create(collection, documents) collection_crud._update(collection) + except SQLAlchemyError as err: + _backout(assistant_crud, assistant.id) + logging.error(f"[Error during creating colletion - {err}") + callback.fail(str(err)) + return - callback.success({"id": payload.key}) - except Exception as err: - logging.error(f"[CollectionTask] Failed: {err} ({type(err).__name__})") + elapsed = time.time() - start_time + logging.info( + f"Collection created: {collection.id} | " + f"Time: {elapsed}s | Files: {doc_count} |Sizes:{file_sizes_kb} KB |Types: {file_exts}" + ) - # 4. On failure, update collection status only - try: - collection = collection_crud.read_one(UUID(payload.key)) - collection.status = "failed" - collection.updated_at = now() - collection_crud._update(collection) - except Exception as suberr: - logging.error(f"Failed to update collection status: {suberr}") + # + # Send back successful response + # - callback.fail(str(err)) + callback.success(collection.model_dump(mode="json")) @router.post( diff --git a/backend/app/core/cloud/storage.py b/backend/app/core/cloud/storage.py index 341abad5..78a44fed 100644 --- a/backend/app/core/cloud/storage.py +++ b/backend/app/core/cloud/storage.py @@ -124,3 +124,10 @@ def stream(self, url: str) -> StreamingBody: return self.aws.client.get_object(**kwargs).get("Body") except ClientError as err: raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err + + def get_file_size_kb(self, url: str) -> float: + name = SimpleStorageName.from_url(url) + kwargs = asdict(name) + response = self.aws.client.head_object(**kwargs) + size_bytes = response["ContentLength"] + return round(size_bytes / 1024, 2) From 60c1afdf27d303d7d2e22ba1d0a41ee070dfa463 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 19 Jun 2025 20:31:50 +0530 Subject: [PATCH 04/12] adding org id and project in crud tests --- backend/app/tests/utils/collection.py | 37 +++++++++++++++++---------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/backend/app/tests/utils/collection.py b/backend/app/tests/utils/collection.py index c4b4804d..ded5c728 100644 --- a/backend/app/tests/utils/collection.py +++ b/backend/app/tests/utils/collection.py @@ -1,11 +1,13 @@ from uuid import UUID +from uuid import uuid4 from openai import OpenAI from sqlmodel import Session from app.core.config import settings -from app.models import Collection +from app.models import Collection, Organization, Project from app.tests.utils.utils import get_user_id_by_email +from app.crud import create_api_key class constants: @@ -21,27 +23,36 @@ def uuid_increment(value: UUID): def get_collection(db: Session, client=None): owner_id = get_user_id_by_email(db) + # Step 1: Create real organization and project entries + organization = Organization(name=f"Test Org {uuid4()}") + db.add(organization) + db.commit() + db.refresh(organization) + + project = Project(name="Test Project {uuid4()}", organization_id=organization.id) + db.add(project) + db.commit() + db.refresh(project) + + # Step 2: Create API key for user with valid foreign keys + create_api_key( + db, organization_id=organization.id, user_id=owner_id, project_id=project.id + ) + if client is None: client = OpenAI(api_key=settings.OPENAI_API_KEY) + vector_store = client.vector_stores.create() assistant = client.beta.assistants.create( model=constants.openai_model, - tools=[ - { - "type": "file_search", - }, - ], - tool_resources={ - "file_search": { - "vector_store_ids": [ - vector_store.id, - ], - }, - }, + tools=[{"type": "file_search"}], + tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}}, ) return Collection( owner_id=owner_id, + organization_id=organization.id, + project_id=project.id, llm_service_id=assistant.id, llm_service_name=constants.llm_service_name, ) From c0ae5a3a32e56126b9c36596e2bdff96f7ffe612 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 19 Jun 2025 20:37:50 +0530 Subject: [PATCH 05/12] migration file --- ..._add_organization_id_project_id_status_.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 backend/app/alembic/versions/75b5156a28fd_add_organization_id_project_id_status_.py diff --git a/backend/app/alembic/versions/75b5156a28fd_add_organization_id_project_id_status_.py b/backend/app/alembic/versions/75b5156a28fd_add_organization_id_project_id_status_.py new file mode 100644 index 00000000..276bd2b8 --- /dev/null +++ b/backend/app/alembic/versions/75b5156a28fd_add_organization_id_project_id_status_.py @@ -0,0 +1,61 @@ +"""Add organization_id, project_id, status, and updated_at to Collection + +Revision ID: 75b5156a28fd +Revises: 8757b005d681 +Create Date: 2025-06-19 15:38:02.609786 + +""" +from alembic import op +import sqlalchemy as sa +import sqlmodel.sql.sqltypes + + +# revision identifiers, used by Alembic. +revision = "75b5156a28fd" +down_revision = "8757b005d681" +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column( + "collection", sa.Column("organization_id", sa.Integer(), nullable=False) + ) + op.add_column("collection", sa.Column("project_id", sa.Integer(), nullable=True)) + op.add_column( + "collection", + sa.Column("status", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + ) + op.add_column("collection", sa.Column("updated_at", sa.DateTime(), nullable=False)) + op.alter_column( + "collection", "llm_service_id", existing_type=sa.VARCHAR(), nullable=True + ) + op.alter_column( + "collection", "llm_service_name", existing_type=sa.VARCHAR(), nullable=True + ) + op.create_foreign_key( + None, "collection", "project", ["project_id"], ["id"], ondelete="SET NULL" + ) + op.create_foreign_key( + None, + "collection", + "organization", + ["organization_id"], + ["id"], + ondelete="CASCADE", + ) + + +def downgrade(): + op.drop_constraint(None, "collection", type_="foreignkey") + op.drop_constraint(None, "collection", type_="foreignkey") + op.alter_column( + "collection", "llm_service_name", existing_type=sa.VARCHAR(), nullable=False + ) + op.alter_column( + "collection", "llm_service_id", existing_type=sa.VARCHAR(), nullable=False + ) + op.drop_column("collection", "updated_at") + op.drop_column("collection", "status") + op.drop_column("collection", "project_id") + op.drop_column("collection", "organization_id") From 8a2b80f0617c09837f7713b86c902ea6b43cb670 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 20 Jun 2025 14:52:58 +0530 Subject: [PATCH 06/12] test cases --- backend/app/api/routes/collections.py | 75 ++++++++++++--------------- backend/app/crud/collection.py | 18 +++++-- backend/app/tests/conftest.py | 7 ++- backend/app/tests/utils/utils.py | 46 +++++++++++++--- 4 files changed, 91 insertions(+), 55 deletions(-) diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index 38e6029a..23c0e1f7 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -176,83 +176,75 @@ def do_create_collection( ): start_time = time.time() client = OpenAI(api_key=settings.OPENAI_API_KEY) - if request.callback_url is None: - callback = SilentCallback(payload) - else: - callback = WebHookCallback(request.callback_url, payload) - - # - # Create the assistant and vector store - # + callback = ( + SilentCallback(payload) + if request.callback_url is None + else WebHookCallback(request.callback_url, payload) + ) vector_store_crud = OpenAIVectorStoreCrud(client) try: vector_store = vector_store_crud.create() except OpenAIError as err: - callback.fail(str(err)) + callback.fail(f"Vector store creation failed: {err}") return storage = AmazonCloudStorage(current_user) document_crud = DocumentCrud(session, current_user.id) assistant_crud = OpenAIAssistantCrud(client) - docs = request(document_crud) - log_doc = list(docs) - doc_count = len(log_doc) - flat_docs = [doc for sublist in log_doc for doc in sublist] - file_exts = list( - {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname} - ) + try: + docs = list(request(document_crud)) + flat_docs = [doc for sublist in docs for doc in sublist] + except Exception as err: + logging.error(f"[Document Fetch Error] {err}") + callback.fail(f"Document fetch failed: {err}") + return - file_sizes_kb = [] - for doc in flat_docs: - size_kb = storage.get_file_size_kb(doc.object_store_url) - file_sizes_kb.append(size_kb) + # Step 3: Collect file metadata + file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname} + file_sizes_kb = [ + storage.get_file_size_kb(doc.object_store_url) for doc in flat_docs + ] - kwargs = dict(request.extract_super_type(AssistantOptions)) try: - updates = vector_store_crud.update(vector_store.id, storage, docs) - documents = list(updates) - assistant = assistant_crud.create(vector_store.id, **kwargs) - except Exception as err: # blanket to handle SQL and OpenAI errors - logging.error(f"File Search setup error: {err} ({type(err).__name__})") + vector_store_crud.update(vector_store.id, storage, iter(docs)) + assistant = assistant_crud.create( + vector_store.id, **dict(request.extract_super_type(AssistantOptions)) + ) + except Exception as err: + logging.error(f"[Assistant/Vector Update Error] {err}") vector_store_crud.delete(vector_store.id) callback.fail(str(err)) return - # - # Store the results - # - collection_crud = CollectionCrud(session, current_user.id) try: - collection_crud = CollectionCrud(session, current_user.id) collection = collection_crud.read_one(UUID(payload.key)) collection.llm_service_id = assistant.id collection.llm_service_name = request.model collection.status = "Successful" collection.updated_at = now() - dc_crud = DocumentCollectionCrud(session) - dc_crud.create(collection, documents) + if flat_docs: + logging.info( + f"[DocumentCollection] Linking {len(flat_docs)} documents to collection {collection.id}" + ) + DocumentCollectionCrud(session).create(collection, flat_docs) collection_crud._update(collection) except SQLAlchemyError as err: _backout(assistant_crud, assistant.id) - logging.error(f"[Error during creating colletion - {err}") + logging.error(f"[Collection Save Error] {err}") callback.fail(str(err)) return elapsed = time.time() - start_time logging.info( - f"Collection created: {collection.id} | " - f"Time: {elapsed}s | Files: {doc_count} |Sizes:{file_sizes_kb} KB |Types: {file_exts}" + f"Collection created: {collection.id} | Time: {elapsed:.2f}s | " + f"Files: {len(flat_docs)} | Sizes: {file_sizes_kb} KB | Types: {list(file_exts)}" ) - # - # Send back successful response - # - callback.success(collection.model_dump(mode="json")) @@ -270,7 +262,6 @@ def create_collection( route = router.url_path_for(this.f_code.co_name) payload = ResponsePayload("processing", route) - # 1. Create initial collection record collection = Collection( id=UUID(payload.key), owner_id=current_user.id, @@ -280,7 +271,7 @@ def create_collection( ) collection_crud = CollectionCrud(session, current_user.id) - collection_crud.create(collection, documents=[]) + collection_crud.create(collection) # 2. Launch background task background_tasks.add_task( diff --git a/backend/app/crud/collection.py b/backend/app/crud/collection.py index 7b91f4fb..6f0ec2c9 100644 --- a/backend/app/crud/collection.py +++ b/backend/app/crud/collection.py @@ -1,7 +1,7 @@ import functools as ft from uuid import UUID from typing import Optional - +import logging from sqlmodel import Session, func, select, and_ from app.models import Document, Collection, DocumentCollection @@ -43,13 +43,23 @@ def _exists(self, collection: Collection): return bool(present) - def create(self, collection: Collection, documents: list[Document]): + def create( + self, collection: Collection, documents: Optional[list[Document]] = None + ): if self._exists(collection): raise FileExistsError("Collection already present") + # Update or create the collection first collection = self._update(collection) - dc_crud = DocumentCollectionCrud(self.session) - dc_crud.create(collection, documents) + + # Only link documents if present + if documents: + dc_crud = DocumentCollectionCrud(self.session) + dc_crud.create(collection, documents) + else: + logging.warning( + f"No documents provided for collection {collection.id}, skipping DocumentCollection creation." + ) return collection diff --git a/backend/app/tests/conftest.py b/backend/app/tests/conftest.py index a68c3eca..6aeac176 100644 --- a/backend/app/tests/conftest.py +++ b/backend/app/tests/conftest.py @@ -18,7 +18,7 @@ Credential, ) from app.tests.utils.user import authentication_token_from_email -from app.tests.utils.utils import get_superuser_token_headers +from app.tests.utils.utils import get_superuser_token_headers, get_real_api_key_headers @pytest.fixture(scope="session", autouse=True) @@ -49,6 +49,11 @@ def superuser_token_headers(client: TestClient) -> dict[str, str]: return get_superuser_token_headers(client) +@pytest.fixture(scope="function") +def api_key_headers(db: Session) -> dict[str, str]: + return get_real_api_key_headers(db) + + @pytest.fixture(scope="module") def normal_user_token_headers(client: TestClient, db: Session) -> dict[str, str]: return authentication_token_from_email( diff --git a/backend/app/tests/utils/utils.py b/backend/app/tests/utils/utils.py index 2945e8b3..c251f904 100644 --- a/backend/app/tests/utils/utils.py +++ b/backend/app/tests/utils/utils.py @@ -9,6 +9,9 @@ from app.core.config import settings from app.crud.user import get_user_by_email +from app.models import Organization, Project, APIKey +from app.crud import create_api_key, get_api_key_by_value +from uuid import uuid4 T = TypeVar("T") @@ -39,16 +42,43 @@ def get_superuser_token_headers(client: TestClient) -> dict[str, str]: return headers -def get_user_id_by_email(db: Session): +def get_user_id_by_email(db: Session) -> int: user = get_user_by_email(session=db, email=settings.FIRST_SUPERUSER) return user.id +def get_real_api_key_headers(db: Session) -> dict[str, str]: + owner_id = get_user_id_by_email(db) + + # Step 1: Create real organization and project + organization = Organization(name=f"Test Org {uuid4()}") + db.add(organization) + db.commit() + db.refresh(organization) + + project = Project(name=f"Test Project {uuid4()}", organization_id=organization.id) + db.add(project) + db.commit() + db.refresh(project) + + # Step 2: Create API key + api_key = create_api_key( + db, + organization_id=organization.id, + user_id=owner_id, + project_id=project.id, + ) + + return {"X-API-Key": api_key.key} + + +def get_user_from_api_key(db: Session, api_key_headers: dict[str, str]) -> int: + key_value = api_key_headers["X-API-Key"] + api_key = get_api_key_by_value(db, api_key_value=key_value) + return api_key + + def get_non_existent_id(session: Session, model: Type[T]) -> int: - """ - Returns an ID that does not exist for the given model. - It fetches the current max ID and adds 1. - """ result = session.exec(select(model.id).order_by(model.id.desc())).first() return (result or 0) + 1 @@ -60,10 +90,10 @@ def __init__(self, start=0): def __iter__(self): return self - def __next__(self): - uu_id = self.peek() + def __next__(self) -> UUID: + uu_id = UUID(int=self.start) self.start += 1 return uu_id - def peek(self): + def peek(self) -> UUID: return UUID(int=self.start) From e15ed29f6a385f841d0af5855c4288d75a4ef49c Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 20 Jun 2025 14:54:43 +0530 Subject: [PATCH 07/12] test cases --- .../collections/test_collection_info.py | 100 ++++++++++++++++++ .../collections/test_create_collections.py | 97 +++++++++++++++++ 2 files changed, 197 insertions(+) create mode 100644 backend/app/tests/api/routes/collections/test_collection_info.py create mode 100644 backend/app/tests/api/routes/collections/test_create_collections.py diff --git a/backend/app/tests/api/routes/collections/test_collection_info.py b/backend/app/tests/api/routes/collections/test_collection_info.py new file mode 100644 index 00000000..79b55fe6 --- /dev/null +++ b/backend/app/tests/api/routes/collections/test_collection_info.py @@ -0,0 +1,100 @@ +import pytest +from uuid import uuid4 +from datetime import datetime, timezone +from fastapi.testclient import TestClient +from sqlmodel import Session +from app.core.config import settings +from app.models import Collection +from app.crud.collection import CollectionCrud +from app.main import app +from app.tests.utils.utils import get_user_from_api_key + +client = TestClient(app) + + +def create_collection( + db, + user, + status: str = "processing", + with_llm: bool = False, +): + now = datetime.now(timezone.utc) + collection = Collection( + id=uuid4(), + owner_id=user.user_id, + organization_id=user.organization_id, + project_id=user.project_id, + status=status, + updated_at=now, + ) + if with_llm: + collection.llm_service_id = f"asst_str(uuid4())" + collection.llm_service_name = "gpt-4o" + + db.add(collection) + db.commit() + db.refresh(collection) + return collection + + +def test_collection_info_processing( + db: Session, + api_key_headers: dict[str, str], +): + user = get_user_from_api_key(db, api_key_headers) + collection = create_collection(db, user, status="processing") + + response = client.post( + f"{settings.API_V1_STR}/collections/info/{collection.id}", + headers=api_key_headers, + ) + + assert response.status_code == 200 + data = response.json()["data"] + + assert data["id"] == str(collection.id) + assert data["status"] == "processing" + assert data["llm_service_id"] is None + assert data["llm_service_name"] is None + + +def test_collection_info_successful( + db: Session, + api_key_headers: dict[str, str], +): + user = get_user_from_api_key(db, api_key_headers) + collection = create_collection(db, user, status="Successful", with_llm=True) + + response = client.post( + f"{settings.API_V1_STR}/collections/info/{collection.id}", + headers=api_key_headers, + ) + + assert response.status_code == 200 + data = response.json()["data"] + + assert data["id"] == str(collection.id) + assert data["status"] == "Successful" + assert data["llm_service_id"] == collection.llm_service_id + assert data["llm_service_name"] == "gpt-4o" + + +def test_collection_info_failed( + db: Session, + api_key_headers: dict[str, str], +): + user = get_user_from_api_key(db, api_key_headers) + collection = create_collection(db, user, status="Failed") + + response = client.post( + f"{settings.API_V1_STR}/collections/info/{collection.id}", + headers=api_key_headers, + ) + + assert response.status_code == 200 + data = response.json()["data"] + + assert data["id"] == str(collection.id) + assert data["status"] == "Failed" + assert data["llm_service_id"] is None + assert data["llm_service_name"] is None diff --git a/backend/app/tests/api/routes/collections/test_create_collections.py b/backend/app/tests/api/routes/collections/test_create_collections.py new file mode 100644 index 00000000..bd816605 --- /dev/null +++ b/backend/app/tests/api/routes/collections/test_create_collections.py @@ -0,0 +1,97 @@ +import pytest +from uuid import UUID +import io + +import openai_responses +from sqlmodel import Session, select +from fastapi.testclient import TestClient +from openai import OpenAIError + +from app.core.config import settings +from app.tests.utils.document import DocumentStore +from app.tests.utils.utils import openai_credentials, get_user_from_api_key +from app.main import app +from app.crud.collection import CollectionCrud +from app.api.routes.collections import CreationRequest, ResponsePayload + + +client = TestClient(app) + + +@pytest.fixture(autouse=True) +def mock_s3(monkeypatch): + class FakeStorage: + def __init__(self, *args, **kwargs): + pass + + def upload(self, file_obj, path: str, **kwargs): + return f"s3://fake-bucket/{path or 'mock-file.txt'}" + + def stream(self, file_obj): + fake_file = io.BytesIO(b"dummy content") + fake_file.name = "fake.txt" + return fake_file + + def get_file_size_kb(self, url: str) -> float: + return 1.0 # Simulate 1KB files + + class FakeS3Client: + def head_object(self, Bucket, Key): + return {"ContentLength": 1024} + + monkeypatch.setattr("app.api.routes.collections.AmazonCloudStorage", FakeStorage) + monkeypatch.setattr("boto3.client", lambda service: FakeS3Client()) + + +@pytest.mark.usefixtures("openai_credentials") +class TestCollectionRouteCreate: + _n_documents = 5 + + @openai_responses.mock() + def test_create_collection_success( + self, + client: TestClient, + db: Session, + api_key_headers: dict[str, str], + ): + store = DocumentStore(db) + documents = store.fill(self._n_documents) + doc_ids = [str(doc.id) for doc in documents] + + body = { + "documents": doc_ids, + "batch_size": 2, + "model": "gpt-4o", + "instructions": "Test collection assistant.", + "temperature": 0.1, + } + + response = client.post( + f"{settings.API_V1_STR}/collections/create", + json=body, + headers=api_key_headers, + ) + + assert response.status_code == 200 + json = response.json() + assert json["success"] is True + metadata = json.get("metadata", {}) + assert metadata["status"] == "processing" + assert UUID(metadata["key"]) + + collection_id = UUID(metadata["key"]) + + user = get_user_from_api_key(db, api_key_headers) + collection = CollectionCrud(db, user.user_id).read_one(collection_id) + + info_response = client.post( + f"{settings.API_V1_STR}/collections/info/{collection_id}", + headers=api_key_headers, + ) + assert info_response.status_code == 200 + info_data = info_response.json()["data"] + + assert collection.status == "Successful" + assert collection.owner_id == user.user_id + assert collection.llm_service_id is not None + assert collection.llm_service_name == "gpt-4o" From 56c51665ba4c87a70000c270d08f68b1ee32d9e4 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 20 Jun 2025 15:01:05 +0530 Subject: [PATCH 08/12] formatting --- backend/app/core/cloud/storage.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/backend/app/core/cloud/storage.py b/backend/app/core/cloud/storage.py index d8787170..0e8eaba4 100644 --- a/backend/app/core/cloud/storage.py +++ b/backend/app/core/cloud/storage.py @@ -1,6 +1,5 @@ import os -# import logging import functools as ft from pathlib import Path from dataclasses import dataclass, asdict @@ -139,4 +138,3 @@ def delete(self, url: str) -> None: self.aws.client.delete_object(**kwargs) except ClientError as err: raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err - From 230d6f1c7c3c02b4749582e9846297dc163af627 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 20 Jun 2025 15:06:40 +0530 Subject: [PATCH 09/12] test case fix --- backend/app/crud/collection.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/backend/app/crud/collection.py b/backend/app/crud/collection.py index 6f0ec2c9..4e90ef97 100644 --- a/backend/app/crud/collection.py +++ b/backend/app/crud/collection.py @@ -46,9 +46,6 @@ def _exists(self, collection: Collection): def create( self, collection: Collection, documents: Optional[list[Document]] = None ): - if self._exists(collection): - raise FileExistsError("Collection already present") - # Update or create the collection first collection = self._update(collection) From c89ead50de3a8bfbfbf3fd84bc894493a30969e9 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 20 Jun 2025 18:15:13 +0530 Subject: [PATCH 10/12] migration and pascalcase --- ..._add_organization_id_project_id_status_.py | 61 ------------------- backend/app/api/deps.py | 2 +- 2 files changed, 1 insertion(+), 62 deletions(-) delete mode 100644 backend/app/alembic/versions/75b5156a28fd_add_organization_id_project_id_status_.py diff --git a/backend/app/alembic/versions/75b5156a28fd_add_organization_id_project_id_status_.py b/backend/app/alembic/versions/75b5156a28fd_add_organization_id_project_id_status_.py deleted file mode 100644 index 276bd2b8..00000000 --- a/backend/app/alembic/versions/75b5156a28fd_add_organization_id_project_id_status_.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Add organization_id, project_id, status, and updated_at to Collection - -Revision ID: 75b5156a28fd -Revises: 8757b005d681 -Create Date: 2025-06-19 15:38:02.609786 - -""" -from alembic import op -import sqlalchemy as sa -import sqlmodel.sql.sqltypes - - -# revision identifiers, used by Alembic. -revision = "75b5156a28fd" -down_revision = "8757b005d681" -branch_labels = None -depends_on = None - - -def upgrade(): - op.add_column( - "collection", sa.Column("organization_id", sa.Integer(), nullable=False) - ) - op.add_column("collection", sa.Column("project_id", sa.Integer(), nullable=True)) - op.add_column( - "collection", - sa.Column("status", sqlmodel.sql.sqltypes.AutoString(), nullable=True), - ) - op.add_column("collection", sa.Column("updated_at", sa.DateTime(), nullable=False)) - op.alter_column( - "collection", "llm_service_id", existing_type=sa.VARCHAR(), nullable=True - ) - op.alter_column( - "collection", "llm_service_name", existing_type=sa.VARCHAR(), nullable=True - ) - op.create_foreign_key( - None, "collection", "project", ["project_id"], ["id"], ondelete="SET NULL" - ) - op.create_foreign_key( - None, - "collection", - "organization", - ["organization_id"], - ["id"], - ondelete="CASCADE", - ) - - -def downgrade(): - op.drop_constraint(None, "collection", type_="foreignkey") - op.drop_constraint(None, "collection", type_="foreignkey") - op.alter_column( - "collection", "llm_service_name", existing_type=sa.VARCHAR(), nullable=False - ) - op.alter_column( - "collection", "llm_service_id", existing_type=sa.VARCHAR(), nullable=False - ) - op.drop_column("collection", "updated_at") - op.drop_column("collection", "status") - op.drop_column("collection", "project_id") - op.drop_column("collection", "organization_id") diff --git a/backend/app/api/deps.py b/backend/app/api/deps.py index 61abef4a..43810c38 100644 --- a/backend/app/api/deps.py +++ b/backend/app/api/deps.py @@ -128,7 +128,7 @@ def get_current_user_org_project( ) -CurrentUserOrgproject = Annotated[UserProjectOrg, Depends(get_current_user_org_project)] +CurrentUserOrgProject = Annotated[UserProjectOrg, Depends(get_current_user_org_project)] def get_current_active_superuser(current_user: CurrentUser) -> User: From f08dbfead5a3bbc4d5deb50899a4c0855fd918e2 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 20 Jun 2025 20:10:51 +0530 Subject: [PATCH 11/12] pr review fixes --- ..._add_alter_columns_in_collections_table.py | 76 ++++++++++++++++ backend/app/api/routes/collections.py | 89 ++++++++++--------- backend/app/crud/collection.py | 21 +++-- backend/app/models/collection.py | 11 ++- .../collections/test_collection_info.py | 60 +++++++------ .../collections/test_create_collections.py | 24 +++-- backend/app/tests/conftest.py | 9 +- backend/app/tests/utils/utils.py | 33 ++----- 8 files changed, 205 insertions(+), 118 deletions(-) create mode 100644 backend/app/alembic/versions/3389c67fdcb4_add_alter_columns_in_collections_table.py diff --git a/backend/app/alembic/versions/3389c67fdcb4_add_alter_columns_in_collections_table.py b/backend/app/alembic/versions/3389c67fdcb4_add_alter_columns_in_collections_table.py new file mode 100644 index 00000000..9581de3e --- /dev/null +++ b/backend/app/alembic/versions/3389c67fdcb4_add_alter_columns_in_collections_table.py @@ -0,0 +1,76 @@ +"""add/alter columns in collections table + +Revision ID: 3389c67fdcb4 +Revises: 8757b005d681 +Create Date: 2025-06-20 18:08:16.585843 + +""" +from alembic import op +import sqlalchemy as sa +import sqlmodel.sql.sqltypes +from sqlalchemy.dialects import postgresql + + +# revision identifiers, used by Alembic. +revision = "3389c67fdcb4" +down_revision = "8757b005d681" +branch_labels = None +depends_on = None + +collection_status_enum = postgresql.ENUM( + "processing", + "successful", + "failed", + name="collectionstatus", + create_type=False, # we create manually to avoid duplicate issues +) + + +def upgrade(): + collection_status_enum.create(op.get_bind(), checkfirst=True) + op.add_column( + "collection", sa.Column("organization_id", sa.Integer(), nullable=False) + ) + op.add_column("collection", sa.Column("project_id", sa.Integer(), nullable=True)) + op.add_column( + "collection", + sa.Column( + "status", + collection_status_enum, + nullable=False, + server_default="processing", + ), + ) + op.add_column("collection", sa.Column("updated_at", sa.DateTime(), nullable=False)) + op.alter_column( + "collection", "llm_service_id", existing_type=sa.VARCHAR(), nullable=True + ) + op.alter_column( + "collection", "llm_service_name", existing_type=sa.VARCHAR(), nullable=True + ) + op.create_foreign_key( + None, + "collection", + "organization", + ["organization_id"], + ["id"], + ondelete="CASCADE", + ) + op.create_foreign_key( + None, "collection", "project", ["project_id"], ["id"], ondelete="CASCADE" + ) + + +def downgrade(): + op.drop_constraint(None, "collection", type_="foreignkey") + op.drop_constraint(None, "collection", type_="foreignkey") + op.alter_column( + "collection", "llm_service_name", existing_type=sa.VARCHAR(), nullable=False + ) + op.alter_column( + "collection", "llm_service_id", existing_type=sa.VARCHAR(), nullable=False + ) + op.drop_column("collection", "updated_at") + op.drop_column("collection", "status") + op.drop_column("collection", "project_id") + op.drop_column("collection", "organization_id") diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index 23c0e1f7..1f1ea843 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -12,13 +12,14 @@ from pydantic import BaseModel, Field, HttpUrl from sqlalchemy.exc import NoResultFound, MultipleResultsFound, SQLAlchemyError -from app.api.deps import CurrentUser, SessionDep, CurrentUserOrgproject +from app.api.deps import CurrentUser, SessionDep, CurrentUserOrgProject from app.core.cloud import AmazonCloudStorage from app.core.config import settings from app.core.util import now, raise_from_unknown, post_callback from app.crud import DocumentCrud, CollectionCrud, DocumentCollectionCrud from app.crud.rag import OpenAIVectorStoreCrud, OpenAIAssistantCrud from app.models import Collection, Document +from app.models.collection import CollectionStatus from app.utils import APIResponse, load_description router = APIRouter(prefix="/collections", tags=["collections"]) @@ -182,48 +183,46 @@ def do_create_collection( else WebHookCallback(request.callback_url, payload) ) - vector_store_crud = OpenAIVectorStoreCrud(client) - try: - vector_store = vector_store_crud.create() - except OpenAIError as err: - callback.fail(f"Vector store creation failed: {err}") - return - storage = AmazonCloudStorage(current_user) document_crud = DocumentCrud(session, current_user.id) assistant_crud = OpenAIAssistantCrud(client) + vector_store_crud = OpenAIVectorStoreCrud(client) + collection_crud = CollectionCrud(session, current_user.id) try: + # Step 1: Create vector store + vector_store = vector_store_crud.create() + + # Step 2: Fetch documents docs = list(request(document_crud)) flat_docs = [doc for sublist in docs for doc in sublist] - except Exception as err: - logging.error(f"[Document Fetch Error] {err}") - callback.fail(f"Document fetch failed: {err}") - return - # Step 3: Collect file metadata - file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname} - file_sizes_kb = [ - storage.get_file_size_kb(doc.object_store_url) for doc in flat_docs - ] + # Step 3: Collect file metadata + file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname} + file_sizes_kb = [ + storage.get_file_size_kb(doc.object_store_url) for doc in flat_docs + ] - try: + # Step 4: Upload documents to vector store + logging.info( + f"[VectorStore Update] Uploading {len(flat_docs)} documents to vector store {vector_store.id}" + ) vector_store_crud.update(vector_store.id, storage, iter(docs)) - assistant = assistant_crud.create( - vector_store.id, **dict(request.extract_super_type(AssistantOptions)) + logging.info(f"[VectorStore Upload] Upload completed") + + # Step 5: Create assistant + assistant_options = dict(request.extract_super_type(AssistantOptions)) + logging.info( + f"[Assistant Create] Creating assistant with options: {assistant_options}" ) - except Exception as err: - logging.error(f"[Assistant/Vector Update Error] {err}") - vector_store_crud.delete(vector_store.id) - callback.fail(str(err)) - return + assistant = assistant_crud.create(vector_store.id, **assistant_options) + logging.info(f"[Assistant Create] Assistant created: {assistant.id}") - collection_crud = CollectionCrud(session, current_user.id) - try: + # Step 6: Update collection collection = collection_crud.read_one(UUID(payload.key)) collection.llm_service_id = assistant.id collection.llm_service_name = request.model - collection.status = "Successful" + collection.status = CollectionStatus.successful collection.updated_at = now() if flat_docs: @@ -233,19 +232,27 @@ def do_create_collection( DocumentCollectionCrud(session).create(collection, flat_docs) collection_crud._update(collection) - except SQLAlchemyError as err: - _backout(assistant_crud, assistant.id) - logging.error(f"[Collection Save Error] {err}") - callback.fail(str(err)) - return - elapsed = time.time() - start_time - logging.info( - f"Collection created: {collection.id} | Time: {elapsed:.2f}s | " - f"Files: {len(flat_docs)} | Sizes: {file_sizes_kb} KB | Types: {list(file_exts)}" - ) + # Step 7: Final success callback + elapsed = time.time() - start_time + logging.info( + f"Collection created: {collection.id} | Time: {elapsed:.2f}s | " + f"Files: {len(flat_docs)} | Sizes: {file_sizes_kb} KB | Types: {list(file_exts)}" + ) + callback.success(collection.model_dump(mode="json")) - callback.success(collection.model_dump(mode="json")) + except Exception as err: + logging.error(f"[Collection Creation Failed] {err} ({type(err).__name__})") + if "assistant" in locals(): + _backout(assistant_crud, assistant.id) + try: + collection = collection_crud.read_one(UUID(payload.key)) + collection.status = CollectionStatus.failed + collection.updated_at = now() + collection_crud._update(collection) + except Exception as suberr: + logging.warning(f"[Collection Status Update Failed] {suberr}") + callback.fail(str(err)) @router.post( @@ -254,7 +261,7 @@ def do_create_collection( ) def create_collection( session: SessionDep, - current_user: CurrentUserOrgproject, + current_user: CurrentUserOrgProject, request: CreationRequest, background_tasks: BackgroundTasks, ): @@ -267,7 +274,7 @@ def create_collection( owner_id=current_user.id, organization_id=current_user.organization_id, project_id=current_user.project_id, - status="processing", + status=CollectionStatus.processing, ) collection_crud = CollectionCrud(session, current_user.id) diff --git a/backend/app/crud/collection.py b/backend/app/crud/collection.py index 4e90ef97..fe3e1b03 100644 --- a/backend/app/crud/collection.py +++ b/backend/app/crud/collection.py @@ -6,6 +6,7 @@ from app.models import Document, Collection, DocumentCollection from app.core.util import now +from app.models.collection import CollectionStatus from .document_collection import DocumentCollectionCrud @@ -44,19 +45,23 @@ def _exists(self, collection: Collection): return bool(present) def create( - self, collection: Collection, documents: Optional[list[Document]] = None + self, + collection: Collection, + documents: Optional[list[Document]] = None, ): - # Update or create the collection first - collection = self._update(collection) + try: + existing = self.read_one(collection.id) + if existing.status == CollectionStatus.failed: + self._update(collection) + else: + raise FileExistsError("Collection already present") + except: + self.session.add(collection) + self.session.commit() - # Only link documents if present if documents: dc_crud = DocumentCollectionCrud(self.session) dc_crud.create(collection, documents) - else: - logging.warning( - f"No documents provided for collection {collection.id}, skipping DocumentCollection creation." - ) return collection diff --git a/backend/app/models/collection.py b/backend/app/models/collection.py index 7a574c0a..27bc66e4 100644 --- a/backend/app/models/collection.py +++ b/backend/app/models/collection.py @@ -8,9 +8,16 @@ from .user import User from .organization import Organization from .project import Project +import enum from enum import Enum +class CollectionStatus(str, enum.Enum): + processing = "processing" + successful = "successful" + failed = "failed" + + class Collection(SQLModel, table=True): id: UUID = Field(default_factory=uuid4, primary_key=True) @@ -29,13 +36,13 @@ class Collection(SQLModel, table=True): project_id: int = Field( foreign_key="project.id", nullable=True, - ondelete="SET NULL", + ondelete="CASCADE", ) llm_service_id: Optional[str] = Field(default=None, nullable=True) llm_service_name: Optional[str] = Field(default=None, nullable=True) - status: Optional[str] = None + status: CollectionStatus = Field(default=CollectionStatus.processing) created_at: datetime = Field(default_factory=now) updated_at: datetime = Field(default_factory=now) diff --git a/backend/app/tests/api/routes/collections/test_collection_info.py b/backend/app/tests/api/routes/collections/test_collection_info.py index 79b55fe6..26e7ef91 100644 --- a/backend/app/tests/api/routes/collections/test_collection_info.py +++ b/backend/app/tests/api/routes/collections/test_collection_info.py @@ -8,14 +8,26 @@ from app.crud.collection import CollectionCrud from app.main import app from app.tests.utils.utils import get_user_from_api_key +from app.seed_data.seed_data import seed_database +from app.models.collection import CollectionStatus client = TestClient(app) +@pytest.fixture(scope="function", autouse=True) +def load_seed_data(db): + """Load seed data before each test.""" + seed_database(db) + yield + + +original_api_key = "ApiKey No3x47A5qoIGhm0kVKjQ77dhCqEdWRIQZlEPzzzh7i8" + + def create_collection( db, user, - status: str = "processing", + status: CollectionStatus = CollectionStatus.processing, with_llm: bool = False, ): now = datetime.now(timezone.utc) @@ -28,7 +40,7 @@ def create_collection( updated_at=now, ) if with_llm: - collection.llm_service_id = f"asst_str(uuid4())" + collection.llm_service_id = f"asst_{uuid4()}" collection.llm_service_name = "gpt-4o" db.add(collection) @@ -37,64 +49,60 @@ def create_collection( return collection -def test_collection_info_processing( - db: Session, - api_key_headers: dict[str, str], -): - user = get_user_from_api_key(db, api_key_headers) - collection = create_collection(db, user, status="processing") +def test_collection_info_processing(db: Session): + headers = {"X-API-KEY": original_api_key} + user = get_user_from_api_key(db, headers) + collection = create_collection(db, user, status=CollectionStatus.processing) response = client.post( f"{settings.API_V1_STR}/collections/info/{collection.id}", - headers=api_key_headers, + headers=headers, ) assert response.status_code == 200 data = response.json()["data"] assert data["id"] == str(collection.id) - assert data["status"] == "processing" + assert data["status"] == CollectionStatus.processing.value assert data["llm_service_id"] is None assert data["llm_service_name"] is None -def test_collection_info_successful( - db: Session, - api_key_headers: dict[str, str], -): - user = get_user_from_api_key(db, api_key_headers) - collection = create_collection(db, user, status="Successful", with_llm=True) +def test_collection_info_successful(db: Session): + headers = {"X-API-KEY": original_api_key} + user = get_user_from_api_key(db, headers) + collection = create_collection( + db, user, status=CollectionStatus.successful, with_llm=True + ) response = client.post( f"{settings.API_V1_STR}/collections/info/{collection.id}", - headers=api_key_headers, + headers=headers, ) assert response.status_code == 200 data = response.json()["data"] assert data["id"] == str(collection.id) - assert data["status"] == "Successful" + assert data["status"] == CollectionStatus.successful.value assert data["llm_service_id"] == collection.llm_service_id assert data["llm_service_name"] == "gpt-4o" -def test_collection_info_failed( - db: Session, - api_key_headers: dict[str, str], -): - user = get_user_from_api_key(db, api_key_headers) - collection = create_collection(db, user, status="Failed") +def test_collection_info_failed(db: Session): + headers = {"X-API-KEY": original_api_key} + user = get_user_from_api_key(db, headers) + collection = create_collection(db, user, status=CollectionStatus.failed) response = client.post( f"{settings.API_V1_STR}/collections/info/{collection.id}", - headers=api_key_headers, + headers=headers, ) assert response.status_code == 200 data = response.json()["data"] assert data["id"] == str(collection.id) - assert data["status"] == "Failed" + assert data["status"] == CollectionStatus.failed.value assert data["llm_service_id"] is None assert data["llm_service_name"] is None diff --git a/backend/app/tests/api/routes/collections/test_create_collections.py b/backend/app/tests/api/routes/collections/test_create_collections.py index bd816605..6fb0855c 100644 --- a/backend/app/tests/api/routes/collections/test_create_collections.py +++ b/backend/app/tests/api/routes/collections/test_create_collections.py @@ -13,11 +13,19 @@ from app.main import app from app.crud.collection import CollectionCrud from app.api.routes.collections import CreationRequest, ResponsePayload - +from app.seed_data.seed_data import seed_database +from app.models.collection import CollectionStatus client = TestClient(app) +@pytest.fixture(scope="function", autouse=True) +def load_seed_data(db): + """Load seed data before each test.""" + seed_database(db) + yield + + @pytest.fixture(autouse=True) def mock_s3(monkeypatch): class FakeStorage: @@ -52,7 +60,6 @@ def test_create_collection_success( self, client: TestClient, db: Session, - api_key_headers: dict[str, str], ): store = DocumentStore(db) documents = store.fill(self._n_documents) @@ -65,33 +72,36 @@ def test_create_collection_success( "instructions": "Test collection assistant.", "temperature": 0.1, } + original_api_key = "ApiKey No3x47A5qoIGhm0kVKjQ77dhCqEdWRIQZlEPzzzh7i8" + + headers = {"X-API-KEY": original_api_key} response = client.post( f"{settings.API_V1_STR}/collections/create", json=body, - headers=api_key_headers, + headers=headers, ) assert response.status_code == 200 json = response.json() assert json["success"] is True metadata = json.get("metadata", {}) - assert metadata["status"] == "processing" + assert metadata["status"] == CollectionStatus.processing.value assert UUID(metadata["key"]) collection_id = UUID(metadata["key"]) - user = get_user_from_api_key(db, api_key_headers) + user = get_user_from_api_key(db, headers) collection = CollectionCrud(db, user.user_id).read_one(collection_id) info_response = client.post( f"{settings.API_V1_STR}/collections/info/{collection_id}", - headers=api_key_headers, + headers=headers, ) assert info_response.status_code == 200 info_data = info_response.json()["data"] - assert collection.status == "Successful" + assert collection.status == CollectionStatus.successful.value assert collection.owner_id == user.user_id assert collection.llm_service_id is not None assert collection.llm_service_name == "gpt-4o" diff --git a/backend/app/tests/conftest.py b/backend/app/tests/conftest.py index 6aeac176..e2a6464d 100644 --- a/backend/app/tests/conftest.py +++ b/backend/app/tests/conftest.py @@ -16,9 +16,10 @@ User, OpenAI_Thread, Credential, + Collection, ) from app.tests.utils.user import authentication_token_from_email -from app.tests.utils.utils import get_superuser_token_headers, get_real_api_key_headers +from app.tests.utils.utils import get_superuser_token_headers @pytest.fixture(scope="session", autouse=True) @@ -35,6 +36,7 @@ def db() -> Generator[Session, None, None]: session.execute(delete(APIKey)) session.execute(delete(User)) session.execute(delete(OpenAI_Thread)) + session.execute(delete(Collection)) session.commit() @@ -49,11 +51,6 @@ def superuser_token_headers(client: TestClient) -> dict[str, str]: return get_superuser_token_headers(client) -@pytest.fixture(scope="function") -def api_key_headers(db: Session) -> dict[str, str]: - return get_real_api_key_headers(db) - - @pytest.fixture(scope="module") def normal_user_token_headers(client: TestClient, db: Session) -> dict[str, str]: return authentication_token_from_email( diff --git a/backend/app/tests/utils/utils.py b/backend/app/tests/utils/utils.py index c251f904..9cba7aae 100644 --- a/backend/app/tests/utils/utils.py +++ b/backend/app/tests/utils/utils.py @@ -9,7 +9,7 @@ from app.core.config import settings from app.crud.user import get_user_by_email -from app.models import Organization, Project, APIKey +from app.models import APIKeyPublic from app.crud import create_api_key, get_api_key_by_value from uuid import uuid4 @@ -47,34 +47,11 @@ def get_user_id_by_email(db: Session) -> int: return user.id -def get_real_api_key_headers(db: Session) -> dict[str, str]: - owner_id = get_user_id_by_email(db) - - # Step 1: Create real organization and project - organization = Organization(name=f"Test Org {uuid4()}") - db.add(organization) - db.commit() - db.refresh(organization) - - project = Project(name=f"Test Project {uuid4()}", organization_id=organization.id) - db.add(project) - db.commit() - db.refresh(project) - - # Step 2: Create API key - api_key = create_api_key( - db, - organization_id=organization.id, - user_id=owner_id, - project_id=project.id, - ) - - return {"X-API-Key": api_key.key} - - -def get_user_from_api_key(db: Session, api_key_headers: dict[str, str]) -> int: - key_value = api_key_headers["X-API-Key"] +def get_user_from_api_key(db: Session, api_key_headers: dict[str, str]) -> APIKeyPublic: + key_value = api_key_headers["X-API-KEY"] api_key = get_api_key_by_value(db, api_key_value=key_value) + if api_key is None: + raise ValueError("Invalid API Key") return api_key From 0a8f0e07e393aabae312c5072b9b4ab8bf773fce Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 20 Jun 2025 20:41:01 +0530 Subject: [PATCH 12/12] vector store fix --- backend/app/api/routes/collections.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index 1f1ea843..7c903930 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -190,27 +190,22 @@ def do_create_collection( collection_crud = CollectionCrud(session, current_user.id) try: - # Step 1: Create vector store vector_store = vector_store_crud.create() - # Step 2: Fetch documents docs = list(request(document_crud)) flat_docs = [doc for sublist in docs for doc in sublist] - # Step 3: Collect file metadata file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname} file_sizes_kb = [ storage.get_file_size_kb(doc.object_store_url) for doc in flat_docs ] - # Step 4: Upload documents to vector store logging.info( f"[VectorStore Update] Uploading {len(flat_docs)} documents to vector store {vector_store.id}" ) - vector_store_crud.update(vector_store.id, storage, iter(docs)) + list(vector_store_crud.update(vector_store.id, storage, docs)) logging.info(f"[VectorStore Upload] Upload completed") - # Step 5: Create assistant assistant_options = dict(request.extract_super_type(AssistantOptions)) logging.info( f"[Assistant Create] Creating assistant with options: {assistant_options}" @@ -218,7 +213,6 @@ def do_create_collection( assistant = assistant_crud.create(vector_store.id, **assistant_options) logging.info(f"[Assistant Create] Assistant created: {assistant.id}") - # Step 6: Update collection collection = collection_crud.read_one(UUID(payload.key)) collection.llm_service_id = assistant.id collection.llm_service_name = request.model @@ -233,7 +227,6 @@ def do_create_collection( collection_crud._update(collection) - # Step 7: Final success callback elapsed = time.time() - start_time logging.info( f"Collection created: {collection.id} | Time: {elapsed:.2f}s | "