From 62d159ef2430871406c5ec9017b5c84c862376bf Mon Sep 17 00:00:00 2001
From: Akhilesh Negi <akhileshnegi@Akhileshs-MacBook-Pro.local>
Date: Wed, 28 May 2025 09:53:20 +0530
Subject: [PATCH 01/64] first stab at setting up langfuse evaluation

---
 backend/app/api/main.py              |   2 +
 backend/app/api/routes/evaluation.py | 127 +++++++++++++++++++++++++++
 2 files changed, 129 insertions(+)
 create mode 100644 backend/app/api/routes/evaluation.py

diff --git a/backend/app/api/main.py b/backend/app/api/main.py
index 2ba2079a1..36750865d 100644
--- a/backend/app/api/main.py
+++ b/backend/app/api/main.py
@@ -14,6 +14,7 @@
     utils,
     onboarding,
     credentials,
+    evaluation,
 )
 from app.core.config import settings
 
@@ -22,6 +23,7 @@
 api_router.include_router(collections.router)
 api_router.include_router(credentials.router)
 api_router.include_router(documents.router)
+api_router.include_router(evaluation.router)
 api_router.include_router(login.router)
 api_router.include_router(onboarding.router)
 api_router.include_router(organization.router)
diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
new file mode 100644
index 000000000..596318b7d
--- /dev/null
+++ b/backend/app/api/routes/evaluation.py
@@ -0,0 +1,127 @@
+from fastapi import APIRouter, Depends, BackgroundTasks
+from sqlmodel import Session
+from langfuse import Langfuse
+from langfuse.decorators import langfuse_context
+
+from app.api.deps import get_current_user_org, get_db
+from app.models import UserOrganization
+from app.utils import APIResponse
+from app.crud.credentials import get_provider_credential
+from app.api.routes.threads import threads
+
+router = APIRouter(tags=["evaluation"])
+
+
+@router.post("/evaluate")
+async def evaluate_threads(
+    experiment_name: str,
+    assistant_id: str,
+    dataset_name: str,
+    project_id: int,
+    background_tasks: BackgroundTasks,
+    _session: Session = Depends(get_db),
+    _current_user: UserOrganization = Depends(get_current_user_org),
+):
+    """Endpoint to run thread evaluations using Langfuse."""
+    # Get OpenAI credentials
+    credentials = get_provider_credential(
+        session=_session,
+        org_id=_current_user.organization_id,
+        provider="openai",
+        project_id=project_id,
+    )
+    if not credentials or "api_key" not in credentials:
+        return APIResponse.failure_response(
+            error="OpenAI API key not configured for this organization."
+        )
+
+    # Get Langfuse credentials
+    langfuse_credentials = get_provider_credential(
+        session=_session,
+        org_id=_current_user.organization_id,
+        provider="langfuse",
+        project_id=project_id,
+    )
+    if not langfuse_credentials:
+        return APIResponse.failure_response(
+            error="LANGFUSE keys not configured for this organization."
+        )
+
+    # Configure Langfuse
+    langfuse = Langfuse(
+        public_key=langfuse_credentials["public_key"],
+        secret_key=langfuse_credentials["secret_key"],
+        host=langfuse_credentials.get("host", "https://cloud.langfuse.com"),
+    )
+
+    langfuse_context.configure(
+        secret_key=langfuse_credentials["secret_key"],
+        public_key=langfuse_credentials["public_key"],
+        host=langfuse_credentials.get("host", "https://cloud.langfuse.com"),
+    )
+
+    try:
+        # Get dataset
+        dataset = langfuse.get_dataset(dataset_name)
+        results = []
+
+        for item in dataset.items:
+            with item.observe(run_name=experiment_name) as trace_id:
+                # Prepare request
+                request = {
+                    "question": item.input,
+                    "assistant_id": assistant_id,
+                    "remove_citation": True,
+                    "project_id": project_id,
+                }
+
+                # Process thread asynchronously
+                response = await threads(
+                    request=request,
+                    background_tasks=background_tasks,
+                    _session=_session,
+                    _current_user=_current_user,
+                )
+
+                # Extract message from the initial response
+                if isinstance(response, APIResponse) and response.success:
+                    thread_id = response.data.get("thread_id")
+                    # Note: The actual response will be sent to the callback URL
+                    # We're just tracking that the thread was created
+                    output = f"Thread created with ID: {thread_id}"
+                else:
+                    output = ""
+
+                # Evaluate based on thread creation success
+                is_match = bool(output)  # Simplified evaluation for now
+                langfuse.score(
+                    trace_id=trace_id, name="thread_creation_success", value=is_match
+                )
+
+                results.append(
+                    {
+                        "input": item.input,
+                        "output": output,
+                        "expected": item.expected_output,
+                        "match": is_match,
+                        "thread_id": thread_id if is_match else None,
+                    }
+                )
+
+        # Flush Langfuse events
+        langfuse_context.flush()
+        langfuse.flush()
+
+        return APIResponse.success_response(
+            data={
+                "experiment_name": experiment_name,
+                "dataset_name": dataset_name,
+                "results": results,
+                "total_items": len(results),
+                "matches": sum(1 for r in results if r["match"]),
+                "note": "Threads are being processed in the background. Check the callback URLs for actual responses.",
+            }
+        )
+
+    except Exception as e:
+        return APIResponse.failure_response(error=str(e))

From 706c5dc54f3c0924828e4f27cf66ed10b033c2ba Mon Sep 17 00:00:00 2001
From: Akhilesh Negi <akhileshnegi@Akhileshs-MBP.domain.name>
Date: Wed, 28 May 2025 13:20:07 +0530
Subject: [PATCH 02/64] bringing traces to sync

---
 backend/app/api/routes/threads.py | 75 ++++++++++++++++---------------
 1 file changed, 39 insertions(+), 36 deletions(-)

diff --git a/backend/app/api/routes/threads.py b/backend/app/api/routes/threads.py
index cc5ded319..2f296d7a5 100644
--- a/backend/app/api/routes/threads.py
+++ b/backend/app/api/routes/threads.py
@@ -59,7 +59,6 @@ def validate_thread(client: OpenAI, thread_id: str) -> tuple[bool, str]:
         return False, f"Invalid thread ID provided {thread_id}"
 
 
-@observe(capture_input=False)
 def setup_thread(client: OpenAI, request: dict) -> tuple[bool, str]:
     """Set up thread and add message, either creating new or using existing."""
     thread_id = request.get("thread_id")
@@ -78,9 +77,6 @@ def setup_thread(client: OpenAI, request: dict) -> tuple[bool, str]:
                 thread_id=thread.id, role="user", content=request["question"]
             )
             request["thread_id"] = thread.id
-            langfuse_context.update_current_trace(
-                session_id=thread.id, name="New Thread ID created", output=thread.id
-            )
             return True, None
         except openai.OpenAIError as e:
             return False, handle_openai_error(e)
@@ -135,8 +131,8 @@ def extract_response_from_thread(
 
 
 @observe(as_type="generation")
-def process_run(request: dict, client: OpenAI):
-    """Process a run and send callback with results."""
+def process_run_core(request: dict, client: OpenAI) -> tuple[dict, str]:
+    """Core function to process a run and return the response and message."""
     try:
         run = client.beta.threads.runs.create_and_poll(
             thread_id=request["thread_id"],
@@ -167,17 +163,21 @@ def process_run(request: dict, client: OpenAI):
                 output=message, name="Thread Run Completed"
             )
 
-            callback_response = create_success_response(request, message)
+            return create_success_response(request, message).model_dump(), None
         else:
-            callback_response = APIResponse.failure_response(
-                error=f"Run failed with status: {run.status}"
-            )
-
-        send_callback(request["callback_url"], callback_response.model_dump())
+            error_msg = f"Run failed with status: {run.status}"
+            return APIResponse.failure_response(error=error_msg).model_dump(), error_msg
 
     except openai.OpenAIError as e:
-        callback_response = APIResponse.failure_response(error=handle_openai_error(e))
-        send_callback(request["callback_url"], callback_response.model_dump())
+        error_msg = handle_openai_error(e)
+        return APIResponse.failure_response(error=error_msg).model_dump(), error_msg
+
+
+@observe(as_type="generation")
+def process_run(request: dict, client: OpenAI):
+    """Process a run and send callback with results."""
+    response, _ = process_run_core(request, client)
+    send_callback(request["callback_url"], response)
 
 
 def poll_run_and_prepare_response(request: dict, client: OpenAI, db: Session):
@@ -276,6 +276,7 @@ async def threads(
     return initial_response
 
 
+@observe()
 @router.post("/threads/sync")
 async def threads_sync(
     request: dict,
@@ -288,7 +289,7 @@ async def threads_sync(
         session=_session,
         org_id=_current_user.organization_id,
         provider="openai",
-        project_id=_current_user.project_id,
+        project_id=request.get("project_id"),
     )
     if not credentials or "api_key" not in credentials:
         return APIResponse.failure_response(
@@ -297,6 +298,25 @@ async def threads_sync(
 
     client = OpenAI(api_key=credentials["api_key"])
 
+    # Get Langfuse credentials
+    langfuse_credentials = get_provider_credential(
+        session=_session,
+        org_id=_current_user.organization_id,
+        provider="langfuse",
+        project_id=request.get("project_id"),
+    )
+    if not langfuse_credentials:
+        return APIResponse.failure_response(
+            error="LANGFUSE keys not configured for this organization."
+        )
+
+    # Configure Langfuse
+    langfuse_context.configure(
+        secret_key=langfuse_credentials["secret_key"],
+        public_key=langfuse_credentials["public_key"],
+        host=langfuse_credentials.get("host", "https://cloud.langfuse.com"),
+    )
+
     # Validate thread
     is_valid, error_message = validate_thread(client, request.get("thread_id"))
     if not is_valid:
@@ -308,27 +328,10 @@ async def threads_sync(
         return APIResponse.failure_response(error=error_message)
 
     try:
-        # Process run
-        run = client.beta.threads.runs.create_and_poll(
-            thread_id=request["thread_id"],
-            assistant_id=request["assistant_id"],
-        )
-
-        if run.status == "completed":
-            messages = client.beta.threads.messages.list(thread_id=request["thread_id"])
-            latest_message = messages.data[0]
-            message_content = latest_message.content[0].text.value
-            message = process_message_content(
-                message_content, request.get("remove_citation", False)
-            )
-            return create_success_response(request, message)
-        else:
-            return APIResponse.failure_response(
-                error=f"Run failed with status: {run.status}"
-            )
-
-    except openai.OpenAIError as e:
-        return APIResponse.failure_response(error=handle_openai_error(e))
+        response, error_message = process_run_core(request, client)
+        return response
+    finally:
+        langfuse_context.flush()
 
 
 @router.post("/threads/start")

From d8821609f075727e4ccc7c20b1912739a556c6b8 Mon Sep 17 00:00:00 2001
From: Akhilesh Negi <akhileshnegi@Akhileshs-MBP.domain.name>
Date: Wed, 28 May 2025 13:39:38 +0530
Subject: [PATCH 03/64] cleanup

---
 backend/app/api/routes/evaluation.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 596318b7d..f392cb42b 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -1,4 +1,4 @@
-from fastapi import APIRouter, Depends, BackgroundTasks
+from fastapi import APIRouter, Depends
 from sqlmodel import Session
 from langfuse import Langfuse
 from langfuse.decorators import langfuse_context
@@ -7,7 +7,7 @@
 from app.models import UserOrganization
 from app.utils import APIResponse
 from app.crud.credentials import get_provider_credential
-from app.api.routes.threads import threads
+from app.api.routes.threads import threads_sync
 
 router = APIRouter(tags=["evaluation"])
 
@@ -18,7 +18,6 @@ async def evaluate_threads(
     assistant_id: str,
     dataset_name: str,
     project_id: int,
-    background_tasks: BackgroundTasks,
     _session: Session = Depends(get_db),
     _current_user: UserOrganization = Depends(get_current_user_org),
 ):
@@ -75,29 +74,28 @@ async def evaluate_threads(
                     "project_id": project_id,
                 }
 
-                # Process thread asynchronously
-                response = await threads(
+                # Process thread synchronously
+                response = await threads_sync(
                     request=request,
-                    background_tasks=background_tasks,
                     _session=_session,
                     _current_user=_current_user,
                 )
 
-                # Extract message from the initial response
+                # Extract message from the response
                 if isinstance(response, APIResponse) and response.success:
+                    print(f"Response: {response.data}")
+                    output = response.data.get("message", "")
                     thread_id = response.data.get("thread_id")
-                    # Note: The actual response will be sent to the callback URL
-                    # We're just tracking that the thread was created
-                    output = f"Thread created with ID: {thread_id}"
                 else:
                     output = ""
+                    thread_id = None
 
-                # Evaluate based on thread creation success
+                # Evaluate based on response success
                 is_match = bool(output)  # Simplified evaluation for now
                 langfuse.score(
                     trace_id=trace_id, name="thread_creation_success", value=is_match
                 )
-
+                print(f"Evaluating item: {item.input}, Match: {is_match}")
                 results.append(
                     {
                         "input": item.input,
@@ -119,7 +117,7 @@ async def evaluate_threads(
                 "results": results,
                 "total_items": len(results),
                 "matches": sum(1 for r in results if r["match"]),
-                "note": "Threads are being processed in the background. Check the callback URLs for actual responses.",
+                "note": "All threads have been processed synchronously.",
             }
         )
 

From 168f52154f39b4a298b78d08519e36ee402f50c2 Mon Sep 17 00:00:00 2001
From: Akhilesh Negi <akhileshnegi@Akhileshs-MBP.domain.name>
Date: Wed, 28 May 2025 15:25:23 +0530
Subject: [PATCH 04/64] getting it up and running with sync

---
 backend/app/api/routes/threads.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/backend/app/api/routes/threads.py b/backend/app/api/routes/threads.py
index 2f296d7a5..b6e197fcf 100644
--- a/backend/app/api/routes/threads.py
+++ b/backend/app/api/routes/threads.py
@@ -276,7 +276,6 @@ async def threads(
     return initial_response
 
 
-@observe()
 @router.post("/threads/sync")
 async def threads_sync(
     request: dict,

From fa509ad619e9d31d408dff080ff9a53a98221d38 Mon Sep 17 00:00:00 2001
From: Akhilesh Negi <akhileshnegi@Akhileshs-MBP.domain.name>
Date: Wed, 28 May 2025 20:12:11 +0530
Subject: [PATCH 05/64] using utils

---
 backend/app/api/routes/evaluation.py | 17 +++++--------
 backend/app/api/routes/threads.py    | 23 ++++++++++--------
 backend/app/core/util.py             | 36 ++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index f392cb42b..32b4c010e 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -8,6 +8,7 @@
 from app.utils import APIResponse
 from app.crud.credentials import get_provider_credential
 from app.api.routes.threads import threads_sync
+from app.core.util import configure_langfuse
 
 router = APIRouter(tags=["evaluation"])
 
@@ -47,17 +48,11 @@ async def evaluate_threads(
         )
 
     # Configure Langfuse
-    langfuse = Langfuse(
-        public_key=langfuse_credentials["public_key"],
-        secret_key=langfuse_credentials["secret_key"],
-        host=langfuse_credentials.get("host", "https://cloud.langfuse.com"),
-    )
-
-    langfuse_context.configure(
-        secret_key=langfuse_credentials["secret_key"],
-        public_key=langfuse_credentials["public_key"],
-        host=langfuse_credentials.get("host", "https://cloud.langfuse.com"),
-    )
+    langfuse, success = configure_langfuse(langfuse_credentials)
+    if not success:
+        return APIResponse.failure_response(
+            error="Failed to configure Langfuse client."
+        )
 
     try:
         # Get dataset
diff --git a/backend/app/api/routes/threads.py b/backend/app/api/routes/threads.py
index b6e197fcf..7242e18c6 100644
--- a/backend/app/api/routes/threads.py
+++ b/backend/app/api/routes/threads.py
@@ -14,6 +14,7 @@
 from app.utils import APIResponse
 from app.crud.credentials import get_provider_credential
 from app.core.security import decrypt_credentials
+from app.core.util import configure_langfuse
 
 logger = logging.getLogger(__name__)
 router = APIRouter(tags=["threads"])
@@ -245,11 +246,13 @@ async def threads(
             error="LANGFUSE keys not configured for this organization."
         )
 
-    langfuse_context.configure(
-        secret_key=langfuse_credentials["secret_key"],
-        public_key=langfuse_credentials["public_key"],
-        host=langfuse_credentials["host"],
-    )
+    # Configure Langfuse
+    _, success = configure_langfuse(langfuse_credentials)
+    if not success:
+        return APIResponse.failure_response(
+            error="Failed to configure Langfuse client."
+        )
+
     # Validate thread
     is_valid, error_message = validate_thread(client, request.get("thread_id"))
     if not is_valid:
@@ -310,11 +313,11 @@ async def threads_sync(
         )
 
     # Configure Langfuse
-    langfuse_context.configure(
-        secret_key=langfuse_credentials["secret_key"],
-        public_key=langfuse_credentials["public_key"],
-        host=langfuse_credentials.get("host", "https://cloud.langfuse.com"),
-    )
+    _, success = configure_langfuse(langfuse_credentials)
+    if not success:
+        return APIResponse.failure_response(
+            error="Failed to configure Langfuse client."
+        )
 
     # Validate thread
     is_valid, error_message = validate_thread(client, request.get("thread_id"))
diff --git a/backend/app/core/util.py b/backend/app/core/util.py
index 6f945b9db..08882f4fa 100644
--- a/backend/app/core/util.py
+++ b/backend/app/core/util.py
@@ -5,6 +5,8 @@
 from fastapi import HTTPException
 from requests import Session, RequestException
 from pydantic import BaseModel, HttpUrl
+from langfuse import Langfuse
+from langfuse.decorators import langfuse_context
 
 
 def now():
@@ -32,3 +34,37 @@ def post_callback(url: HttpUrl, payload: BaseModel):
             errno += 1
 
     return not errno
+
+
+def configure_langfuse(credentials: dict) -> tuple[Langfuse, bool]:
+    """
+    Configure Langfuse client and context with the provided credentials.
+
+    Args:
+        credentials: Dictionary containing Langfuse credentials (public_key, secret_key, host)
+
+    Returns:
+        Tuple of (Langfuse client instance, success boolean)
+    """
+    if not credentials:
+        return None, False
+
+    try:
+        # Configure Langfuse client
+        langfuse = Langfuse(
+            public_key=credentials["public_key"],
+            secret_key=credentials["secret_key"],
+            host=credentials.get("host", "https://cloud.langfuse.com"),
+        )
+
+        # Configure Langfuse context
+        langfuse_context.configure(
+            secret_key=credentials["secret_key"],
+            public_key=credentials["public_key"],
+            host=credentials.get("host", "https://cloud.langfuse.com"),
+        )
+
+        return langfuse, True
+    except Exception as e:
+        warnings.warn(f"Failed to configure Langfuse: {str(e)}")
+        return None, False

From 38ea1a96926dbb9df56d10103bc525ae64dba2e4 Mon Sep 17 00:00:00 2001
From: Akhilesh Negi <akhileshnegi@Akhileshs-MBP.domain.name>
Date: Wed, 28 May 2025 20:28:14 +0530
Subject: [PATCH 06/64] cleanups

---
 backend/app/api/routes/evaluation.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 32b4c010e..f0f9b172a 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -77,10 +77,9 @@ async def evaluate_threads(
                 )
 
                 # Extract message from the response
-                if isinstance(response, APIResponse) and response.success:
-                    print(f"Response: {response.data}")
-                    output = response.data.get("message", "")
-                    thread_id = response.data.get("thread_id")
+                if isinstance(response, dict) and response.get("success"):
+                    output = response.get("data", {}).get("message", "")
+                    thread_id = response.get("data", {}).get("thread_id")
                 else:
                     output = ""
                     thread_id = None
@@ -90,7 +89,6 @@ async def evaluate_threads(
                 langfuse.score(
                     trace_id=trace_id, name="thread_creation_success", value=is_match
                 )
-                print(f"Evaluating item: {item.input}, Match: {is_match}")
                 results.append(
                     {
                         "input": item.input,

From be299a335c3174d4d0ed7a2ccf6e1017f06827a3 Mon Sep 17 00:00:00 2001
From: Akhilesh Negi <akhileshnegi@Akhileshs-MacBook-Pro.local>
Date: Thu, 29 May 2025 11:22:05 +0530
Subject: [PATCH 07/64] added logs

---
 backend/app/api/routes/evaluation.py | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index f0f9b172a..bb20e36f9 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -1,3 +1,4 @@
+import logging
 from fastapi import APIRouter, Depends
 from sqlmodel import Session
 from langfuse import Langfuse
@@ -10,6 +11,10 @@
 from app.api.routes.threads import threads_sync
 from app.core.util import configure_langfuse
 
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
 router = APIRouter(tags=["evaluation"])
 
 
@@ -23,6 +28,10 @@ async def evaluate_threads(
     _current_user: UserOrganization = Depends(get_current_user_org),
 ):
     """Endpoint to run thread evaluations using Langfuse."""
+    logger.info(
+        f"Starting evaluation for experiment: {experiment_name}, dataset: {dataset_name}, assistant: {assistant_id}"
+    )
+
     # Get OpenAI credentials
     credentials = get_provider_credential(
         session=_session,
@@ -56,10 +65,14 @@ async def evaluate_threads(
 
     try:
         # Get dataset
+        logger.info(f"Fetching dataset: {dataset_name}")
         dataset = langfuse.get_dataset(dataset_name)
         results = []
+        total_items = len(dataset.items)
+        logger.info(f"Processing {total_items} items from {dataset_name} dataset")
 
-        for item in dataset.items:
+        for idx, item in enumerate(dataset.items, 1):
+            logger.info(f"Processing item {idx}/{total_items}: {item.input[:20]}...")
             with item.observe(run_name=experiment_name) as trace_id:
                 # Prepare request
                 request = {
@@ -85,7 +98,7 @@ async def evaluate_threads(
                     thread_id = None
 
                 # Evaluate based on response success
-                is_match = bool(output)  # Simplified evaluation for now
+                is_match = bool(output)
                 langfuse.score(
                     trace_id=trace_id, name="thread_creation_success", value=is_match
                 )
@@ -98,21 +111,27 @@ async def evaluate_threads(
                         "thread_id": thread_id if is_match else None,
                     }
                 )
+                logger.info(f"Completed processing item {idx} (match: {is_match})")
 
         # Flush Langfuse events
         langfuse_context.flush()
         langfuse.flush()
 
+        matches = sum(1 for r in results if r["match"])
+        logger.info(
+            f"Evaluation completed. Total items: {len(results)}, Matches: {matches}"
+        )
         return APIResponse.success_response(
             data={
                 "experiment_name": experiment_name,
                 "dataset_name": dataset_name,
                 "results": results,
                 "total_items": len(results),
-                "matches": sum(1 for r in results if r["match"]),
+                "matches": matches,
                 "note": "All threads have been processed synchronously.",
             }
         )
 
     except Exception as e:
+        logger.error(f"Error during evaluation: {str(e)}", exc_info=True)
         return APIResponse.failure_response(error=str(e))

From 46fd438c80413a301da01909abfd7130b7d3b46b Mon Sep 17 00:00:00 2001
From: Akhilesh Negi <akhileshnegi@Akhileshs-MacBook-Pro.local>
Date: Thu, 29 May 2025 11:26:28 +0530
Subject: [PATCH 08/64] code refactoring

---
 backend/app/api/routes/evaluation.py |  7 +++++--
 backend/app/api/routes/threads.py    | 30 ++++++++++++++++++++--------
 backend/app/core/util.py             | 23 +++++++++++++++++++++
 3 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index bb20e36f9..74e76afc3 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -9,7 +9,7 @@
 from app.utils import APIResponse
 from app.crud.credentials import get_provider_credential
 from app.api.routes.threads import threads_sync
-from app.core.util import configure_langfuse
+from app.core.util import configure_langfuse, configure_openai
 
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -39,7 +39,10 @@ async def evaluate_threads(
         provider="openai",
         project_id=project_id,
     )
-    if not credentials or "api_key" not in credentials:
+
+    # Configure OpenAI client
+    client, success = configure_openai(credentials)
+    if not success:
         return APIResponse.failure_response(
             error="OpenAI API key not configured for this organization."
         )
diff --git a/backend/app/api/routes/threads.py b/backend/app/api/routes/threads.py
index 7242e18c6..23b565576 100644
--- a/backend/app/api/routes/threads.py
+++ b/backend/app/api/routes/threads.py
@@ -14,7 +14,7 @@
 from app.utils import APIResponse
 from app.crud.credentials import get_provider_credential
 from app.core.security import decrypt_credentials
-from app.core.util import configure_langfuse
+from app.core.util import configure_langfuse, configure_openai
 
 logger = logging.getLogger(__name__)
 router = APIRouter(tags=["threads"])
@@ -229,11 +229,13 @@ async def threads(
         provider="openai",
         project_id=request.get("project_id"),
     )
-    if not credentials or "api_key" not in credentials:
+
+    # Configure OpenAI client
+    client, success = configure_openai(credentials)
+    if not success:
         return APIResponse.failure_response(
             error="OpenAI API key not configured for this organization."
         )
-    client = OpenAI(api_key=credentials["api_key"])
 
     langfuse_credentials = get_provider_credential(
         session=_session,
@@ -286,20 +288,20 @@ async def threads_sync(
     _current_user: UserOrganization = Depends(get_current_user_org),
 ):
     """Synchronous endpoint that processes requests immediately."""
-
     credentials = get_provider_credential(
         session=_session,
         org_id=_current_user.organization_id,
         provider="openai",
         project_id=request.get("project_id"),
     )
-    if not credentials or "api_key" not in credentials:
+
+    # Configure OpenAI client
+    client, success = configure_openai(credentials)
+    if not success:
         return APIResponse.failure_response(
             error="OpenAI API key not configured for this organization."
         )
 
-    client = OpenAI(api_key=credentials["api_key"])
-
     # Get Langfuse credentials
     langfuse_credentials = get_provider_credential(
         session=_session,
@@ -347,7 +349,19 @@ async def start_thread(
     Create a new OpenAI thread for the given question and start polling in the background.
     """
     prompt = request["question"]
-    client = OpenAI(api_key=settings.OPENAI_API_KEY)
+    credentials = get_provider_credential(
+        session=db,
+        org_id=_current_user.organization_id,
+        provider="openai",
+        project_id=request.get("project_id"),
+    )
+
+    # Configure OpenAI client
+    client, success = configure_openai(credentials)
+    if not success:
+        return APIResponse.failure_response(
+            error="OpenAI API key not configured for this organization."
+        )
 
     is_success, error = setup_thread(client, request)
     if not is_success:
diff --git a/backend/app/core/util.py b/backend/app/core/util.py
index 08882f4fa..c3cd4c934 100644
--- a/backend/app/core/util.py
+++ b/backend/app/core/util.py
@@ -7,6 +7,7 @@
 from pydantic import BaseModel, HttpUrl
 from langfuse import Langfuse
 from langfuse.decorators import langfuse_context
+from openai import OpenAI
 
 
 def now():
@@ -68,3 +69,25 @@ def configure_langfuse(credentials: dict) -> tuple[Langfuse, bool]:
     except Exception as e:
         warnings.warn(f"Failed to configure Langfuse: {str(e)}")
         return None, False
+
+
+def configure_openai(credentials: dict) -> tuple[OpenAI, bool]:
+    """
+    Configure OpenAI client with the provided credentials.
+
+    Args:
+        credentials: Dictionary containing OpenAI credentials (api_key)
+
+    Returns:
+        Tuple of (OpenAI client instance, success boolean)
+    """
+    if not credentials or "api_key" not in credentials:
+        return None, False
+
+    try:
+        # Configure OpenAI client
+        client = OpenAI(api_key=credentials["api_key"])
+        return client, True
+    except Exception as e:
+        warnings.warn(f"Failed to configure OpenAI client: {str(e)}")
+        return None, False

From 6198542545853486e3ee6ecf3f940a05bf8148cf Mon Sep 17 00:00:00 2001
From: Akhilesh Negi <akhileshnegi@Akhileshs-MacBook-Pro.local>
Date: Thu, 29 May 2025 12:13:16 +0530
Subject: [PATCH 09/64] cleanups

---
 backend/app/api/routes/evaluation.py | 6 ++++--
 backend/app/api/routes/threads.py    | 1 -
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 74e76afc3..f384399b3 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -1,7 +1,6 @@
 import logging
 from fastapi import APIRouter, Depends
 from sqlmodel import Session
-from langfuse import Langfuse
 from langfuse.decorators import langfuse_context
 
 from app.api.deps import get_current_user_org, get_db
@@ -27,7 +26,10 @@ async def evaluate_threads(
     _session: Session = Depends(get_db),
     _current_user: UserOrganization = Depends(get_current_user_org),
 ):
-    """Endpoint to run thread evaluations using Langfuse."""
+    """
+    Endpoint to run Lanfuse evaluations using LLM-as-a-judge.
+    Read more here: https://langfuse.com/changelog/2024-11-19-llm-as-a-judge-for-datasets
+    """
     logger.info(
         f"Starting evaluation for experiment: {experiment_name}, dataset: {dataset_name}, assistant: {assistant_id}"
     )
diff --git a/backend/app/api/routes/threads.py b/backend/app/api/routes/threads.py
index 23b565576..89fc85c03 100644
--- a/backend/app/api/routes/threads.py
+++ b/backend/app/api/routes/threads.py
@@ -13,7 +13,6 @@
 from app.crud import upsert_thread_result, get_thread_result
 from app.utils import APIResponse
 from app.crud.credentials import get_provider_credential
-from app.core.security import decrypt_credentials
 from app.core.util import configure_langfuse, configure_openai
 
 logger = logging.getLogger(__name__)

From 74ab6d16bcf0831f14c8bfa1a1028775ef33fc83 Mon Sep 17 00:00:00 2001
From: Akhilesh Negi <akhileshnegi@Akhileshs-MBP.domain.name>
Date: Thu, 29 May 2025 12:38:22 +0530
Subject: [PATCH 10/64] moving to separate files

---
 backend/app/api/routes/evaluation.py | 114 ++-----------------
 backend/app/crud/evaluation.py       | 159 +++++++++++++++++++++++++++
 2 files changed, 168 insertions(+), 105 deletions(-)
 create mode 100644 backend/app/crud/evaluation.py

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index f384399b3..81f5c4604 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -1,14 +1,11 @@
 import logging
 from fastapi import APIRouter, Depends
 from sqlmodel import Session
-from langfuse.decorators import langfuse_context
 
 from app.api.deps import get_current_user_org, get_db
 from app.models import UserOrganization
 from app.utils import APIResponse
-from app.crud.credentials import get_provider_credential
-from app.api.routes.threads import threads_sync
-from app.core.util import configure_langfuse, configure_openai
+from app.crud.evaluation import run_evaluation
 
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -34,109 +31,16 @@ async def evaluate_threads(
         f"Starting evaluation for experiment: {experiment_name}, dataset: {dataset_name}, assistant: {assistant_id}"
     )
 
-    # Get OpenAI credentials
-    credentials = get_provider_credential(
-        session=_session,
-        org_id=_current_user.organization_id,
-        provider="openai",
+    success, data, error = await run_evaluation(
+        experiment_name=experiment_name,
+        assistant_id=assistant_id,
+        dataset_name=dataset_name,
         project_id=project_id,
+        _session=_session,
+        _current_user=_current_user,
     )
 
-    # Configure OpenAI client
-    client, success = configure_openai(credentials)
     if not success:
-        return APIResponse.failure_response(
-            error="OpenAI API key not configured for this organization."
-        )
+        return APIResponse.failure_response(error=error)
 
-    # Get Langfuse credentials
-    langfuse_credentials = get_provider_credential(
-        session=_session,
-        org_id=_current_user.organization_id,
-        provider="langfuse",
-        project_id=project_id,
-    )
-    if not langfuse_credentials:
-        return APIResponse.failure_response(
-            error="LANGFUSE keys not configured for this organization."
-        )
-
-    # Configure Langfuse
-    langfuse, success = configure_langfuse(langfuse_credentials)
-    if not success:
-        return APIResponse.failure_response(
-            error="Failed to configure Langfuse client."
-        )
-
-    try:
-        # Get dataset
-        logger.info(f"Fetching dataset: {dataset_name}")
-        dataset = langfuse.get_dataset(dataset_name)
-        results = []
-        total_items = len(dataset.items)
-        logger.info(f"Processing {total_items} items from {dataset_name} dataset")
-
-        for idx, item in enumerate(dataset.items, 1):
-            logger.info(f"Processing item {idx}/{total_items}: {item.input[:20]}...")
-            with item.observe(run_name=experiment_name) as trace_id:
-                # Prepare request
-                request = {
-                    "question": item.input,
-                    "assistant_id": assistant_id,
-                    "remove_citation": True,
-                    "project_id": project_id,
-                }
-
-                # Process thread synchronously
-                response = await threads_sync(
-                    request=request,
-                    _session=_session,
-                    _current_user=_current_user,
-                )
-
-                # Extract message from the response
-                if isinstance(response, dict) and response.get("success"):
-                    output = response.get("data", {}).get("message", "")
-                    thread_id = response.get("data", {}).get("thread_id")
-                else:
-                    output = ""
-                    thread_id = None
-
-                # Evaluate based on response success
-                is_match = bool(output)
-                langfuse.score(
-                    trace_id=trace_id, name="thread_creation_success", value=is_match
-                )
-                results.append(
-                    {
-                        "input": item.input,
-                        "output": output,
-                        "expected": item.expected_output,
-                        "match": is_match,
-                        "thread_id": thread_id if is_match else None,
-                    }
-                )
-                logger.info(f"Completed processing item {idx} (match: {is_match})")
-
-        # Flush Langfuse events
-        langfuse_context.flush()
-        langfuse.flush()
-
-        matches = sum(1 for r in results if r["match"])
-        logger.info(
-            f"Evaluation completed. Total items: {len(results)}, Matches: {matches}"
-        )
-        return APIResponse.success_response(
-            data={
-                "experiment_name": experiment_name,
-                "dataset_name": dataset_name,
-                "results": results,
-                "total_items": len(results),
-                "matches": matches,
-                "note": "All threads have been processed synchronously.",
-            }
-        )
-
-    except Exception as e:
-        logger.error(f"Error during evaluation: {str(e)}", exc_info=True)
-        return APIResponse.failure_response(error=str(e))
+    return APIResponse.success_response(data=data)
diff --git a/backend/app/crud/evaluation.py b/backend/app/crud/evaluation.py
new file mode 100644
index 000000000..93ef52ff4
--- /dev/null
+++ b/backend/app/crud/evaluation.py
@@ -0,0 +1,159 @@
+import logging
+from typing import Dict, List, Optional, Tuple
+from sqlmodel import Session
+from langfuse import Langfuse
+
+from app.models import UserOrganization
+from app.crud.credentials import get_provider_credential
+from app.api.routes.threads import threads_sync
+from app.core.util import configure_langfuse, configure_openai
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+async def run_evaluation(
+    experiment_name: str,
+    assistant_id: str,
+    dataset_name: str,
+    project_id: int,
+    _session: Session,
+    _current_user: UserOrganization,
+) -> Tuple[bool, Dict, Optional[str]]:
+    """
+    Run Langfuse evaluations using LLM-as-a-judge.
+
+    Args:
+        experiment_name: Name of the experiment
+        assistant_id: ID of the assistant to evaluate
+        dataset_name: Name of the dataset to use
+        project_id: Project ID
+        _session: Database session
+        _current_user: Current user organization
+
+    Returns:
+        Tuple of (success, response_data, error_message)
+    """
+    # Get OpenAI credentials
+    credentials = get_provider_credential(
+        session=_session,
+        org_id=_current_user.organization_id,
+        provider="openai",
+        project_id=project_id,
+    )
+
+    # Configure OpenAI client
+    client, success = configure_openai(credentials)
+    if not success:
+        return False, {}, "OpenAI API key not configured for this organization."
+
+    # Get Langfuse credentials
+    langfuse_credentials = get_provider_credential(
+        session=_session,
+        org_id=_current_user.organization_id,
+        provider="langfuse",
+        project_id=project_id,
+    )
+    if not langfuse_credentials:
+        return False, {}, "LANGFUSE keys not configured for this organization."
+
+    # Configure Langfuse
+    langfuse, success = configure_langfuse(langfuse_credentials)
+    if not success:
+        return False, {}, "Failed to configure Langfuse client."
+
+    try:
+        return await _process_evaluation(
+            langfuse=langfuse,
+            experiment_name=experiment_name,
+            assistant_id=assistant_id,
+            dataset_name=dataset_name,
+            project_id=project_id,
+            _session=_session,
+            _current_user=_current_user,
+        )
+    except Exception as e:
+        logger.error(f"Error during evaluation: {str(e)}", exc_info=True)
+        return False, {}, str(e)
+
+
+async def _process_evaluation(
+    langfuse: Langfuse,
+    experiment_name: str,
+    assistant_id: str,
+    dataset_name: str,
+    project_id: int,
+    _session: Session,
+    _current_user: UserOrganization,
+) -> Tuple[bool, Dict, Optional[str]]:
+    """Internal function to process the evaluation."""
+    # Get dataset
+    logger.info(f"Fetching dataset: {dataset_name}")
+    dataset = langfuse.get_dataset(dataset_name)
+    results: List[Dict] = []
+    total_items = len(dataset.items)
+    logger.info(f"Processing {total_items} items from {dataset_name} dataset")
+
+    for idx, item in enumerate(dataset.items, 1):
+        logger.info(f"Processing item {idx}/{total_items}: {item.input[:20]}...")
+        with item.observe(run_name=experiment_name) as trace_id:
+            # Prepare request
+            request = {
+                "question": item.input,
+                "assistant_id": assistant_id,
+                "remove_citation": True,
+                "project_id": project_id,
+            }
+
+            # Process thread synchronously
+            response = await threads_sync(
+                request=request,
+                _session=_session,
+                _current_user=_current_user,
+            )
+
+            # Extract message from the response
+            if isinstance(response, dict) and response.get("success"):
+                output = response.get("data", {}).get("message", "")
+                thread_id = response.get("data", {}).get("thread_id")
+            else:
+                output = ""
+                thread_id = None
+
+            # Evaluate based on response success
+            is_match = bool(output)
+            langfuse.score(
+                trace_id=trace_id, name="thread_creation_success", value=is_match
+            )
+            results.append(
+                {
+                    "input": item.input,
+                    "output": output,
+                    "expected": item.expected_output,
+                    "match": is_match,
+                    "thread_id": thread_id if is_match else None,
+                }
+            )
+            logger.info(f"Completed processing item {idx} (match: {is_match})")
+
+    # Flush Langfuse events
+    langfuse.flush()
+
+    matches = sum(1 for r in results if r["match"])
+    logger.info(
+        f"Evaluation completed. Total items: {len(results)}, Matches: {matches}"
+    )
+
+    return (
+        True,
+        {
+            "experiment_name": experiment_name,
+            "dataset_name": dataset_name,
+            "results": results,
+            "total_items": len(results),
+            "matches": matches,
+            "note": "All threads have been processed synchronously.",
+        },
+        None,
+    )

From cfc2dbd2cab346c347fc04db03441604a76768d3 Mon Sep 17 00:00:00 2001
From: Akhilesh Negi <akhileshnegi@Akhileshs-MBP.domain.name>
Date: Thu, 29 May 2025 12:57:52 +0530
Subject: [PATCH 11/64] using pydantic types

---
 backend/app/api/routes/evaluation.py | 11 ++++---
 backend/app/crud/evaluation.py       | 49 ++++++++++++++--------------
 backend/app/models/evaluation.py     | 29 ++++++++++++++++
 3 files changed, 60 insertions(+), 29 deletions(-)
 create mode 100644 backend/app/models/evaluation.py

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 81f5c4604..e420c94bd 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -6,6 +6,7 @@
 from app.models import UserOrganization
 from app.utils import APIResponse
 from app.crud.evaluation import run_evaluation
+from app.models.evaluation import Experiment
 
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -14,7 +15,7 @@
 router = APIRouter(tags=["evaluation"])
 
 
-@router.post("/evaluate")
+@router.post("/evaluate", response_model=Experiment)
 async def evaluate_threads(
     experiment_name: str,
     assistant_id: str,
@@ -22,7 +23,7 @@ async def evaluate_threads(
     project_id: int,
     _session: Session = Depends(get_db),
     _current_user: UserOrganization = Depends(get_current_user_org),
-):
+) -> Experiment:
     """
     Endpoint to run Lanfuse evaluations using LLM-as-a-judge.
     Read more here: https://langfuse.com/changelog/2024-11-19-llm-as-a-judge-for-datasets
@@ -40,7 +41,7 @@ async def evaluate_threads(
         _current_user=_current_user,
     )
 
-    if not success:
-        return APIResponse.failure_response(error=error)
+    if not success or data is None:
+        raise ValueError(error or "Failed to run evaluation")
 
-    return APIResponse.success_response(data=data)
+    return data
diff --git a/backend/app/crud/evaluation.py b/backend/app/crud/evaluation.py
index 93ef52ff4..0b459c150 100644
--- a/backend/app/crud/evaluation.py
+++ b/backend/app/crud/evaluation.py
@@ -7,6 +7,7 @@
 from app.crud.credentials import get_provider_credential
 from app.api.routes.threads import threads_sync
 from app.core.util import configure_langfuse, configure_openai
+from app.models.evaluation import Experiment, EvaluationResult
 
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -20,7 +21,7 @@ async def run_evaluation(
     project_id: int,
     _session: Session,
     _current_user: UserOrganization,
-) -> Tuple[bool, Dict, Optional[str]]:
+) -> Tuple[bool, Optional[Experiment], Optional[str]]:
     """
     Run Langfuse evaluations using LLM-as-a-judge.
 
@@ -33,7 +34,7 @@ async def run_evaluation(
         _current_user: Current user organization
 
     Returns:
-        Tuple of (success, response_data, error_message)
+        Tuple of (success, experiment_data, error_message)
     """
     # Get OpenAI credentials
     credentials = get_provider_credential(
@@ -46,7 +47,7 @@ async def run_evaluation(
     # Configure OpenAI client
     client, success = configure_openai(credentials)
     if not success:
-        return False, {}, "OpenAI API key not configured for this organization."
+        return False, None, "OpenAI API key not configured for this organization."
 
     # Get Langfuse credentials
     langfuse_credentials = get_provider_credential(
@@ -56,12 +57,12 @@ async def run_evaluation(
         project_id=project_id,
     )
     if not langfuse_credentials:
-        return False, {}, "LANGFUSE keys not configured for this organization."
+        return False, None, "LANGFUSE keys not configured for this organization."
 
     # Configure Langfuse
     langfuse, success = configure_langfuse(langfuse_credentials)
     if not success:
-        return False, {}, "Failed to configure Langfuse client."
+        return False, None, "Failed to configure Langfuse client."
 
     try:
         return await _process_evaluation(
@@ -75,7 +76,7 @@ async def run_evaluation(
         )
     except Exception as e:
         logger.error(f"Error during evaluation: {str(e)}", exc_info=True)
-        return False, {}, str(e)
+        return False, None, str(e)
 
 
 async def _process_evaluation(
@@ -86,12 +87,12 @@ async def _process_evaluation(
     project_id: int,
     _session: Session,
     _current_user: UserOrganization,
-) -> Tuple[bool, Dict, Optional[str]]:
+) -> Tuple[bool, Optional[Experiment], Optional[str]]:
     """Internal function to process the evaluation."""
     # Get dataset
     logger.info(f"Fetching dataset: {dataset_name}")
     dataset = langfuse.get_dataset(dataset_name)
-    results: List[Dict] = []
+    results: List[EvaluationResult] = []
     total_items = len(dataset.items)
     logger.info(f"Processing {total_items} items from {dataset_name} dataset")
 
@@ -127,33 +128,33 @@ async def _process_evaluation(
                 trace_id=trace_id, name="thread_creation_success", value=is_match
             )
             results.append(
-                {
-                    "input": item.input,
-                    "output": output,
-                    "expected": item.expected_output,
-                    "match": is_match,
-                    "thread_id": thread_id if is_match else None,
-                }
+                EvaluationResult(
+                    input=item.input,
+                    output=output,
+                    expected=item.expected_output,
+                    match=is_match,
+                    thread_id=thread_id if is_match else None,
+                )
             )
             logger.info(f"Completed processing item {idx} (match: {is_match})")
 
     # Flush Langfuse events
     langfuse.flush()
 
-    matches = sum(1 for r in results if r["match"])
+    matches = sum(1 for r in results if r.match)
     logger.info(
         f"Evaluation completed. Total items: {len(results)}, Matches: {matches}"
     )
 
     return (
         True,
-        {
-            "experiment_name": experiment_name,
-            "dataset_name": dataset_name,
-            "results": results,
-            "total_items": len(results),
-            "matches": matches,
-            "note": "All threads have been processed synchronously.",
-        },
+        Experiment(
+            experiment_name=experiment_name,
+            dataset_name=dataset_name,
+            results=results,
+            total_items=len(results),
+            matches=matches,
+            note="All threads have been processed synchronously.",
+        ),
         None,
     )
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
new file mode 100644
index 000000000..89ef0b9c5
--- /dev/null
+++ b/backend/app/models/evaluation.py
@@ -0,0 +1,29 @@
+from typing import List, Optional
+from pydantic import BaseModel, Field
+
+
+class EvaluationResult(BaseModel):
+    """Model for a single evaluation result."""
+
+    input: str = Field(..., description="The input question/prompt used for evaluation")
+    output: str = Field(..., description="The actual output from the assistant")
+    expected: str = Field(..., description="The expected output from the dataset")
+    match: bool = Field(
+        ..., description="Whether the output matches the expected result"
+    )
+    thread_id: Optional[str] = Field(None, description="ID of the OpenAI")
+
+
+class Experiment(BaseModel):
+    """Model for the complete experiment evaluation response."""
+
+    experiment_name: str = Field(..., description="Name of the experiment")
+    dataset_name: str = Field(
+        ..., description="Name of the dataset used for evaluation"
+    )
+    results: List[EvaluationResult] = Field(
+        ..., description="List of evaluation results"
+    )
+    total_items: int = Field(..., description="Total number of items evaluated")
+    matches: int = Field(..., description="Number of successful matches")
+    note: str = Field(..., description="Additional notes about the evaluation process")

From 0633c542e9a1fcec701195b71aac87f01bf2b853 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 7 Oct 2025 09:50:15 +0530
Subject: [PATCH 12/64] Remove project_id dependency from evaluation endpoints

- Remove project_id parameter from /evaluate endpoint
- Update get_provider_credential calls to not require project_id
- Credentials now retrieved via API key authentication
- Clean up logging configuration and imports
- Fix linting errors and update type annotations
---
 backend/app/api/routes/evaluation.py | 10 +++-------
 backend/app/crud/evaluation.py       | 25 ++++++++-----------------
 backend/app/models/evaluation.py     |  5 ++---
 3 files changed, 13 insertions(+), 27 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index e420c94bd..0cd9d22f2 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -1,15 +1,13 @@
 import logging
+
 from fastapi import APIRouter, Depends
 from sqlmodel import Session
 
 from app.api.deps import get_current_user_org, get_db
-from app.models import UserOrganization
-from app.utils import APIResponse
 from app.crud.evaluation import run_evaluation
+from app.models import UserOrganization
 from app.models.evaluation import Experiment
 
-# Configure logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 router = APIRouter(tags=["evaluation"])
@@ -20,12 +18,11 @@ async def evaluate_threads(
     experiment_name: str,
     assistant_id: str,
     dataset_name: str,
-    project_id: int,
     _session: Session = Depends(get_db),
     _current_user: UserOrganization = Depends(get_current_user_org),
 ) -> Experiment:
     """
-    Endpoint to run Lanfuse evaluations using LLM-as-a-judge.
+    Endpoint to run Langfuse evaluations using LLM-as-a-judge.
     Read more here: https://langfuse.com/changelog/2024-11-19-llm-as-a-judge-for-datasets
     """
     logger.info(
@@ -36,7 +33,6 @@ async def evaluate_threads(
         experiment_name=experiment_name,
         assistant_id=assistant_id,
         dataset_name=dataset_name,
-        project_id=project_id,
         _session=_session,
         _current_user=_current_user,
     )
diff --git a/backend/app/crud/evaluation.py b/backend/app/crud/evaluation.py
index 0b459c150..c64d96f1e 100644
--- a/backend/app/crud/evaluation.py
+++ b/backend/app/crud/evaluation.py
@@ -1,16 +1,14 @@
 import logging
-from typing import Dict, List, Optional, Tuple
-from sqlmodel import Session
+
 from langfuse import Langfuse
+from sqlmodel import Session
 
-from app.models import UserOrganization
-from app.crud.credentials import get_provider_credential
 from app.api.routes.threads import threads_sync
 from app.core.util import configure_langfuse, configure_openai
-from app.models.evaluation import Experiment, EvaluationResult
+from app.crud.credentials import get_provider_credential
+from app.models import UserOrganization
+from app.models.evaluation import EvaluationResult, Experiment
 
-# Configure logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
@@ -18,10 +16,9 @@ async def run_evaluation(
     experiment_name: str,
     assistant_id: str,
     dataset_name: str,
-    project_id: int,
     _session: Session,
     _current_user: UserOrganization,
-) -> Tuple[bool, Optional[Experiment], Optional[str]]:
+) -> tuple[bool, Experiment | None, str | None]:
     """
     Run Langfuse evaluations using LLM-as-a-judge.
 
@@ -29,7 +26,6 @@ async def run_evaluation(
         experiment_name: Name of the experiment
         assistant_id: ID of the assistant to evaluate
         dataset_name: Name of the dataset to use
-        project_id: Project ID
         _session: Database session
         _current_user: Current user organization
 
@@ -41,7 +37,6 @@ async def run_evaluation(
         session=_session,
         org_id=_current_user.organization_id,
         provider="openai",
-        project_id=project_id,
     )
 
     # Configure OpenAI client
@@ -54,7 +49,6 @@ async def run_evaluation(
         session=_session,
         org_id=_current_user.organization_id,
         provider="langfuse",
-        project_id=project_id,
     )
     if not langfuse_credentials:
         return False, None, "LANGFUSE keys not configured for this organization."
@@ -70,7 +64,6 @@ async def run_evaluation(
             experiment_name=experiment_name,
             assistant_id=assistant_id,
             dataset_name=dataset_name,
-            project_id=project_id,
             _session=_session,
             _current_user=_current_user,
         )
@@ -84,15 +77,14 @@ async def _process_evaluation(
     experiment_name: str,
     assistant_id: str,
     dataset_name: str,
-    project_id: int,
     _session: Session,
     _current_user: UserOrganization,
-) -> Tuple[bool, Optional[Experiment], Optional[str]]:
+) -> tuple[bool, Experiment | None, str | None]:
     """Internal function to process the evaluation."""
     # Get dataset
     logger.info(f"Fetching dataset: {dataset_name}")
     dataset = langfuse.get_dataset(dataset_name)
-    results: List[EvaluationResult] = []
+    results: list[EvaluationResult] = []
     total_items = len(dataset.items)
     logger.info(f"Processing {total_items} items from {dataset_name} dataset")
 
@@ -104,7 +96,6 @@ async def _process_evaluation(
                 "question": item.input,
                 "assistant_id": assistant_id,
                 "remove_citation": True,
-                "project_id": project_id,
             }
 
             # Process thread synchronously
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index 89ef0b9c5..62c64127f 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -1,4 +1,3 @@
-from typing import List, Optional
 from pydantic import BaseModel, Field
 
 
@@ -11,7 +10,7 @@ class EvaluationResult(BaseModel):
     match: bool = Field(
         ..., description="Whether the output matches the expected result"
     )
-    thread_id: Optional[str] = Field(None, description="ID of the OpenAI")
+    thread_id: str | None = Field(None, description="ID of the OpenAI")
 
 
 class Experiment(BaseModel):
@@ -21,7 +20,7 @@ class Experiment(BaseModel):
     dataset_name: str = Field(
         ..., description="Name of the dataset used for evaluation"
     )
-    results: List[EvaluationResult] = Field(
+    results: list[EvaluationResult] = Field(
         ..., description="List of evaluation results"
     )
     total_items: int = Field(..., description="Total number of items evaluated")

From c130b0262e0231ab5635f3b566471d56ba8314d5 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Wed, 8 Oct 2025 11:43:39 +0530
Subject: [PATCH 13/64] using hardcoded

---
 backend/app/crud/evaluation.py | 68 ++++++++++++++++------------------
 1 file changed, 31 insertions(+), 37 deletions(-)

diff --git a/backend/app/crud/evaluation.py b/backend/app/crud/evaluation.py
index c64d96f1e..c85c99daf 100644
--- a/backend/app/crud/evaluation.py
+++ b/backend/app/crud/evaluation.py
@@ -3,7 +3,6 @@
 from langfuse import Langfuse
 from sqlmodel import Session
 
-from app.api.routes.threads import threads_sync
 from app.core.util import configure_langfuse, configure_openai
 from app.crud.credentials import get_provider_credential
 from app.models import UserOrganization
@@ -80,54 +79,49 @@ async def _process_evaluation(
     _session: Session,
     _current_user: UserOrganization,
 ) -> tuple[bool, Experiment | None, str | None]:
-    """Internal function to process the evaluation."""
-    # Get dataset
+    """Internal function to process the evaluation with hardcoded input/output pairs."""
+    # Hardcoded test data - list of question/answer pairs
+    test_data = [
+        {"question": "What is the capital of France?", "answer": "Paris"},
+        {"question": "What is the capital of Germany?", "answer": "Berlin"},
+        {"question": "What is the capital of Italy?", "answer": "Rome"},
+        {"question": "What is the capital of Spain?", "answer": "Madrid"},
+    ]
+
+    # Get dataset from Langfuse (assume it exists)
     logger.info(f"Fetching dataset: {dataset_name}")
     dataset = langfuse.get_dataset(dataset_name)
+
     results: list[EvaluationResult] = []
     total_items = len(dataset.items)
-    logger.info(f"Processing {total_items} items from {dataset_name} dataset")
+    logger.info(
+        f"Processing {total_items} items from dataset with experiment: {experiment_name}"
+    )
 
     for idx, item in enumerate(dataset.items, 1):
-        logger.info(f"Processing item {idx}/{total_items}: {item.input[:20]}...")
+        question = item.input
+        expected_answer = item.expected_output
+        logger.info(f"Processing item {idx}/{total_items}: {question}")
+
+        # Use item.observe to create trace linked to dataset item
         with item.observe(run_name=experiment_name) as trace_id:
-            # Prepare request
-            request = {
-                "question": item.input,
-                "assistant_id": assistant_id,
-                "remove_citation": True,
-            }
-
-            # Process thread synchronously
-            response = await threads_sync(
-                request=request,
-                _session=_session,
-                _current_user=_current_user,
-            )
+            # For testing, use the expected answer as output
+            answer = expected_answer
 
-            # Extract message from the response
-            if isinstance(response, dict) and response.get("success"):
-                output = response.get("data", {}).get("message", "")
-                thread_id = response.get("data", {}).get("thread_id")
-            else:
-                output = ""
-                thread_id = None
-
-            # Evaluate based on response success
-            is_match = bool(output)
-            langfuse.score(
-                trace_id=trace_id, name="thread_creation_success", value=is_match
+            # Update trace with input/output
+            langfuse.trace(
+                id=trace_id, input={"question": question}, output={"answer": answer}
             )
+
             results.append(
                 EvaluationResult(
-                    input=item.input,
-                    output=output,
-                    expected=item.expected_output,
-                    match=is_match,
-                    thread_id=thread_id if is_match else None,
+                    input=question,
+                    output=answer,
+                    expected=expected_answer,
+                    thread_id=None,
                 )
             )
-            logger.info(f"Completed processing item {idx} (match: {is_match})")
+            logger.info(f"Completed processing item {idx}")
 
     # Flush Langfuse events
     langfuse.flush()
@@ -145,7 +139,7 @@ async def _process_evaluation(
             results=results,
             total_items=len(results),
             matches=matches,
-            note="All threads have been processed synchronously.",
+            note="Hardcoded question/answer pairs linked to dataset run.",
         ),
         None,
     )

From 546622905ec7e3ec6cb5d1981425048804f56091 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Mon, 13 Oct 2025 20:38:02 +0530
Subject: [PATCH 14/64] adding endpoint for uploading dataset

---
 backend/app/api/routes/evaluation.py          |  59 ++-
 backend/app/crud/evaluation.py                | 129 ++++++-
 backend/app/models/evaluation.py              |  25 ++
 .../app/tests/api/routes/test_evaluation.py   | 363 ++++++++++++++++++
 4 files changed, 572 insertions(+), 4 deletions(-)
 create mode 100644 backend/app/tests/api/routes/test_evaluation.py

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 0cd9d22f2..f440a2a29 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -1,18 +1,71 @@
 import logging
 
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter, Depends, UploadFile, File, Form
 from sqlmodel import Session
 
 from app.api.deps import get_current_user_org, get_db
-from app.crud.evaluation import run_evaluation
+from app.crud.evaluation import run_evaluation, upload_dataset_to_langfuse
 from app.models import UserOrganization
-from app.models.evaluation import Experiment
+from app.models.evaluation import Experiment, DatasetUploadResponse
 
 logger = logging.getLogger(__name__)
 
 router = APIRouter(tags=["evaluation"])
 
 
+@router.post("/dataset/upload", response_model=DatasetUploadResponse)
+async def upload_dataset(
+    file: UploadFile = File(
+        ..., description="CSV file with 'question' and 'answer' columns"
+    ),
+    dataset_name: str = Form(..., description="Name for the dataset in Langfuse"),
+    duplication_factor: int = Form(
+        default=5, description="Number of times to duplicate each item"
+    ),
+    _session: Session = Depends(get_db),
+    _current_user: UserOrganization = Depends(get_current_user_org),
+) -> DatasetUploadResponse:
+    """
+    Upload a CSV file containing Q&A pairs to Langfuse as a dataset.
+    Each question will be duplicated N times (default 5) to test LLM flakiness.
+
+    CSV Format:
+    - Must contain 'question' and 'answer' columns
+    - Can have additional columns (will be ignored)
+
+    Example CSV:
+    ```
+    question,answer
+    "What is the capital of France?","Paris"
+    "What is 2+2?","4"
+    ```
+    """
+    logger.info(
+        f"Uploading dataset: {dataset_name} with duplication factor: {duplication_factor}"
+    )
+
+    # Read CSV content
+    content = await file.read()
+
+    success, data, error = await upload_dataset_to_langfuse(
+        csv_content=content,
+        dataset_name=dataset_name,
+        duplication_factor=duplication_factor,
+        _session=_session,
+        _current_user=_current_user,
+    )
+
+    if not success or data is None:
+        raise ValueError(error or "Failed to upload dataset")
+
+    logger.info(
+        f"Successfully uploaded dataset: {dataset_name} with {data.total_items} items "
+        f"({data.original_items} original items × {duplication_factor})"
+    )
+
+    return data
+
+
 @router.post("/evaluate", response_model=Experiment)
 async def evaluate_threads(
     experiment_name: str,
diff --git a/backend/app/crud/evaluation.py b/backend/app/crud/evaluation.py
index c85c99daf..ec7dc0b69 100644
--- a/backend/app/crud/evaluation.py
+++ b/backend/app/crud/evaluation.py
@@ -1,3 +1,5 @@
+import csv
+import io
 import logging
 
 from langfuse import Langfuse
@@ -6,11 +8,136 @@
 from app.core.util import configure_langfuse, configure_openai
 from app.crud.credentials import get_provider_credential
 from app.models import UserOrganization
-from app.models.evaluation import EvaluationResult, Experiment
+from app.models.evaluation import (
+    DatasetUploadResponse,
+    EvaluationResult,
+    Experiment,
+)
 
 logger = logging.getLogger(__name__)
 
 
+async def upload_dataset_to_langfuse(
+    csv_content: bytes,
+    dataset_name: str,
+    duplication_factor: int,
+    _session: Session,
+    _current_user: UserOrganization,
+) -> tuple[bool, DatasetUploadResponse | None, str | None]:
+    """
+    Upload a CSV dataset to Langfuse with duplication for flakiness testing.
+
+    Args:
+        csv_content: Raw CSV file content as bytes
+        dataset_name: Name for the dataset in Langfuse
+        duplication_factor: Number of times to duplicate each item (default 5)
+        _session: Database session
+        _current_user: Current user organization
+
+    Returns:
+        Tuple of (success, dataset_response, error_message)
+    """
+    try:
+        # Get Langfuse credentials
+        langfuse_credentials = get_provider_credential(
+            session=_session,
+            org_id=_current_user.organization_id,
+            provider="langfuse",
+        )
+        if not langfuse_credentials:
+            return False, None, "LANGFUSE keys not configured for this organization."
+
+        # Configure Langfuse
+        langfuse, success = configure_langfuse(langfuse_credentials)
+        if not success:
+            return False, None, "Failed to configure Langfuse client."
+
+        # Parse CSV content
+        csv_text = csv_content.decode("utf-8")
+        csv_reader = csv.DictReader(io.StringIO(csv_text))
+
+        # Validate CSV headers
+        if (
+            "question" not in csv_reader.fieldnames
+            or "answer" not in csv_reader.fieldnames
+        ):
+            return (
+                False,
+                None,
+                "CSV must contain 'question' and 'answer' columns. "
+                f"Found columns: {csv_reader.fieldnames}",
+            )
+
+        # Read all rows from CSV
+        original_items = []
+        for row in csv_reader:
+            question = row.get("question", "").strip()
+            answer = row.get("answer", "").strip()
+
+            if not question or not answer:
+                logger.warning(f"Skipping row with empty question or answer: {row}")
+                continue
+
+            original_items.append({"question": question, "answer": answer})
+
+        if not original_items:
+            return False, None, "No valid items found in CSV file."
+
+        logger.info(
+            f"Parsed {len(original_items)} items from CSV. "
+            f"Will duplicate {duplication_factor}x for a total of {len(original_items) * duplication_factor} items."
+        )
+
+        # Create or get dataset in Langfuse
+        dataset = langfuse.create_dataset(name=dataset_name)
+
+        # Upload items with duplication
+        total_uploaded = 0
+        for item in original_items:
+            # Duplicate each item N times
+            for duplicate_num in range(duplication_factor):
+                try:
+                    langfuse.create_dataset_item(
+                        dataset_name=dataset_name,
+                        input={"question": item["question"]},
+                        expected_output={"answer": item["answer"]},
+                        metadata={
+                            "original_question": item["question"],
+                            "duplicate_number": duplicate_num + 1,
+                            "duplication_factor": duplication_factor,
+                        },
+                    )
+                    total_uploaded += 1
+                except Exception as e:
+                    logger.error(
+                        f"Failed to upload item (duplicate {duplicate_num + 1}): {item['question'][:50]}... Error: {e}"
+                    )
+
+        # Flush to ensure all items are uploaded
+        langfuse.flush()
+
+        logger.info(
+            f"Successfully uploaded {total_uploaded} items to dataset '{dataset_name}' "
+            f"({len(original_items)} original × {duplication_factor} duplicates)"
+        )
+
+        return (
+            True,
+            DatasetUploadResponse(
+                dataset_name=dataset_name,
+                total_items=total_uploaded,
+                original_items=len(original_items),
+                duplication_factor=duplication_factor,
+                langfuse_dataset_id=dataset.id if hasattr(dataset, "id") else None,
+            ),
+            None,
+        )
+
+    except Exception as e:
+        logger.error(f"Error uploading dataset: {str(e)}", exc_info=True)
+        return False, None, f"Failed to upload dataset: {str(e)}"
+
+
 async def run_evaluation(
     experiment_name: str,
     assistant_id: str,
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index 62c64127f..1f0e2ffb6 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -1,6 +1,31 @@
 from pydantic import BaseModel, Field
 
 
+class DatasetItem(BaseModel):
+    """Model for a single dataset item (Q&A pair)."""
+
+    question: str = Field(..., description="The question/input")
+    answer: str = Field(..., description="The expected answer/output")
+
+
+class DatasetUploadResponse(BaseModel):
+    """Response model for dataset upload."""
+
+    dataset_name: str = Field(..., description="Name of the created dataset")
+    total_items: int = Field(
+        ..., description="Total number of items uploaded (after duplication)"
+    )
+    original_items: int = Field(
+        ..., description="Number of original items before duplication"
+    )
+    duplication_factor: int = Field(
+        default=5, description="Number of times each item was duplicated"
+    )
+    langfuse_dataset_id: str | None = Field(
+        None, description="Langfuse dataset ID if available"
+    )
+
+
 class EvaluationResult(BaseModel):
     """Model for a single evaluation result."""
 
diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
new file mode 100644
index 000000000..672a13171
--- /dev/null
+++ b/backend/app/tests/api/routes/test_evaluation.py
@@ -0,0 +1,363 @@
+import io
+from unittest.mock import patch, MagicMock
+import pytest
+
+from app.models.evaluation import DatasetUploadResponse
+
+
+# Helper function to create CSV file-like object
+def create_csv_file(content: str) -> tuple[str, io.BytesIO]:
+    """Create a CSV file-like object for testing."""
+    file_obj = io.BytesIO(content.encode("utf-8"))
+    return ("test.csv", file_obj)
+
+
+@pytest.fixture
+def valid_csv_content():
+    """Valid CSV content with question and answer columns."""
+    return """question,answer
+"Who is known as the strongest jujutsu sorcerer?","Satoru Gojo"
+"What is the name of Gojo’s Domain Expansion?","Infinite Void"
+"Who is known as the King of Curses?","Ryomen Sukuna"
+"""
+
+
+@pytest.fixture
+def invalid_csv_missing_columns():
+    """CSV content missing required columns."""
+    return """query,response
+"Who is known as the strongest jujutsu sorcerer?","Satoru Gojo"
+"""
+
+
+@pytest.fixture
+def csv_with_empty_rows():
+    """CSV content with some empty rows."""
+    return """question,answer
+"Who is known as the strongest jujutsu sorcerer?","Satoru Gojo"
+"","4"
+"Who wrote Romeo and Juliet?",""
+"Valid question","Valid answer"
+"""
+
+
+@pytest.fixture
+def setup_credentials(db):
+    """Setup mock credentials for Langfuse."""
+    from app.crud.credentials import set_credentials
+
+    credentials = {
+        "langfuse": {
+            "public_key": "test_public_key",
+            "secret_key": "test_secret_key",
+            "host": "https://cloud.langfuse.com",
+        }
+    }
+
+    # Get organization_id from test user (from seed data)
+    from sqlmodel import select
+    from app.models import Organization
+
+    org = db.exec(select(Organization)).first()
+
+    set_credentials(
+        session=db,
+        org_id=org.id,
+        credentials=credentials,
+    )
+    db.commit()
+    return org.id
+
+
+class TestDatasetUploadValidation:
+    """Test CSV validation and parsing."""
+
+    def test_upload_dataset_valid_csv(
+        self, client, user_api_key_header, valid_csv_content, setup_credentials
+    ):
+        """Test uploading a valid CSV file."""
+        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
+            # Mock Langfuse client
+            mock_client = MagicMock()
+            mock_dataset = MagicMock()
+            mock_dataset.id = "test_dataset_id"
+            mock_client.create_dataset.return_value = mock_dataset
+            mock_client.create_dataset_item.return_value = None
+            mock_langfuse.return_value = (mock_client, True)
+
+            filename, file_obj = create_csv_file(valid_csv_content)
+
+            response = client.post(
+                "/api/v1/dataset/upload",
+                files={"file": (filename, file_obj, "text/csv")},
+                data={
+                    "dataset_name": "test_dataset",
+                    "duplication_factor": 3,
+                },
+                headers=user_api_key_header,
+            )
+
+            assert response.status_code == 200, response.text
+            data = response.json()
+
+            assert data["dataset_name"] == "test_dataset"
+            assert data["original_items"] == 3
+            assert data["total_items"] == 9  # 3 items * 3 duplication
+            assert data["duplication_factor"] == 3
+            assert data["langfuse_dataset_id"] == "test_dataset_id"
+
+            # Verify Langfuse was called correctly
+            mock_client.create_dataset.assert_called_once_with(name="test_dataset")
+            assert (
+                mock_client.create_dataset_item.call_count == 9
+            )  # 3 items * 3 duplicates
+
+    def test_upload_dataset_missing_columns(
+        self,
+        client,
+        user_api_key_header,
+        invalid_csv_missing_columns,
+        setup_credentials,
+    ):
+        """Test uploading CSV with missing required columns."""
+        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
+            mock_client = MagicMock()
+            mock_langfuse.return_value = (mock_client, True)
+
+            filename, file_obj = create_csv_file(invalid_csv_missing_columns)
+
+            response = client.post(
+                "/api/v1/dataset/upload",
+                files={"file": (filename, file_obj, "text/csv")},
+                data={
+                    "dataset_name": "test_dataset",
+                    "duplication_factor": 5,
+                },
+                headers=user_api_key_header,
+            )
+
+            assert response.status_code == 500  # ValueError is raised
+            assert (
+                "question" in response.text.lower() or "answer" in response.text.lower()
+            )
+
+    def test_upload_dataset_empty_rows(
+        self, client, user_api_key_header, csv_with_empty_rows, setup_credentials
+    ):
+        """Test uploading CSV with empty rows (should skip them)."""
+        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
+            mock_client = MagicMock()
+            mock_dataset = MagicMock()
+            mock_dataset.id = "test_dataset_id"
+            mock_client.create_dataset.return_value = mock_dataset
+            mock_client.create_dataset_item.return_value = None
+            mock_langfuse.return_value = (mock_client, True)
+
+            filename, file_obj = create_csv_file(csv_with_empty_rows)
+
+            response = client.post(
+                "/api/v1/dataset/upload",
+                files={"file": (filename, file_obj, "text/csv")},
+                data={
+                    "dataset_name": "test_dataset",
+                    "duplication_factor": 2,
+                },
+                headers=user_api_key_header,
+            )
+
+            assert response.status_code == 200, response.text
+            data = response.json()
+
+            # Should only have 2 valid items (first and last rows)
+            assert data["original_items"] == 2
+            assert data["total_items"] == 4  # 2 items * 2 duplication
+
+    def test_upload_dataset_no_langfuse_credentials(
+        self, client, user_api_key_header, valid_csv_content
+    ):
+        """Test uploading without Langfuse credentials configured."""
+        filename, file_obj = create_csv_file(valid_csv_content)
+
+        response = client.post(
+            "/api/v1/dataset/upload",
+            files={"file": (filename, file_obj, "text/csv")},
+            data={
+                "dataset_name": "test_dataset",
+                "duplication_factor": 5,
+            },
+            headers=user_api_key_header,
+        )
+
+        assert response.status_code == 500
+        assert "LANGFUSE" in response.text or "not configured" in response.text.lower()
+
+
+class TestDatasetUploadDuplication:
+    """Test duplication logic."""
+
+    def test_upload_with_default_duplication(
+        self, client, user_api_key_header, valid_csv_content, setup_credentials
+    ):
+        """Test uploading with default duplication factor (5)."""
+        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
+            mock_client = MagicMock()
+            mock_dataset = MagicMock()
+            mock_dataset.id = "test_dataset_id"
+            mock_client.create_dataset.return_value = mock_dataset
+            mock_client.create_dataset_item.return_value = None
+            mock_langfuse.return_value = (mock_client, True)
+
+            filename, file_obj = create_csv_file(valid_csv_content)
+
+            response = client.post(
+                "/api/v1/dataset/upload",
+                files={"file": (filename, file_obj, "text/csv")},
+                data={
+                    "dataset_name": "test_dataset",
+                    # duplication_factor not provided, should default to 5
+                },
+                headers=user_api_key_header,
+            )
+
+            assert response.status_code == 200, response.text
+            data = response.json()
+
+            assert data["duplication_factor"] == 5
+            assert data["original_items"] == 3
+            assert data["total_items"] == 15  # 3 items * 5 duplication
+
+    def test_upload_with_custom_duplication(
+        self, client, user_api_key_header, valid_csv_content, setup_credentials
+    ):
+        """Test uploading with custom duplication factor."""
+        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
+            mock_client = MagicMock()
+            mock_dataset = MagicMock()
+            mock_dataset.id = "test_dataset_id"
+            mock_client.create_dataset.return_value = mock_dataset
+            mock_client.create_dataset_item.return_value = None
+            mock_langfuse.return_value = (mock_client, True)
+
+            filename, file_obj = create_csv_file(valid_csv_content)
+
+            response = client.post(
+                "/api/v1/dataset/upload",
+                files={"file": (filename, file_obj, "text/csv")},
+                data={
+                    "dataset_name": "test_dataset",
+                    "duplication_factor": 10,
+                },
+                headers=user_api_key_header,
+            )
+
+            assert response.status_code == 200, response.text
+            data = response.json()
+
+            assert data["duplication_factor"] == 10
+            assert data["original_items"] == 3
+            assert data["total_items"] == 30  # 3 items * 10 duplication
+
+    def test_upload_metadata_includes_duplicate_number(
+        self, client, user_api_key_header, valid_csv_content, setup_credentials
+    ):
+        """Test that metadata includes duplicate number for each item."""
+        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
+            mock_client = MagicMock()
+            mock_dataset = MagicMock()
+            mock_dataset.id = "test_dataset_id"
+            mock_client.create_dataset.return_value = mock_dataset
+            mock_client.create_dataset_item.return_value = None
+            mock_langfuse.return_value = (mock_client, True)
+
+            filename, file_obj = create_csv_file(valid_csv_content)
+
+            response = client.post(
+                "/api/v1/dataset/upload",
+                files={"file": (filename, file_obj, "text/csv")},
+                data={
+                    "dataset_name": "test_dataset",
+                    "duplication_factor": 3,
+                },
+                headers=user_api_key_header,
+            )
+
+            assert response.status_code == 200, response.text
+
+            # Verify metadata was passed correctly
+            calls = mock_client.create_dataset_item.call_args_list
+
+            # Check that each duplicate has correct metadata
+            duplicate_numbers = set()
+            for call in calls:
+                metadata = call.kwargs.get("metadata", {})
+                duplicate_numbers.add(metadata["duplicate_number"])
+                assert metadata["duplication_factor"] == 3
+                assert "original_question" in metadata
+
+            # Should have duplicate numbers 1, 2, 3
+            assert duplicate_numbers == {1, 2, 3}
+
+
+class TestDatasetUploadErrors:
+    """Test error handling."""
+
+    def test_upload_langfuse_configuration_fails(
+        self, client, user_api_key_header, valid_csv_content, setup_credentials
+    ):
+        """Test when Langfuse client configuration fails."""
+        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
+            mock_langfuse.return_value = (None, False)
+
+            filename, file_obj = create_csv_file(valid_csv_content)
+
+            response = client.post(
+                "/api/v1/dataset/upload",
+                files={"file": (filename, file_obj, "text/csv")},
+                data={
+                    "dataset_name": "test_dataset",
+                    "duplication_factor": 5,
+                },
+                headers=user_api_key_header,
+            )
+
+            assert response.status_code == 500
+            assert "Failed to configure" in response.text or "Langfuse" in response.text
+
+    def test_upload_invalid_csv_format(
+        self, client, user_api_key_header, setup_credentials
+    ):
+        """Test uploading invalid CSV format."""
+        invalid_csv = "not,a,valid\ncsv format here!!!"
+        filename, file_obj = create_csv_file(invalid_csv)
+
+        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
+            mock_client = MagicMock()
+            mock_langfuse.return_value = (mock_client, True)
+
+            response = client.post(
+                "/api/v1/dataset/upload",
+                files={"file": (filename, file_obj, "text/csv")},
+                data={
+                    "dataset_name": "test_dataset",
+                    "duplication_factor": 5,
+                },
+                headers=user_api_key_header,
+            )
+
+            # Should fail validation
+            assert response.status_code == 500
+
+    def test_upload_without_authentication(self, client, valid_csv_content):
+        """Test uploading without authentication."""
+        filename, file_obj = create_csv_file(valid_csv_content)
+
+        response = client.post(
+            "/api/v1/dataset/upload",
+            files={"file": (filename, file_obj, "text/csv")},
+            data={
+                "dataset_name": "test_dataset",
+                "duplication_factor": 5,
+            },
+        )
+
+        assert response.status_code == 401  # Unauthorized

From 2ac8163122e280a84b86f7e6773b51748558d397 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 14 Oct 2025 14:45:26 +0530
Subject: [PATCH 15/64] added steps for starting evaluation using batchAPI

---
 ...5747495bd7c_create_evaluation_run_table.py |  83 ++++++
 backend/app/api/routes/evaluation.py          |   2 +-
 backend/app/crud/evaluation_batch.py          | 269 ++++++++++++++++++
 backend/app/models/__init__.py                |   7 +
 backend/app/models/evaluation.py              |  98 +++++++
 backend/app/models/organization.py            |   3 +
 backend/app/models/project.py                 |   3 +
 7 files changed, 464 insertions(+), 1 deletion(-)
 create mode 100644 backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
 create mode 100644 backend/app/crud/evaluation_batch.py

diff --git a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
new file mode 100644
index 000000000..5d8d16865
--- /dev/null
+++ b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
@@ -0,0 +1,83 @@
+"""create_evaluation_run_table
+
+Revision ID: d5747495bd7c
+Revises: b30727137e65
+Create Date: 2025-10-14 12:42:15.464302
+
+"""
+from alembic import op
+import sqlalchemy as sa
+import sqlmodel.sql.sqltypes
+
+
+# revision identifiers, used by Alembic.
+revision = "d5747495bd7c"
+down_revision = "b30727137e65"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "evaluation_run",
+        sa.Column("run_name", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
+        sa.Column("dataset_name", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
+        sa.Column("config", sa.JSON(), nullable=False),
+        sa.Column("status", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
+        sa.Column("batch_status", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
+        sa.Column("batch_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
+        sa.Column("batch_file_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
+        sa.Column(
+            "batch_output_file_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True
+        ),
+        sa.Column("s3_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
+        sa.Column("total_items", sa.Integer(), nullable=False),
+        sa.Column("score", sa.JSON(), nullable=True),
+        sa.Column("error_message", sa.Text(), nullable=True),
+        sa.Column("organization_id", sa.Integer(), nullable=False),
+        sa.Column("project_id", sa.Integer(), nullable=False),
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("inserted_at", sa.DateTime(), nullable=False),
+        sa.Column("updated_at", sa.DateTime(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["organization_id"], ["organization.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    op.create_index(
+        op.f("ix_evaluation_run_run_name"), "evaluation_run", ["run_name"], unique=False
+    )
+    op.drop_constraint(
+        "openai_conversation_project_id_fkey1",
+        "openai_conversation",
+        type_="foreignkey",
+    )
+    op.drop_constraint(
+        "openai_conversation_organization_id_fkey1",
+        "openai_conversation",
+        type_="foreignkey",
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_foreign_key(
+        "openai_conversation_organization_id_fkey1",
+        "openai_conversation",
+        "organization",
+        ["organization_id"],
+        ["id"],
+    )
+    op.create_foreign_key(
+        "openai_conversation_project_id_fkey1",
+        "openai_conversation",
+        "project",
+        ["project_id"],
+        ["id"],
+    )
+    op.drop_index(op.f("ix_evaluation_run_run_name"), table_name="evaluation_run")
+    op.drop_table("evaluation_run")
+    # ### end Alembic commands ###
diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index f440a2a29..62bdbdb33 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -26,7 +26,7 @@ async def upload_dataset(
     _current_user: UserOrganization = Depends(get_current_user_org),
 ) -> DatasetUploadResponse:
     """
-    Upload a CSV file containing Q&A pairs to Langfuse as a dataset.
+    Upload a CSV file containing Golden Q&A pairs to Langfuse as a dataset.
     Each question will be duplicated N times (default 5) to test LLM flakiness.
 
     CSV Format:
diff --git a/backend/app/crud/evaluation_batch.py b/backend/app/crud/evaluation_batch.py
new file mode 100644
index 000000000..f7049d32a
--- /dev/null
+++ b/backend/app/crud/evaluation_batch.py
@@ -0,0 +1,269 @@
+"""
+OpenAI Batch API integration for LLM evaluations using Responses API.
+
+This module handles:
+1. Fetching dataset items from Langfuse
+2. Building JSONL for OpenAI Batch API (/v1/responses endpoint)
+3. Uploading and creating batch jobs
+"""
+
+import json
+import logging
+from typing import Any
+
+from langfuse import Langfuse
+from openai import OpenAI
+from sqlmodel import Session
+
+from app.models import EvaluationRun
+
+logger = logging.getLogger(__name__)
+
+
+def fetch_dataset_items(langfuse: Langfuse, dataset_name: str) -> list[dict[str, Any]]:
+    """
+    Fetch all items from a Langfuse dataset.
+
+    Args:
+        langfuse: Configured Langfuse client
+        dataset_name: Name of the dataset to fetch
+
+    Returns:
+        List of dataset items with input and expected_output
+
+    Raises:
+        ValueError: If dataset not found or empty
+    """
+    logger.info(f"Fetching dataset: {dataset_name}")
+
+    try:
+        dataset = langfuse.get_dataset(dataset_name)
+    except Exception as e:
+        logger.error(f"Failed to fetch dataset '{dataset_name}': {e}")
+        raise ValueError(f"Dataset '{dataset_name}' not found: {e}")
+
+    if not dataset.items:
+        raise ValueError(f"Dataset '{dataset_name}' is empty")
+
+    items = []
+    for item in dataset.items:
+        items.append(
+            {
+                "id": item.id,
+                "input": item.input,
+                "expected_output": item.expected_output,
+                "metadata": item.metadata if hasattr(item, "metadata") else {},
+            }
+        )
+
+    logger.info(f"Fetched {len(items)} items from dataset '{dataset_name}'")
+    return items
+
+
+def build_batch_jsonl(
+    dataset_items: list[dict[str, Any]], config: dict[str, Any]
+) -> list[str]:
+    """
+    Build JSONL lines for OpenAI Batch API using Responses API.
+
+    Each line is a JSON object with:
+    - custom_id: Unique identifier for the request
+    - method: POST
+    - url: /v1/responses
+    - body: Response request with model, instructions, and input
+
+    Args:
+        dataset_items: List of dataset items from Langfuse
+        config: Evaluation configuration dict with llm, instructions, vector_store_ids
+
+    Returns:
+        List of JSONL strings (one per dataset item)
+    """
+    # Extract config values
+    llm_config = config.get("llm", {})
+    model = llm_config.get("model", "gpt-4o")
+    instructions = config.get("instructions", "You are a helpful assistant")
+    vector_store_ids = config.get("vector_store_ids", [])
+
+    logger.info(f"Building JSONL for {len(dataset_items)} items with model {model}")
+
+    batch_file = []
+
+    for item in dataset_items:
+        # Extract question from input
+        question = item["input"].get("question", "")
+        if not question:
+            logger.warning(f"Skipping item {item['id']} - no question found")
+            continue
+
+        # Build the batch request object for Responses API
+        batch_request = {
+            "custom_id": item["id"],
+            "method": "POST",
+            "url": "/v1/responses",
+            "body": {
+                "model": model,
+                "instructions": instructions,
+                "input": question,
+            },
+        }
+
+        # Add vector store IDs if available (for file search)
+        if vector_store_ids:
+            batch_request["body"]["tools"] = [{"type": "file_search"}]
+            batch_request["body"]["tool_choice"] = "auto"
+
+        batch_file.append(json.dumps(batch_request))
+
+    logger.info(f"Built {len(batch_file)} JSONL lines")
+    return batch_file
+
+
+def upload_batch_file(client: OpenAI, batch_file: list[str]) -> str:
+    """
+    Upload JSONL content to OpenAI Files API.
+
+    Args:
+        client: Configured OpenAI client
+        batch_file: List of JSONL strings
+
+    Returns:
+        File ID from OpenAI
+
+    Raises:
+        Exception: If upload fails
+    """
+    logger.info(f"Uploading {len(batch_file)} lines to OpenAI Files API")
+
+    # Join lines with newlines
+    jsonl_content = "\n".join(batch_file)
+
+    try:
+        # Upload as a file object
+        file_response = client.files.create(
+            file=("batch_input.jsonl", jsonl_content.encode("utf-8")),
+            purpose="batch",
+        )
+
+        logger.info(f"Uploaded file: {file_response.id}")
+        return file_response.id
+
+    except Exception as e:
+        logger.error(f"Failed to upload batch file: {e}")
+        raise
+
+
+def create_batch_job(
+    client: OpenAI,
+    file_id: str,
+    description: str = "LLM evaluation batch",
+) -> dict[str, Any]:
+    """
+    Create a batch job in OpenAI using Responses API.
+
+    Args:
+        client: Configured OpenAI client
+        file_id: File ID from upload_batch_file
+        description: Optional description for the batch
+
+    Returns:
+        Dict with batch details (id, status, etc.)
+
+    Raises:
+        Exception: If batch creation fails
+    """
+    logger.info(f"Creating batch job with file: {file_id}")
+
+    try:
+        batch = client.batches.create(
+            input_file_id=file_id,
+            endpoint="/v1/responses",
+            completion_window="24h",
+            metadata={"description": description},
+        )
+
+        batch_info = {
+            "id": batch.id,
+            "status": batch.status,
+            "created_at": batch.created_at,
+            "endpoint": batch.endpoint,
+            "input_file_id": batch.input_file_id,
+        }
+
+        logger.info(f"Created batch: {batch.id} (status={batch.status})")
+        return batch_info
+
+    except Exception as e:
+        logger.error(f"Failed to create batch job: {e}")
+        raise
+
+
+def start_evaluation_batch(
+    langfuse: Langfuse,
+    openai_client: OpenAI,
+    session: Session,
+    eval_run: EvaluationRun,
+    config: dict[str, Any],
+) -> EvaluationRun:
+    """
+    Fetch data, build JSONL, upload to OpenAI, create batch.
+
+    Args:
+        langfuse: Configured Langfuse client
+        openai_client: Configured OpenAI client
+        session: Database session
+        eval_run: EvaluationRun database object (with run_name, dataset_name, config)
+        config: Evaluation configuration dict with llm, instructions, vector_store_ids
+
+    Returns:
+        Updated EvaluationRun with batch_id and batch_file_id populated
+
+    Raises:
+        Exception: If any step fails
+    """
+    try:
+        # Step 1: Fetch dataset items from Langfuse
+        dataset_items = fetch_dataset_items(
+            langfuse=langfuse, dataset_name=eval_run.dataset_name
+        )
+
+        # Step 2: Build JSONL using config
+        batch_file = build_batch_jsonl(dataset_items=dataset_items, config=config)
+
+        # Step 3: Upload to OpenAI
+        file_id = upload_batch_file(client=openai_client, batch_file=batch_file)
+
+        # Step 4: Create batch job
+        batch_info = create_batch_job(
+            client=openai_client,
+            file_id=file_id,
+            description=f"Evaluation: {eval_run.run_name}",
+        )
+
+        # Update eval_run with batch info
+        eval_run.batch_id = batch_info["id"]
+        eval_run.batch_file_id = file_id
+        eval_run.batch_status = batch_info[
+            "status"
+        ]  # OpenAI batch status (e.g., "validating")
+        eval_run.total_items = len(batch_file)
+        eval_run.status = "processing"  # Overall evaluation status
+
+        session.add(eval_run)
+        session.commit()
+        session.refresh(eval_run)
+
+        logger.info(
+            f"Successfully started evaluation batch: {batch_info['id']} "
+            f"for run '{eval_run.run_name}' with {len(batch_file)} items"
+        )
+
+        return eval_run
+
+    except Exception as e:
+        logger.error(f"Failed to start evaluation batch: {e}", exc_info=True)
+        eval_run.status = "failed"
+        eval_run.error_message = str(e)
+        session.add(eval_run)
+        session.commit()
+        raise
diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py
index 537532d08..ca5897fb2 100644
--- a/backend/app/models/__init__.py
+++ b/backend/app/models/__init__.py
@@ -35,6 +35,13 @@
 )
 from .document_collection import DocumentCollection
 
+from .evaluation import (
+    EvaluationRun,
+    EvaluationRunBase,
+    EvaluationRunCreate,
+    EvaluationRunPublic,
+)
+
 from .fine_tuning import (
     FineTuningJobBase,
     Fine_Tuning,
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index 1f0e2ffb6..6f8ebac81 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -1,4 +1,11 @@
+from datetime import datetime
+from typing import Optional
+
 from pydantic import BaseModel, Field
+from sqlalchemy import Column, Text, JSON
+from sqlmodel import Field as SQLField, Relationship, SQLModel
+
+from app.core.util import now
 
 
 class DatasetItem(BaseModel):
@@ -51,3 +58,94 @@ class Experiment(BaseModel):
     total_items: int = Field(..., description="Total number of items evaluated")
     matches: int = Field(..., description="Number of successful matches")
     note: str = Field(..., description="Additional notes about the evaluation process")
+
+
+# Database Models
+
+
+class EvaluationRunBase(SQLModel):
+    """Base model for evaluation runs."""
+
+    # Input fields (provided by user)
+    run_name: str = SQLField(index=True, description="Name of the evaluation run")
+    dataset_name: str = SQLField(description="Name of the Langfuse dataset")
+    config: dict = SQLField(
+        default={},
+        sa_column=Column(JSON, nullable=False),
+        description="Evaluation configuration (LLM settings, instructions, vector stores)",
+    )
+
+    # Output/Status fields (updated by system during processing)
+    status: str = SQLField(
+        default="pending",
+        description="Overall evaluation status: pending, processing, completed, failed",
+    )
+    batch_status: Optional[str] = SQLField(
+        default=None,
+        description="OpenAI Batch API status: validating, in_progress, finalizing, completed, failed, expired, cancelling, cancelled (for polling)",
+    )
+    batch_id: Optional[str] = SQLField(
+        default=None, description="OpenAI Batch API batch ID (set during processing)"
+    )
+    batch_file_id: Optional[str] = SQLField(
+        default=None,
+        description="OpenAI file ID for batch input (set during processing)",
+    )
+    batch_output_file_id: Optional[str] = SQLField(
+        default=None,
+        description="OpenAI file ID for batch output (set after completion)",
+    )
+    s3_url: Optional[str] = SQLField(
+        default=None, description="S3 URL of OpenAI output file for future reference"
+    )
+    total_items: int = SQLField(
+        default=0, description="Total number of items evaluated (set during processing)"
+    )
+    score: Optional[dict] = SQLField(
+        default=None,
+        sa_column=Column(JSON, nullable=True),
+        description="Evaluation scores (e.g., correctness, cosine_similarity, etc.) (set after completion)",
+    )
+    error_message: Optional[str] = SQLField(
+        default=None,
+        sa_column=Column(Text, nullable=True),
+        description="Error message if failed",
+    )
+    organization_id: int = SQLField(
+        foreign_key="organization.id", nullable=False, ondelete="CASCADE"
+    )
+    project_id: int = SQLField(
+        foreign_key="project.id", nullable=False, ondelete="CASCADE"
+    )
+
+
+class EvaluationRun(EvaluationRunBase, table=True):
+    """Database table for evaluation runs."""
+
+    __tablename__ = "evaluation_run"
+
+    id: int = SQLField(default=None, primary_key=True)
+    inserted_at: datetime = SQLField(default_factory=now, nullable=False)
+    updated_at: datetime = SQLField(default_factory=now, nullable=False)
+
+    # Relationships
+    project: "Project" = Relationship(back_populates="evaluation_runs")
+    organization: "Organization" = Relationship(back_populates="evaluation_runs")
+
+
+class EvaluationRunCreate(SQLModel):
+    """Model for creating an evaluation run."""
+
+    run_name: str = Field(description="Name of the evaluation run", min_length=3)
+    dataset_name: str = Field(description="Name of the Langfuse dataset", min_length=1)
+    config: dict = Field(
+        description="Evaluation configuration (flexible dict with llm, instructions, vector_store_ids, etc.)"
+    )
+
+
+class EvaluationRunPublic(EvaluationRunBase):
+    """Public model for evaluation runs."""
+
+    id: int
+    inserted_at: datetime
+    updated_at: datetime
diff --git a/backend/app/models/organization.py b/backend/app/models/organization.py
index 39342fdac..32f146397 100644
--- a/backend/app/models/organization.py
+++ b/backend/app/models/organization.py
@@ -55,6 +55,9 @@ class Organization(OrganizationBase, table=True):
     openai_conversations: list["OpenAIConversation"] = Relationship(
         back_populates="organization", cascade_delete=True
     )
+    evaluation_runs: list["EvaluationRun"] = Relationship(
+        back_populates="organization", cascade_delete=True
+    )
 
 
 # Properties to return via API
diff --git a/backend/app/models/project.py b/backend/app/models/project.py
index 8f0acfa60..cd5e2d7ef 100644
--- a/backend/app/models/project.py
+++ b/backend/app/models/project.py
@@ -61,6 +61,9 @@ class Project(ProjectBase, table=True):
     openai_conversations: list["OpenAIConversation"] = Relationship(
         back_populates="project", cascade_delete=True
     )
+    evaluation_runs: list["EvaluationRun"] = Relationship(
+        back_populates="project", cascade_delete=True
+    )
 
 
 # Properties to return via API

From 17c767c1f1a36fec569d8e1406ffb7aee73752cf Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 14 Oct 2025 15:02:43 +0530
Subject: [PATCH 16/64] added testcase

---
 .../app/tests/api/routes/test_evaluation.py   | 563 ++++++++++++++++--
 1 file changed, 504 insertions(+), 59 deletions(-)

diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
index 672a13171..207de6d67 100644
--- a/backend/app/tests/api/routes/test_evaluation.py
+++ b/backend/app/tests/api/routes/test_evaluation.py
@@ -1,8 +1,12 @@
 import io
-from unittest.mock import patch, MagicMock
+import json
+from unittest.mock import MagicMock, patch
+
 import pytest
+from sqlmodel import select
 
-from app.models.evaluation import DatasetUploadResponse
+from app.crud.evaluation_batch import build_batch_jsonl
+from app.models import EvaluationRun
 
 
 # Helper function to create CSV file-like object
@@ -41,39 +45,11 @@ def csv_with_empty_rows():
 """
 
 
-@pytest.fixture
-def setup_credentials(db):
-    """Setup mock credentials for Langfuse."""
-    from app.crud.credentials import set_credentials
-
-    credentials = {
-        "langfuse": {
-            "public_key": "test_public_key",
-            "secret_key": "test_secret_key",
-            "host": "https://cloud.langfuse.com",
-        }
-    }
-
-    # Get organization_id from test user (from seed data)
-    from sqlmodel import select
-    from app.models import Organization
-
-    org = db.exec(select(Organization)).first()
-
-    set_credentials(
-        session=db,
-        org_id=org.id,
-        credentials=credentials,
-    )
-    db.commit()
-    return org.id
-
-
 class TestDatasetUploadValidation:
     """Test CSV validation and parsing."""
 
     def test_upload_dataset_valid_csv(
-        self, client, user_api_key_header, valid_csv_content, setup_credentials
+        self, client, user_api_key_header, valid_csv_content
     ):
         """Test uploading a valid CSV file."""
         with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
@@ -117,7 +93,6 @@ def test_upload_dataset_missing_columns(
         client,
         user_api_key_header,
         invalid_csv_missing_columns,
-        setup_credentials,
     ):
         """Test uploading CSV with missing required columns."""
         with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
@@ -142,7 +117,7 @@ def test_upload_dataset_missing_columns(
             )
 
     def test_upload_dataset_empty_rows(
-        self, client, user_api_key_header, csv_with_empty_rows, setup_credentials
+        self, client, user_api_key_header, csv_with_empty_rows
     ):
         """Test uploading CSV with empty rows (should skip them)."""
         with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
@@ -172,31 +147,12 @@ def test_upload_dataset_empty_rows(
             assert data["original_items"] == 2
             assert data["total_items"] == 4  # 2 items * 2 duplication
 
-    def test_upload_dataset_no_langfuse_credentials(
-        self, client, user_api_key_header, valid_csv_content
-    ):
-        """Test uploading without Langfuse credentials configured."""
-        filename, file_obj = create_csv_file(valid_csv_content)
-
-        response = client.post(
-            "/api/v1/dataset/upload",
-            files={"file": (filename, file_obj, "text/csv")},
-            data={
-                "dataset_name": "test_dataset",
-                "duplication_factor": 5,
-            },
-            headers=user_api_key_header,
-        )
-
-        assert response.status_code == 500
-        assert "LANGFUSE" in response.text or "not configured" in response.text.lower()
-
 
 class TestDatasetUploadDuplication:
     """Test duplication logic."""
 
     def test_upload_with_default_duplication(
-        self, client, user_api_key_header, valid_csv_content, setup_credentials
+        self, client, user_api_key_header, valid_csv_content
     ):
         """Test uploading with default duplication factor (5)."""
         with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
@@ -227,7 +183,7 @@ def test_upload_with_default_duplication(
             assert data["total_items"] == 15  # 3 items * 5 duplication
 
     def test_upload_with_custom_duplication(
-        self, client, user_api_key_header, valid_csv_content, setup_credentials
+        self, client, user_api_key_header, valid_csv_content
     ):
         """Test uploading with custom duplication factor."""
         with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
@@ -258,7 +214,7 @@ def test_upload_with_custom_duplication(
             assert data["total_items"] == 30  # 3 items * 10 duplication
 
     def test_upload_metadata_includes_duplicate_number(
-        self, client, user_api_key_header, valid_csv_content, setup_credentials
+        self, client, user_api_key_header, valid_csv_content
     ):
         """Test that metadata includes duplicate number for each item."""
         with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
@@ -302,7 +258,7 @@ class TestDatasetUploadErrors:
     """Test error handling."""
 
     def test_upload_langfuse_configuration_fails(
-        self, client, user_api_key_header, valid_csv_content, setup_credentials
+        self, client, user_api_key_header, valid_csv_content
     ):
         """Test when Langfuse client configuration fails."""
         with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
@@ -323,9 +279,7 @@ def test_upload_langfuse_configuration_fails(
             assert response.status_code == 500
             assert "Failed to configure" in response.text or "Langfuse" in response.text
 
-    def test_upload_invalid_csv_format(
-        self, client, user_api_key_header, setup_credentials
-    ):
+    def test_upload_invalid_csv_format(self, client, user_api_key_header):
         """Test uploading invalid CSV format."""
         invalid_csv = "not,a,valid\ncsv format here!!!"
         filename, file_obj = create_csv_file(invalid_csv)
@@ -361,3 +315,494 @@ def test_upload_without_authentication(self, client, valid_csv_content):
         )
 
         assert response.status_code == 401  # Unauthorized
+
+
+class TestBatchEvaluation:
+    """Test batch evaluation endpoint using OpenAI Batch API."""
+
+    @pytest.fixture
+    def sample_evaluation_config(self):
+        """Sample evaluation configuration."""
+        return {
+            "llm": {"model": "gpt-4o", "temperature": 0.2},
+            "instructions": "You are a helpful assistant",
+            "vector_store_ids": [],
+        }
+
+    @pytest.fixture
+    def sample_evaluation_config_with_vector_stores(self):
+        """Sample evaluation configuration with vector stores."""
+        return {
+            "llm": {"model": "gpt-4o-mini", "temperature": 0.5},
+            "instructions": "You are an expert assistant with access to documents",
+            "vector_store_ids": ["vs_abc123", "vs_def456"],
+        }
+
+    def test_start_batch_evaluation_success(
+        self,
+        client,
+        user_api_key_header,
+        sample_evaluation_config,
+    ):
+        """Test successfully starting a batch evaluation."""
+        with patch(
+            "app.crud.evaluation_batch.fetch_dataset_items"
+        ) as mock_fetch, patch(
+            "app.crud.evaluation_batch.upload_batch_file"
+        ) as mock_upload, patch(
+            "app.crud.evaluation_batch.create_batch_job"
+        ) as mock_create_batch, patch(
+            "app.api.routes.evaluation.configure_openai"
+        ) as mock_openai, patch(
+            "app.api.routes.evaluation.configure_langfuse"
+        ) as mock_langfuse:
+            # Mock dataset items from Langfuse
+            mock_fetch.return_value = [
+                {
+                    "id": "item1",
+                    "input": {"question": "What is 2+2?"},
+                    "expected_output": {"answer": "4"},
+                    "metadata": {},
+                },
+                {
+                    "id": "item2",
+                    "input": {"question": "What is the capital of France?"},
+                    "expected_output": {"answer": "Paris"},
+                    "metadata": {},
+                },
+            ]
+
+            # Mock OpenAI file upload
+            mock_upload.return_value = "file-abc123"
+
+            # Mock batch job creation
+            mock_create_batch.return_value = {
+                "id": "batch_abc123",
+                "status": "validating",
+                "created_at": 1234567890,
+                "endpoint": "/v1/responses",
+                "input_file_id": "file-abc123",
+            }
+
+            # Mock clients
+            mock_openai_client = MagicMock()
+            mock_openai.return_value = (mock_openai_client, True)
+
+            mock_langfuse_client = MagicMock()
+            mock_langfuse.return_value = (mock_langfuse_client, True)
+
+            response = client.post(
+                "/api/v1/evaluate/batch",
+                json={
+                    "run_name": "test_evaluation_run",
+                    "dataset_name": "test_dataset",
+                    "config": sample_evaluation_config,
+                },
+                headers=user_api_key_header,
+            )
+
+            assert response.status_code == 200, response.text
+            data = response.json()
+
+            # Verify response structure
+            assert data["run_name"] == "test_evaluation_run"
+            assert data["dataset_name"] == "test_dataset"
+            assert data["config"] == sample_evaluation_config
+            assert data["status"] == "processing"
+            assert data["batch_status"] == "validating"
+            assert data["batch_id"] == "batch_abc123"
+            assert data["batch_file_id"] == "file-abc123"
+            assert data["total_items"] == 2
+
+            # Verify mocks were called
+            mock_fetch.assert_called_once()
+            mock_upload.assert_called_once()
+            mock_create_batch.assert_called_once()
+
+    def test_start_batch_evaluation_with_vector_stores(
+        self,
+        client,
+        user_api_key_header,
+        sample_evaluation_config_with_vector_stores,
+    ):
+        """Test batch evaluation with vector stores configured."""
+        with patch(
+            "app.crud.evaluation_batch.fetch_dataset_items"
+        ) as mock_fetch, patch(
+            "app.crud.evaluation_batch.upload_batch_file"
+        ) as mock_upload, patch(
+            "app.crud.evaluation_batch.create_batch_job"
+        ) as mock_create_batch, patch(
+            "app.api.routes.evaluation.configure_openai"
+        ) as mock_openai, patch(
+            "app.api.routes.evaluation.configure_langfuse"
+        ) as mock_langfuse:
+            mock_fetch.return_value = [
+                {
+                    "id": "item1",
+                    "input": {"question": "Test question"},
+                    "expected_output": {"answer": "Test answer"},
+                    "metadata": {},
+                }
+            ]
+
+            mock_upload.return_value = "file-xyz789"
+            mock_create_batch.return_value = {
+                "id": "batch_xyz789",
+                "status": "validating",
+                "created_at": 1234567890,
+                "endpoint": "/v1/responses",
+                "input_file_id": "file-xyz789",
+            }
+
+            mock_openai_client = MagicMock()
+            mock_openai.return_value = (mock_openai_client, True)
+
+            mock_langfuse_client = MagicMock()
+            mock_langfuse.return_value = (mock_langfuse_client, True)
+
+            response = client.post(
+                "/api/v1/evaluate/batch",
+                json={
+                    "run_name": "test_with_vector_stores",
+                    "dataset_name": "test_dataset",
+                    "config": sample_evaluation_config_with_vector_stores,
+                },
+                headers=user_api_key_header,
+            )
+
+            assert response.status_code == 200, response.text
+            data = response.json()
+
+            assert data["config"]["vector_store_ids"] == ["vs_abc123", "vs_def456"]
+            assert data["batch_id"] == "batch_xyz789"
+
+    def test_start_batch_evaluation_invalid_dataset(
+        self, client, user_api_key_header, sample_evaluation_config
+    ):
+        """Test batch evaluation fails with invalid dataset name."""
+        with patch(
+            "app.crud.evaluation_batch.fetch_dataset_items"
+        ) as mock_fetch, patch(
+            "app.api.routes.evaluation.configure_openai"
+        ) as mock_openai, patch(
+            "app.api.routes.evaluation.configure_langfuse"
+        ) as mock_langfuse:
+            # Mock dataset fetch to raise error
+            mock_fetch.side_effect = ValueError("Dataset 'invalid_dataset' not found")
+
+            mock_openai_client = MagicMock()
+            mock_openai.return_value = (mock_openai_client, True)
+
+            mock_langfuse_client = MagicMock()
+            mock_langfuse.return_value = (mock_langfuse_client, True)
+
+            response = client.post(
+                "/api/v1/evaluate/batch",
+                json={
+                    "run_name": "test_evaluation_run",
+                    "dataset_name": "invalid_dataset",
+                    "config": sample_evaluation_config,
+                },
+                headers=user_api_key_header,
+            )
+
+            assert response.status_code == 500
+            assert (
+                "not found" in response.text.lower()
+                or "failed" in response.text.lower()
+            )
+
+    def test_start_batch_evaluation_empty_dataset(
+        self, client, user_api_key_header, sample_evaluation_config
+    ):
+        """Test batch evaluation fails with empty dataset."""
+        with patch(
+            "app.crud.evaluation_batch.fetch_dataset_items"
+        ) as mock_fetch, patch(
+            "app.api.routes.evaluation.configure_openai"
+        ) as mock_openai, patch(
+            "app.api.routes.evaluation.configure_langfuse"
+        ) as mock_langfuse:
+            # Mock empty dataset
+            mock_fetch.side_effect = ValueError("Dataset 'empty_dataset' is empty")
+
+            mock_openai_client = MagicMock()
+            mock_openai.return_value = (mock_openai_client, True)
+
+            mock_langfuse_client = MagicMock()
+            mock_langfuse.return_value = (mock_langfuse_client, True)
+
+            response = client.post(
+                "/api/v1/evaluate/batch",
+                json={
+                    "run_name": "test_evaluation_run",
+                    "dataset_name": "empty_dataset",
+                    "config": sample_evaluation_config,
+                },
+                headers=user_api_key_header,
+            )
+
+            assert response.status_code == 500
+            assert "empty" in response.text.lower() or "failed" in response.text.lower()
+
+    def test_start_batch_evaluation_without_authentication(
+        self, client, sample_evaluation_config
+    ):
+        """Test batch evaluation requires authentication."""
+        response = client.post(
+            "/api/v1/evaluate/batch",
+            json={
+                "run_name": "test_evaluation_run",
+                "dataset_name": "test_dataset",
+                "config": sample_evaluation_config,
+            },
+        )
+
+        assert response.status_code == 401  # Unauthorized
+
+    def test_start_batch_evaluation_invalid_config(self, client, user_api_key_header):
+        """Test batch evaluation with invalid config structure."""
+        invalid_config = {
+            "llm": {"model": "gpt-4o"},
+            # Missing instructions
+            "vector_store_ids": "should_be_list_not_string",
+        }
+
+        with patch("app.api.routes.evaluation.configure_openai") as mock_openai, patch(
+            "app.api.routes.evaluation.configure_langfuse"
+        ) as mock_langfuse:
+            mock_openai_client = MagicMock()
+            mock_openai.return_value = (mock_openai_client, True)
+
+            mock_langfuse_client = MagicMock()
+            mock_langfuse.return_value = (mock_langfuse_client, True)
+
+            # This should still work because config is flexible (dict)
+            # but build_batch_jsonl will use defaults for missing values
+            response = client.post(
+                "/api/v1/evaluate/batch",
+                json={
+                    "run_name": "test_evaluation_run",
+                    "dataset_name": "test_dataset",
+                    "config": invalid_config,
+                },
+                headers=user_api_key_header,
+            )
+
+            # Should succeed because config validation is flexible
+            # The function will use defaults where needed
+            assert response.status_code in [200, 500]  # Depends on other mocks
+
+    def test_start_batch_evaluation_creates_database_record(
+        self, client, user_api_key_header, sample_evaluation_config, db
+    ):
+        """Test that batch evaluation creates a proper database record."""
+        with patch(
+            "app.crud.evaluation_batch.fetch_dataset_items"
+        ) as mock_fetch, patch(
+            "app.crud.evaluation_batch.upload_batch_file"
+        ) as mock_upload, patch(
+            "app.crud.evaluation_batch.create_batch_job"
+        ) as mock_create_batch, patch(
+            "app.api.routes.evaluation.configure_openai"
+        ) as mock_openai, patch(
+            "app.api.routes.evaluation.configure_langfuse"
+        ) as mock_langfuse:
+            mock_fetch.return_value = [
+                {
+                    "id": "item1",
+                    "input": {"question": "Test?"},
+                    "expected_output": {"answer": "Test"},
+                    "metadata": {},
+                }
+            ]
+
+            mock_upload.return_value = "file-test123"
+            mock_create_batch.return_value = {
+                "id": "batch_test123",
+                "status": "validating",
+                "created_at": 1234567890,
+                "endpoint": "/v1/responses",
+                "input_file_id": "file-test123",
+            }
+
+            mock_openai_client = MagicMock()
+            mock_openai.return_value = (mock_openai_client, True)
+
+            mock_langfuse_client = MagicMock()
+            mock_langfuse.return_value = (mock_langfuse_client, True)
+
+            response = client.post(
+                "/api/v1/evaluate/batch",
+                json={
+                    "run_name": "database_test_run",
+                    "dataset_name": "test_dataset",
+                    "config": sample_evaluation_config,
+                },
+                headers=user_api_key_header,
+            )
+
+            assert response.status_code == 200, response.text
+
+            # Verify database record was created
+            eval_run = db.exec(
+                select(EvaluationRun).where(
+                    EvaluationRun.run_name == "database_test_run"
+                )
+            ).first()
+
+            assert eval_run is not None
+            assert eval_run.dataset_name == "test_dataset"
+            assert eval_run.config == sample_evaluation_config
+            assert eval_run.status == "processing"
+            assert eval_run.batch_status == "validating"
+            assert eval_run.batch_id == "batch_test123"
+            assert eval_run.batch_file_id == "file-test123"
+            assert eval_run.total_items == 1
+
+
+class TestBatchEvaluationJSONLBuilding:
+    """Test JSONL building logic for batch evaluation."""
+
+    def test_build_batch_jsonl_basic(self):
+        """Test basic JSONL building with minimal config."""
+        dataset_items = [
+            {
+                "id": "item1",
+                "input": {"question": "What is 2+2?"},
+                "expected_output": {"answer": "4"},
+                "metadata": {},
+            }
+        ]
+
+        config = {
+            "llm": {"model": "gpt-4o", "temperature": 0.2},
+            "instructions": "You are a helpful assistant",
+            "vector_store_ids": [],
+        }
+
+        batch_file = build_batch_jsonl(dataset_items, config)
+
+        assert len(batch_file) == 1
+
+        request = json.loads(batch_file[0])
+
+        assert request["custom_id"] == "item1"
+        assert request["method"] == "POST"
+        assert request["url"] == "/v1/responses"
+        assert request["body"]["model"] == "gpt-4o"
+        assert request["body"]["instructions"] == "You are a helpful assistant"
+        assert request["body"]["input"] == "What is 2+2?"
+        assert "tools" not in request["body"]
+
+    def test_build_batch_jsonl_with_vector_stores(self):
+        """Test JSONL building with vector stores."""
+        dataset_items = [
+            {
+                "id": "item1",
+                "input": {"question": "Search the docs"},
+                "expected_output": {"answer": "Answer from docs"},
+                "metadata": {},
+            }
+        ]
+
+        config = {
+            "llm": {"model": "gpt-4o-mini"},
+            "instructions": "Search documents",
+            "vector_store_ids": ["vs_abc123"],
+        }
+
+        batch_file = build_batch_jsonl(dataset_items, config)
+
+        assert len(batch_file) == 1
+
+        request = json.loads(batch_file[0])
+
+        assert request["body"]["tools"] == [{"type": "file_search"}]
+        assert request["body"]["tool_choice"] == "auto"
+
+    def test_build_batch_jsonl_uses_defaults(self):
+        """Test JSONL building with missing config values uses defaults."""
+        dataset_items = [
+            {
+                "id": "item1",
+                "input": {"question": "Test question"},
+                "expected_output": {"answer": "Test answer"},
+                "metadata": {},
+            }
+        ]
+
+        config = {}  # Empty config, should use defaults
+
+        batch_file = build_batch_jsonl(dataset_items, config)
+
+        assert len(batch_file) == 1
+
+        request = json.loads(batch_file[0])
+
+        # Check defaults
+        assert request["body"]["model"] == "gpt-4o"  # Default model
+        assert (
+            request["body"]["instructions"] == "You are a helpful assistant"
+        )  # Default instructions
+
+    def test_build_batch_jsonl_skips_empty_questions(self):
+        """Test that items with empty questions are skipped."""
+        dataset_items = [
+            {
+                "id": "item1",
+                "input": {"question": "Valid question"},
+                "expected_output": {"answer": "Answer"},
+                "metadata": {},
+            },
+            {
+                "id": "item2",
+                "input": {"question": ""},  # Empty question
+                "expected_output": {"answer": "Answer"},
+                "metadata": {},
+            },
+            {
+                "id": "item3",
+                "input": {},  # Missing question key
+                "expected_output": {"answer": "Answer"},
+                "metadata": {},
+            },
+        ]
+
+        config = {"llm": {"model": "gpt-4o"}, "instructions": "Test"}
+
+        batch_file = build_batch_jsonl(dataset_items, config)
+
+        # Should only have 1 valid item
+        assert len(batch_file) == 1
+
+        request = json.loads(batch_file[0])
+        assert request["custom_id"] == "item1"
+
+    def test_build_batch_jsonl_multiple_items(self):
+        """Test JSONL building with multiple items."""
+        dataset_items = [
+            {
+                "id": f"item{i}",
+                "input": {"question": f"Question {i}"},
+                "expected_output": {"answer": f"Answer {i}"},
+                "metadata": {},
+            }
+            for i in range(5)
+        ]
+
+        config = {
+            "llm": {"model": "gpt-4o"},
+            "instructions": "Answer questions",
+            "vector_store_ids": [],
+        }
+
+        batch_file = build_batch_jsonl(dataset_items, config)
+
+        assert len(batch_file) == 5
+
+        for i, line in enumerate(batch_file):
+            request = json.loads(line)
+            assert request["custom_id"] == f"item{i}"
+            assert request["body"]["input"] == f"Question {i}"

From f99ae27126292a771f779cc1ba40f1128e4a6119 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 14 Oct 2025 21:32:13 +0530
Subject: [PATCH 17/64] using celery beat and evaluation batch

---
 ...5747495bd7c_create_evaluation_run_table.py |  24 -
 backend/app/api/routes/evaluation.py          | 337 +++++++++++++-
 backend/app/celery/celery_app.py              |  15 +-
 .../app/celery/tasks/evaluation_polling.py    | 114 +++++
 backend/app/crud/evaluation.py                | 140 +-----
 backend/app/crud/evaluation_batch.py          | 251 ++++++++++-
 backend/app/crud/evaluation_langfuse.py       | 121 +++++
 backend/app/crud/evaluation_processing.py     | 414 ++++++++++++++++++
 backend/app/models/evaluation.py              |   4 -
 docker-compose.yml                            |  17 +
 10 files changed, 1246 insertions(+), 191 deletions(-)
 create mode 100644 backend/app/celery/tasks/evaluation_polling.py
 create mode 100644 backend/app/crud/evaluation_langfuse.py
 create mode 100644 backend/app/crud/evaluation_processing.py

diff --git a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
index 5d8d16865..0ade9284c 100644
--- a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
+++ b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
@@ -49,35 +49,11 @@ def upgrade():
     op.create_index(
         op.f("ix_evaluation_run_run_name"), "evaluation_run", ["run_name"], unique=False
     )
-    op.drop_constraint(
-        "openai_conversation_project_id_fkey1",
-        "openai_conversation",
-        type_="foreignkey",
-    )
-    op.drop_constraint(
-        "openai_conversation_organization_id_fkey1",
-        "openai_conversation",
-        type_="foreignkey",
-    )
     # ### end Alembic commands ###
 
 
 def downgrade():
     # ### commands auto generated by Alembic - please adjust! ###
-    op.create_foreign_key(
-        "openai_conversation_organization_id_fkey1",
-        "openai_conversation",
-        "organization",
-        ["organization_id"],
-        ["id"],
-    )
-    op.create_foreign_key(
-        "openai_conversation_project_id_fkey1",
-        "openai_conversation",
-        "project",
-        ["project_id"],
-        ["id"],
-    )
     op.drop_index(op.f("ix_evaluation_run_run_name"), table_name="evaluation_run")
     op.drop_table("evaluation_run")
     # ### end Alembic commands ###
diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 62bdbdb33..3c93bf790 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -1,12 +1,20 @@
 import logging
 
 from fastapi import APIRouter, Depends, UploadFile, File, Form
-from sqlmodel import Session
+from sqlmodel import Session, select
 
 from app.api.deps import get_current_user_org, get_db
-from app.crud.evaluation import run_evaluation, upload_dataset_to_langfuse
-from app.models import UserOrganization
-from app.models.evaluation import Experiment, DatasetUploadResponse
+from app.core.util import configure_langfuse, configure_openai, now
+from app.crud.credentials import get_provider_credential
+from app.crud.evaluation import upload_dataset_to_langfuse
+from app.crud.evaluation_batch import start_evaluation_batch
+from app.crud.evaluation_processing import poll_all_pending_evaluations
+from app.models import UserOrganization, EvaluationRun
+from app.models.evaluation import (
+    DatasetUploadResponse,
+    EvaluationRunCreate,
+    EvaluationRunPublic,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -66,31 +74,326 @@ async def upload_dataset(
     return data
 
 
-@router.post("/evaluate", response_model=Experiment)
+@router.post("/evaluate", response_model=EvaluationRunPublic)
 async def evaluate_threads(
     experiment_name: str,
     assistant_id: str,
     dataset_name: str,
     _session: Session = Depends(get_db),
     _current_user: UserOrganization = Depends(get_current_user_org),
-) -> Experiment:
+) -> EvaluationRunPublic:
     """
-    Endpoint to run Langfuse evaluations using LLM-as-a-judge.
-    Read more here: https://langfuse.com/changelog/2024-11-19-llm-as-a-judge-for-datasets
+    Start an evaluation using OpenAI Batch API.
+
+    This endpoint:
+    1. Creates an EvaluationRun record in the database
+    2. Fetches dataset items from Langfuse
+    3. Builds JSONL for OpenAI Batch API (using assistant config)
+    4. Uploads to OpenAI and creates batch job
+    5. Returns the evaluation run details with batch_id
+
+    The batch will be processed asynchronously by Celery Beat (every 60s).
+    Use GET /evaluate/batch/{run_id}/status to check progress.
+
+    Args:
+        experiment_name: Name for this evaluation run
+        assistant_id: ID of the assistant (used to get config)
+        dataset_name: Name of the Langfuse dataset
+
+    Returns:
+        EvaluationRunPublic with batch details and status
     """
     logger.info(
-        f"Starting evaluation for experiment: {experiment_name}, dataset: {dataset_name}, assistant: {assistant_id}"
+        f"Starting evaluation: experiment={experiment_name}, "
+        f"dataset={dataset_name}, assistant={assistant_id}, "
+        f"org_id={_current_user.organization_id}"
+    )
+
+    # Get credentials
+    openai_credentials = get_provider_credential(
+        session=_session,
+        org_id=_current_user.organization_id,
+        provider="openai",
     )
+    langfuse_credentials = get_provider_credential(
+        session=_session,
+        org_id=_current_user.organization_id,
+        provider="langfuse",
+    )
+
+    if not openai_credentials or not langfuse_credentials:
+        raise ValueError("OpenAI or Langfuse credentials not configured")
+
+    # Configure clients
+    openai_client, openai_success = configure_openai(openai_credentials)
+    langfuse, langfuse_success = configure_langfuse(langfuse_credentials)
 
-    success, data, error = await run_evaluation(
-        experiment_name=experiment_name,
-        assistant_id=assistant_id,
+    if not openai_success or not langfuse_success:
+        raise ValueError("Failed to configure API clients")
+
+    # Build config from assistant_id
+    # For now, use simple config - you can enhance this to fetch assistant settings
+    config = {
+        "assistant_id": assistant_id,
+        "llm": {"model": "gpt-4o", "temperature": 0.2},
+        "instructions": "You are a helpful assistant",
+        "vector_store_ids": [],
+    }
+
+    # Create EvaluationRun record
+    eval_run = EvaluationRun(
+        run_name=experiment_name,
         dataset_name=dataset_name,
-        _session=_session,
-        _current_user=_current_user,
+        config=config,
+        status="pending",
+        organization_id=_current_user.organization_id,
+        project_id=_current_user.project_id,
+        inserted_at=now(),
+        updated_at=now(),
     )
 
-    if not success or data is None:
-        raise ValueError(error or "Failed to run evaluation")
+    _session.add(eval_run)
+    _session.commit()
+    _session.refresh(eval_run)
 
-    return data
+    logger.info(f"Created EvaluationRun record: id={eval_run.id}")
+
+    # Start the batch evaluation
+    try:
+        eval_run = start_evaluation_batch(
+            langfuse=langfuse,
+            openai_client=openai_client,
+            session=_session,
+            eval_run=eval_run,
+            config=config,
+        )
+
+        logger.info(
+            f"Evaluation started successfully: "
+            f"batch_id={eval_run.batch_id}, total_items={eval_run.total_items}"
+        )
+
+        return eval_run
+
+    except Exception as e:
+        logger.error(
+            f"Failed to start evaluation for run {eval_run.id}: {e}",
+            exc_info=True,
+        )
+        # Error is already handled in start_evaluation_batch
+        _session.refresh(eval_run)
+        return eval_run
+
+
+@router.post("/evaluate/batch", response_model=EvaluationRunPublic)
+async def start_batch_evaluation(
+    eval_run_data: EvaluationRunCreate,
+    _session: Session = Depends(get_db),
+    _current_user: UserOrganization = Depends(get_current_user_org),
+) -> EvaluationRunPublic:
+    """
+    Start a batch evaluation using OpenAI Batch API.
+
+    This endpoint:
+    1. Creates an EvaluationRun record in the database
+    2. Fetches dataset items from Langfuse
+    3. Builds JSONL for OpenAI Batch API
+    4. Uploads to OpenAI and creates batch job
+    5. Returns the evaluation run details with batch_id
+
+    The batch will be processed asynchronously. Use:
+    - GET /evaluate/batch/{run_id}/status to check status
+    - POST /evaluate/batch/poll to manually trigger polling
+
+    Args:
+        eval_run_data: EvaluationRunCreate with run_name, dataset_name, and config
+
+    Returns:
+        EvaluationRunPublic with batch details
+    """
+    logger.info(
+        f"Starting batch evaluation: run_name={eval_run_data.run_name}, "
+        f"dataset={eval_run_data.dataset_name}, "
+        f"org_id={_current_user.organization_id}"
+    )
+
+    # Get credentials
+    openai_credentials = get_provider_credential(
+        session=_session,
+        org_id=_current_user.organization_id,
+        provider="openai",
+    )
+    langfuse_credentials = get_provider_credential(
+        session=_session,
+        org_id=_current_user.organization_id,
+        provider="langfuse",
+    )
+
+    if not openai_credentials or not langfuse_credentials:
+        raise ValueError("OpenAI or Langfuse credentials not configured")
+
+    # Configure clients
+    openai_client, openai_success = configure_openai(openai_credentials)
+    langfuse, langfuse_success = configure_langfuse(langfuse_credentials)
+
+    if not openai_success or not langfuse_success:
+        raise ValueError("Failed to configure API clients")
+
+    # Create EvaluationRun record
+    eval_run = EvaluationRun(
+        run_name=eval_run_data.run_name,
+        dataset_name=eval_run_data.dataset_name,
+        config=eval_run_data.config,
+        status="pending",
+        organization_id=_current_user.organization_id,
+        project_id=_current_user.project_id,
+        inserted_at=now(),
+        updated_at=now(),
+    )
+
+    _session.add(eval_run)
+    _session.commit()
+    _session.refresh(eval_run)
+
+    logger.info(f"Created EvaluationRun record: id={eval_run.id}")
+
+    # Start the batch evaluation
+    try:
+        eval_run = start_evaluation_batch(
+            langfuse=langfuse,
+            openai_client=openai_client,
+            session=_session,
+            eval_run=eval_run,
+            config=eval_run_data.config,
+        )
+
+        logger.info(
+            f"Batch evaluation started successfully: "
+            f"batch_id={eval_run.batch_id}, total_items={eval_run.total_items}"
+        )
+
+        return eval_run
+
+    except Exception as e:
+        logger.error(
+            f"Failed to start batch evaluation for run {eval_run.id}: {e}",
+            exc_info=True,
+        )
+        # The error is already handled in start_evaluation_batch
+        # Just refresh and return the failed run
+        _session.refresh(eval_run)
+        return eval_run
+
+
+@router.post("/evaluate/batch/poll")
+async def poll_evaluation_batches(
+    _session: Session = Depends(get_db),
+    _current_user: UserOrganization = Depends(get_current_user_org),
+) -> dict:
+    """
+    Manually trigger polling for all pending evaluations in the current organization.
+
+    This endpoint is useful for:
+    - Testing the evaluation flow
+    - Immediately checking status instead of waiting for Celery beat
+    - Debugging evaluation issues
+
+    Returns:
+        Summary of polling results including processed, failed, and still processing counts
+    """
+    logger.info(
+        f"Manual polling triggered for org_id={_current_user.organization_id} "
+        f"by user_id={_current_user.user_id}"
+    )
+
+    summary = await poll_all_pending_evaluations(
+        session=_session, org_id=_current_user.organization_id
+    )
+
+    logger.info(
+        f"Manual polling completed for org_id={_current_user.organization_id}: "
+        f"{summary.get('total', 0)} evaluations checked, "
+        f"{summary.get('processed', 0)} processed, "
+        f"{summary.get('failed', 0)} failed"
+    )
+
+    return summary
+
+
+@router.get("/evaluate/batch/{run_id}/status", response_model=EvaluationRunPublic)
+async def get_evaluation_run_status(
+    run_id: int,
+    _session: Session = Depends(get_db),
+    _current_user: UserOrganization = Depends(get_current_user_org),
+) -> EvaluationRunPublic:
+    """
+    Get the current status of a specific evaluation run.
+
+    Args:
+        run_id: ID of the evaluation run
+
+    Returns:
+        EvaluationRunPublic with current status, batch_status, and results if completed
+    """
+    logger.info(
+        f"Fetching status for evaluation run {run_id} "
+        f"(org_id={_current_user.organization_id})"
+    )
+
+    # Query the evaluation run
+    statement = (
+        select(EvaluationRun)
+        .where(EvaluationRun.id == run_id)
+        .where(EvaluationRun.organization_id == _current_user.organization_id)
+    )
+
+    eval_run = _session.exec(statement).first()
+
+    if not eval_run:
+        raise ValueError(
+            f"Evaluation run {run_id} not found or not accessible to this organization"
+        )
+
+    logger.info(
+        f"Found evaluation run {run_id}: status={eval_run.status}, "
+        f"batch_status={eval_run.batch_status}"
+    )
+
+    return eval_run
+
+
+@router.get("/evaluate/batch/list", response_model=list[EvaluationRunPublic])
+async def list_evaluation_runs(
+    _session: Session = Depends(get_db),
+    _current_user: UserOrganization = Depends(get_current_user_org),
+    limit: int = 50,
+    offset: int = 0,
+) -> list[EvaluationRunPublic]:
+    """
+    List all evaluation runs for the current organization.
+
+    Args:
+        limit: Maximum number of runs to return (default 50)
+        offset: Number of runs to skip (for pagination)
+
+    Returns:
+        List of EvaluationRunPublic objects, ordered by most recent first
+    """
+    logger.info(
+        f"Listing evaluation runs for org_id={_current_user.organization_id} "
+        f"(limit={limit}, offset={offset})"
+    )
+
+    statement = (
+        select(EvaluationRun)
+        .where(EvaluationRun.organization_id == _current_user.organization_id)
+        .order_by(EvaluationRun.inserted_at.desc())
+        .limit(limit)
+        .offset(offset)
+    )
+
+    runs = _session.exec(statement).all()
+
+    logger.info(f"Found {len(runs)} evaluation runs")
+
+    return list(runs)
diff --git a/backend/app/celery/celery_app.py b/backend/app/celery/celery_app.py
index d67acdbcd..14d0dab12 100644
--- a/backend/app/celery/celery_app.py
+++ b/backend/app/celery/celery_app.py
@@ -7,7 +7,10 @@
     "ai_platform",
     broker=settings.RABBITMQ_URL,
     backend=settings.REDIS_URL,
-    include=["app.celery.tasks.job_execution"],
+    include=[
+        "app.celery.tasks.job_execution",
+        "app.celery.tasks.evaluation_polling",
+    ],
 )
 
 # Define exchanges and queues with priority
@@ -84,11 +87,11 @@
     broker_pool_limit=settings.CELERY_BROKER_POOL_LIMIT,
     # Beat configuration (for future cron jobs)
     beat_schedule={
-        # Example cron job (commented out)
-        # "example-cron": {
-        #     "task": "app.celery.tasks.example_cron_task",
-        #     "schedule": 60.0,  # Every 60 seconds
-        # },
+        # Poll evaluation batches every 60 seconds
+        "poll-evaluation-batches": {
+            "task": "poll_evaluation_batches",
+            "schedule": 60.0,  # Every 60 seconds
+        },
     },
 )
 
diff --git a/backend/app/celery/tasks/evaluation_polling.py b/backend/app/celery/tasks/evaluation_polling.py
new file mode 100644
index 000000000..d5ccad192
--- /dev/null
+++ b/backend/app/celery/tasks/evaluation_polling.py
@@ -0,0 +1,114 @@
+"""
+Celery tasks for evaluation batch polling.
+
+This module contains periodic tasks that poll OpenAI batch status
+and process completed evaluations.
+"""
+
+import asyncio
+import logging
+
+from celery import shared_task
+from sqlmodel import Session, select
+
+from app.core.db import get_engine
+from app.crud.evaluation_processing import poll_all_pending_evaluations
+from app.models import Organization
+
+logger = logging.getLogger(__name__)
+
+
+@shared_task(name="poll_evaluation_batches", bind=True)
+def poll_evaluation_batches_task(self):
+    """
+    Periodic task to poll all pending evaluation batches.
+
+    This task:
+    1. Gets all organizations
+    2. For each org, polls their pending evaluations
+    3. Processes completed batches automatically
+
+    Runs every 60 seconds (configured in celery_app.py beat_schedule)
+    """
+    logger.info("[poll_evaluation_batches] Starting evaluation batch polling")
+
+    try:
+        # Get database session
+        engine = get_engine()
+        with Session(engine) as session:
+            # Get all organizations
+            orgs = session.exec(select(Organization)).all()
+
+            if not orgs:
+                logger.info("[poll_evaluation_batches] No organizations found")
+                return {
+                    "status": "success",
+                    "organizations_processed": 0,
+                    "message": "No organizations to process",
+                }
+
+            logger.info(
+                f"[poll_evaluation_batches] Found {len(orgs)} organizations to process"
+            )
+
+            results = []
+            total_processed = 0
+            total_failed = 0
+            total_still_processing = 0
+
+            # Process each organization
+            for org in orgs:
+                try:
+                    logger.info(
+                        f"[poll_evaluation_batches] Processing org_id={org.id} ({org.name})"
+                    )
+
+                    # Poll all pending evaluations for this org
+                    # Use asyncio.run since poll_all_pending_evaluations is async
+                    summary = asyncio.run(
+                        poll_all_pending_evaluations(session=session, org_id=org.id)
+                    )
+
+                    results.append(
+                        {
+                            "org_id": org.id,
+                            "org_name": org.name,
+                            "summary": summary,
+                        }
+                    )
+
+                    total_processed += summary.get("processed", 0)
+                    total_failed += summary.get("failed", 0)
+                    total_still_processing += summary.get("still_processing", 0)
+
+                except Exception as e:
+                    logger.error(
+                        f"[poll_evaluation_batches] Error processing org_id={org.id}: {e}",
+                        exc_info=True,
+                    )
+                    results.append(
+                        {"org_id": org.id, "org_name": org.name, "error": str(e)}
+                    )
+
+            logger.info(
+                f"[poll_evaluation_batches] Completed: "
+                f"{total_processed} processed, {total_failed} failed, "
+                f"{total_still_processing} still processing"
+            )
+
+            return {
+                "status": "success",
+                "organizations_processed": len(orgs),
+                "total_processed": total_processed,
+                "total_failed": total_failed,
+                "total_still_processing": total_still_processing,
+                "results": results,
+            }
+
+    except Exception as e:
+        logger.error(
+            f"[poll_evaluation_batches] Fatal error: {e}",
+            exc_info=True,
+        )
+        # Retry the task after 5 minutes
+        raise self.retry(exc=e, countdown=300, max_retries=3)
diff --git a/backend/app/crud/evaluation.py b/backend/app/crud/evaluation.py
index ec7dc0b69..63122db4e 100644
--- a/backend/app/crud/evaluation.py
+++ b/backend/app/crud/evaluation.py
@@ -8,11 +8,7 @@
 from app.core.util import configure_langfuse, configure_openai
 from app.crud.credentials import get_provider_credential
 from app.models import UserOrganization
-from app.models.evaluation import (
-    DatasetUploadResponse,
-    EvaluationResult,
-    Experiment,
-)
+from app.models.evaluation import DatasetUploadResponse
 
 logger = logging.getLogger(__name__)
 
@@ -136,137 +132,3 @@ async def upload_dataset_to_langfuse(
     except Exception as e:
         logger.error(f"Error uploading dataset: {str(e)}", exc_info=True)
         return False, None, f"Failed to upload dataset: {str(e)}"
-
-
-async def run_evaluation(
-    experiment_name: str,
-    assistant_id: str,
-    dataset_name: str,
-    _session: Session,
-    _current_user: UserOrganization,
-) -> tuple[bool, Experiment | None, str | None]:
-    """
-    Run Langfuse evaluations using LLM-as-a-judge.
-
-    Args:
-        experiment_name: Name of the experiment
-        assistant_id: ID of the assistant to evaluate
-        dataset_name: Name of the dataset to use
-        _session: Database session
-        _current_user: Current user organization
-
-    Returns:
-        Tuple of (success, experiment_data, error_message)
-    """
-    # Get OpenAI credentials
-    credentials = get_provider_credential(
-        session=_session,
-        org_id=_current_user.organization_id,
-        provider="openai",
-    )
-
-    # Configure OpenAI client
-    client, success = configure_openai(credentials)
-    if not success:
-        return False, None, "OpenAI API key not configured for this organization."
-
-    # Get Langfuse credentials
-    langfuse_credentials = get_provider_credential(
-        session=_session,
-        org_id=_current_user.organization_id,
-        provider="langfuse",
-    )
-    if not langfuse_credentials:
-        return False, None, "LANGFUSE keys not configured for this organization."
-
-    # Configure Langfuse
-    langfuse, success = configure_langfuse(langfuse_credentials)
-    if not success:
-        return False, None, "Failed to configure Langfuse client."
-
-    try:
-        return await _process_evaluation(
-            langfuse=langfuse,
-            experiment_name=experiment_name,
-            assistant_id=assistant_id,
-            dataset_name=dataset_name,
-            _session=_session,
-            _current_user=_current_user,
-        )
-    except Exception as e:
-        logger.error(f"Error during evaluation: {str(e)}", exc_info=True)
-        return False, None, str(e)
-
-
-async def _process_evaluation(
-    langfuse: Langfuse,
-    experiment_name: str,
-    assistant_id: str,
-    dataset_name: str,
-    _session: Session,
-    _current_user: UserOrganization,
-) -> tuple[bool, Experiment | None, str | None]:
-    """Internal function to process the evaluation with hardcoded input/output pairs."""
-    # Hardcoded test data - list of question/answer pairs
-    test_data = [
-        {"question": "What is the capital of France?", "answer": "Paris"},
-        {"question": "What is the capital of Germany?", "answer": "Berlin"},
-        {"question": "What is the capital of Italy?", "answer": "Rome"},
-        {"question": "What is the capital of Spain?", "answer": "Madrid"},
-    ]
-
-    # Get dataset from Langfuse (assume it exists)
-    logger.info(f"Fetching dataset: {dataset_name}")
-    dataset = langfuse.get_dataset(dataset_name)
-
-    results: list[EvaluationResult] = []
-    total_items = len(dataset.items)
-    logger.info(
-        f"Processing {total_items} items from dataset with experiment: {experiment_name}"
-    )
-
-    for idx, item in enumerate(dataset.items, 1):
-        question = item.input
-        expected_answer = item.expected_output
-        logger.info(f"Processing item {idx}/{total_items}: {question}")
-
-        # Use item.observe to create trace linked to dataset item
-        with item.observe(run_name=experiment_name) as trace_id:
-            # For testing, use the expected answer as output
-            answer = expected_answer
-
-            # Update trace with input/output
-            langfuse.trace(
-                id=trace_id, input={"question": question}, output={"answer": answer}
-            )
-
-            results.append(
-                EvaluationResult(
-                    input=question,
-                    output=answer,
-                    expected=expected_answer,
-                    thread_id=None,
-                )
-            )
-            logger.info(f"Completed processing item {idx}")
-
-    # Flush Langfuse events
-    langfuse.flush()
-
-    matches = sum(1 for r in results if r.match)
-    logger.info(
-        f"Evaluation completed. Total items: {len(results)}, Matches: {matches}"
-    )
-
-    return (
-        True,
-        Experiment(
-            experiment_name=experiment_name,
-            dataset_name=dataset_name,
-            results=results,
-            total_items=len(results),
-            matches=matches,
-            note="Hardcoded question/answer pairs linked to dataset run.",
-        ),
-        None,
-    )
diff --git a/backend/app/crud/evaluation_batch.py b/backend/app/crud/evaluation_batch.py
index f7049d32a..3c40a2385 100644
--- a/backend/app/crud/evaluation_batch.py
+++ b/backend/app/crud/evaluation_batch.py
@@ -5,6 +5,7 @@
 1. Fetching dataset items from Langfuse
 2. Building JSONL for OpenAI Batch API (/v1/responses endpoint)
 3. Uploading and creating batch jobs
+4. Polling batch status and downloading results
 """
 
 import json
@@ -13,7 +14,7 @@
 
 from langfuse import Langfuse
 from openai import OpenAI
-from sqlmodel import Session
+from sqlmodel import Session, select
 
 from app.models import EvaluationRun
 
@@ -267,3 +268,251 @@ def start_evaluation_batch(
         session.add(eval_run)
         session.commit()
         raise
+
+
+# ============================================================================
+# Batch Polling and Result Processing
+# ============================================================================
+
+
+def get_pending_evaluations(session: Session) -> list[EvaluationRun]:
+    """
+    Get all evaluations that are currently processing and need polling.
+
+    Args:
+        session: Database session
+
+    Returns:
+        List of EvaluationRun objects with status='processing'
+    """
+    statement = select(EvaluationRun).where(EvaluationRun.status == "processing")
+    results = session.exec(statement).all()
+    logger.info(f"Found {len(results)} evaluations in 'processing' status")
+    return list(results)
+
+
+def poll_batch_status(client: OpenAI, batch_id: str) -> dict[str, Any]:
+    """
+    Poll OpenAI for current batch status.
+
+    Args:
+        client: Configured OpenAI client
+        batch_id: Batch ID to poll
+
+    Returns:
+        Dict with batch status information:
+        {
+            "id": "batch_abc123",
+            "status": "completed" | "failed" | "in_progress" | "validating" | ...,
+            "output_file_id": "file-xyz" (if completed),
+            "error_file_id": "file-err" (if failed),
+            "failed_requests": 0,
+            "completed_requests": 10,
+            "total_requests": 10
+        }
+
+    Raises:
+        Exception: If polling fails
+    """
+    logger.info(f"Polling batch status: {batch_id}")
+
+    try:
+        batch = client.batches.retrieve(batch_id)
+
+        batch_status = {
+            "id": batch.id,
+            "status": batch.status,
+            "output_file_id": batch.output_file_id,
+            "error_file_id": batch.error_file_id,
+            "request_counts": {
+                "total": batch.request_counts.total,
+                "completed": batch.request_counts.completed,
+                "failed": batch.request_counts.failed,
+            },
+        }
+
+        logger.info(
+            f"Batch {batch_id} status: {batch.status} "
+            f"({batch.request_counts.completed}/{batch.request_counts.total} completed)"
+        )
+
+        return batch_status
+
+    except Exception as e:
+        logger.error(f"Failed to poll batch status for {batch_id}: {e}")
+        raise
+
+
+def download_batch_output(client: OpenAI, output_file_id: str) -> str:
+    """
+    Download batch output JSONL from OpenAI.
+
+    Args:
+        client: Configured OpenAI client
+        output_file_id: File ID of the batch output
+
+    Returns:
+        JSONL content as string
+
+    Raises:
+        Exception: If download fails
+    """
+    logger.info(f"Downloading batch output file: {output_file_id}")
+
+    try:
+        file_content = client.files.content(output_file_id)
+        jsonl_content = file_content.read().decode("utf-8")
+
+        # Count lines for logging
+        line_count = len(jsonl_content.strip().split("\n"))
+        logger.info(f"Downloaded {line_count} lines from output file {output_file_id}")
+
+        return jsonl_content
+
+    except Exception as e:
+        logger.error(f"Failed to download batch output {output_file_id}: {e}")
+        raise
+
+
+def parse_batch_output(
+    jsonl_content: str, dataset_items: list[dict[str, Any]]
+) -> list[dict[str, Any]]:
+    """
+    Parse batch output JSONL into structured results.
+
+    Args:
+        jsonl_content: Raw JSONL string from OpenAI batch output
+        dataset_items: Original dataset items (for matching ground truth)
+
+    Returns:
+        List of results in format:
+        [
+            {
+                "item_id": "item_123",
+                "question": "What is 2+2?",
+                "generated_output": "4",
+                "ground_truth": "4"
+            },
+            ...
+        ]
+    """
+    logger.info("Parsing batch output JSONL")
+
+    # Create lookup map for dataset items by ID
+    dataset_map = {item["id"]: item for item in dataset_items}
+
+    results = []
+    lines = jsonl_content.strip().split("\n")
+
+    for line_num, line in enumerate(lines, 1):
+        try:
+            response = json.loads(line)
+
+            # Extract custom_id (which is our dataset item ID)
+            item_id = response.get("custom_id")
+            if not item_id:
+                logger.warning(f"Line {line_num}: No custom_id found, skipping")
+                continue
+
+            # Get original dataset item
+            dataset_item = dataset_map.get(item_id)
+            if not dataset_item:
+                logger.warning(f"Line {line_num}: No dataset item found for {item_id}")
+                continue
+
+            # Extract the response body
+            response_body = response.get("response", {}).get("body", {})
+
+            # Handle errors in batch processing
+            if response.get("error"):
+                error_msg = response["error"].get("message", "Unknown error")
+                logger.error(f"Item {item_id} had error: {error_msg}")
+                generated_output = f"ERROR: {error_msg}"
+            else:
+                # Extract output text from response
+                # Response API returns: {"output": "the answer text"}
+                generated_output = response_body.get("output", "")
+
+            # Extract question and ground truth from dataset item
+            question = dataset_item["input"].get("question", "")
+            ground_truth = dataset_item["expected_output"].get("answer", "")
+
+            results.append(
+                {
+                    "item_id": item_id,
+                    "question": question,
+                    "generated_output": generated_output,
+                    "ground_truth": ground_truth,
+                }
+            )
+
+        except json.JSONDecodeError as e:
+            logger.error(f"Line {line_num}: Failed to parse JSON: {e}")
+            continue
+        except Exception as e:
+            logger.error(f"Line {line_num}: Unexpected error: {e}")
+            continue
+
+    logger.info(f"Parsed {len(results)} results from {len(lines)} output lines")
+    return results
+
+
+def upload_results_to_s3(
+    jsonl_content: str, eval_run: EvaluationRun, project_id: int
+) -> str:
+    """
+    Upload evaluation results to S3.
+
+    Args:
+        jsonl_content: JSONL content to upload
+        eval_run: EvaluationRun database object
+        project_id: Project ID for storage path
+
+    Returns:
+        S3 URL (e.g., s3://bucket/project-uuid/evaluations/run-123/results.jsonl)
+
+    Raises:
+        Exception: If upload fails
+    """
+    from io import BytesIO
+    from app.core.cloud.storage import (
+        AmazonCloudStorageClient,
+        SimpleStorageName,
+    )
+
+    logger.info(f"Uploading results to S3 for evaluation run {eval_run.id}")
+
+    try:
+        # Create S3 key path
+        # Format: project-storage-path/evaluations/run-{id}/results.jsonl
+        s3_key = f"evaluations/run-{eval_run.id}/results.jsonl"
+
+        # Convert string content to bytes
+        content_bytes = jsonl_content.encode("utf-8")
+        file_like = BytesIO(content_bytes)
+
+        # Upload to S3
+        aws_client = AmazonCloudStorageClient()
+        aws_client.client.upload_fileobj(
+            file_like,
+            Bucket=aws_client.client._client_config.__dict__.get(
+                "bucket", "kaapi-storage"
+            ),
+            Key=s3_key,
+            ExtraArgs={"ContentType": "application/jsonl"},
+        )
+
+        # Construct S3 URL
+        storage_name = SimpleStorageName(Key=s3_key)
+        s3_url = str(storage_name)
+
+        logger.info(
+            f"Successfully uploaded results to S3: {s3_url} "
+            f"({len(content_bytes)} bytes)"
+        )
+
+        return s3_url
+
+    except Exception as e:
+        logger.error(f"Failed to upload results to S3: {e}", exc_info=True)
+        raise
diff --git a/backend/app/crud/evaluation_langfuse.py b/backend/app/crud/evaluation_langfuse.py
new file mode 100644
index 000000000..ed68192de
--- /dev/null
+++ b/backend/app/crud/evaluation_langfuse.py
@@ -0,0 +1,121 @@
+"""
+Langfuse integration for evaluation runs.
+
+This module handles:
+1. Creating dataset runs in Langfuse
+2. Creating traces for each evaluation item
+3. Uploading results to Langfuse for visualization
+"""
+
+import logging
+from typing import Any
+
+from langfuse import Langfuse
+
+logger = logging.getLogger(__name__)
+
+
+def create_langfuse_dataset_run(
+    langfuse: Langfuse,
+    dataset_name: str,
+    run_name: str,
+    results: list[dict[str, Any]],
+) -> None:
+    """
+    Create a dataset run in Langfuse with traces for each evaluation item.
+
+    This function:
+    1. Gets the dataset from Langfuse (which already exists)
+    2. For each result, creates a trace linked to the dataset item
+    3. Logs input (question), output (generated_output), and expected (ground_truth)
+
+    Args:
+        langfuse: Configured Langfuse client
+        dataset_name: Name of the dataset in Langfuse
+        run_name: Name for this evaluation run
+        results: List of evaluation results from parse_batch_output()
+                 Format: [
+                     {
+                         "item_id": "item_123",
+                         "question": "What is 2+2?",
+                         "generated_output": "4",
+                         "ground_truth": "4"
+                     },
+                     ...
+                 ]
+
+    Raises:
+        Exception: If Langfuse operations fail
+    """
+    logger.info(
+        f"Creating Langfuse dataset run '{run_name}' for dataset '{dataset_name}' "
+        f"with {len(results)} items"
+    )
+
+    try:
+        # Get the dataset
+        dataset = langfuse.get_dataset(dataset_name)
+        logger.info(f"Found dataset '{dataset_name}' with {len(dataset.items)} items")
+
+        # Create a map of item IDs for quick lookup
+        dataset_items_map = {item.id: item for item in dataset.items}
+
+        created_traces = 0
+        skipped_items = 0
+
+        # Create a trace for each result
+        for idx, result in enumerate(results, 1):
+            item_id = result["item_id"]
+            question = result["question"]
+            generated_output = result["generated_output"]
+            ground_truth = result["ground_truth"]
+
+            # Get the dataset item
+            dataset_item = dataset_items_map.get(item_id)
+            if not dataset_item:
+                logger.warning(
+                    f"Item {idx}/{len(results)}: Dataset item '{item_id}' not found, skipping"
+                )
+                skipped_items += 1
+                continue
+
+            try:
+                # Use item.observe to create a trace linked to the dataset item
+                with dataset_item.observe(run_name=run_name) as trace_id:
+                    # Update the trace with input and output
+                    langfuse.trace(
+                        id=trace_id,
+                        input={"question": question},
+                        output={"answer": generated_output},
+                        metadata={
+                            "ground_truth": ground_truth,
+                            "item_id": item_id,
+                        },
+                    )
+                    created_traces += 1
+
+                if idx % 10 == 0:
+                    logger.info(
+                        f"Progress: Created {idx}/{len(results)} traces for run '{run_name}'"
+                    )
+
+            except Exception as e:
+                logger.error(
+                    f"Failed to create trace for item {item_id}: {e}", exc_info=True
+                )
+                skipped_items += 1
+                continue
+
+        # Flush to ensure all traces are sent
+        langfuse.flush()
+
+        logger.info(
+            f"Successfully created Langfuse dataset run '{run_name}': "
+            f"{created_traces} traces created, {skipped_items} items skipped"
+        )
+
+    except Exception as e:
+        logger.error(
+            f"Failed to create Langfuse dataset run '{run_name}': {e}", exc_info=True
+        )
+        raise
diff --git a/backend/app/crud/evaluation_processing.py b/backend/app/crud/evaluation_processing.py
new file mode 100644
index 000000000..f330f34ad
--- /dev/null
+++ b/backend/app/crud/evaluation_processing.py
@@ -0,0 +1,414 @@
+"""
+Evaluation batch processing orchestrator.
+
+This module coordinates the complete evaluation workflow:
+1. Polling batch status from OpenAI
+2. Downloading and parsing completed batch results
+3. Uploading results to S3
+4. Creating Langfuse dataset runs with traces
+5. Updating database with final status
+"""
+
+import logging
+from typing import Any
+
+from langfuse import Langfuse
+from openai import OpenAI
+from sqlmodel import Session
+
+from app.core.util import configure_langfuse, configure_openai, now
+from app.crud.credentials import get_provider_credential
+from app.crud.evaluation_batch import (
+    download_batch_output,
+    fetch_dataset_items,
+    get_pending_evaluations,
+    parse_batch_output,
+    poll_batch_status,
+    upload_results_to_s3,
+)
+from app.crud.evaluation_langfuse import create_langfuse_dataset_run
+from app.models import EvaluationRun
+
+logger = logging.getLogger(__name__)
+
+
+async def process_completed_batch(
+    eval_run: EvaluationRun,
+    session: Session,
+    openai_client: OpenAI,
+    langfuse: Langfuse,
+    output_file_id: str,
+) -> EvaluationRun:
+    """
+    Process a completed batch evaluation.
+
+    This function:
+    1. Downloads batch output from OpenAI
+    2. Parses results into question/output/ground_truth format
+    3. Uploads results to S3
+    4. Creates Langfuse dataset run with traces
+    5. Updates database with completion status
+
+    Args:
+        eval_run: EvaluationRun database object
+        session: Database session
+        openai_client: Configured OpenAI client
+        langfuse: Configured Langfuse client
+        output_file_id: OpenAI file ID for batch output
+
+    Returns:
+        Updated EvaluationRun object
+
+    Raises:
+        Exception: If processing fails
+    """
+    logger.info(f"Processing completed batch for evaluation run {eval_run.id}")
+
+    try:
+        # Step 1: Download batch output from OpenAI
+        logger.info(f"Step 1: Downloading batch output file: {output_file_id}")
+        jsonl_content = download_batch_output(
+            client=openai_client, output_file_id=output_file_id
+        )
+
+        # Step 2: Fetch dataset items (needed for matching ground truth)
+        logger.info(f"Step 2: Fetching dataset items for '{eval_run.dataset_name}'")
+        dataset_items = fetch_dataset_items(
+            langfuse=langfuse, dataset_name=eval_run.dataset_name
+        )
+
+        # Step 3: Parse batch output into structured results
+        logger.info("Step 3: Parsing batch output")
+        results = parse_batch_output(
+            jsonl_content=jsonl_content, dataset_items=dataset_items
+        )
+
+        if not results:
+            raise ValueError("No valid results found in batch output")
+
+        # Step 4: Upload results to S3
+        logger.info("Step 4: Uploading results to S3")
+        s3_url = upload_results_to_s3(
+            jsonl_content=jsonl_content,
+            eval_run=eval_run,
+            project_id=eval_run.project_id,
+        )
+
+        # Step 5: Update DB with output file ID and S3 URL
+        logger.info("Step 5: Updating database with S3 URL")
+        eval_run.batch_output_file_id = output_file_id
+        eval_run.s3_url = s3_url
+        eval_run.updated_at = now()
+        session.add(eval_run)
+        session.commit()
+        session.refresh(eval_run)
+
+        # Step 6: Create Langfuse dataset run with traces
+        logger.info("Step 6: Creating Langfuse dataset run with traces")
+        create_langfuse_dataset_run(
+            langfuse=langfuse,
+            dataset_name=eval_run.dataset_name,
+            run_name=eval_run.run_name,
+            results=results,
+        )
+
+        # Step 7: Mark as completed
+        logger.info("Step 7: Marking evaluation as completed")
+        eval_run.status = "completed"
+        eval_run.updated_at = now()
+        session.add(eval_run)
+        session.commit()
+        session.refresh(eval_run)
+
+        logger.info(
+            f"Successfully completed processing for evaluation run {eval_run.id}: "
+            f"{len(results)} items processed, S3 URL: {s3_url}"
+        )
+
+        return eval_run
+
+    except Exception as e:
+        logger.error(
+            f"Failed to process completed batch for run {eval_run.id}: {e}",
+            exc_info=True,
+        )
+        # Mark as failed
+        eval_run.status = "failed"
+        eval_run.error_message = f"Processing failed: {str(e)}"
+        eval_run.updated_at = now()
+        session.add(eval_run)
+        session.commit()
+        raise
+
+
+async def check_and_process_batch(
+    eval_run: EvaluationRun,
+    session: Session,
+    openai_client: OpenAI,
+    langfuse: Langfuse,
+) -> dict[str, Any]:
+    """
+    Check batch status and process if completed.
+
+    Args:
+        eval_run: EvaluationRun database object
+        session: Database session
+        openai_client: Configured OpenAI client
+        langfuse: Configured Langfuse client
+
+    Returns:
+        Dict with status information:
+        {
+            "run_id": 123,
+            "run_name": "test_run",
+            "previous_status": "processing",
+            "current_status": "completed",
+            "batch_status": "completed",
+            "action": "processed" | "updated" | "failed" | "no_change"
+        }
+    """
+    logger.info(
+        f"Checking batch status for evaluation run {eval_run.id} (batch_id={eval_run.batch_id})"
+    )
+
+    previous_status = eval_run.status
+    previous_batch_status = eval_run.batch_status
+
+    try:
+        # Poll batch status from OpenAI
+        batch_status_info = poll_batch_status(
+            client=openai_client, batch_id=eval_run.batch_id
+        )
+
+        new_batch_status = batch_status_info["status"]
+        output_file_id = batch_status_info.get("output_file_id")
+
+        # Update batch status in DB
+        if new_batch_status != previous_batch_status:
+            eval_run.batch_status = new_batch_status
+            eval_run.updated_at = now()
+            session.add(eval_run)
+            session.commit()
+            session.refresh(eval_run)
+            logger.info(
+                f"Updated batch_status for run {eval_run.id}: "
+                f"{previous_batch_status} -> {new_batch_status}"
+            )
+
+        # Handle different batch statuses
+        if new_batch_status == "completed":
+            if not output_file_id:
+                raise ValueError("Batch completed but no output_file_id found")
+
+            logger.info(f"Batch {eval_run.batch_id} completed, processing results...")
+
+            # Process the completed batch
+            await process_completed_batch(
+                eval_run=eval_run,
+                session=session,
+                openai_client=openai_client,
+                langfuse=langfuse,
+                output_file_id=output_file_id,
+            )
+
+            return {
+                "run_id": eval_run.id,
+                "run_name": eval_run.run_name,
+                "previous_status": previous_status,
+                "current_status": eval_run.status,
+                "batch_status": new_batch_status,
+                "action": "processed",
+            }
+
+        elif new_batch_status in ["failed", "expired", "cancelled"]:
+            # Mark as failed
+            error_msg = f"Batch {new_batch_status}"
+            if batch_status_info.get("error_file_id"):
+                error_msg += f" (error_file_id: {batch_status_info['error_file_id']})"
+
+            eval_run.status = "failed"
+            eval_run.error_message = error_msg
+            eval_run.updated_at = now()
+            session.add(eval_run)
+            session.commit()
+            session.refresh(eval_run)
+
+            logger.error(f"Batch {eval_run.batch_id} failed: {error_msg}")
+
+            return {
+                "run_id": eval_run.id,
+                "run_name": eval_run.run_name,
+                "previous_status": previous_status,
+                "current_status": "failed",
+                "batch_status": new_batch_status,
+                "action": "failed",
+                "error": error_msg,
+            }
+
+        else:
+            # Still in progress (validating, in_progress, finalizing)
+            logger.info(
+                f"Batch {eval_run.batch_id} still processing (status={new_batch_status})"
+            )
+
+            return {
+                "run_id": eval_run.id,
+                "run_name": eval_run.run_name,
+                "previous_status": previous_status,
+                "current_status": eval_run.status,
+                "batch_status": new_batch_status,
+                "action": "updated"
+                if new_batch_status != previous_batch_status
+                else "no_change",
+            }
+
+    except Exception as e:
+        logger.error(
+            f"Error checking batch status for run {eval_run.id}: {e}", exc_info=True
+        )
+
+        # Mark as failed
+        eval_run.status = "failed"
+        eval_run.error_message = f"Polling failed: {str(e)}"
+        eval_run.updated_at = now()
+        session.add(eval_run)
+        session.commit()
+
+        return {
+            "run_id": eval_run.id,
+            "run_name": eval_run.run_name,
+            "previous_status": previous_status,
+            "current_status": "failed",
+            "batch_status": eval_run.batch_status,
+            "action": "failed",
+            "error": str(e),
+        }
+
+
+async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[str, Any]:
+    """
+    Poll all pending evaluations for an organization.
+
+    Args:
+        session: Database session
+        org_id: Organization ID
+
+    Returns:
+        Summary dict:
+        {
+            "total": 5,
+            "processed": 2,
+            "failed": 1,
+            "still_processing": 2,
+            "details": [...]
+        }
+    """
+    logger.info(f"Polling all pending evaluations for org_id={org_id}")
+
+    # Get pending evaluations
+    pending_runs = get_pending_evaluations(session=session)
+
+    # Filter by org_id
+    pending_runs = [run for run in pending_runs if run.organization_id == org_id]
+
+    if not pending_runs:
+        logger.info(f"No pending evaluations found for org_id={org_id}")
+        return {
+            "total": 0,
+            "processed": 0,
+            "failed": 0,
+            "still_processing": 0,
+            "details": [],
+        }
+
+    logger.info(f"Found {len(pending_runs)} pending evaluations for org_id={org_id}")
+
+    # Get credentials
+    openai_credentials = get_provider_credential(
+        session=session, org_id=org_id, provider="openai"
+    )
+    langfuse_credentials = get_provider_credential(
+        session=session, org_id=org_id, provider="langfuse"
+    )
+
+    if not openai_credentials or not langfuse_credentials:
+        logger.error(
+            f"Missing credentials for org_id={org_id}: "
+            f"openai={bool(openai_credentials)}, langfuse={bool(langfuse_credentials)}"
+        )
+        return {
+            "total": len(pending_runs),
+            "processed": 0,
+            "failed": 0,
+            "still_processing": len(pending_runs),
+            "details": [],
+            "error": "Missing OpenAI or Langfuse credentials",
+        }
+
+    # Configure clients
+    openai_client, openai_success = configure_openai(openai_credentials)
+    langfuse, langfuse_success = configure_langfuse(langfuse_credentials)
+
+    if not openai_success or not langfuse_success:
+        logger.error(f"Failed to configure clients for org_id={org_id}")
+        return {
+            "total": len(pending_runs),
+            "processed": 0,
+            "failed": 0,
+            "still_processing": len(pending_runs),
+            "details": [],
+            "error": "Failed to configure API clients",
+        }
+
+    # Process each evaluation
+    results = []
+    processed_count = 0
+    failed_count = 0
+    still_processing_count = 0
+
+    for eval_run in pending_runs:
+        try:
+            result = await check_and_process_batch(
+                eval_run=eval_run,
+                session=session,
+                openai_client=openai_client,
+                langfuse=langfuse,
+            )
+            results.append(result)
+
+            if result["action"] == "processed":
+                processed_count += 1
+            elif result["action"] == "failed":
+                failed_count += 1
+            else:
+                still_processing_count += 1
+
+        except Exception as e:
+            logger.error(
+                f"Failed to check evaluation run {eval_run.id}: {e}", exc_info=True
+            )
+            results.append(
+                {
+                    "run_id": eval_run.id,
+                    "run_name": eval_run.run_name,
+                    "action": "failed",
+                    "error": str(e),
+                }
+            )
+            failed_count += 1
+
+    summary = {
+        "total": len(pending_runs),
+        "processed": processed_count,
+        "failed": failed_count,
+        "still_processing": still_processing_count,
+        "details": results,
+    }
+
+    logger.info(
+        f"Polling summary for org_id={org_id}: "
+        f"{processed_count} processed, {failed_count} failed, "
+        f"{still_processing_count} still processing"
+    )
+
+    return summary
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index 6f8ebac81..d92611468 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -39,9 +39,6 @@ class EvaluationResult(BaseModel):
     input: str = Field(..., description="The input question/prompt used for evaluation")
     output: str = Field(..., description="The actual output from the assistant")
     expected: str = Field(..., description="The expected output from the dataset")
-    match: bool = Field(
-        ..., description="Whether the output matches the expected result"
-    )
     thread_id: str | None = Field(None, description="ID of the OpenAI")
 
 
@@ -56,7 +53,6 @@ class Experiment(BaseModel):
         ..., description="List of evaluation results"
     )
     total_items: int = Field(..., description="Total number of items evaluated")
-    matches: int = Field(..., description="Number of successful matches")
     note: str = Field(..., description="Additional notes about the evaluation process")
 
 
diff --git a/docker-compose.yml b/docker-compose.yml
index 10fc0d914..5e7ed313d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -152,6 +152,23 @@ services:
       RABBITMQ_HOST: rabbitmq
     command: ["uv", "run", "celery", "-A", "app.celery.celery_app", "worker", "--loglevel=info"]
 
+  celery_beat:
+    image: "${DOCKER_IMAGE_BACKEND?Variable not set}:${TAG:-latest}"
+    container_name: celery-beat
+    restart: always
+    build:
+      context: ./backend
+    depends_on:
+      backend:
+        condition: service_healthy
+    env_file:
+      - .env
+    environment:
+      POSTGRES_SERVER: db
+      REDIS_HOST: redis
+      RABBITMQ_HOST: rabbitmq
+    command: ["uv", "run", "celery", "-A", "app.celery.celery_app", "beat", "--loglevel=info"]
+
   celery_flower:
     image: "${DOCKER_IMAGE_BACKEND?Variable not set}:${TAG:-latest}"
     container_name: celery-flower

From 9bc96b234b14ac376e211fa4c9251babdcdca713 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 14 Oct 2025 22:47:24 +0530
Subject: [PATCH 18/64] first stab at running evaluation

---
 backend/app/api/routes/evaluation.py      | 207 +++++++++-------------
 backend/app/crud/evaluation_batch.py      |  13 +-
 backend/app/crud/evaluation_processing.py |  57 ++++--
 3 files changed, 135 insertions(+), 142 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 3c93bf790..41fe31a01 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -1,18 +1,18 @@
 import logging
 
-from fastapi import APIRouter, Depends, UploadFile, File, Form
+from fastapi import APIRouter, Body, Depends, UploadFile, File, Form
 from sqlmodel import Session, select
 
-from app.api.deps import get_current_user_org, get_db
+from app.api.deps import get_current_user_org_project, get_db
 from app.core.util import configure_langfuse, configure_openai, now
 from app.crud.credentials import get_provider_credential
 from app.crud.evaluation import upload_dataset_to_langfuse
 from app.crud.evaluation_batch import start_evaluation_batch
 from app.crud.evaluation_processing import poll_all_pending_evaluations
-from app.models import UserOrganization, EvaluationRun
+from app.crud.assistants import get_assistant_by_id
+from app.models import UserProjectOrg, EvaluationRun
 from app.models.evaluation import (
     DatasetUploadResponse,
-    EvaluationRunCreate,
     EvaluationRunPublic,
 )
 
@@ -31,7 +31,7 @@ async def upload_dataset(
         default=5, description="Number of times to duplicate each item"
     ),
     _session: Session = Depends(get_db),
-    _current_user: UserOrganization = Depends(get_current_user_org),
+    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
 ) -> DatasetUploadResponse:
     """
     Upload a CSV file containing Golden Q&A pairs to Langfuse as a dataset.
@@ -76,11 +76,13 @@ async def upload_dataset(
 
 @router.post("/evaluate", response_model=EvaluationRunPublic)
 async def evaluate_threads(
-    experiment_name: str,
-    assistant_id: str,
-    dataset_name: str,
+    dataset_name: str = Body(..., description="Name of the Langfuse dataset"),
+    experiment_name: str = Body(
+        ..., description="Name for this evaluation experiment/run"
+    ),
+    config: dict = Body(..., description="Evaluation configuration"),
     _session: Session = Depends(get_db),
-    _current_user: UserOrganization = Depends(get_current_user_org),
+    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
 ) -> EvaluationRunPublic:
     """
     Start an evaluation using OpenAI Batch API.
@@ -88,7 +90,7 @@ async def evaluate_threads(
     This endpoint:
     1. Creates an EvaluationRun record in the database
     2. Fetches dataset items from Langfuse
-    3. Builds JSONL for OpenAI Batch API (using assistant config)
+    3. Builds JSONL for OpenAI Batch API (using provided config)
     4. Uploads to OpenAI and creates batch job
     5. Returns the evaluation run details with batch_id
 
@@ -96,17 +98,30 @@ async def evaluate_threads(
     Use GET /evaluate/batch/{run_id}/status to check progress.
 
     Args:
-        experiment_name: Name for this evaluation run
-        assistant_id: ID of the assistant (used to get config)
         dataset_name: Name of the Langfuse dataset
+        experiment_name: Name for this evaluation experiment/run
+        config: Configuration dict with optional fields:
+            - assistant_id (optional): If provided, fetch config from openai_assistant table
+            - llm (optional): {"model": "gpt-4o", "temperature": 0.2}
+            - instructions (optional): System instructions
+            - vector_store_ids (optional): List of vector store IDs
+
+    Example config:
+    {
+        "llm": {"model": "gpt-4o", "temperature": 0.2},
+        "instructions": "You are a friendly assistant",
+        "vector_store_ids": ["vs_abc123"],
+        "assistant_id": "asst_xyz"  # Optional - fetches from DB if provided
+    }
 
     Returns:
         EvaluationRunPublic with batch details and status
     """
     logger.info(
-        f"Starting evaluation: experiment={experiment_name}, "
-        f"dataset={dataset_name}, assistant={assistant_id}, "
-        f"org_id={_current_user.organization_id}"
+        f"Starting evaluation: experiment_name={experiment_name}, "
+        f"dataset={dataset_name}, "
+        f"org_id={_current_user.organization_id}, "
+        f"config_keys={list(config.keys())}"
     )
 
     # Get credentials
@@ -131,14 +146,54 @@ async def evaluate_threads(
     if not openai_success or not langfuse_success:
         raise ValueError("Failed to configure API clients")
 
-    # Build config from assistant_id
-    # For now, use simple config - you can enhance this to fetch assistant settings
-    config = {
-        "assistant_id": assistant_id,
-        "llm": {"model": "gpt-4o", "temperature": 0.2},
-        "instructions": "You are a helpful assistant",
-        "vector_store_ids": [],
-    }
+    # Check if assistant_id is provided in config
+    assistant_id = config.get("assistant_id")
+    if assistant_id:
+        # Fetch assistant details from database
+        assistant = get_assistant_by_id(
+            session=_session,
+            assistant_id=assistant_id,
+            project_id=_current_user.project_id,
+        )
+
+        if assistant:
+            logger.info(
+                f"Found assistant in DB: id={assistant.id}, "
+                f"model={assistant.model}, instructions={assistant.instructions[:50]}..."
+            )
+
+            # Merge DB config with provided config (provided config takes precedence)
+            db_config = {
+                "assistant_id": assistant_id,
+                "llm": {
+                    "model": assistant.model,
+                    "temperature": assistant.temperature,
+                },
+                "instructions": assistant.instructions,
+                "vector_store_ids": assistant.vector_store_ids or [],
+            }
+
+            # Override with provided config values
+            for key in ["llm", "instructions", "vector_store_ids"]:
+                if key in config:
+                    db_config[key] = config[key]
+
+            config = db_config
+            logger.info(f"Using merged config from DB and provided values")
+        else:
+            logger.warning(
+                f"Assistant {assistant_id} not found in DB, using provided config"
+            )
+    else:
+        logger.info("No assistant_id provided, using provided config directly")
+
+    # Ensure config has required fields with defaults
+    if "llm" not in config:
+        config["llm"] = {"model": "gpt-4o", "temperature": 0.2}
+    if "instructions" not in config:
+        config["instructions"] = "You are a helpful assistant"
+    if "vector_store_ids" not in config:
+        config["vector_store_ids"] = []
 
     # Create EvaluationRun record
     eval_run = EvaluationRun(
@@ -185,110 +240,10 @@ async def evaluate_threads(
         return eval_run
 
 
-@router.post("/evaluate/batch", response_model=EvaluationRunPublic)
-async def start_batch_evaluation(
-    eval_run_data: EvaluationRunCreate,
-    _session: Session = Depends(get_db),
-    _current_user: UserOrganization = Depends(get_current_user_org),
-) -> EvaluationRunPublic:
-    """
-    Start a batch evaluation using OpenAI Batch API.
-
-    This endpoint:
-    1. Creates an EvaluationRun record in the database
-    2. Fetches dataset items from Langfuse
-    3. Builds JSONL for OpenAI Batch API
-    4. Uploads to OpenAI and creates batch job
-    5. Returns the evaluation run details with batch_id
-
-    The batch will be processed asynchronously. Use:
-    - GET /evaluate/batch/{run_id}/status to check status
-    - POST /evaluate/batch/poll to manually trigger polling
-
-    Args:
-        eval_run_data: EvaluationRunCreate with run_name, dataset_name, and config
-
-    Returns:
-        EvaluationRunPublic with batch details
-    """
-    logger.info(
-        f"Starting batch evaluation: run_name={eval_run_data.run_name}, "
-        f"dataset={eval_run_data.dataset_name}, "
-        f"org_id={_current_user.organization_id}"
-    )
-
-    # Get credentials
-    openai_credentials = get_provider_credential(
-        session=_session,
-        org_id=_current_user.organization_id,
-        provider="openai",
-    )
-    langfuse_credentials = get_provider_credential(
-        session=_session,
-        org_id=_current_user.organization_id,
-        provider="langfuse",
-    )
-
-    if not openai_credentials or not langfuse_credentials:
-        raise ValueError("OpenAI or Langfuse credentials not configured")
-
-    # Configure clients
-    openai_client, openai_success = configure_openai(openai_credentials)
-    langfuse, langfuse_success = configure_langfuse(langfuse_credentials)
-
-    if not openai_success or not langfuse_success:
-        raise ValueError("Failed to configure API clients")
-
-    # Create EvaluationRun record
-    eval_run = EvaluationRun(
-        run_name=eval_run_data.run_name,
-        dataset_name=eval_run_data.dataset_name,
-        config=eval_run_data.config,
-        status="pending",
-        organization_id=_current_user.organization_id,
-        project_id=_current_user.project_id,
-        inserted_at=now(),
-        updated_at=now(),
-    )
-
-    _session.add(eval_run)
-    _session.commit()
-    _session.refresh(eval_run)
-
-    logger.info(f"Created EvaluationRun record: id={eval_run.id}")
-
-    # Start the batch evaluation
-    try:
-        eval_run = start_evaluation_batch(
-            langfuse=langfuse,
-            openai_client=openai_client,
-            session=_session,
-            eval_run=eval_run,
-            config=eval_run_data.config,
-        )
-
-        logger.info(
-            f"Batch evaluation started successfully: "
-            f"batch_id={eval_run.batch_id}, total_items={eval_run.total_items}"
-        )
-
-        return eval_run
-
-    except Exception as e:
-        logger.error(
-            f"Failed to start batch evaluation for run {eval_run.id}: {e}",
-            exc_info=True,
-        )
-        # The error is already handled in start_evaluation_batch
-        # Just refresh and return the failed run
-        _session.refresh(eval_run)
-        return eval_run
-
-
 @router.post("/evaluate/batch/poll")
 async def poll_evaluation_batches(
     _session: Session = Depends(get_db),
-    _current_user: UserOrganization = Depends(get_current_user_org),
+    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
 ) -> dict:
     """
     Manually trigger polling for all pending evaluations in the current organization.
@@ -324,7 +279,7 @@ async def poll_evaluation_batches(
 async def get_evaluation_run_status(
     run_id: int,
     _session: Session = Depends(get_db),
-    _current_user: UserOrganization = Depends(get_current_user_org),
+    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
 ) -> EvaluationRunPublic:
     """
     Get the current status of a specific evaluation run.
@@ -365,7 +320,7 @@ async def get_evaluation_run_status(
 @router.get("/evaluate/batch/list", response_model=list[EvaluationRunPublic])
 async def list_evaluation_runs(
     _session: Session = Depends(get_db),
-    _current_user: UserOrganization = Depends(get_current_user_org),
+    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
     limit: int = 50,
     offset: int = 0,
 ) -> list[EvaluationRunPublic]:
diff --git a/backend/app/crud/evaluation_batch.py b/backend/app/crud/evaluation_batch.py
index 3c40a2385..1cb3d560e 100644
--- a/backend/app/crud/evaluation_batch.py
+++ b/backend/app/crud/evaluation_batch.py
@@ -110,8 +110,14 @@ def build_batch_jsonl(
         }
 
         # Add vector store IDs if available (for file search)
-        if vector_store_ids:
-            batch_request["body"]["tools"] = [{"type": "file_search"}]
+        # Only add tools if vector_store_ids is a non-empty list
+        if vector_store_ids and len(vector_store_ids) > 0:
+            batch_request["body"]["tools"] = [
+                {
+                    "type": "file_search",
+                    "vector_store_ids": vector_store_ids,
+                }
+            ]
             batch_request["body"]["tool_choice"] = "auto"
 
         batch_file.append(json.dumps(batch_request))
@@ -333,7 +339,8 @@ def poll_batch_status(client: OpenAI, batch_id: str) -> dict[str, Any]:
 
         logger.info(
             f"Batch {batch_id} status: {batch.status} "
-            f"({batch.request_counts.completed}/{batch.request_counts.total} completed)"
+            f"({batch.request_counts.completed}/{batch.request_counts.total} completed), "
+            f"output_file_id={batch.output_file_id}, error_file_id={batch.error_file_id}"
         )
 
         return batch_status
diff --git a/backend/app/crud/evaluation_processing.py b/backend/app/crud/evaluation_processing.py
index f330f34ad..9f02c43d9 100644
--- a/backend/app/crud/evaluation_processing.py
+++ b/backend/app/crud/evaluation_processing.py
@@ -86,18 +86,28 @@ async def process_completed_batch(
         if not results:
             raise ValueError("No valid results found in batch output")
 
-        # Step 4: Upload results to S3
-        logger.info("Step 4: Uploading results to S3")
-        s3_url = upload_results_to_s3(
-            jsonl_content=jsonl_content,
-            eval_run=eval_run,
-            project_id=eval_run.project_id,
-        )
+        # Step 4: Upload results to S3 (optional - skip if AWS credentials not configured)
+        s3_url = None
+        try:
+            logger.info("Step 4: Uploading results to S3")
+            s3_url = upload_results_to_s3(
+                jsonl_content=jsonl_content,
+                eval_run=eval_run,
+                project_id=eval_run.project_id,
+            )
+            logger.info(f"Successfully uploaded to S3: {s3_url}")
+        except Exception as s3_error:
+            # S3 upload is optional - log warning but continue processing
+            logger.warning(
+                f"S3 upload failed (AWS credentials may not be configured): {s3_error}. "
+                f"Continuing without S3 storage. Results will be available in Langfuse.",
+                exc_info=True,
+            )
 
-        # Step 5: Update DB with output file ID and S3 URL
-        logger.info("Step 5: Updating database with S3 URL")
+        # Step 5: Update DB with output file ID and S3 URL (if available)
+        logger.info("Step 5: Updating database with output file ID and S3 URL")
         eval_run.batch_output_file_id = output_file_id
-        eval_run.s3_url = s3_url
+        eval_run.s3_url = s3_url  # Will be None if S3 upload failed
         eval_run.updated_at = now()
         session.add(eval_run)
         session.commit()
@@ -120,14 +130,18 @@ async def process_completed_batch(
         session.commit()
         session.refresh(eval_run)
 
+        s3_info = (
+            f"S3 URL: {s3_url}" if s3_url else "S3 upload skipped (AWS not configured)"
+        )
         logger.info(
             f"Successfully completed processing for evaluation run {eval_run.id}: "
-            f"{len(results)} items processed, S3 URL: {s3_url}"
+            f"{len(results)} items processed, {s3_info}"
         )
 
         return eval_run
 
     except Exception as e:
+        # This catches any errors from steps 1-3 or 5-7 (but NOT S3 upload which is caught above)
         logger.error(
             f"Failed to process completed batch for run {eval_run.id}: {e}",
             exc_info=True,
@@ -138,7 +152,8 @@ async def process_completed_batch(
         eval_run.updated_at = now()
         session.add(eval_run)
         session.commit()
-        raise
+        session.refresh(eval_run)
+        return eval_run
 
 
 async def check_and_process_batch(
@@ -198,7 +213,23 @@ async def check_and_process_batch(
         # Handle different batch statuses
         if new_batch_status == "completed":
             if not output_file_id:
-                raise ValueError("Batch completed but no output_file_id found")
+                # Sometimes OpenAI returns None for output_file_id even when batch is completed
+                # This is a timing issue. Skip processing for now and let the next poll cycle handle it.
+                logger.warning(
+                    f"Batch {eval_run.batch_id} is completed but output_file_id is None. "
+                    f"This is likely a timing issue. Skipping for now - will retry in next poll cycle. "
+                    f"Request counts: {batch_status_info.get('request_counts')}"
+                )
+
+                return {
+                    "run_id": eval_run.id,
+                    "run_name": eval_run.run_name,
+                    "previous_status": previous_status,
+                    "current_status": eval_run.status,
+                    "batch_status": new_batch_status,
+                    "action": "no_change",
+                    "note": "Batch completed but output_file_id not yet available, will retry next poll",
+                }
 
             logger.info(f"Batch {eval_run.batch_id} completed, processing results...")
 

From 2200c27a04146ff6eb918b0b8d02291c8f20c280 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 14 Oct 2025 23:37:10 +0530
Subject: [PATCH 19/64] cleaning up traces in langfuse

---
 backend/app/crud/evaluation_batch.py          |  80 ++++-
 .../app/tests/crud/test_evaluation_batch.py   | 276 ++++++++++++++++++
 2 files changed, 354 insertions(+), 2 deletions(-)
 create mode 100644 backend/app/tests/crud/test_evaluation_batch.py

diff --git a/backend/app/crud/evaluation_batch.py b/backend/app/crud/evaluation_batch.py
index 1cb3d560e..c5033faf2 100644
--- a/backend/app/crud/evaluation_batch.py
+++ b/backend/app/crud/evaluation_batch.py
@@ -381,6 +381,44 @@ def download_batch_output(client: OpenAI, output_file_id: str) -> str:
         raise
 
 
+def extract_output_text(output: list[dict[str, Any]]) -> str:
+    """
+    Extract clean text from Response API output array.
+
+    This mimics the logic from OpenAI SDK's Response.output_text property.
+    The output array contains items with different types (message, file_search_call, etc.).
+    We extract text from message items that contain output_text content blocks.
+
+    Args:
+        output: The output array from the Response API
+                Format: [
+                    {"type": "file_search_call", ...},
+                    {"type": "message", "content": [{"type": "output_text", "text": "..."}]}
+                ]
+
+    Returns:
+        Extracted text string, or empty string if no text found
+    """
+    texts = []
+
+    for output_item in output:
+        # Look for message type items (similar to SDK logic)
+        if isinstance(output_item, dict) and output_item.get("type") == "message":
+            content = output_item.get("content", [])
+
+            if isinstance(content, list):
+                for content_item in content:
+                    if (
+                        isinstance(content_item, dict)
+                        and content_item.get("type") == "output_text"
+                    ):
+                        text = content_item.get("text", "")
+                        if text:
+                            texts.append(text)
+
+    return "".join(texts)
+
+
 def parse_batch_output(
     jsonl_content: str, dataset_items: list[dict[str, Any]]
 ) -> list[dict[str, Any]]:
@@ -437,8 +475,46 @@ def parse_batch_output(
                 generated_output = f"ERROR: {error_msg}"
             else:
                 # Extract output text from response
-                # Response API returns: {"output": "the answer text"}
-                generated_output = response_body.get("output", "")
+                # Response API can return simple string or complex array structure
+                output = response_body.get("output", [])
+
+                # If output is a string, check if it's a stringified list/dict
+                if isinstance(output, str):
+                    # Try to parse it as JSON first (in case it's a JSON string)
+                    try:
+                        # Try JSON parsing (for properly escaped strings)
+                        parsed_output = json.loads(output)
+                        if isinstance(parsed_output, list):
+                            generated_output = extract_output_text(parsed_output)
+                        else:
+                            generated_output = output
+                    except (json.JSONDecodeError, ValueError):
+                        # If JSON parsing fails, try literal_eval for Python string representation
+                        try:
+                            import ast
+
+                            parsed_output = ast.literal_eval(output)
+                            if isinstance(parsed_output, list):
+                                generated_output = extract_output_text(parsed_output)
+                            else:
+                                generated_output = output
+                        except (ValueError, SyntaxError):
+                            # If both fail, use the string as-is
+                            generated_output = output
+                # If output is a list (complex structure), extract text from message items
+                elif isinstance(output, list):
+                    generated_output = extract_output_text(output)
+                else:
+                    generated_output = ""
+                    logger.warning(
+                        f"Item {item_id}: Unexpected output type: {type(output)}"
+                    )
+
+                # Log the extracted output for debugging
+                logger.debug(
+                    f"Item {item_id}: Extracted clean text output "
+                    f"(length={len(generated_output)}, preview={generated_output[:100]}...)"
+                )
 
             # Extract question and ground truth from dataset item
             question = dataset_item["input"].get("question", "")
diff --git a/backend/app/tests/crud/test_evaluation_batch.py b/backend/app/tests/crud/test_evaluation_batch.py
new file mode 100644
index 000000000..239c83fd0
--- /dev/null
+++ b/backend/app/tests/crud/test_evaluation_batch.py
@@ -0,0 +1,276 @@
+"""Tests for evaluation batch output parsing."""
+
+import json
+from app.crud.evaluation_batch import extract_output_text, parse_batch_output
+
+
+def test_extract_output_text_complex_structure():
+    """Test extracting text from complex Response API output structure."""
+    # Complex structure with file_search_call and message
+    output = [
+        {
+            "id": "fs_0bc4a7ca503259fd0068ee84e9de60819b9178fd9e40b69146",
+            "type": "file_search_call",
+            "status": "completed",
+            "queries": ["सीएलएफ में उपसमिति के कार्य की समीक्षा कौन करता है?"],
+            "results": None,
+        },
+        {
+            "id": "msg_0bc4a7ca503259fd0068ee84ed5540819b98161efd65fc2834",
+            "type": "message",
+            "status": "completed",
+            "content": [
+                {
+                    "type": "output_text",
+                    "annotations": [],
+                    "logprobs": [],
+                    "text": "मुझे मौजूदा दस्तावेज़ से सीएलएफ में उपसमिति के कार्य की समीक्षा किसके द्वारा की जाती है के बारे में जानकारी नहीं मिल पाई है।",
+                }
+            ],
+            "role": "assistant",
+        },
+    ]
+
+    result = extract_output_text(output)
+    assert (
+        result
+        == "मुझे मौजूदा दस्तावेज़ से सीएलएफ में उपसमिति के कार्य की समीक्षा किसके द्वारा की जाती है के बारे में जानकारी नहीं मिल पाई है।"
+    )
+
+
+def test_extract_output_text_simple_message():
+    """Test extracting text from simple message structure."""
+    output = [
+        {
+            "type": "message",
+            "content": [
+                {
+                    "type": "output_text",
+                    "text": "This is a simple answer.",
+                }
+            ],
+        }
+    ]
+
+    result = extract_output_text(output)
+    assert result == "This is a simple answer."
+
+
+def test_extract_output_text_multiple_messages():
+    """Test extracting and joining text from multiple message items."""
+    output = [
+        {
+            "type": "message",
+            "content": [
+                {
+                    "type": "output_text",
+                    "text": "First part. ",
+                }
+            ],
+        },
+        {
+            "type": "message",
+            "content": [
+                {
+                    "type": "output_text",
+                    "text": "Second part.",
+                }
+            ],
+        },
+    ]
+
+    result = extract_output_text(output)
+    assert result == "First part. Second part."
+
+
+def test_extract_output_text_empty_output():
+    """Test extracting text from empty output."""
+    output = []
+    result = extract_output_text(output)
+    assert result == ""
+
+
+def test_extract_output_text_no_message_items():
+    """Test extracting text when there are no message items."""
+    output = [
+        {
+            "type": "file_search_call",
+            "status": "completed",
+        }
+    ]
+
+    result = extract_output_text(output)
+    assert result == ""
+
+
+def test_parse_batch_output_complex_structure():
+    """Test parsing batch output with complex answer structure."""
+    # Batch output JSONL with complex structure
+    jsonl_content = json.dumps(
+        {
+            "custom_id": "item_123",
+            "response": {
+                "status_code": 200,
+                "body": {
+                    "id": "resp_abc",
+                    "output": [
+                        {
+                            "type": "file_search_call",
+                            "status": "completed",
+                        },
+                        {
+                            "type": "message",
+                            "content": [
+                                {
+                                    "type": "output_text",
+                                    "text": "This is the extracted answer.",
+                                }
+                            ],
+                        },
+                    ],
+                },
+            },
+        }
+    )
+
+    # Dataset items
+    dataset_items = [
+        {
+            "id": "item_123",
+            "input": {"question": "What is the answer?"},
+            "expected_output": {"answer": "Expected answer"},
+        }
+    ]
+
+    results = parse_batch_output(jsonl_content, dataset_items)
+
+    assert len(results) == 1
+    assert results[0]["item_id"] == "item_123"
+    assert results[0]["question"] == "What is the answer?"
+    assert results[0]["generated_output"] == "This is the extracted answer."
+    assert results[0]["ground_truth"] == "Expected answer"
+
+
+def test_parse_batch_output_simple_string():
+    """Test parsing batch output with simple string output."""
+    # Batch output JSONL with simple string
+    jsonl_content = json.dumps(
+        {
+            "custom_id": "item_456",
+            "response": {
+                "status_code": 200,
+                "body": {
+                    "id": "resp_def",
+                    "output": "Simple string answer",
+                },
+            },
+        }
+    )
+
+    # Dataset items
+    dataset_items = [
+        {
+            "id": "item_456",
+            "input": {"question": "Simple question?"},
+            "expected_output": {"answer": "Simple expected"},
+        }
+    ]
+
+    results = parse_batch_output(jsonl_content, dataset_items)
+
+    assert len(results) == 1
+    assert results[0]["item_id"] == "item_456"
+    assert results[0]["generated_output"] == "Simple string answer"
+
+
+def test_parse_batch_output_error_handling():
+    """Test parsing batch output with error response."""
+    # Batch output JSONL with error
+    jsonl_content = json.dumps(
+        {
+            "custom_id": "item_789",
+            "error": {
+                "message": "Rate limit exceeded",
+                "type": "rate_limit_error",
+            },
+        }
+    )
+
+    # Dataset items
+    dataset_items = [
+        {
+            "id": "item_789",
+            "input": {"question": "Error question?"},
+            "expected_output": {"answer": "Error expected"},
+        }
+    ]
+
+    results = parse_batch_output(jsonl_content, dataset_items)
+
+    assert len(results) == 1
+    assert results[0]["item_id"] == "item_789"
+    assert "ERROR: Rate limit exceeded" in results[0]["generated_output"]
+
+
+def test_parse_batch_output_stringified_list():
+    """Test parsing batch output with stringified Python list (single quotes)."""
+    # This is the exact format you showed - Python string representation of a list
+    stringified_output = str(
+        [
+            {
+                "id": "fs_0a09867e650850280068ee8d506cd081959c3e4891a733e591",
+                "type": "file_search_call",
+                "status": "completed",
+                "queries": [
+                    "सीएलएफ की आरजीबी बैठक में आय और व्यय का विवरण प्रस्तुत करने के लिए कौन जिम्मेदार है?"
+                ],
+                "results": None,
+            },
+            {
+                "id": "msg_0a09867e650850280068ee8d515d5881959de222d6218b4804",
+                "type": "message",
+                "status": "completed",
+                "content": [
+                    {
+                        "type": "output_text",
+                        "annotations": [],
+                        "logprobs": [],
+                        "text": "I'm sorry, I couldn't find any relevant information regarding who is responsible for presenting the income and expenditure details at the RGB meeting of CLF in the provided file. If there is more data or another file, I can check that for you.",
+                    }
+                ],
+                "role": "assistant",
+            },
+        ]
+    )
+
+    # Batch output JSONL with stringified list
+    jsonl_content = json.dumps(
+        {
+            "custom_id": "item_stringified",
+            "response": {
+                "status_code": 200,
+                "body": {
+                    "id": "resp_str",
+                    "output": stringified_output,
+                },
+            },
+        }
+    )
+
+    # Dataset items
+    dataset_items = [
+        {
+            "id": "item_stringified",
+            "input": {"question": "Stringified question?"},
+            "expected_output": {"answer": "Stringified expected"},
+        }
+    ]
+
+    results = parse_batch_output(jsonl_content, dataset_items)
+
+    assert len(results) == 1
+    assert results[0]["item_id"] == "item_stringified"
+    assert (
+        results[0]["generated_output"]
+        == "I'm sorry, I couldn't find any relevant information regarding who is responsible for presenting the income and expenditure details at the RGB meeting of CLF in the provided file. If there is more data or another file, I can check that for you."
+    )

From ae3c77938804cc3b2e81129ce40c2c6de91fa631 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 14 Oct 2025 23:49:31 +0530
Subject: [PATCH 20/64] cleanup unnecessary code

---
 backend/app/crud/evaluation_batch.py          |  93 +++++-----------
 .../app/tests/crud/test_evaluation_batch.py   | 101 +-----------------
 2 files changed, 26 insertions(+), 168 deletions(-)

diff --git a/backend/app/crud/evaluation_batch.py b/backend/app/crud/evaluation_batch.py
index c5033faf2..c30d9c46d 100644
--- a/backend/app/crud/evaluation_batch.py
+++ b/backend/app/crud/evaluation_batch.py
@@ -8,6 +8,7 @@
 4. Polling batch status and downloading results
 """
 
+import ast
 import json
 import logging
 from typing import Any
@@ -381,44 +382,6 @@ def download_batch_output(client: OpenAI, output_file_id: str) -> str:
         raise
 
 
-def extract_output_text(output: list[dict[str, Any]]) -> str:
-    """
-    Extract clean text from Response API output array.
-
-    This mimics the logic from OpenAI SDK's Response.output_text property.
-    The output array contains items with different types (message, file_search_call, etc.).
-    We extract text from message items that contain output_text content blocks.
-
-    Args:
-        output: The output array from the Response API
-                Format: [
-                    {"type": "file_search_call", ...},
-                    {"type": "message", "content": [{"type": "output_text", "text": "..."}]}
-                ]
-
-    Returns:
-        Extracted text string, or empty string if no text found
-    """
-    texts = []
-
-    for output_item in output:
-        # Look for message type items (similar to SDK logic)
-        if isinstance(output_item, dict) and output_item.get("type") == "message":
-            content = output_item.get("content", [])
-
-            if isinstance(content, list):
-                for content_item in content:
-                    if (
-                        isinstance(content_item, dict)
-                        and content_item.get("type") == "output_text"
-                    ):
-                        text = content_item.get("text", "")
-                        if text:
-                            texts.append(text)
-
-    return "".join(texts)
-
-
 def parse_batch_output(
     jsonl_content: str, dataset_items: list[dict[str, Any]]
 ) -> list[dict[str, Any]]:
@@ -474,48 +437,42 @@ def parse_batch_output(
                 logger.error(f"Item {item_id} had error: {error_msg}")
                 generated_output = f"ERROR: {error_msg}"
             else:
-                # Extract output text from response
-                # Response API can return simple string or complex array structure
-                output = response_body.get("output", [])
+                # Extract text from output (can be string, list, or complex structure)
+                output = response_body.get("output", "")
 
-                # If output is a string, check if it's a stringified list/dict
+                # If string, try to parse it (may be JSON or Python repr of list)
                 if isinstance(output, str):
-                    # Try to parse it as JSON first (in case it's a JSON string)
                     try:
-                        # Try JSON parsing (for properly escaped strings)
-                        parsed_output = json.loads(output)
-                        if isinstance(parsed_output, list):
-                            generated_output = extract_output_text(parsed_output)
-                        else:
-                            generated_output = output
+                        output = json.loads(output)
                     except (json.JSONDecodeError, ValueError):
-                        # If JSON parsing fails, try literal_eval for Python string representation
                         try:
-                            import ast
-
-                            parsed_output = ast.literal_eval(output)
-                            if isinstance(parsed_output, list):
-                                generated_output = extract_output_text(parsed_output)
-                            else:
-                                generated_output = output
+                            output = ast.literal_eval(output)
                         except (ValueError, SyntaxError):
-                            # If both fail, use the string as-is
+                            # Keep as string if parsing fails
                             generated_output = output
-                # If output is a list (complex structure), extract text from message items
-                elif isinstance(output, list):
-                    generated_output = extract_output_text(output)
-                else:
+                            output = None
+
+                # If we have a list structure, extract text from message items
+                if isinstance(output, list):
+                    generated_output = ""
+                    for item in output:
+                        if isinstance(item, dict) and item.get("type") == "message":
+                            for content in item.get("content", []):
+                                if (
+                                    isinstance(content, dict)
+                                    and content.get("type") == "output_text"
+                                ):
+                                    generated_output = content.get("text", "")
+                                    break
+                            if generated_output:
+                                break
+                elif output is not None:
+                    # output was not a string and not a list
                     generated_output = ""
                     logger.warning(
                         f"Item {item_id}: Unexpected output type: {type(output)}"
                     )
 
-                # Log the extracted output for debugging
-                logger.debug(
-                    f"Item {item_id}: Extracted clean text output "
-                    f"(length={len(generated_output)}, preview={generated_output[:100]}...)"
-                )
-
             # Extract question and ground truth from dataset item
             question = dataset_item["input"].get("question", "")
             ground_truth = dataset_item["expected_output"].get("answer", "")
diff --git a/backend/app/tests/crud/test_evaluation_batch.py b/backend/app/tests/crud/test_evaluation_batch.py
index 239c83fd0..6431c9ead 100644
--- a/backend/app/tests/crud/test_evaluation_batch.py
+++ b/backend/app/tests/crud/test_evaluation_batch.py
@@ -1,106 +1,7 @@
 """Tests for evaluation batch output parsing."""
 
 import json
-from app.crud.evaluation_batch import extract_output_text, parse_batch_output
-
-
-def test_extract_output_text_complex_structure():
-    """Test extracting text from complex Response API output structure."""
-    # Complex structure with file_search_call and message
-    output = [
-        {
-            "id": "fs_0bc4a7ca503259fd0068ee84e9de60819b9178fd9e40b69146",
-            "type": "file_search_call",
-            "status": "completed",
-            "queries": ["सीएलएफ में उपसमिति के कार्य की समीक्षा कौन करता है?"],
-            "results": None,
-        },
-        {
-            "id": "msg_0bc4a7ca503259fd0068ee84ed5540819b98161efd65fc2834",
-            "type": "message",
-            "status": "completed",
-            "content": [
-                {
-                    "type": "output_text",
-                    "annotations": [],
-                    "logprobs": [],
-                    "text": "मुझे मौजूदा दस्तावेज़ से सीएलएफ में उपसमिति के कार्य की समीक्षा किसके द्वारा की जाती है के बारे में जानकारी नहीं मिल पाई है।",
-                }
-            ],
-            "role": "assistant",
-        },
-    ]
-
-    result = extract_output_text(output)
-    assert (
-        result
-        == "मुझे मौजूदा दस्तावेज़ से सीएलएफ में उपसमिति के कार्य की समीक्षा किसके द्वारा की जाती है के बारे में जानकारी नहीं मिल पाई है।"
-    )
-
-
-def test_extract_output_text_simple_message():
-    """Test extracting text from simple message structure."""
-    output = [
-        {
-            "type": "message",
-            "content": [
-                {
-                    "type": "output_text",
-                    "text": "This is a simple answer.",
-                }
-            ],
-        }
-    ]
-
-    result = extract_output_text(output)
-    assert result == "This is a simple answer."
-
-
-def test_extract_output_text_multiple_messages():
-    """Test extracting and joining text from multiple message items."""
-    output = [
-        {
-            "type": "message",
-            "content": [
-                {
-                    "type": "output_text",
-                    "text": "First part. ",
-                }
-            ],
-        },
-        {
-            "type": "message",
-            "content": [
-                {
-                    "type": "output_text",
-                    "text": "Second part.",
-                }
-            ],
-        },
-    ]
-
-    result = extract_output_text(output)
-    assert result == "First part. Second part."
-
-
-def test_extract_output_text_empty_output():
-    """Test extracting text from empty output."""
-    output = []
-    result = extract_output_text(output)
-    assert result == ""
-
-
-def test_extract_output_text_no_message_items():
-    """Test extracting text when there are no message items."""
-    output = [
-        {
-            "type": "file_search_call",
-            "status": "completed",
-        }
-    ]
-
-    result = extract_output_text(output)
-    assert result == ""
+from app.crud.evaluation_batch import parse_batch_output
 
 
 def test_parse_batch_output_complex_structure():

From 34082d634786ec9ddf350e046c3bc79d5835ab94 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 16 Oct 2025 14:05:21 +0530
Subject: [PATCH 21/64] syncing with master changes

---
 backend/app/api/routes/evaluation.py      |   2 +
 backend/app/crud/evaluation.py            |   5 +-
 backend/app/crud/evaluation_processing.py | 183 ++++++++++++++--------
 3 files changed, 119 insertions(+), 71 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 41fe31a01..44f0fb4dd 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -128,11 +128,13 @@ async def evaluate_threads(
     openai_credentials = get_provider_credential(
         session=_session,
         org_id=_current_user.organization_id,
+        project_id=_current_user.project_id,
         provider="openai",
     )
     langfuse_credentials = get_provider_credential(
         session=_session,
         org_id=_current_user.organization_id,
+        project_id=_current_user.project_id,
         provider="langfuse",
     )
 
diff --git a/backend/app/crud/evaluation.py b/backend/app/crud/evaluation.py
index 63122db4e..49f3d1e5f 100644
--- a/backend/app/crud/evaluation.py
+++ b/backend/app/crud/evaluation.py
@@ -7,7 +7,7 @@
 
 from app.core.util import configure_langfuse, configure_openai
 from app.crud.credentials import get_provider_credential
-from app.models import UserOrganization
+from app.models import UserProjectOrg
 from app.models.evaluation import DatasetUploadResponse
 
 logger = logging.getLogger(__name__)
@@ -18,7 +18,7 @@ async def upload_dataset_to_langfuse(
     dataset_name: str,
     duplication_factor: int,
     _session: Session,
-    _current_user: UserOrganization,
+    _current_user: UserProjectOrg,
 ) -> tuple[bool, DatasetUploadResponse | None, str | None]:
     """
     Upload a CSV dataset to Langfuse with duplication for flakiness testing.
@@ -38,6 +38,7 @@ async def upload_dataset_to_langfuse(
         langfuse_credentials = get_provider_credential(
             session=_session,
             org_id=_current_user.organization_id,
+            project_id=_current_user.project_id,
             provider="langfuse",
         )
         if not langfuse_credentials:
diff --git a/backend/app/crud/evaluation_processing.py b/backend/app/crud/evaluation_processing.py
index 9f02c43d9..b5633fd13 100644
--- a/backend/app/crud/evaluation_processing.py
+++ b/backend/app/crud/evaluation_processing.py
@@ -354,92 +354,137 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
 
     logger.info(f"Found {len(pending_runs)} pending evaluations for org_id={org_id}")
 
-    # Get credentials
-    openai_credentials = get_provider_credential(
-        session=session, org_id=org_id, provider="openai"
-    )
-    langfuse_credentials = get_provider_credential(
-        session=session, org_id=org_id, provider="langfuse"
-    )
+    # Group evaluations by project_id since credentials are per project
+    from collections import defaultdict
 
-    if not openai_credentials or not langfuse_credentials:
-        logger.error(
-            f"Missing credentials for org_id={org_id}: "
-            f"openai={bool(openai_credentials)}, langfuse={bool(langfuse_credentials)}"
-        )
-        return {
-            "total": len(pending_runs),
-            "processed": 0,
-            "failed": 0,
-            "still_processing": len(pending_runs),
-            "details": [],
-            "error": "Missing OpenAI or Langfuse credentials",
-        }
-
-    # Configure clients
-    openai_client, openai_success = configure_openai(openai_credentials)
-    langfuse, langfuse_success = configure_langfuse(langfuse_credentials)
+    evaluations_by_project = defaultdict(list)
+    for run in pending_runs:
+        evaluations_by_project[run.project_id].append(run)
 
-    if not openai_success or not langfuse_success:
-        logger.error(f"Failed to configure clients for org_id={org_id}")
-        return {
-            "total": len(pending_runs),
-            "processed": 0,
-            "failed": 0,
-            "still_processing": len(pending_runs),
-            "details": [],
-            "error": "Failed to configure API clients",
-        }
+    # Process each project separately
+    all_results = []
+    total_processed_count = 0
+    total_failed_count = 0
+    total_still_processing_count = 0
 
-    # Process each evaluation
-    results = []
-    processed_count = 0
-    failed_count = 0
-    still_processing_count = 0
+    for project_id, project_runs in evaluations_by_project.items():
+        logger.info(
+            f"Processing {len(project_runs)} evaluations for project_id={project_id}"
+        )
 
-    for eval_run in pending_runs:
         try:
-            result = await check_and_process_batch(
-                eval_run=eval_run,
+            # Get credentials for this project
+            openai_credentials = get_provider_credential(
                 session=session,
-                openai_client=openai_client,
-                langfuse=langfuse,
+                org_id=org_id,
+                project_id=project_id,
+                provider="openai",
+            )
+            langfuse_credentials = get_provider_credential(
+                session=session,
+                org_id=org_id,
+                project_id=project_id,
+                provider="langfuse",
             )
-            results.append(result)
 
-            if result["action"] == "processed":
-                processed_count += 1
-            elif result["action"] == "failed":
-                failed_count += 1
-            else:
-                still_processing_count += 1
+            if not openai_credentials or not langfuse_credentials:
+                logger.error(
+                    f"Missing credentials for org_id={org_id}, project_id={project_id}: "
+                    f"openai={bool(openai_credentials)}, langfuse={bool(langfuse_credentials)}"
+                )
+                # Mark all runs in this project as failed due to missing credentials
+                for eval_run in project_runs:
+                    all_results.append(
+                        {
+                            "run_id": eval_run.id,
+                            "run_name": eval_run.run_name,
+                            "action": "failed",
+                            "error": "Missing OpenAI or Langfuse credentials",
+                        }
+                    )
+                    total_failed_count += 1
+                continue
+
+            # Configure clients
+            openai_client, openai_success = configure_openai(openai_credentials)
+            langfuse, langfuse_success = configure_langfuse(langfuse_credentials)
+
+            if not openai_success or not langfuse_success:
+                logger.error(
+                    f"Failed to configure clients for org_id={org_id}, project_id={project_id}"
+                )
+                # Mark all runs in this project as failed due to client configuration
+                for eval_run in project_runs:
+                    all_results.append(
+                        {
+                            "run_id": eval_run.id,
+                            "run_name": eval_run.run_name,
+                            "action": "failed",
+                            "error": "Failed to configure API clients",
+                        }
+                    )
+                    total_failed_count += 1
+                continue
+
+            # Process each evaluation in this project
+            for eval_run in project_runs:
+                try:
+                    result = await check_and_process_batch(
+                        eval_run=eval_run,
+                        session=session,
+                        openai_client=openai_client,
+                        langfuse=langfuse,
+                    )
+                    all_results.append(result)
+
+                    if result["action"] == "processed":
+                        total_processed_count += 1
+                    elif result["action"] == "failed":
+                        total_failed_count += 1
+                    else:
+                        total_still_processing_count += 1
+
+                except Exception as e:
+                    logger.error(
+                        f"Failed to check evaluation run {eval_run.id}: {e}",
+                        exc_info=True,
+                    )
+                    all_results.append(
+                        {
+                            "run_id": eval_run.id,
+                            "run_name": eval_run.run_name,
+                            "action": "failed",
+                            "error": str(e),
+                        }
+                    )
+                    total_failed_count += 1
 
         except Exception as e:
-            logger.error(
-                f"Failed to check evaluation run {eval_run.id}: {e}", exc_info=True
-            )
-            results.append(
-                {
-                    "run_id": eval_run.id,
-                    "run_name": eval_run.run_name,
-                    "action": "failed",
-                    "error": str(e),
-                }
-            )
-            failed_count += 1
+            logger.error(f"Failed to process project {project_id}: {e}", exc_info=True)
+            # Mark all runs in this project as failed
+            for eval_run in project_runs:
+                all_results.append(
+                    {
+                        "run_id": eval_run.id,
+                        "run_name": eval_run.run_name,
+                        "action": "failed",
+                        "error": f"Project processing failed: {str(e)}",
+                    }
+                )
+                total_failed_count += 1
 
     summary = {
         "total": len(pending_runs),
-        "processed": processed_count,
-        "failed": failed_count,
-        "still_processing": still_processing_count,
-        "details": results,
+        "processed": total_processed_count,
+        "failed": total_failed_count,
+        "still_processing": total_still_processing_count,
+        "details": all_results,
     }
 
     logger.info(
         f"Polling summary for org_id={org_id}: "
-        f"{processed_count} processed, {failed_count} failed, "
-        f"{still_processing_count} still processing"
+        f"{total_processed_count} processed, {total_failed_count} failed, "
+        f"{total_still_processing_count} still processing"
     )
 
     return summary

From 2a915faa46b7d56bc1426456a97d96aadaf82725 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 21 Oct 2025 15:22:10 +0530
Subject: [PATCH 22/64] moving to batch table

---
 .../alembic/versions/add_batch_job_table.py   | 152 ++++++
 backend/app/api/routes/evaluation.py          |  20 +-
 backend/app/celery/celery_app.py              |  16 +-
 .../app/celery/tasks/evaluation_score_sync.py | 118 +++++
 backend/app/core/batch/__init__.py            |   5 +
 backend/app/core/batch/openai_provider.py     | 267 ++++++++++
 backend/app/core/batch/provider_interface.py  | 121 +++++
 backend/app/crud/batch_job.py                 | 208 ++++++++
 backend/app/crud/batch_operations.py          | 331 +++++++++++++
 backend/app/crud/evaluation_batch.py          | 455 +++---------------
 backend/app/crud/evaluation_processing.py     | 356 ++++++++------
 backend/app/models/__init__.py                |   8 +-
 backend/app/models/batch_job.py               | 122 +++++
 backend/app/models/evaluation.py              |  82 ++--
 backend/app/models/organization.py            |   5 +
 backend/app/models/project.py                 |   3 +
 16 files changed, 1673 insertions(+), 596 deletions(-)
 create mode 100644 backend/app/alembic/versions/add_batch_job_table.py
 create mode 100644 backend/app/celery/tasks/evaluation_score_sync.py
 create mode 100644 backend/app/core/batch/__init__.py
 create mode 100644 backend/app/core/batch/openai_provider.py
 create mode 100644 backend/app/core/batch/provider_interface.py
 create mode 100644 backend/app/crud/batch_job.py
 create mode 100644 backend/app/crud/batch_operations.py
 create mode 100644 backend/app/models/batch_job.py

diff --git a/backend/app/alembic/versions/add_batch_job_table.py b/backend/app/alembic/versions/add_batch_job_table.py
new file mode 100644
index 000000000..857d38a45
--- /dev/null
+++ b/backend/app/alembic/versions/add_batch_job_table.py
@@ -0,0 +1,152 @@
+"""Add batch_job table and refactor evaluation_run
+
+Revision ID: add_batch_job
+Revises: 93d484f5798e
+Create Date: 2025-10-21 00:00:00.000000
+
+"""
+import sqlalchemy as sa
+from alembic import op
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "add_batch_job"
+down_revision = ("93d484f5798e", "d5747495bd7c", "27c271ab6dd0")
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # Create batch_job table
+    op.create_table(
+        "batch_job",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column(
+            "provider",
+            sa.String(),
+            nullable=False,
+            comment="LLM provider name (e.g., 'openai', 'anthropic')",
+        ),
+        sa.Column(
+            "job_type",
+            sa.String(),
+            nullable=False,
+            comment="Type of batch job (e.g., 'evaluation', 'classification', 'embedding')",
+        ),
+        sa.Column(
+            "config",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=False,
+            server_default=sa.text("'{}'::jsonb"),
+            comment="Complete batch configuration",
+        ),
+        sa.Column(
+            "provider_batch_id",
+            sa.String(),
+            nullable=True,
+            comment="Provider's batch job ID",
+        ),
+        sa.Column(
+            "provider_file_id",
+            sa.String(),
+            nullable=True,
+            comment="Provider's input file ID",
+        ),
+        sa.Column(
+            "provider_output_file_id",
+            sa.String(),
+            nullable=True,
+            comment="Provider's output file ID",
+        ),
+        sa.Column(
+            "provider_status",
+            sa.String(),
+            nullable=True,
+            comment="Provider-specific status (e.g., OpenAI: validating, in_progress, completed, failed)",
+        ),
+        sa.Column(
+            "raw_output_url",
+            sa.String(),
+            nullable=True,
+            comment="S3 URL of raw batch output file",
+        ),
+        sa.Column(
+            "total_items",
+            sa.Integer(),
+            nullable=False,
+            default=0,
+            comment="Total number of items in the batch",
+        ),
+        sa.Column(
+            "error_message",
+            sa.Text(),
+            nullable=True,
+            comment="Error message if batch failed",
+        ),
+        sa.Column("organization_id", sa.Integer(), nullable=False),
+        sa.Column("project_id", sa.Integer(), nullable=False),
+        sa.Column("inserted_at", sa.DateTime(), nullable=False),
+        sa.Column("updated_at", sa.DateTime(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["organization_id"], ["organization.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    op.create_index(
+        op.f("ix_batch_job_job_type"), "batch_job", ["job_type"], unique=False
+    )
+    op.create_index(
+        op.f("ix_batch_job_organization_id"),
+        "batch_job",
+        ["organization_id"],
+        unique=False,
+    )
+    op.create_index(
+        op.f("ix_batch_job_project_id"), "batch_job", ["project_id"], unique=False
+    )
+
+    # Add batch_job_id to evaluation_run
+    op.add_column(
+        "evaluation_run", sa.Column("batch_job_id", sa.Integer(), nullable=True)
+    )
+    op.create_foreign_key(
+        "fk_evaluation_run_batch_job_id",
+        "evaluation_run",
+        "batch_job",
+        ["batch_job_id"],
+        ["id"],
+    )
+
+    # Drop batch-related columns from evaluation_run
+    op.drop_column("evaluation_run", "batch_status")
+    op.drop_column("evaluation_run", "batch_id")
+    op.drop_column("evaluation_run", "batch_file_id")
+    op.drop_column("evaluation_run", "batch_output_file_id")
+
+
+def downgrade():
+    # Add back batch-related columns to evaluation_run
+    op.add_column(
+        "evaluation_run",
+        sa.Column("batch_output_file_id", sa.String(), nullable=True),
+    )
+    op.add_column(
+        "evaluation_run", sa.Column("batch_file_id", sa.String(), nullable=True)
+    )
+    op.add_column("evaluation_run", sa.Column("batch_id", sa.String(), nullable=True))
+    op.add_column(
+        "evaluation_run", sa.Column("batch_status", sa.String(), nullable=True)
+    )
+
+    # Drop batch_job_id from evaluation_run
+    op.drop_constraint(
+        "fk_evaluation_run_batch_job_id", "evaluation_run", type_="foreignkey"
+    )
+    op.drop_column("evaluation_run", "batch_job_id")
+
+    # Drop batch_job table
+    op.drop_index(op.f("ix_batch_job_project_id"), table_name="batch_job")
+    op.drop_index(op.f("ix_batch_job_organization_id"), table_name="batch_job")
+    op.drop_index(op.f("ix_batch_job_job_type"), table_name="batch_job")
+    op.drop_table("batch_job")
diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 44f0fb4dd..f5667bbb9 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -1,16 +1,16 @@
 import logging
 
-from fastapi import APIRouter, Body, Depends, UploadFile, File, Form
+from fastapi import APIRouter, Body, Depends, File, Form, UploadFile
 from sqlmodel import Session, select
 
 from app.api.deps import get_current_user_org_project, get_db
 from app.core.util import configure_langfuse, configure_openai, now
+from app.crud.assistants import get_assistant_by_id
 from app.crud.credentials import get_provider_credential
 from app.crud.evaluation import upload_dataset_to_langfuse
 from app.crud.evaluation_batch import start_evaluation_batch
 from app.crud.evaluation_processing import poll_all_pending_evaluations
-from app.crud.assistants import get_assistant_by_id
-from app.models import UserProjectOrg, EvaluationRun
+from app.models import EvaluationRun, UserProjectOrg
 from app.models.evaluation import (
     DatasetUploadResponse,
     EvaluationRunPublic,
@@ -90,9 +90,9 @@ async def evaluate_threads(
     This endpoint:
     1. Creates an EvaluationRun record in the database
     2. Fetches dataset items from Langfuse
-    3. Builds JSONL for OpenAI Batch API (using provided config)
-    4. Uploads to OpenAI and creates batch job
-    5. Returns the evaluation run details with batch_id
+    3. Builds JSONL for batch processing (using provided config)
+    4. Creates a batch job via the generic batch infrastructure
+    5. Returns the evaluation run details with batch_job_id
 
     The batch will be processed asynchronously by Celery Beat (every 60s).
     Use GET /evaluate/batch/{run_id}/status to check progress.
@@ -181,7 +181,7 @@ async def evaluate_threads(
                     db_config[key] = config[key]
 
             config = db_config
-            logger.info(f"Using merged config from DB and provided values")
+            logger.info("Using merged config from DB and provided values")
         else:
             logger.warning(
                 f"Assistant {assistant_id} not found in DB, using provided config"
@@ -227,7 +227,7 @@ async def evaluate_threads(
 
         logger.info(
             f"Evaluation started successfully: "
-            f"batch_id={eval_run.batch_id}, total_items={eval_run.total_items}"
+            f"batch_job_id={eval_run.batch_job_id}, total_items={eval_run.total_items}"
         )
 
         return eval_run
@@ -290,7 +290,7 @@ async def get_evaluation_run_status(
         run_id: ID of the evaluation run
 
     Returns:
-        EvaluationRunPublic with current status, batch_status, and results if completed
+        EvaluationRunPublic with current status and results if completed
     """
     logger.info(
         f"Fetching status for evaluation run {run_id} "
@@ -313,7 +313,7 @@ async def get_evaluation_run_status(
 
     logger.info(
         f"Found evaluation run {run_id}: status={eval_run.status}, "
-        f"batch_status={eval_run.batch_status}"
+        f"batch_job_id={eval_run.batch_job_id}"
     )
 
     return eval_run
diff --git a/backend/app/celery/celery_app.py b/backend/app/celery/celery_app.py
index 14d0dab12..e223f804c 100644
--- a/backend/app/celery/celery_app.py
+++ b/backend/app/celery/celery_app.py
@@ -1,5 +1,6 @@
 from celery import Celery
-from kombu import Queue, Exchange
+from kombu import Exchange, Queue
+
 from app.core.config import settings
 
 # Create Celery instance
@@ -9,7 +10,7 @@
     backend=settings.REDIS_URL,
     include=[
         "app.celery.tasks.job_execution",
-        "app.celery.tasks.evaluation_polling",
+        "app.celery.tasks.evaluation_score_sync",
     ],
 )
 
@@ -85,13 +86,16 @@
     # Connection settings from environment
     broker_connection_retry_on_startup=True,
     broker_pool_limit=settings.CELERY_BROKER_POOL_LIMIT,
-    # Beat configuration (for future cron jobs)
+    # Beat configuration
     beat_schedule={
-        # Poll evaluation batches every 60 seconds
-        "poll-evaluation-batches": {
-            "task": "poll_evaluation_batches",
+        # Process evaluation batches (polls provider status and processes results)
+        "process-evaluation-batches": {
+            "task": "process_evaluation_batches",
             "schedule": 60.0,  # Every 60 seconds
         },
+        # Future: Add similar tasks for other job types
+        # "process-classification-batches": {...}
+        # "process-embedding-batches": {...}
     },
 )
 
diff --git a/backend/app/celery/tasks/evaluation_score_sync.py b/backend/app/celery/tasks/evaluation_score_sync.py
new file mode 100644
index 000000000..77320f733
--- /dev/null
+++ b/backend/app/celery/tasks/evaluation_score_sync.py
@@ -0,0 +1,118 @@
+"""
+Celery tasks for evaluation-specific processing.
+
+This module contains periodic tasks that process completed evaluation batches,
+parse results, create Langfuse traces, and calculate scores.
+"""
+
+import asyncio
+import logging
+
+from celery import shared_task
+from sqlmodel import Session, select
+
+from app.core.db import get_engine
+from app.crud.evaluation_processing import poll_all_pending_evaluations
+from app.models import Organization
+
+logger = logging.getLogger(__name__)
+
+
+@shared_task(name="process_evaluation_batches", bind=True)
+def process_evaluation_batches_task(self):
+    """
+    Periodic task to process completed evaluation batches.
+
+    This task:
+    1. Gets all organizations
+    2. For each org, checks their pending evaluations
+    3. Processes completed batches (parses results, creates Langfuse traces)
+    4. Updates evaluation_run records with final status
+
+    Runs every 60 seconds (configured in celery_app.py beat_schedule)
+
+    Note: Generic batch_job status polling is handled by poll_batch_jobs task.
+    This task focuses on evaluation-specific result processing.
+    """
+    logger.info("[process_evaluation_batches] Starting evaluation processing")
+
+    try:
+        # Get database session
+        engine = get_engine()
+        with Session(engine) as session:
+            # Get all organizations
+            orgs = session.exec(select(Organization)).all()
+
+            if not orgs:
+                logger.info("[process_evaluation_batches] No organizations found")
+                return {
+                    "status": "success",
+                    "organizations_processed": 0,
+                    "message": "No organizations to process",
+                }
+
+            logger.info(
+                f"[process_evaluation_batches] Found {len(orgs)} organizations to process"
+            )
+
+            results = []
+            total_processed = 0
+            total_failed = 0
+            total_still_processing = 0
+
+            # Process each organization
+            for org in orgs:
+                try:
+                    logger.info(
+                        f"[process_evaluation_batches] Processing org_id={org.id} ({org.name})"
+                    )
+
+                    # Poll and process all pending evaluations for this org
+                    # Use asyncio.run since poll_all_pending_evaluations is async
+                    summary = asyncio.run(
+                        poll_all_pending_evaluations(session=session, org_id=org.id)
+                    )
+
+                    results.append(
+                        {
+                            "org_id": org.id,
+                            "org_name": org.name,
+                            "summary": summary,
+                        }
+                    )
+
+                    total_processed += summary.get("processed", 0)
+                    total_failed += summary.get("failed", 0)
+                    total_still_processing += summary.get("still_processing", 0)
+
+                except Exception as e:
+                    logger.error(
+                        f"[process_evaluation_batches] Error processing org_id={org.id}: {e}",
+                        exc_info=True,
+                    )
+                    results.append(
+                        {"org_id": org.id, "org_name": org.name, "error": str(e)}
+                    )
+
+            logger.info(
+                f"[process_evaluation_batches] Completed: "
+                f"{total_processed} processed, {total_failed} failed, "
+                f"{total_still_processing} still processing"
+            )
+
+            return {
+                "status": "success",
+                "organizations_processed": len(orgs),
+                "total_processed": total_processed,
+                "total_failed": total_failed,
+                "total_still_processing": total_still_processing,
+                "results": results,
+            }
+
+    except Exception as e:
+        logger.error(
+            f"[process_evaluation_batches] Fatal error: {e}",
+            exc_info=True,
+        )
+        # Retry the task after 5 minutes
+        raise self.retry(exc=e, countdown=300, max_retries=3)
diff --git a/backend/app/core/batch/__init__.py b/backend/app/core/batch/__init__.py
new file mode 100644
index 000000000..73ee2fe93
--- /dev/null
+++ b/backend/app/core/batch/__init__.py
@@ -0,0 +1,5 @@
+"""Batch processing infrastructure for LLM providers."""
+
+from .provider_interface import BatchProvider
+
+__all__ = ["BatchProvider"]
diff --git a/backend/app/core/batch/openai_provider.py b/backend/app/core/batch/openai_provider.py
new file mode 100644
index 000000000..3e17fd696
--- /dev/null
+++ b/backend/app/core/batch/openai_provider.py
@@ -0,0 +1,267 @@
+"""OpenAI batch provider implementation."""
+
+import json
+import logging
+from typing import Any
+
+from openai import OpenAI
+
+from .provider_interface import BatchProvider
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAIBatchProvider(BatchProvider):
+    """OpenAI implementation of the BatchProvider interface."""
+
+    def __init__(self, client: OpenAI):
+        """
+        Initialize the OpenAI batch provider.
+
+        Args:
+            client: Configured OpenAI client
+        """
+        self.client = client
+
+    def create_batch(
+        self, jsonl_data: list[dict[str, Any]], config: dict[str, Any]
+    ) -> dict[str, Any]:
+        """
+        Upload JSONL data and create a batch job with OpenAI.
+
+        Args:
+            jsonl_data: List of dictionaries representing JSONL lines
+            config: Provider-specific configuration with:
+                - endpoint: OpenAI endpoint (e.g., "/v1/responses")
+                - description: Optional batch description
+                - completion_window: Optional completion window (default "24h")
+
+        Returns:
+            Dictionary containing:
+                - provider_batch_id: OpenAI batch ID
+                - provider_file_id: OpenAI input file ID
+                - provider_status: Initial status from OpenAI
+                - total_items: Number of items in the batch
+
+        Raises:
+            Exception: If batch creation fails
+        """
+        endpoint = config.get("endpoint", "/v1/responses")
+        description = config.get("description", "LLM batch job")
+        completion_window = config.get("completion_window", "24h")
+
+        logger.info(
+            f"Creating OpenAI batch with {len(jsonl_data)} items for endpoint {endpoint}"
+        )
+
+        try:
+            # Step 1: Upload file
+            file_id = self.upload_file(
+                content="\n".join([json.dumps(line) for line in jsonl_data]),
+                purpose="batch",
+            )
+
+            # Step 2: Create batch job
+            batch = self.client.batches.create(
+                input_file_id=file_id,
+                endpoint=endpoint,
+                completion_window=completion_window,
+                metadata={"description": description},
+            )
+
+            result = {
+                "provider_batch_id": batch.id,
+                "provider_file_id": file_id,
+                "provider_status": batch.status,
+                "total_items": len(jsonl_data),
+            }
+
+            logger.info(
+                f"Created OpenAI batch: {batch.id} (status={batch.status}, {len(jsonl_data)} items)"
+            )
+
+            return result
+
+        except Exception as e:
+            logger.error(f"Failed to create OpenAI batch: {e}")
+            raise
+
+    def get_batch_status(self, batch_id: str) -> dict[str, Any]:
+        """
+        Poll OpenAI for batch job status.
+
+        Args:
+            batch_id: OpenAI batch ID
+
+        Returns:
+            Dictionary containing:
+                - provider_status: Current OpenAI status
+                - provider_output_file_id: Output file ID (if completed)
+                - error_message: Error message (if failed)
+                - request_counts: Dict with total/completed/failed counts
+
+        Raises:
+            Exception: If status check fails
+        """
+        logger.info(f"Polling OpenAI batch status: {batch_id}")
+
+        try:
+            batch = self.client.batches.retrieve(batch_id)
+
+            result = {
+                "provider_status": batch.status,
+                "provider_output_file_id": batch.output_file_id,
+                "error_file_id": batch.error_file_id,
+                "request_counts": {
+                    "total": batch.request_counts.total,
+                    "completed": batch.request_counts.completed,
+                    "failed": batch.request_counts.failed,
+                },
+            }
+
+            # Add error message if batch failed
+            if batch.status in ["failed", "expired", "cancelled"]:
+                error_msg = f"Batch {batch.status}"
+                if batch.error_file_id:
+                    error_msg += f" (error_file_id: {batch.error_file_id})"
+                result["error_message"] = error_msg
+
+            logger.info(
+                f"OpenAI batch {batch_id} status: {batch.status} "
+                f"({batch.request_counts.completed}/{batch.request_counts.total} completed)"
+            )
+
+            return result
+
+        except Exception as e:
+            logger.error(f"Failed to poll OpenAI batch status for {batch_id}: {e}")
+            raise
+
+    def download_batch_results(self, output_file_id: str) -> list[dict[str, Any]]:
+        """
+        Download and parse batch results from OpenAI.
+
+        Args:
+            output_file_id: OpenAI output file ID
+
+        Returns:
+            List of result dictionaries, each containing:
+                - custom_id: Item identifier from input
+                - response: OpenAI response data (body, status_code, request_id)
+                - error: Error info (if item failed)
+
+        Raises:
+            Exception: If download or parsing fails
+        """
+        logger.info(f"Downloading OpenAI batch results: {output_file_id}")
+
+        try:
+            # Download file content
+            jsonl_content = self.download_file(output_file_id)
+
+            # Parse JSONL into list of dicts
+            results = []
+            lines = jsonl_content.strip().split("\n")
+
+            for line_num, line in enumerate(lines, 1):
+                try:
+                    result = json.loads(line)
+                    results.append(result)
+                except json.JSONDecodeError as e:
+                    logger.error(f"Line {line_num}: Failed to parse JSON: {e}")
+                    continue
+
+            logger.info(
+                f"Downloaded and parsed {len(results)} results from OpenAI batch output"
+            )
+
+            return results
+
+        except Exception as e:
+            logger.error(f"Failed to download OpenAI batch results: {e}")
+            raise
+
+    def cancel_batch(self, batch_id: str) -> bool:
+        """
+        Cancel a running OpenAI batch job.
+
+        Args:
+            batch_id: OpenAI batch ID
+
+        Returns:
+            True if cancellation was successful or batch was already terminal
+
+        Raises:
+            Exception: If cancellation fails
+        """
+        logger.info(f"Cancelling OpenAI batch: {batch_id}")
+
+        try:
+            batch = self.client.batches.cancel(batch_id)
+
+            logger.info(f"OpenAI batch {batch_id} cancelled (status={batch.status})")
+
+            return True
+
+        except Exception as e:
+            logger.error(f"Failed to cancel OpenAI batch {batch_id}: {e}")
+            raise
+
+    def upload_file(self, content: str, purpose: str = "batch") -> str:
+        """
+        Upload a file to OpenAI file storage.
+
+        Args:
+            content: File content (typically JSONL string)
+            purpose: Purpose of the file (e.g., "batch")
+
+        Returns:
+            OpenAI file ID
+
+        Raises:
+            Exception: If upload fails
+        """
+        logger.info(f"Uploading file to OpenAI ({len(content)} bytes)")
+
+        try:
+            file_response = self.client.files.create(
+                file=("batch_input.jsonl", content.encode("utf-8")),
+                purpose=purpose,
+            )
+
+            logger.info(f"Uploaded file to OpenAI: {file_response.id}")
+
+            return file_response.id
+
+        except Exception as e:
+            logger.error(f"Failed to upload file to OpenAI: {e}")
+            raise
+
+    def download_file(self, file_id: str) -> str:
+        """
+        Download a file from OpenAI file storage.
+
+        Args:
+            file_id: OpenAI file ID
+
+        Returns:
+            File content as string
+
+        Raises:
+            Exception: If download fails
+        """
+        logger.info(f"Downloading file from OpenAI: {file_id}")
+
+        try:
+            file_content = self.client.files.content(file_id)
+            content = file_content.read().decode("utf-8")
+
+            logger.info(
+                f"Downloaded file from OpenAI: {file_id} ({len(content)} bytes)"
+            )
+
+            return content
+
+        except Exception as e:
+            logger.error(f"Failed to download file from OpenAI {file_id}: {e}")
+            raise
diff --git a/backend/app/core/batch/provider_interface.py b/backend/app/core/batch/provider_interface.py
new file mode 100644
index 000000000..953a57225
--- /dev/null
+++ b/backend/app/core/batch/provider_interface.py
@@ -0,0 +1,121 @@
+"""Abstract interface for LLM batch providers."""
+
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+class BatchProvider(ABC):
+    """Abstract base class for LLM batch providers (OpenAI, Anthropic, etc.)."""
+
+    @abstractmethod
+    def create_batch(
+        self, jsonl_data: list[dict[str, Any]], config: dict[str, Any]
+    ) -> dict[str, Any]:
+        """
+        Upload JSONL data and create a batch job with the provider.
+
+        Args:
+            jsonl_data: List of dictionaries representing JSONL lines
+            config: Provider-specific configuration (model, temperature, etc.)
+
+        Returns:
+            Dictionary containing:
+                - provider_batch_id: Provider's batch job ID
+                - provider_file_id: Provider's input file ID
+                - provider_status: Initial status from provider
+                - total_items: Number of items in the batch
+                - Any other provider-specific metadata
+
+        Raises:
+            Exception: If batch creation fails
+        """
+        pass
+
+    @abstractmethod
+    def get_batch_status(self, batch_id: str) -> dict[str, Any]:
+        """
+        Poll the provider for batch job status.
+
+        Args:
+            batch_id: Provider's batch job ID
+
+        Returns:
+            Dictionary containing:
+                - provider_status: Current status from provider
+                - provider_output_file_id: Output file ID (if completed)
+                - error_message: Error message (if failed)
+                - Any other provider-specific status info
+
+        Raises:
+            Exception: If status check fails
+        """
+        pass
+
+    @abstractmethod
+    def download_batch_results(self, output_file_id: str) -> list[dict[str, Any]]:
+        """
+        Download and parse batch results from the provider.
+
+        Args:
+            output_file_id: Provider's output file ID
+
+        Returns:
+            List of result dictionaries, each containing:
+                - custom_id: Item identifier from input
+                - response: Provider's response data
+                - error: Error info (if item failed)
+                - Any other provider-specific result data
+
+        Raises:
+            Exception: If download or parsing fails
+        """
+        pass
+
+    @abstractmethod
+    def cancel_batch(self, batch_id: str) -> bool:
+        """
+        Cancel a running batch job.
+
+        Args:
+            batch_id: Provider's batch job ID
+
+        Returns:
+            True if cancellation was successful or batch was already terminal
+
+        Raises:
+            Exception: If cancellation fails
+        """
+        pass
+
+    @abstractmethod
+    def upload_file(self, content: str, purpose: str = "batch") -> str:
+        """
+        Upload a file to the provider's file storage.
+
+        Args:
+            content: File content (typically JSONL string)
+            purpose: Purpose of the file (e.g., "batch")
+
+        Returns:
+            Provider's file ID
+
+        Raises:
+            Exception: If upload fails
+        """
+        pass
+
+    @abstractmethod
+    def download_file(self, file_id: str) -> str:
+        """
+        Download a file from the provider's file storage.
+
+        Args:
+            file_id: Provider's file ID
+
+        Returns:
+            File content as string
+
+        Raises:
+            Exception: If download fails
+        """
+        pass
diff --git a/backend/app/crud/batch_job.py b/backend/app/crud/batch_job.py
new file mode 100644
index 000000000..186ff9474
--- /dev/null
+++ b/backend/app/crud/batch_job.py
@@ -0,0 +1,208 @@
+"""CRUD operations for batch_job table."""
+
+import logging
+
+from sqlmodel import Session, select
+
+from app.core.util import now
+from app.models.batch_job import BatchJob, BatchJobCreate, BatchJobUpdate
+
+logger = logging.getLogger(__name__)
+
+
+def create_batch_job(
+    session: Session,
+    batch_job_create: BatchJobCreate,
+) -> BatchJob:
+    """
+    Create a new batch job record.
+
+    Args:
+        session: Database session
+        batch_job_create: BatchJobCreate schema with all required fields
+
+    Returns:
+        Created BatchJob object
+
+    Raises:
+        Exception: If creation fails
+    """
+    logger.info(
+        f"Creating batch job: provider={batch_job_create.provider}, "
+        f"job_type={batch_job_create.job_type}, "
+        f"org_id={batch_job_create.organization_id}, "
+        f"project_id={batch_job_create.project_id}"
+    )
+
+    try:
+        batch_job = BatchJob.model_validate(batch_job_create)
+        batch_job.inserted_at = now()
+        batch_job.updated_at = now()
+
+        session.add(batch_job)
+        session.commit()
+        session.refresh(batch_job)
+
+        logger.info(f"Created batch job: id={batch_job.id}")
+
+        return batch_job
+
+    except Exception as e:
+        logger.error(f"Failed to create batch job: {e}", exc_info=True)
+        session.rollback()
+        raise
+
+
+def get_batch_job(session: Session, batch_job_id: int) -> BatchJob | None:
+    """
+    Get a batch job by ID.
+
+    Args:
+        session: Database session
+        batch_job_id: Batch job ID
+
+    Returns:
+        BatchJob object if found, None otherwise
+    """
+    statement = select(BatchJob).where(BatchJob.id == batch_job_id)
+    batch_job = session.exec(statement).first()
+
+    return batch_job
+
+
+def update_batch_job(
+    session: Session,
+    batch_job: BatchJob,
+    batch_job_update: BatchJobUpdate,
+) -> BatchJob:
+    """
+    Update a batch job record.
+
+    Args:
+        session: Database session
+        batch_job: BatchJob object to update
+        batch_job_update: BatchJobUpdate schema with fields to update
+
+    Returns:
+        Updated BatchJob object
+
+    Raises:
+        Exception: If update fails
+    """
+    logger.info(f"Updating batch job: id={batch_job.id}")
+
+    try:
+        # Update fields if provided
+        update_data = batch_job_update.model_dump(exclude_unset=True)
+
+        for key, value in update_data.items():
+            setattr(batch_job, key, value)
+
+        batch_job.updated_at = now()
+
+        session.add(batch_job)
+        session.commit()
+        session.refresh(batch_job)
+
+        logger.info(f"Updated batch job: id={batch_job.id}")
+
+        return batch_job
+
+    except Exception as e:
+        logger.error(f"Failed to update batch job {batch_job.id}: {e}", exc_info=True)
+        session.rollback()
+        raise
+
+
+def get_batch_jobs_by_ids(
+    session: Session,
+    batch_job_ids: list[int],
+) -> list[BatchJob]:
+    """
+    Get batch jobs by their IDs.
+
+    This is used by parent tables to get their associated batch jobs for polling.
+
+    Args:
+        session: Database session
+        batch_job_ids: List of batch job IDs
+
+    Returns:
+        List of BatchJob objects
+    """
+    if not batch_job_ids:
+        return []
+
+    statement = select(BatchJob).where(BatchJob.id.in_(batch_job_ids))
+    results = session.exec(statement).all()
+
+    logger.info(f"Found {len(results)} batch jobs for {len(batch_job_ids)} IDs")
+
+    return list(results)
+
+
+def get_batches_by_type(
+    session: Session,
+    job_type: str,
+    organization_id: int | None = None,
+    project_id: int | None = None,
+    provider_status: str | None = None,
+) -> list[BatchJob]:
+    """
+    Get batch jobs by type with optional filters.
+
+    Args:
+        session: Database session
+        job_type: Job type (e.g., "evaluation", "classification")
+        organization_id: Optional filter by organization ID
+        project_id: Optional filter by project ID
+        provider_status: Optional filter by provider status
+
+    Returns:
+        List of BatchJob objects matching filters
+    """
+    statement = select(BatchJob).where(BatchJob.job_type == job_type)
+
+    if organization_id:
+        statement = statement.where(BatchJob.organization_id == organization_id)
+
+    if project_id:
+        statement = statement.where(BatchJob.project_id == project_id)
+
+    if provider_status:
+        statement = statement.where(BatchJob.provider_status == provider_status)
+
+    results = session.exec(statement).all()
+
+    logger.info(
+        f"Found {len(results)} batch jobs "
+        f"(job_type={job_type}, org_id={organization_id}, "
+        f"project_id={project_id}, provider_status={provider_status})"
+    )
+
+    return list(results)
+
+
+def delete_batch_job(session: Session, batch_job: BatchJob) -> None:
+    """
+    Delete a batch job record.
+
+    Args:
+        session: Database session
+        batch_job: BatchJob object to delete
+
+    Raises:
+        Exception: If deletion fails
+    """
+    logger.info(f"Deleting batch job: id={batch_job.id}")
+
+    try:
+        session.delete(batch_job)
+        session.commit()
+
+        logger.info(f"Deleted batch job: id={batch_job.id}")
+
+    except Exception as e:
+        logger.error(f"Failed to delete batch job {batch_job.id}: {e}", exc_info=True)
+        session.rollback()
+        raise
diff --git a/backend/app/crud/batch_operations.py b/backend/app/crud/batch_operations.py
new file mode 100644
index 000000000..71998f816
--- /dev/null
+++ b/backend/app/crud/batch_operations.py
@@ -0,0 +1,331 @@
+"""Generic batch operations orchestrator."""
+
+import json
+import logging
+from io import BytesIO
+from typing import Any
+
+from sqlmodel import Session
+
+from app.core.batch.provider_interface import BatchProvider
+from app.core.cloud.storage import AmazonCloudStorageClient, SimpleStorageName
+from app.crud.batch_job import (
+    create_batch_job,
+    update_batch_job,
+)
+from app.models.batch_job import BatchJob, BatchJobCreate, BatchJobUpdate
+
+logger = logging.getLogger(__name__)
+
+
+def start_batch_job(
+    session: Session,
+    provider: BatchProvider,
+    provider_name: str,
+    job_type: str,
+    organization_id: int,
+    project_id: int,
+    jsonl_data: list[dict[str, Any]],
+    config: dict[str, Any],
+) -> BatchJob:
+    """
+    Create and start a batch job with the specified provider.
+
+    This orchestrates the complete batch creation workflow:
+    1. Create batch_job record in DB with status='pending'
+    2. Call provider to upload data and create batch
+    3. Update batch_job with provider IDs and status='processing'
+
+    Args:
+        session: Database session
+        provider: BatchProvider instance (e.g., OpenAIBatchProvider)
+        provider_name: Provider name (e.g., "openai", "anthropic")
+        job_type: Job type (e.g., "evaluation", "classification")
+        organization_id: Organization ID
+        project_id: Project ID
+        jsonl_data: List of dictionaries representing JSONL lines
+        config: Complete batch configuration including provider-specific params
+
+    Returns:
+        BatchJob object with provider IDs populated
+
+    Raises:
+        Exception: If batch creation fails
+    """
+    logger.info(
+        f"Starting {provider_name} batch job: job_type={job_type}, "
+        f"org_id={organization_id}, project_id={project_id}, "
+        f"items={len(jsonl_data)}"
+    )
+
+    # Step 1: Create batch_job record
+    batch_job_create = BatchJobCreate(
+        provider=provider_name,
+        job_type=job_type,
+        organization_id=organization_id,
+        project_id=project_id,
+        config=config,
+        total_items=len(jsonl_data),
+    )
+
+    batch_job = create_batch_job(session=session, batch_job_create=batch_job_create)
+
+    try:
+        # Step 2: Call provider to create batch
+        logger.info(f"Creating batch with {provider_name} provider...")
+        batch_result = provider.create_batch(jsonl_data=jsonl_data, config=config)
+
+        # Step 3: Update batch_job with provider IDs
+        batch_job_update = BatchJobUpdate(
+            provider_batch_id=batch_result["provider_batch_id"],
+            provider_file_id=batch_result["provider_file_id"],
+            provider_status=batch_result["provider_status"],
+            total_items=batch_result.get("total_items", len(jsonl_data)),
+        )
+
+        batch_job = update_batch_job(
+            session=session, batch_job=batch_job, batch_job_update=batch_job_update
+        )
+
+        logger.info(
+            f"Successfully started batch job: id={batch_job.id}, "
+            f"provider_batch_id={batch_job.provider_batch_id}"
+        )
+
+        return batch_job
+
+    except Exception as e:
+        logger.error(f"Failed to start batch job: {e}", exc_info=True)
+
+        # Store error in batch_job (parent table will handle status)
+        batch_job_update = BatchJobUpdate(
+            error_message=f"Batch creation failed: {str(e)}"
+        )
+        update_batch_job(
+            session=session, batch_job=batch_job, batch_job_update=batch_job_update
+        )
+
+        raise
+
+
+def poll_batch_status(
+    session: Session, provider: BatchProvider, batch_job: BatchJob
+) -> dict[str, Any]:
+    """
+    Poll provider for batch status and update database.
+
+    Args:
+        session: Database session
+        provider: BatchProvider instance
+        batch_job: BatchJob object
+
+    Returns:
+        Dictionary with status information from provider
+
+    Raises:
+        Exception: If polling fails
+    """
+    logger.info(
+        f"Polling batch status: id={batch_job.id}, "
+        f"provider_batch_id={batch_job.provider_batch_id}"
+    )
+
+    try:
+        # Poll provider for status
+        status_result = provider.get_batch_status(batch_job.provider_batch_id)
+
+        # Update batch_job if status changed
+        provider_status = status_result["provider_status"]
+        if provider_status != batch_job.provider_status:
+            update_data = {"provider_status": provider_status}
+
+            # Update output file ID if available
+            if status_result.get("provider_output_file_id"):
+                update_data["provider_output_file_id"] = status_result[
+                    "provider_output_file_id"
+                ]
+
+            # Update error message if failed
+            if status_result.get("error_message"):
+                update_data["error_message"] = status_result["error_message"]
+
+            batch_job_update = BatchJobUpdate(**update_data)
+            batch_job = update_batch_job(
+                session=session, batch_job=batch_job, batch_job_update=batch_job_update
+            )
+
+            logger.info(
+                f"Updated batch_job {batch_job.id} status: "
+                f"{batch_job.provider_status} -> {provider_status}"
+            )
+
+        return status_result
+
+    except Exception as e:
+        logger.error(f"Failed to poll batch status: {e}", exc_info=True)
+        raise
+
+
+def download_batch_results(
+    provider: BatchProvider, batch_job: BatchJob
+) -> list[dict[str, Any]]:
+    """
+    Download raw batch results from provider.
+
+    Args:
+        provider: BatchProvider instance
+        batch_job: BatchJob object (must have provider_output_file_id)
+
+    Returns:
+        List of result dictionaries from provider
+
+    Raises:
+        ValueError: If output_file_id not available
+        Exception: If download fails
+    """
+    if not batch_job.provider_output_file_id:
+        raise ValueError(
+            f"Batch job {batch_job.id} does not have provider_output_file_id"
+        )
+
+    logger.info(
+        f"Downloading batch results: id={batch_job.id}, "
+        f"output_file_id={batch_job.provider_output_file_id}"
+    )
+
+    try:
+        results = provider.download_batch_results(batch_job.provider_output_file_id)
+
+        logger.info(f"Downloaded {len(results)} results for batch job {batch_job.id}")
+
+        return results
+
+    except Exception as e:
+        logger.error(f"Failed to download batch results: {e}", exc_info=True)
+        raise
+
+
+def process_completed_batch(
+    session: Session,
+    provider: BatchProvider,
+    batch_job: BatchJob,
+    upload_to_s3: bool = True,
+) -> tuple[list[dict[str, Any]], str | None]:
+    """
+    Process a completed batch: download results and optionally upload to S3.
+
+    Args:
+        session: Database session
+        provider: BatchProvider instance
+        batch_job: BatchJob object
+        upload_to_s3: Whether to upload raw results to S3
+
+    Returns:
+        Tuple of (results, s3_url)
+        - results: List of result dictionaries
+        - s3_url: S3 URL if uploaded, None otherwise
+
+    Raises:
+        Exception: If processing fails
+    """
+    logger.info(f"Processing completed batch: id={batch_job.id}")
+
+    try:
+        # Download results
+        results = download_batch_results(provider=provider, batch_job=batch_job)
+
+        # Upload to S3 if requested
+        s3_url = None
+        if upload_to_s3:
+            try:
+                s3_url = upload_batch_results_to_s3(
+                    batch_job=batch_job, results=results
+                )
+                logger.info(f"Uploaded batch results to S3: {s3_url}")
+            except Exception as s3_error:
+                logger.warning(
+                    f"S3 upload failed (AWS credentials may not be configured): {s3_error}. "
+                    f"Continuing without S3 storage.",
+                    exc_info=True,
+                )
+
+        # Update batch_job with S3 URL
+        if s3_url:
+            batch_job_update = BatchJobUpdate(raw_output_url=s3_url)
+            update_batch_job(
+                session=session, batch_job=batch_job, batch_job_update=batch_job_update
+            )
+
+        return results, s3_url
+
+    except Exception as e:
+        logger.error(f"Failed to process completed batch: {e}", exc_info=True)
+        raise
+
+
+def upload_batch_results_to_s3(
+    batch_job: BatchJob, results: list[dict[str, Any]]
+) -> str:
+    """
+    Upload batch results to S3.
+
+    Args:
+        batch_job: BatchJob object
+        results: List of result dictionaries
+
+    Returns:
+        S3 URL
+
+    Raises:
+        Exception: If upload fails
+    """
+    logger.info(f"Uploading batch results to S3 for batch_job {batch_job.id}")
+
+    try:
+        # Create S3 key path
+        # Format: {job_type}/batch-{id}/results.jsonl
+        s3_key = f"{batch_job.job_type}/batch-{batch_job.id}/results.jsonl"
+
+        # Convert results to JSONL
+        jsonl_content = "\n".join([json.dumps(result) for result in results])
+        content_bytes = jsonl_content.encode("utf-8")
+        file_like = BytesIO(content_bytes)
+
+        # Upload to S3
+        aws_client = AmazonCloudStorageClient()
+        aws_client.client.upload_fileobj(
+            file_like,
+            Bucket=aws_client.client._client_config.__dict__.get(
+                "bucket", "kaapi-storage"
+            ),
+            Key=s3_key,
+            ExtraArgs={"ContentType": "application/jsonl"},
+        )
+
+        # Construct S3 URL
+        storage_name = SimpleStorageName(Key=s3_key)
+        s3_url = str(storage_name)
+
+        logger.info(
+            f"Successfully uploaded batch results to S3: {s3_url} ({len(content_bytes)} bytes)"
+        )
+
+        return s3_url
+
+    except Exception as e:
+        logger.error(f"Failed to upload batch results to S3: {e}", exc_info=True)
+        raise
+
+
+# NOTE: Batch-level polling has been removed from this module.
+# Polling should be done at the parent table level (e.g., evaluation_run)
+# because only the parent knows when its business logic is complete.
+#
+# For example:
+# - poll_all_pending_evaluations() in evaluation_processing.py
+# - poll_all_pending_classifications() in classification_processing.py (future)
+#
+# Each parent-specific polling function should:
+# 1. Query parent table for status="processing"
+# 2. Poll batch_job.provider_status via poll_batch_status()
+# 3. Update parent table status based on business logic
diff --git a/backend/app/crud/evaluation_batch.py b/backend/app/crud/evaluation_batch.py
index c30d9c46d..8f6e267ab 100644
--- a/backend/app/crud/evaluation_batch.py
+++ b/backend/app/crud/evaluation_batch.py
@@ -1,22 +1,21 @@
 """
-OpenAI Batch API integration for LLM evaluations using Responses API.
+Evaluation-specific batch preparation and orchestration.
 
 This module handles:
 1. Fetching dataset items from Langfuse
-2. Building JSONL for OpenAI Batch API (/v1/responses endpoint)
-3. Uploading and creating batch jobs
-4. Polling batch status and downloading results
+2. Building evaluation-specific JSONL for batch processing
+3. Starting evaluation batches using generic batch infrastructure
 """
 
-import ast
-import json
 import logging
 from typing import Any
 
 from langfuse import Langfuse
 from openai import OpenAI
-from sqlmodel import Session, select
+from sqlmodel import Session
 
+from app.core.batch.openai_provider import OpenAIBatchProvider
+from app.crud.batch_operations import start_batch_job
 from app.models import EvaluationRun
 
 logger = logging.getLogger(__name__)
@@ -62,14 +61,14 @@ def fetch_dataset_items(langfuse: Langfuse, dataset_name: str) -> list[dict[str,
     return items
 
 
-def build_batch_jsonl(
+def build_evaluation_jsonl(
     dataset_items: list[dict[str, Any]], config: dict[str, Any]
-) -> list[str]:
+) -> list[dict[str, Any]]:
     """
-    Build JSONL lines for OpenAI Batch API using Responses API.
+    Build JSONL data for evaluation batch using OpenAI Responses API.
 
-    Each line is a JSON object with:
-    - custom_id: Unique identifier for the request
+    Each line is a dict with:
+    - custom_id: Unique identifier for the request (dataset item ID)
     - method: POST
     - url: /v1/responses
     - body: Response request with model, instructions, and input
@@ -79,7 +78,7 @@ def build_batch_jsonl(
         config: Evaluation configuration dict with llm, instructions, vector_store_ids
 
     Returns:
-        List of JSONL strings (one per dataset item)
+        List of dictionaries (JSONL data)
     """
     # Extract config values
     llm_config = config.get("llm", {})
@@ -89,7 +88,7 @@ def build_batch_jsonl(
 
     logger.info(f"Building JSONL for {len(dataset_items)} items with model {model}")
 
-    batch_file = []
+    jsonl_data = []
 
     for item in dataset_items:
         # Extract question from input
@@ -111,7 +110,6 @@ def build_batch_jsonl(
         }
 
         # Add vector store IDs if available (for file search)
-        # Only add tools if vector_store_ids is a non-empty list
         if vector_store_ids and len(vector_store_ids) > 0:
             batch_request["body"]["tools"] = [
                 {
@@ -121,89 +119,10 @@ def build_batch_jsonl(
             ]
             batch_request["body"]["tool_choice"] = "auto"
 
-        batch_file.append(json.dumps(batch_request))
-
-    logger.info(f"Built {len(batch_file)} JSONL lines")
-    return batch_file
+        jsonl_data.append(batch_request)
 
-
-def upload_batch_file(client: OpenAI, batch_file: list[str]) -> str:
-    """
-    Upload JSONL content to OpenAI Files API.
-
-    Args:
-        client: Configured OpenAI client
-        batch_file: List of JSONL strings
-
-    Returns:
-        File ID from OpenAI
-
-    Raises:
-        Exception: If upload fails
-    """
-    logger.info(f"Uploading {len(batch_file)} lines to OpenAI Files API")
-
-    # Join lines with newlines
-    jsonl_content = "\n".join(batch_file)
-
-    try:
-        # Upload as a file object
-        file_response = client.files.create(
-            file=("batch_input.jsonl", jsonl_content.encode("utf-8")),
-            purpose="batch",
-        )
-
-        logger.info(f"Uploaded file: {file_response.id}")
-        return file_response.id
-
-    except Exception as e:
-        logger.error(f"Failed to upload batch file: {e}")
-        raise
-
-
-def create_batch_job(
-    client: OpenAI,
-    file_id: str,
-    description: str = "LLM evaluation batch",
-) -> dict[str, Any]:
-    """
-    Create a batch job in OpenAI using Responses API.
-
-    Args:
-        client: Configured OpenAI client
-        file_id: File ID from upload_batch_file
-        description: Optional description for the batch
-
-    Returns:
-        Dict with batch details (id, status, etc.)
-
-    Raises:
-        Exception: If batch creation fails
-    """
-    logger.info(f"Creating batch job with file: {file_id}")
-
-    try:
-        batch = client.batches.create(
-            input_file_id=file_id,
-            endpoint="/v1/responses",
-            completion_window="24h",
-            metadata={"description": description},
-        )
-
-        batch_info = {
-            "id": batch.id,
-            "status": batch.status,
-            "created_at": batch.created_at,
-            "endpoint": batch.endpoint,
-            "input_file_id": batch.input_file_id,
-        }
-
-        logger.info(f"Created batch: {batch.id} (status={batch.status})")
-        return batch_info
-
-    except Exception as e:
-        logger.error(f"Failed to create batch job: {e}")
-        raise
+    logger.info(f"Built {len(jsonl_data)} JSONL lines")
+    return jsonl_data
 
 
 def start_evaluation_batch(
@@ -214,7 +133,10 @@ def start_evaluation_batch(
     config: dict[str, Any],
 ) -> EvaluationRun:
     """
-    Fetch data, build JSONL, upload to OpenAI, create batch.
+    Fetch data, build JSONL, and start evaluation batch.
+
+    This function orchestrates the evaluation-specific logic and delegates
+    to the generic batch infrastructure for actual batch creation.
 
     Args:
         langfuse: Configured Langfuse client
@@ -224,46 +146,60 @@ def start_evaluation_batch(
         config: Evaluation configuration dict with llm, instructions, vector_store_ids
 
     Returns:
-        Updated EvaluationRun with batch_id and batch_file_id populated
+        Updated EvaluationRun with batch_job_id populated
 
     Raises:
         Exception: If any step fails
     """
     try:
         # Step 1: Fetch dataset items from Langfuse
+        logger.info(f"Starting evaluation batch for run '{eval_run.run_name}'")
         dataset_items = fetch_dataset_items(
             langfuse=langfuse, dataset_name=eval_run.dataset_name
         )
 
-        # Step 2: Build JSONL using config
-        batch_file = build_batch_jsonl(dataset_items=dataset_items, config=config)
-
-        # Step 3: Upload to OpenAI
-        file_id = upload_batch_file(client=openai_client, batch_file=batch_file)
+        # Step 2: Build evaluation-specific JSONL
+        jsonl_data = build_evaluation_jsonl(dataset_items=dataset_items, config=config)
+
+        # Step 3: Create batch provider
+        provider = OpenAIBatchProvider(client=openai_client)
+
+        # Step 4: Prepare batch configuration
+        batch_config = {
+            "endpoint": "/v1/responses",
+            "description": f"Evaluation: {eval_run.run_name}",
+            "completion_window": "24h",
+            # Store complete config including LLM settings for reference
+            "llm": config.get("llm", {}),
+            "instructions": config.get("instructions"),
+            "vector_store_ids": config.get("vector_store_ids", []),
+        }
 
-        # Step 4: Create batch job
-        batch_info = create_batch_job(
-            client=openai_client,
-            file_id=file_id,
-            description=f"Evaluation: {eval_run.run_name}",
+        # Step 5: Start batch job using generic infrastructure
+        batch_job = start_batch_job(
+            session=session,
+            provider=provider,
+            provider_name="openai",
+            job_type="evaluation",
+            organization_id=eval_run.organization_id,
+            project_id=eval_run.project_id,
+            jsonl_data=jsonl_data,
+            config=batch_config,
         )
 
-        # Update eval_run with batch info
-        eval_run.batch_id = batch_info["id"]
-        eval_run.batch_file_id = file_id
-        eval_run.batch_status = batch_info[
-            "status"
-        ]  # OpenAI batch status (e.g., "validating")
-        eval_run.total_items = len(batch_file)
-        eval_run.status = "processing"  # Overall evaluation status
+        # Step 6: Link batch_job to evaluation_run
+        eval_run.batch_job_id = batch_job.id
+        eval_run.status = "processing"
+        eval_run.total_items = batch_job.total_items
 
         session.add(eval_run)
         session.commit()
         session.refresh(eval_run)
 
         logger.info(
-            f"Successfully started evaluation batch: {batch_info['id']} "
-            f"for run '{eval_run.run_name}' with {len(batch_file)} items"
+            f"Successfully started evaluation batch: batch_job_id={batch_job.id}, "
+            f"provider_batch_id={batch_job.provider_batch_id} "
+            f"for run '{eval_run.run_name}' with {batch_job.total_items} items"
         )
 
         return eval_run
@@ -275,284 +211,3 @@ def start_evaluation_batch(
         session.add(eval_run)
         session.commit()
         raise
-
-
-# ============================================================================
-# Batch Polling and Result Processing
-# ============================================================================
-
-
-def get_pending_evaluations(session: Session) -> list[EvaluationRun]:
-    """
-    Get all evaluations that are currently processing and need polling.
-
-    Args:
-        session: Database session
-
-    Returns:
-        List of EvaluationRun objects with status='processing'
-    """
-    statement = select(EvaluationRun).where(EvaluationRun.status == "processing")
-    results = session.exec(statement).all()
-    logger.info(f"Found {len(results)} evaluations in 'processing' status")
-    return list(results)
-
-
-def poll_batch_status(client: OpenAI, batch_id: str) -> dict[str, Any]:
-    """
-    Poll OpenAI for current batch status.
-
-    Args:
-        client: Configured OpenAI client
-        batch_id: Batch ID to poll
-
-    Returns:
-        Dict with batch status information:
-        {
-            "id": "batch_abc123",
-            "status": "completed" | "failed" | "in_progress" | "validating" | ...,
-            "output_file_id": "file-xyz" (if completed),
-            "error_file_id": "file-err" (if failed),
-            "failed_requests": 0,
-            "completed_requests": 10,
-            "total_requests": 10
-        }
-
-    Raises:
-        Exception: If polling fails
-    """
-    logger.info(f"Polling batch status: {batch_id}")
-
-    try:
-        batch = client.batches.retrieve(batch_id)
-
-        batch_status = {
-            "id": batch.id,
-            "status": batch.status,
-            "output_file_id": batch.output_file_id,
-            "error_file_id": batch.error_file_id,
-            "request_counts": {
-                "total": batch.request_counts.total,
-                "completed": batch.request_counts.completed,
-                "failed": batch.request_counts.failed,
-            },
-        }
-
-        logger.info(
-            f"Batch {batch_id} status: {batch.status} "
-            f"({batch.request_counts.completed}/{batch.request_counts.total} completed), "
-            f"output_file_id={batch.output_file_id}, error_file_id={batch.error_file_id}"
-        )
-
-        return batch_status
-
-    except Exception as e:
-        logger.error(f"Failed to poll batch status for {batch_id}: {e}")
-        raise
-
-
-def download_batch_output(client: OpenAI, output_file_id: str) -> str:
-    """
-    Download batch output JSONL from OpenAI.
-
-    Args:
-        client: Configured OpenAI client
-        output_file_id: File ID of the batch output
-
-    Returns:
-        JSONL content as string
-
-    Raises:
-        Exception: If download fails
-    """
-    logger.info(f"Downloading batch output file: {output_file_id}")
-
-    try:
-        file_content = client.files.content(output_file_id)
-        jsonl_content = file_content.read().decode("utf-8")
-
-        # Count lines for logging
-        line_count = len(jsonl_content.strip().split("\n"))
-        logger.info(f"Downloaded {line_count} lines from output file {output_file_id}")
-
-        return jsonl_content
-
-    except Exception as e:
-        logger.error(f"Failed to download batch output {output_file_id}: {e}")
-        raise
-
-
-def parse_batch_output(
-    jsonl_content: str, dataset_items: list[dict[str, Any]]
-) -> list[dict[str, Any]]:
-    """
-    Parse batch output JSONL into structured results.
-
-    Args:
-        jsonl_content: Raw JSONL string from OpenAI batch output
-        dataset_items: Original dataset items (for matching ground truth)
-
-    Returns:
-        List of results in format:
-        [
-            {
-                "item_id": "item_123",
-                "question": "What is 2+2?",
-                "generated_output": "4",
-                "ground_truth": "4"
-            },
-            ...
-        ]
-    """
-    logger.info("Parsing batch output JSONL")
-
-    # Create lookup map for dataset items by ID
-    dataset_map = {item["id"]: item for item in dataset_items}
-
-    results = []
-    lines = jsonl_content.strip().split("\n")
-
-    for line_num, line in enumerate(lines, 1):
-        try:
-            response = json.loads(line)
-
-            # Extract custom_id (which is our dataset item ID)
-            item_id = response.get("custom_id")
-            if not item_id:
-                logger.warning(f"Line {line_num}: No custom_id found, skipping")
-                continue
-
-            # Get original dataset item
-            dataset_item = dataset_map.get(item_id)
-            if not dataset_item:
-                logger.warning(f"Line {line_num}: No dataset item found for {item_id}")
-                continue
-
-            # Extract the response body
-            response_body = response.get("response", {}).get("body", {})
-
-            # Handle errors in batch processing
-            if response.get("error"):
-                error_msg = response["error"].get("message", "Unknown error")
-                logger.error(f"Item {item_id} had error: {error_msg}")
-                generated_output = f"ERROR: {error_msg}"
-            else:
-                # Extract text from output (can be string, list, or complex structure)
-                output = response_body.get("output", "")
-
-                # If string, try to parse it (may be JSON or Python repr of list)
-                if isinstance(output, str):
-                    try:
-                        output = json.loads(output)
-                    except (json.JSONDecodeError, ValueError):
-                        try:
-                            output = ast.literal_eval(output)
-                        except (ValueError, SyntaxError):
-                            # Keep as string if parsing fails
-                            generated_output = output
-                            output = None
-
-                # If we have a list structure, extract text from message items
-                if isinstance(output, list):
-                    generated_output = ""
-                    for item in output:
-                        if isinstance(item, dict) and item.get("type") == "message":
-                            for content in item.get("content", []):
-                                if (
-                                    isinstance(content, dict)
-                                    and content.get("type") == "output_text"
-                                ):
-                                    generated_output = content.get("text", "")
-                                    break
-                            if generated_output:
-                                break
-                elif output is not None:
-                    # output was not a string and not a list
-                    generated_output = ""
-                    logger.warning(
-                        f"Item {item_id}: Unexpected output type: {type(output)}"
-                    )
-
-            # Extract question and ground truth from dataset item
-            question = dataset_item["input"].get("question", "")
-            ground_truth = dataset_item["expected_output"].get("answer", "")
-
-            results.append(
-                {
-                    "item_id": item_id,
-                    "question": question,
-                    "generated_output": generated_output,
-                    "ground_truth": ground_truth,
-                }
-            )
-
-        except json.JSONDecodeError as e:
-            logger.error(f"Line {line_num}: Failed to parse JSON: {e}")
-            continue
-        except Exception as e:
-            logger.error(f"Line {line_num}: Unexpected error: {e}")
-            continue
-
-    logger.info(f"Parsed {len(results)} results from {len(lines)} output lines")
-    return results
-
-
-def upload_results_to_s3(
-    jsonl_content: str, eval_run: EvaluationRun, project_id: int
-) -> str:
-    """
-    Upload evaluation results to S3.
-
-    Args:
-        jsonl_content: JSONL content to upload
-        eval_run: EvaluationRun database object
-        project_id: Project ID for storage path
-
-    Returns:
-        S3 URL (e.g., s3://bucket/project-uuid/evaluations/run-123/results.jsonl)
-
-    Raises:
-        Exception: If upload fails
-    """
-    from io import BytesIO
-    from app.core.cloud.storage import (
-        AmazonCloudStorageClient,
-        SimpleStorageName,
-    )
-
-    logger.info(f"Uploading results to S3 for evaluation run {eval_run.id}")
-
-    try:
-        # Create S3 key path
-        # Format: project-storage-path/evaluations/run-{id}/results.jsonl
-        s3_key = f"evaluations/run-{eval_run.id}/results.jsonl"
-
-        # Convert string content to bytes
-        content_bytes = jsonl_content.encode("utf-8")
-        file_like = BytesIO(content_bytes)
-
-        # Upload to S3
-        aws_client = AmazonCloudStorageClient()
-        aws_client.client.upload_fileobj(
-            file_like,
-            Bucket=aws_client.client._client_config.__dict__.get(
-                "bucket", "kaapi-storage"
-            ),
-            Key=s3_key,
-            ExtraArgs={"ContentType": "application/jsonl"},
-        )
-
-        # Construct S3 URL
-        storage_name = SimpleStorageName(Key=s3_key)
-        s3_url = str(storage_name)
-
-        logger.info(
-            f"Successfully uploaded results to S3: {s3_url} "
-            f"({len(content_bytes)} bytes)"
-        )
-
-        return s3_url
-
-    except Exception as e:
-        logger.error(f"Failed to upload results to S3: {e}", exc_info=True)
-        raise
diff --git a/backend/app/crud/evaluation_processing.py b/backend/app/crud/evaluation_processing.py
index b5633fd13..c90dcc6fa 100644
--- a/backend/app/crud/evaluation_processing.py
+++ b/backend/app/crud/evaluation_processing.py
@@ -1,60 +1,169 @@
 """
 Evaluation batch processing orchestrator.
 
-This module coordinates the complete evaluation workflow:
-1. Polling batch status from OpenAI
-2. Downloading and parsing completed batch results
-3. Uploading results to S3
-4. Creating Langfuse dataset runs with traces
-5. Updating database with final status
+This module coordinates the evaluation-specific workflow:
+1. Monitoring batch_job status for evaluations
+2. Parsing evaluation results from batch output
+3. Creating Langfuse dataset runs with traces
+4. Updating evaluation_run with final status and scores
 """
 
+import ast
+import json
 import logging
+from collections import defaultdict
 from typing import Any
 
 from langfuse import Langfuse
 from openai import OpenAI
-from sqlmodel import Session
+from sqlmodel import Session, select
 
+from app.core.batch.openai_provider import OpenAIBatchProvider
 from app.core.util import configure_langfuse, configure_openai, now
+from app.crud.batch_job import get_batch_job
+from app.crud.batch_operations import download_batch_results
 from app.crud.credentials import get_provider_credential
-from app.crud.evaluation_batch import (
-    download_batch_output,
-    fetch_dataset_items,
-    get_pending_evaluations,
-    parse_batch_output,
-    poll_batch_status,
-    upload_results_to_s3,
-)
+from app.crud.evaluation_batch import fetch_dataset_items
 from app.crud.evaluation_langfuse import create_langfuse_dataset_run
 from app.models import EvaluationRun
 
 logger = logging.getLogger(__name__)
 
 
-async def process_completed_batch(
+def parse_evaluation_output(
+    raw_results: list[dict[str, Any]], dataset_items: list[dict[str, Any]]
+) -> list[dict[str, Any]]:
+    """
+    Parse batch output into evaluation results.
+
+    This function extracts the generated output from the batch results
+    and matches it with the ground truth from the dataset.
+
+    Args:
+        raw_results: Raw results from batch provider (list of JSONL lines)
+        dataset_items: Original dataset items (for matching ground truth)
+
+    Returns:
+        List of results in format:
+        [
+            {
+                "item_id": "item_123",
+                "question": "What is 2+2?",
+                "generated_output": "4",
+                "ground_truth": "4"
+            },
+            ...
+        ]
+    """
+    logger.info("Parsing evaluation results")
+
+    # Create lookup map for dataset items by ID
+    dataset_map = {item["id"]: item for item in dataset_items}
+
+    results = []
+
+    for line_num, response in enumerate(raw_results, 1):
+        try:
+            # Extract custom_id (which is our dataset item ID)
+            item_id = response.get("custom_id")
+            if not item_id:
+                logger.warning(f"Line {line_num}: No custom_id found, skipping")
+                continue
+
+            # Get original dataset item
+            dataset_item = dataset_map.get(item_id)
+            if not dataset_item:
+                logger.warning(f"Line {line_num}: No dataset item found for {item_id}")
+                continue
+
+            # Extract the response body
+            response_body = response.get("response", {}).get("body", {})
+
+            # Handle errors in batch processing
+            if response.get("error"):
+                error_msg = response["error"].get("message", "Unknown error")
+                logger.error(f"Item {item_id} had error: {error_msg}")
+                generated_output = f"ERROR: {error_msg}"
+            else:
+                # Extract text from output (can be string, list, or complex structure)
+                output = response_body.get("output", "")
+
+                # If string, try to parse it (may be JSON or Python repr of list)
+                if isinstance(output, str):
+                    try:
+                        output = json.loads(output)
+                    except (json.JSONDecodeError, ValueError):
+                        try:
+                            output = ast.literal_eval(output)
+                        except (ValueError, SyntaxError):
+                            # Keep as string if parsing fails
+                            generated_output = output
+                            output = None
+
+                # If we have a list structure, extract text from message items
+                if isinstance(output, list):
+                    generated_output = ""
+                    for item in output:
+                        if isinstance(item, dict) and item.get("type") == "message":
+                            for content in item.get("content", []):
+                                if (
+                                    isinstance(content, dict)
+                                    and content.get("type") == "output_text"
+                                ):
+                                    generated_output = content.get("text", "")
+                                    break
+                            if generated_output:
+                                break
+                elif output is not None:
+                    # output was not a string and not a list
+                    generated_output = ""
+                    logger.warning(
+                        f"Item {item_id}: Unexpected output type: {type(output)}"
+                    )
+
+            # Extract question and ground truth from dataset item
+            question = dataset_item["input"].get("question", "")
+            ground_truth = dataset_item["expected_output"].get("answer", "")
+
+            results.append(
+                {
+                    "item_id": item_id,
+                    "question": question,
+                    "generated_output": generated_output,
+                    "ground_truth": ground_truth,
+                }
+            )
+
+        except Exception as e:
+            logger.error(f"Line {line_num}: Unexpected error: {e}")
+            continue
+
+    logger.info(
+        f"Parsed {len(results)} evaluation results from {len(raw_results)} output lines"
+    )
+    return results
+
+
+async def process_completed_evaluation(
     eval_run: EvaluationRun,
     session: Session,
     openai_client: OpenAI,
     langfuse: Langfuse,
-    output_file_id: str,
 ) -> EvaluationRun:
     """
-    Process a completed batch evaluation.
+    Process a completed evaluation batch.
 
     This function:
-    1. Downloads batch output from OpenAI
+    1. Downloads batch output from provider
     2. Parses results into question/output/ground_truth format
-    3. Uploads results to S3
-    4. Creates Langfuse dataset run with traces
-    5. Updates database with completion status
+    3. Creates Langfuse dataset run with traces
+    4. Updates evaluation_run with completion status
 
     Args:
         eval_run: EvaluationRun database object
         session: Database session
         openai_client: Configured OpenAI client
         langfuse: Configured Langfuse client
-        output_file_id: OpenAI file ID for batch output
 
     Returns:
         Updated EvaluationRun object
@@ -62,59 +171,41 @@ async def process_completed_batch(
     Raises:
         Exception: If processing fails
     """
-    logger.info(f"Processing completed batch for evaluation run {eval_run.id}")
+    logger.info(f"Processing completed evaluation for run {eval_run.id}")
 
     try:
-        # Step 1: Download batch output from OpenAI
-        logger.info(f"Step 1: Downloading batch output file: {output_file_id}")
-        jsonl_content = download_batch_output(
-            client=openai_client, output_file_id=output_file_id
-        )
+        # Step 1: Get batch_job
+        if not eval_run.batch_job_id:
+            raise ValueError(f"EvaluationRun {eval_run.id} has no batch_job_id")
+
+        batch_job = get_batch_job(session=session, batch_job_id=eval_run.batch_job_id)
+        if not batch_job:
+            raise ValueError(
+                f"BatchJob {eval_run.batch_job_id} not found for evaluation {eval_run.id}"
+            )
+
+        # Step 2: Create provider and download results
+        logger.info(f"Step 1: Downloading batch results for batch_job {batch_job.id}")
+        provider = OpenAIBatchProvider(client=openai_client)
+        raw_results = download_batch_results(provider=provider, batch_job=batch_job)
 
-        # Step 2: Fetch dataset items (needed for matching ground truth)
+        # Step 3: Fetch dataset items (needed for matching ground truth)
         logger.info(f"Step 2: Fetching dataset items for '{eval_run.dataset_name}'")
         dataset_items = fetch_dataset_items(
             langfuse=langfuse, dataset_name=eval_run.dataset_name
         )
 
-        # Step 3: Parse batch output into structured results
-        logger.info("Step 3: Parsing batch output")
-        results = parse_batch_output(
-            jsonl_content=jsonl_content, dataset_items=dataset_items
+        # Step 4: Parse evaluation results
+        logger.info("Step 3: Parsing evaluation results")
+        results = parse_evaluation_output(
+            raw_results=raw_results, dataset_items=dataset_items
         )
 
         if not results:
             raise ValueError("No valid results found in batch output")
 
-        # Step 4: Upload results to S3 (optional - skip if AWS credentials not configured)
-        s3_url = None
-        try:
-            logger.info("Step 4: Uploading results to S3")
-            s3_url = upload_results_to_s3(
-                jsonl_content=jsonl_content,
-                eval_run=eval_run,
-                project_id=eval_run.project_id,
-            )
-            logger.info(f"Successfully uploaded to S3: {s3_url}")
-        except Exception as s3_error:
-            # S3 upload is optional - log warning but continue processing
-            logger.warning(
-                f"S3 upload failed (AWS credentials may not be configured): {s3_error}. "
-                f"Continuing without S3 storage. Results will be available in Langfuse.",
-                exc_info=True,
-            )
-
-        # Step 5: Update DB with output file ID and S3 URL (if available)
-        logger.info("Step 5: Updating database with output file ID and S3 URL")
-        eval_run.batch_output_file_id = output_file_id
-        eval_run.s3_url = s3_url  # Will be None if S3 upload failed
-        eval_run.updated_at = now()
-        session.add(eval_run)
-        session.commit()
-        session.refresh(eval_run)
-
-        # Step 6: Create Langfuse dataset run with traces
-        logger.info("Step 6: Creating Langfuse dataset run with traces")
+        # Step 5: Create Langfuse dataset run with traces
+        logger.info("Step 4: Creating Langfuse dataset run with traces")
         create_langfuse_dataset_run(
             langfuse=langfuse,
             dataset_name=eval_run.dataset_name,
@@ -122,28 +213,29 @@ async def process_completed_batch(
             results=results,
         )
 
-        # Step 7: Mark as completed
-        logger.info("Step 7: Marking evaluation as completed")
+        # Step 6: Mark evaluation as completed
+        logger.info("Step 5: Marking evaluation as completed")
         eval_run.status = "completed"
         eval_run.updated_at = now()
+
+        # Copy S3 URL from batch_job if available
+        if batch_job.raw_output_url:
+            eval_run.s3_url = batch_job.raw_output_url
+
         session.add(eval_run)
         session.commit()
         session.refresh(eval_run)
 
-        s3_info = (
-            f"S3 URL: {s3_url}" if s3_url else "S3 upload skipped (AWS not configured)"
-        )
         logger.info(
             f"Successfully completed processing for evaluation run {eval_run.id}: "
-            f"{len(results)} items processed, {s3_info}"
+            f"{len(results)} items processed"
         )
 
         return eval_run
 
     except Exception as e:
-        # This catches any errors from steps 1-3 or 5-7 (but NOT S3 upload which is caught above)
         logger.error(
-            f"Failed to process completed batch for run {eval_run.id}: {e}",
+            f"Failed to process completed evaluation for run {eval_run.id}: {e}",
             exc_info=True,
         )
         # Mark as failed
@@ -156,14 +248,17 @@ async def process_completed_batch(
         return eval_run
 
 
-async def check_and_process_batch(
+async def check_and_process_evaluation(
     eval_run: EvaluationRun,
     session: Session,
     openai_client: OpenAI,
     langfuse: Langfuse,
 ) -> dict[str, Any]:
     """
-    Check batch status and process if completed.
+    Check evaluation batch status and process if completed.
+
+    This function checks the batch_job status and triggers evaluation-specific
+    processing when the batch is completed.
 
     Args:
         eval_run: EvaluationRun database object
@@ -182,64 +277,44 @@ async def check_and_process_batch(
             "action": "processed" | "updated" | "failed" | "no_change"
         }
     """
-    logger.info(
-        f"Checking batch status for evaluation run {eval_run.id} (batch_id={eval_run.batch_id})"
-    )
+    logger.info(f"Checking evaluation run {eval_run.id}")
 
     previous_status = eval_run.status
-    previous_batch_status = eval_run.batch_status
 
     try:
-        # Poll batch status from OpenAI
-        batch_status_info = poll_batch_status(
-            client=openai_client, batch_id=eval_run.batch_id
-        )
-
-        new_batch_status = batch_status_info["status"]
-        output_file_id = batch_status_info.get("output_file_id")
-
-        # Update batch status in DB
-        if new_batch_status != previous_batch_status:
-            eval_run.batch_status = new_batch_status
-            eval_run.updated_at = now()
-            session.add(eval_run)
-            session.commit()
-            session.refresh(eval_run)
-            logger.info(
-                f"Updated batch_status for run {eval_run.id}: "
-                f"{previous_batch_status} -> {new_batch_status}"
+        # Get batch_job
+        if not eval_run.batch_job_id:
+            raise ValueError(f"EvaluationRun {eval_run.id} has no batch_job_id")
+
+        batch_job = get_batch_job(session=session, batch_job_id=eval_run.batch_job_id)
+        if not batch_job:
+            raise ValueError(
+                f"BatchJob {eval_run.batch_job_id} not found for evaluation {eval_run.id}"
             )
 
-        # Handle different batch statuses
-        if new_batch_status == "completed":
-            if not output_file_id:
-                # Sometimes OpenAI returns None for output_file_id even when batch is completed
-                # This is a timing issue. Skip processing for now and let the next poll cycle handle it.
-                logger.warning(
-                    f"Batch {eval_run.batch_id} is completed but output_file_id is None. "
-                    f"This is likely a timing issue. Skipping for now - will retry in next poll cycle. "
-                    f"Request counts: {batch_status_info.get('request_counts')}"
-                )
+        # IMPORTANT: Poll OpenAI to get the latest status before checking
+        logger.info(f"Polling OpenAI for batch status: {batch_job.provider_batch_id}")
+        provider = OpenAIBatchProvider(client=openai_client)
+        from app.crud.batch_operations import poll_batch_status
 
-                return {
-                    "run_id": eval_run.id,
-                    "run_name": eval_run.run_name,
-                    "previous_status": previous_status,
-                    "current_status": eval_run.status,
-                    "batch_status": new_batch_status,
-                    "action": "no_change",
-                    "note": "Batch completed but output_file_id not yet available, will retry next poll",
-                }
+        poll_batch_status(session=session, provider=provider, batch_job=batch_job)
 
-            logger.info(f"Batch {eval_run.batch_id} completed, processing results...")
+        # Refresh batch_job to get the updated provider_status
+        session.refresh(batch_job)
+        provider_status = batch_job.provider_status
 
-            # Process the completed batch
-            await process_completed_batch(
+        # Handle different provider statuses
+        if provider_status == "completed":
+            # Process the completed evaluation
+            logger.info(
+                f"Batch {batch_job.provider_batch_id} completed, processing evaluation results..."
+            )
+
+            await process_completed_evaluation(
                 eval_run=eval_run,
                 session=session,
                 openai_client=openai_client,
                 langfuse=langfuse,
-                output_file_id=output_file_id,
             )
 
             return {
@@ -247,15 +322,13 @@ async def check_and_process_batch(
                 "run_name": eval_run.run_name,
                 "previous_status": previous_status,
                 "current_status": eval_run.status,
-                "batch_status": new_batch_status,
+                "provider_status": provider_status,
                 "action": "processed",
             }
 
-        elif new_batch_status in ["failed", "expired", "cancelled"]:
-            # Mark as failed
-            error_msg = f"Batch {new_batch_status}"
-            if batch_status_info.get("error_file_id"):
-                error_msg += f" (error_file_id: {batch_status_info['error_file_id']})"
+        elif provider_status in ["failed", "expired", "cancelled"]:
+            # Mark evaluation as failed based on provider status
+            error_msg = batch_job.error_message or f"Provider batch {provider_status}"
 
             eval_run.status = "failed"
             eval_run.error_message = error_msg
@@ -264,14 +337,14 @@ async def check_and_process_batch(
             session.commit()
             session.refresh(eval_run)
 
-            logger.error(f"Batch {eval_run.batch_id} failed: {error_msg}")
+            logger.error(f"Batch {batch_job.provider_batch_id} failed: {error_msg}")
 
             return {
                 "run_id": eval_run.id,
                 "run_name": eval_run.run_name,
                 "previous_status": previous_status,
                 "current_status": "failed",
-                "batch_status": new_batch_status,
+                "provider_status": provider_status,
                 "action": "failed",
                 "error": error_msg,
             }
@@ -279,7 +352,7 @@ async def check_and_process_batch(
         else:
             # Still in progress (validating, in_progress, finalizing)
             logger.info(
-                f"Batch {eval_run.batch_id} still processing (status={new_batch_status})"
+                f"Batch {batch_job.provider_batch_id} still processing (provider_status={provider_status})"
             )
 
             return {
@@ -287,20 +360,16 @@ async def check_and_process_batch(
                 "run_name": eval_run.run_name,
                 "previous_status": previous_status,
                 "current_status": eval_run.status,
-                "batch_status": new_batch_status,
-                "action": "updated"
-                if new_batch_status != previous_batch_status
-                else "no_change",
+                "provider_status": provider_status,
+                "action": "no_change",
             }
 
     except Exception as e:
-        logger.error(
-            f"Error checking batch status for run {eval_run.id}: {e}", exc_info=True
-        )
+        logger.error(f"Error checking evaluation run {eval_run.id}: {e}", exc_info=True)
 
         # Mark as failed
         eval_run.status = "failed"
-        eval_run.error_message = f"Polling failed: {str(e)}"
+        eval_run.error_message = f"Checking failed: {str(e)}"
         eval_run.updated_at = now()
         session.add(eval_run)
         session.commit()
@@ -310,7 +379,7 @@ async def check_and_process_batch(
             "run_name": eval_run.run_name,
             "previous_status": previous_status,
             "current_status": "failed",
-            "batch_status": eval_run.batch_status,
+            "provider_status": "unknown",
             "action": "failed",
             "error": str(e),
         }
@@ -336,11 +405,12 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
     """
     logger.info(f"Polling all pending evaluations for org_id={org_id}")
 
-    # Get pending evaluations
-    pending_runs = get_pending_evaluations(session=session)
-
-    # Filter by org_id
-    pending_runs = [run for run in pending_runs if run.organization_id == org_id]
+    # Get pending evaluations (status = "processing")
+    statement = select(EvaluationRun).where(
+        EvaluationRun.status == "processing",
+        EvaluationRun.organization_id == org_id,
+    )
+    pending_runs = session.exec(statement).all()
 
     if not pending_runs:
         logger.info(f"No pending evaluations found for org_id={org_id}")
@@ -355,8 +425,6 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
     logger.info(f"Found {len(pending_runs)} pending evaluations for org_id={org_id}")
 
     # Group evaluations by project_id since credentials are per project
-    from collections import defaultdict
-
     evaluations_by_project = defaultdict(list)
     for run in pending_runs:
         evaluations_by_project[run.project_id].append(run)
@@ -429,7 +497,7 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
             # Process each evaluation in this project
             for eval_run in project_runs:
                 try:
-                    result = await check_and_process_batch(
+                    result = await check_and_process_evaluation(
                         eval_run=eval_run,
                         session=session,
                         openai_client=openai_client,
diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py
index 4df3f4a88..b603fa313 100644
--- a/backend/app/models/__init__.py
+++ b/backend/app/models/__init__.py
@@ -35,9 +35,15 @@
 )
 from .document_collection import DocumentCollection
 
+from .batch_job import (
+    BatchJob,
+    BatchJobCreate,
+    BatchJobPublic,
+    BatchJobUpdate,
+)
+
 from .evaluation import (
     EvaluationRun,
-    EvaluationRunBase,
     EvaluationRunCreate,
     EvaluationRunPublic,
 )
diff --git a/backend/app/models/batch_job.py b/backend/app/models/batch_job.py
new file mode 100644
index 000000000..6d44c81d8
--- /dev/null
+++ b/backend/app/models/batch_job.py
@@ -0,0 +1,122 @@
+from datetime import datetime
+from typing import Any, Optional
+
+from sqlalchemy import Column
+from sqlalchemy.dialects.postgresql import JSONB
+from sqlmodel import Field, Relationship, SQLModel
+
+
+class BatchJob(SQLModel, table=True):
+    """Batch job table for tracking async LLM batch operations."""
+
+    __tablename__ = "batch_job"
+
+    id: int | None = Field(default=None, primary_key=True)
+
+    # Provider and job type
+    provider: str = Field(description="LLM provider name (e.g., 'openai', 'anthropic')")
+    job_type: str = Field(
+        description="Type of batch job (e.g., 'evaluation', 'classification', 'embedding')"
+    )
+
+    # Batch configuration - stores all provider-specific config
+    config: dict[str, Any] = Field(
+        default_factory=dict,
+        sa_column=Column(JSONB()),
+        description="Complete batch configuration including model, temperature, instructions, tools, etc.",
+    )
+
+    # Provider-specific batch tracking
+    provider_batch_id: str | None = Field(
+        default=None, description="Provider's batch job ID (e.g., OpenAI batch_id)"
+    )
+    provider_file_id: str | None = Field(
+        default=None, description="Provider's input file ID"
+    )
+    provider_output_file_id: str | None = Field(
+        default=None, description="Provider's output file ID"
+    )
+
+    # Provider status tracking
+    provider_status: str | None = Field(
+        default=None,
+        description="Provider-specific status (e.g., OpenAI: validating, in_progress, finalizing, completed, failed, expired, cancelling, cancelled)",
+    )
+
+    # Raw results (before parent-specific processing)
+    raw_output_url: str | None = Field(
+        default=None, description="S3 URL of raw batch output file"
+    )
+    total_items: int = Field(
+        default=0, description="Total number of items in the batch"
+    )
+
+    # Error handling
+    error_message: str | None = Field(
+        default=None, description="Error message if batch failed"
+    )
+
+    # Foreign keys
+    organization_id: int = Field(foreign_key="organization.id")
+    project_id: int = Field(foreign_key="project.id")
+
+    # Timestamps
+    inserted_at: datetime = Field(default_factory=datetime.utcnow)
+    updated_at: datetime = Field(default_factory=datetime.utcnow)
+
+    # Relationships
+    organization: Optional["Organization"] = Relationship(  # noqa: F821
+        back_populates="batch_jobs"
+    )
+    project: Optional["Project"] = Relationship(
+        back_populates="batch_jobs"
+    )  # noqa: F821
+
+
+class BatchJobCreate(SQLModel):
+    """Schema for creating a new batch job."""
+
+    provider: str
+    job_type: str
+    config: dict[str, Any] = Field(default_factory=dict)
+    provider_batch_id: str | None = None
+    provider_file_id: str | None = None
+    provider_output_file_id: str | None = None
+    provider_status: str | None = None
+    raw_output_url: str | None = None
+    total_items: int = 0
+    error_message: str | None = None
+    organization_id: int
+    project_id: int
+
+
+class BatchJobUpdate(SQLModel):
+    """Schema for updating a batch job."""
+
+    provider_batch_id: str | None = None
+    provider_file_id: str | None = None
+    provider_output_file_id: str | None = None
+    provider_status: str | None = None
+    raw_output_url: str | None = None
+    total_items: int | None = None
+    error_message: str | None = None
+
+
+class BatchJobPublic(SQLModel):
+    """Public schema for batch job responses."""
+
+    id: int
+    provider: str
+    job_type: str
+    config: dict[str, Any]
+    provider_batch_id: str | None
+    provider_file_id: str | None
+    provider_output_file_id: str | None
+    provider_status: str | None
+    raw_output_url: str | None
+    total_items: int
+    error_message: str | None
+    organization_id: int
+    project_id: int
+    inserted_at: datetime
+    updated_at: datetime
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index d92611468..e15268574 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -1,5 +1,5 @@
 from datetime import datetime
-from typing import Optional
+from typing import Optional, Any
 
 from pydantic import BaseModel, Field
 from sqlalchemy import Column, Text, JSON
@@ -59,16 +59,29 @@ class Experiment(BaseModel):
 # Database Models
 
 
-class EvaluationRunBase(SQLModel):
-    """Base model for evaluation runs."""
+class EvaluationRun(SQLModel, table=True):
+    """Database table for evaluation runs."""
+
+    __tablename__ = "evaluation_run"
+
+    id: int = SQLField(default=None, primary_key=True)
 
     # Input fields (provided by user)
     run_name: str = SQLField(index=True, description="Name of the evaluation run")
     dataset_name: str = SQLField(description="Name of the Langfuse dataset")
-    config: dict = SQLField(
-        default={},
-        sa_column=Column(JSON, nullable=False),
-        description="Evaluation configuration (LLM settings, instructions, vector stores)",
+
+    # Config field - dict requires sa_column
+    config: dict[str, Any] = SQLField(
+        default_factory=dict,
+        sa_column=Column(JSON),
+        description="Evaluation configuration",
+    )
+
+    # Batch job reference
+    batch_job_id: Optional[int] = SQLField(
+        default=None,
+        foreign_key="batch_job.id",
+        description="Reference to the batch_job that processes this evaluation",
     )
 
     # Output/Status fields (updated by system during processing)
@@ -76,37 +89,29 @@ class EvaluationRunBase(SQLModel):
         default="pending",
         description="Overall evaluation status: pending, processing, completed, failed",
     )
-    batch_status: Optional[str] = SQLField(
-        default=None,
-        description="OpenAI Batch API status: validating, in_progress, finalizing, completed, failed, expired, cancelling, cancelled (for polling)",
-    )
-    batch_id: Optional[str] = SQLField(
-        default=None, description="OpenAI Batch API batch ID (set during processing)"
-    )
-    batch_file_id: Optional[str] = SQLField(
-        default=None,
-        description="OpenAI file ID for batch input (set during processing)",
-    )
-    batch_output_file_id: Optional[str] = SQLField(
-        default=None,
-        description="OpenAI file ID for batch output (set after completion)",
-    )
     s3_url: Optional[str] = SQLField(
-        default=None, description="S3 URL of OpenAI output file for future reference"
+        default=None,
+        description="S3 URL of processed evaluation results for future reference",
     )
     total_items: int = SQLField(
         default=0, description="Total number of items evaluated (set during processing)"
     )
-    score: Optional[dict] = SQLField(
+
+    # Score field - dict requires sa_column
+    score: Optional[dict[str, Any]] = SQLField(
         default=None,
         sa_column=Column(JSON, nullable=True),
-        description="Evaluation scores (e.g., correctness, cosine_similarity, etc.) (set after completion)",
+        description="Evaluation scores (e.g., correctness, cosine_similarity, etc.)",
     )
+
+    # Error message field
     error_message: Optional[str] = SQLField(
         default=None,
         sa_column=Column(Text, nullable=True),
         description="Error message if failed",
     )
+
+    # Foreign keys
     organization_id: int = SQLField(
         foreign_key="organization.id", nullable=False, ondelete="CASCADE"
     )
@@ -114,19 +119,14 @@ class EvaluationRunBase(SQLModel):
         foreign_key="project.id", nullable=False, ondelete="CASCADE"
     )
 
-
-class EvaluationRun(EvaluationRunBase, table=True):
-    """Database table for evaluation runs."""
-
-    __tablename__ = "evaluation_run"
-
-    id: int = SQLField(default=None, primary_key=True)
+    # Timestamps
     inserted_at: datetime = SQLField(default_factory=now, nullable=False)
     updated_at: datetime = SQLField(default_factory=now, nullable=False)
 
     # Relationships
     project: "Project" = Relationship(back_populates="evaluation_runs")
     organization: "Organization" = Relationship(back_populates="evaluation_runs")
+    batch_job: Optional["BatchJob"] = Relationship()  # noqa: F821
 
 
 class EvaluationRunCreate(SQLModel):
@@ -134,14 +134,26 @@ class EvaluationRunCreate(SQLModel):
 
     run_name: str = Field(description="Name of the evaluation run", min_length=3)
     dataset_name: str = Field(description="Name of the Langfuse dataset", min_length=1)
-    config: dict = Field(
-        description="Evaluation configuration (flexible dict with llm, instructions, vector_store_ids, etc.)"
+    config: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Evaluation configuration (flexible dict with llm, instructions, vector_store_ids, etc.)",
     )
 
 
-class EvaluationRunPublic(EvaluationRunBase):
+class EvaluationRunPublic(SQLModel):
     """Public model for evaluation runs."""
 
     id: int
+    run_name: str
+    dataset_name: str
+    config: dict[str, Any]
+    batch_job_id: Optional[int]
+    status: str
+    s3_url: Optional[str]
+    total_items: int
+    score: Optional[dict[str, Any]]
+    error_message: Optional[str]
+    organization_id: int
+    project_id: int
     inserted_at: datetime
     updated_at: datetime
diff --git a/backend/app/models/organization.py b/backend/app/models/organization.py
index 32f146397..b658762df 100644
--- a/backend/app/models/organization.py
+++ b/backend/app/models/organization.py
@@ -11,6 +11,8 @@
     from .assistants import Assistant
     from .collection import Collection
     from .openai_conversation import OpenAIConversation
+    from .batch_job import BatchJob
+    from .evaluation import EvaluationRun
 
 
 # Shared properties for an Organization
@@ -58,6 +60,9 @@ class Organization(OrganizationBase, table=True):
     evaluation_runs: list["EvaluationRun"] = Relationship(
         back_populates="organization", cascade_delete=True
     )
+    batch_jobs: list["BatchJob"] = Relationship(
+        back_populates="organization", cascade_delete=True
+    )
 
 
 # Properties to return via API
diff --git a/backend/app/models/project.py b/backend/app/models/project.py
index ff44548f9..30dda9a0e 100644
--- a/backend/app/models/project.py
+++ b/backend/app/models/project.py
@@ -61,6 +61,9 @@ class Project(ProjectBase, table=True):
     evaluation_runs: list["EvaluationRun"] = Relationship(
         back_populates="project", cascade_delete=True
     )
+    batch_jobs: list["BatchJob"] = Relationship(
+        back_populates="project", cascade_delete=True
+    )
 
 
 # Properties to return via API

From 1e278fc07590ee736c4a2fa098d09d9774de56e2 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 21 Oct 2025 15:43:42 +0530
Subject: [PATCH 23/64] checking out AWS

---
 .../alembic/versions/add_batch_job_table.py   |  2 +-
 backend/app/crud/evaluation_processing.py     | 26 ++++++++++++++-----
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/backend/app/alembic/versions/add_batch_job_table.py b/backend/app/alembic/versions/add_batch_job_table.py
index 857d38a45..39c8fd7f5 100644
--- a/backend/app/alembic/versions/add_batch_job_table.py
+++ b/backend/app/alembic/versions/add_batch_job_table.py
@@ -11,7 +11,7 @@
 
 # revision identifiers, used by Alembic.
 revision = "add_batch_job"
-down_revision = ("93d484f5798e", "d5747495bd7c", "27c271ab6dd0")
+down_revision = "27c271ab6dd0"
 branch_labels = None
 depends_on = None
 
diff --git a/backend/app/crud/evaluation_processing.py b/backend/app/crud/evaluation_processing.py
index c90dcc6fa..17ee6b2f3 100644
--- a/backend/app/crud/evaluation_processing.py
+++ b/backend/app/crud/evaluation_processing.py
@@ -11,17 +11,17 @@
 import ast
 import json
 import logging
-from collections import defaultdict
-from typing import Any
 
+from collections import defaultdict
 from langfuse import Langfuse
 from openai import OpenAI
 from sqlmodel import Session, select
+from typing import Any
 
 from app.core.batch.openai_provider import OpenAIBatchProvider
 from app.core.util import configure_langfuse, configure_openai, now
 from app.crud.batch_job import get_batch_job
-from app.crud.batch_operations import download_batch_results
+from app.crud.batch_operations import upload_batch_results_to_s3, download_batch_results
 from app.crud.credentials import get_provider_credential
 from app.crud.evaluation_batch import fetch_dataset_items
 from app.crud.evaluation_langfuse import create_langfuse_dataset_run
@@ -189,6 +189,20 @@ async def process_completed_evaluation(
         provider = OpenAIBatchProvider(client=openai_client)
         raw_results = download_batch_results(provider=provider, batch_job=batch_job)
 
+        # Step 2a: Upload raw results to S3 for evaluation_run
+        s3_url = None
+        try:
+            s3_url = upload_batch_results_to_s3(
+                batch_job=batch_job, results=raw_results
+            )
+            logger.info(f"Uploaded evaluation results to S3: {s3_url}")
+        except Exception as s3_error:
+            logger.warning(
+                f"S3 upload failed (AWS credentials may not be configured): {s3_error}. "
+                f"Continuing without S3 storage.",
+                exc_info=True,
+            )
+
         # Step 3: Fetch dataset items (needed for matching ground truth)
         logger.info(f"Step 2: Fetching dataset items for '{eval_run.dataset_name}'")
         dataset_items = fetch_dataset_items(
@@ -218,9 +232,9 @@ async def process_completed_evaluation(
         eval_run.status = "completed"
         eval_run.updated_at = now()
 
-        # Copy S3 URL from batch_job if available
-        if batch_job.raw_output_url:
-            eval_run.s3_url = batch_job.raw_output_url
+        # Set S3 URL if upload was successful
+        if s3_url:
+            eval_run.s3_url = s3_url
 
         session.add(eval_run)
         session.commit()

From 4cb5d56012741c47590ba779a7d69c68a3b5aef1 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 21 Oct 2025 19:00:48 +0530
Subject: [PATCH 24/64] cleanup migration

---
 .../alembic/versions/add_batch_job_table.py   | 152 ------------------
 ...5747495bd7c_create_evaluation_run_table.py | 116 +++++++++++--
 2 files changed, 103 insertions(+), 165 deletions(-)
 delete mode 100644 backend/app/alembic/versions/add_batch_job_table.py

diff --git a/backend/app/alembic/versions/add_batch_job_table.py b/backend/app/alembic/versions/add_batch_job_table.py
deleted file mode 100644
index 39c8fd7f5..000000000
--- a/backend/app/alembic/versions/add_batch_job_table.py
+++ /dev/null
@@ -1,152 +0,0 @@
-"""Add batch_job table and refactor evaluation_run
-
-Revision ID: add_batch_job
-Revises: 93d484f5798e
-Create Date: 2025-10-21 00:00:00.000000
-
-"""
-import sqlalchemy as sa
-from alembic import op
-from sqlalchemy.dialects import postgresql
-
-# revision identifiers, used by Alembic.
-revision = "add_batch_job"
-down_revision = "27c271ab6dd0"
-branch_labels = None
-depends_on = None
-
-
-def upgrade():
-    # Create batch_job table
-    op.create_table(
-        "batch_job",
-        sa.Column("id", sa.Integer(), nullable=False),
-        sa.Column(
-            "provider",
-            sa.String(),
-            nullable=False,
-            comment="LLM provider name (e.g., 'openai', 'anthropic')",
-        ),
-        sa.Column(
-            "job_type",
-            sa.String(),
-            nullable=False,
-            comment="Type of batch job (e.g., 'evaluation', 'classification', 'embedding')",
-        ),
-        sa.Column(
-            "config",
-            postgresql.JSONB(astext_type=sa.Text()),
-            nullable=False,
-            server_default=sa.text("'{}'::jsonb"),
-            comment="Complete batch configuration",
-        ),
-        sa.Column(
-            "provider_batch_id",
-            sa.String(),
-            nullable=True,
-            comment="Provider's batch job ID",
-        ),
-        sa.Column(
-            "provider_file_id",
-            sa.String(),
-            nullable=True,
-            comment="Provider's input file ID",
-        ),
-        sa.Column(
-            "provider_output_file_id",
-            sa.String(),
-            nullable=True,
-            comment="Provider's output file ID",
-        ),
-        sa.Column(
-            "provider_status",
-            sa.String(),
-            nullable=True,
-            comment="Provider-specific status (e.g., OpenAI: validating, in_progress, completed, failed)",
-        ),
-        sa.Column(
-            "raw_output_url",
-            sa.String(),
-            nullable=True,
-            comment="S3 URL of raw batch output file",
-        ),
-        sa.Column(
-            "total_items",
-            sa.Integer(),
-            nullable=False,
-            default=0,
-            comment="Total number of items in the batch",
-        ),
-        sa.Column(
-            "error_message",
-            sa.Text(),
-            nullable=True,
-            comment="Error message if batch failed",
-        ),
-        sa.Column("organization_id", sa.Integer(), nullable=False),
-        sa.Column("project_id", sa.Integer(), nullable=False),
-        sa.Column("inserted_at", sa.DateTime(), nullable=False),
-        sa.Column("updated_at", sa.DateTime(), nullable=False),
-        sa.ForeignKeyConstraint(
-            ["organization_id"], ["organization.id"], ondelete="CASCADE"
-        ),
-        sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"),
-        sa.PrimaryKeyConstraint("id"),
-    )
-    op.create_index(
-        op.f("ix_batch_job_job_type"), "batch_job", ["job_type"], unique=False
-    )
-    op.create_index(
-        op.f("ix_batch_job_organization_id"),
-        "batch_job",
-        ["organization_id"],
-        unique=False,
-    )
-    op.create_index(
-        op.f("ix_batch_job_project_id"), "batch_job", ["project_id"], unique=False
-    )
-
-    # Add batch_job_id to evaluation_run
-    op.add_column(
-        "evaluation_run", sa.Column("batch_job_id", sa.Integer(), nullable=True)
-    )
-    op.create_foreign_key(
-        "fk_evaluation_run_batch_job_id",
-        "evaluation_run",
-        "batch_job",
-        ["batch_job_id"],
-        ["id"],
-    )
-
-    # Drop batch-related columns from evaluation_run
-    op.drop_column("evaluation_run", "batch_status")
-    op.drop_column("evaluation_run", "batch_id")
-    op.drop_column("evaluation_run", "batch_file_id")
-    op.drop_column("evaluation_run", "batch_output_file_id")
-
-
-def downgrade():
-    # Add back batch-related columns to evaluation_run
-    op.add_column(
-        "evaluation_run",
-        sa.Column("batch_output_file_id", sa.String(), nullable=True),
-    )
-    op.add_column(
-        "evaluation_run", sa.Column("batch_file_id", sa.String(), nullable=True)
-    )
-    op.add_column("evaluation_run", sa.Column("batch_id", sa.String(), nullable=True))
-    op.add_column(
-        "evaluation_run", sa.Column("batch_status", sa.String(), nullable=True)
-    )
-
-    # Drop batch_job_id from evaluation_run
-    op.drop_constraint(
-        "fk_evaluation_run_batch_job_id", "evaluation_run", type_="foreignkey"
-    )
-    op.drop_column("evaluation_run", "batch_job_id")
-
-    # Drop batch_job table
-    op.drop_index(op.f("ix_batch_job_project_id"), table_name="batch_job")
-    op.drop_index(op.f("ix_batch_job_organization_id"), table_name="batch_job")
-    op.drop_index(op.f("ix_batch_job_job_type"), table_name="batch_job")
-    op.drop_table("batch_job")
diff --git a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
index 0ade9284c..79954049d 100644
--- a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
+++ b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
@@ -1,36 +1,121 @@
-"""create_evaluation_run_table
+"""create_evaluation_run_table and batch_job_table
 
 Revision ID: d5747495bd7c
-Revises: b30727137e65
+Revises: e7c68e43ce6f
 Create Date: 2025-10-14 12:42:15.464302
 
 """
 from alembic import op
 import sqlalchemy as sa
 import sqlmodel.sql.sqltypes
+from sqlalchemy.dialects import postgresql
 
 
 # revision identifiers, used by Alembic.
 revision = "d5747495bd7c"
-down_revision = "b30727137e65"
+down_revision = "e7c68e43ce6f"
 branch_labels = None
 depends_on = None
 
 
 def upgrade():
-    # ### commands auto generated by Alembic - please adjust! ###
+    # Create batch_job table first (as evaluation_run will reference it)
+    op.create_table(
+        "batch_job",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column(
+            "provider",
+            sa.String(),
+            nullable=False,
+            comment="LLM provider name (e.g., 'openai', 'anthropic')",
+        ),
+        sa.Column(
+            "job_type",
+            sa.String(),
+            nullable=False,
+            comment="Type of batch job (e.g., 'evaluation', 'classification', 'embedding')",
+        ),
+        sa.Column(
+            "config",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=False,
+            server_default=sa.text("'{}'::jsonb"),
+            comment="Complete batch configuration",
+        ),
+        sa.Column(
+            "provider_batch_id",
+            sa.String(),
+            nullable=True,
+            comment="Provider's batch job ID",
+        ),
+        sa.Column(
+            "provider_file_id",
+            sa.String(),
+            nullable=True,
+            comment="Provider's input file ID",
+        ),
+        sa.Column(
+            "provider_output_file_id",
+            sa.String(),
+            nullable=True,
+            comment="Provider's output file ID",
+        ),
+        sa.Column(
+            "provider_status",
+            sa.String(),
+            nullable=True,
+            comment="Provider-specific status (e.g., OpenAI: validating, in_progress, completed, failed)",
+        ),
+        sa.Column(
+            "raw_output_url",
+            sa.String(),
+            nullable=True,
+            comment="S3 URL of raw batch output file",
+        ),
+        sa.Column(
+            "total_items",
+            sa.Integer(),
+            nullable=False,
+            server_default=sa.text("0"),
+            comment="Total number of items in the batch",
+        ),
+        sa.Column(
+            "error_message",
+            sa.Text(),
+            nullable=True,
+            comment="Error message if batch failed",
+        ),
+        sa.Column("organization_id", sa.Integer(), nullable=False),
+        sa.Column("project_id", sa.Integer(), nullable=False),
+        sa.Column("inserted_at", sa.DateTime(), nullable=False),
+        sa.Column("updated_at", sa.DateTime(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["organization_id"], ["organization.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    op.create_index(
+        op.f("ix_batch_job_job_type"), "batch_job", ["job_type"], unique=False
+    )
+    op.create_index(
+        op.f("ix_batch_job_organization_id"),
+        "batch_job",
+        ["organization_id"],
+        unique=False,
+    )
+    op.create_index(
+        op.f("ix_batch_job_project_id"), "batch_job", ["project_id"], unique=False
+    )
+
+    # Create evaluation_run table with batch_job_id reference (no old batch columns)
     op.create_table(
         "evaluation_run",
         sa.Column("run_name", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
         sa.Column("dataset_name", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
         sa.Column("config", sa.JSON(), nullable=False),
+        sa.Column("batch_job_id", sa.Integer(), nullable=True),
         sa.Column("status", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
-        sa.Column("batch_status", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
-        sa.Column("batch_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
-        sa.Column("batch_file_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
-        sa.Column(
-            "batch_output_file_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True
-        ),
         sa.Column("s3_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
         sa.Column("total_items", sa.Integer(), nullable=False),
         sa.Column("score", sa.JSON(), nullable=True),
@@ -40,6 +125,7 @@ def upgrade():
         sa.Column("id", sa.Integer(), nullable=False),
         sa.Column("inserted_at", sa.DateTime(), nullable=False),
         sa.Column("updated_at", sa.DateTime(), nullable=False),
+        sa.ForeignKeyConstraint(["batch_job_id"], ["batch_job.id"]),
         sa.ForeignKeyConstraint(
             ["organization_id"], ["organization.id"], ondelete="CASCADE"
         ),
@@ -49,11 +135,15 @@ def upgrade():
     op.create_index(
         op.f("ix_evaluation_run_run_name"), "evaluation_run", ["run_name"], unique=False
     )
-    # ### end Alembic commands ###
 
 
 def downgrade():
-    # ### commands auto generated by Alembic - please adjust! ###
+    # Drop evaluation_run table first
     op.drop_index(op.f("ix_evaluation_run_run_name"), table_name="evaluation_run")
     op.drop_table("evaluation_run")
-    # ### end Alembic commands ###
+
+    # Drop batch_job table
+    op.drop_index(op.f("ix_batch_job_project_id"), table_name="batch_job")
+    op.drop_index(op.f("ix_batch_job_organization_id"), table_name="batch_job")
+    op.drop_index(op.f("ix_batch_job_job_type"), table_name="batch_job")
+    op.drop_table("batch_job")

From 877ba0400659895df1ba71ff29e65ca1678b9742 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 23 Oct 2025 15:15:47 +0530
Subject: [PATCH 25/64] added support for cosine similarity score

---
 ...mbedding_batch_job_id_to_evaluation_run.py |  50 ++
 backend/app/crud/evaluation_embeddings.py     | 433 ++++++++++++++++++
 backend/app/crud/evaluation_processing.py     | 264 ++++++++++-
 backend/app/models/evaluation.py              |  42 +-
 .../tests/crud/test_evaluation_embeddings.py  | 373 +++++++++++++++
 backend/pyproject.toml                        |   1 +
 backend/uv.lock                               |   2 +
 7 files changed, 1132 insertions(+), 33 deletions(-)
 create mode 100644 backend/app/alembic/versions/a1b2c3d4e5f6_add_embedding_batch_job_id_to_evaluation_run.py
 create mode 100644 backend/app/crud/evaluation_embeddings.py
 create mode 100644 backend/app/tests/crud/test_evaluation_embeddings.py

diff --git a/backend/app/alembic/versions/a1b2c3d4e5f6_add_embedding_batch_job_id_to_evaluation_run.py b/backend/app/alembic/versions/a1b2c3d4e5f6_add_embedding_batch_job_id_to_evaluation_run.py
new file mode 100644
index 000000000..195c8a81c
--- /dev/null
+++ b/backend/app/alembic/versions/a1b2c3d4e5f6_add_embedding_batch_job_id_to_evaluation_run.py
@@ -0,0 +1,50 @@
+"""add_embedding_batch_job_id_to_evaluation_run
+
+Revision ID: a1b2c3d4e5f6
+Revises: d5747495bd7c
+Create Date: 2025-10-22 00:00:00.000000
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "a1b2c3d4e5f6"
+down_revision = "d5747495bd7c"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # Add embedding_batch_job_id column to evaluation_run table
+    op.add_column(
+        "evaluation_run",
+        sa.Column(
+            "embedding_batch_job_id",
+            sa.Integer(),
+            nullable=True,
+            comment="Reference to the batch_job for embedding-based similarity scoring",
+        ),
+    )
+
+    # Add foreign key constraint to batch_job table
+    op.create_foreign_key(
+        "fk_evaluation_run_embedding_batch_job_id",
+        "evaluation_run",
+        "batch_job",
+        ["embedding_batch_job_id"],
+        ["id"],
+    )
+
+
+def downgrade():
+    # Drop foreign key constraint
+    op.drop_constraint(
+        "fk_evaluation_run_embedding_batch_job_id",
+        "evaluation_run",
+        type_="foreignkey",
+    )
+
+    # Drop embedding_batch_job_id column
+    op.drop_column("evaluation_run", "embedding_batch_job_id")
diff --git a/backend/app/crud/evaluation_embeddings.py b/backend/app/crud/evaluation_embeddings.py
new file mode 100644
index 000000000..cc304cade
--- /dev/null
+++ b/backend/app/crud/evaluation_embeddings.py
@@ -0,0 +1,433 @@
+"""
+Embedding-based similarity scoring for evaluation runs.
+
+This module handles:
+1. Building JSONL for embedding batch requests
+2. Parsing embedding results from batch API
+3. Calculating cosine similarity between embeddings
+4. Orchestrating embedding batch creation and processing
+"""
+
+import logging
+from typing import Any
+
+import numpy as np
+from openai import OpenAI
+from sqlmodel import Session
+
+from app.core.batch.openai_provider import OpenAIBatchProvider
+from app.core.util import now
+from app.crud.batch_operations import start_batch_job
+from app.models import EvaluationRun
+
+logger = logging.getLogger(__name__)
+
+# Valid embedding models with their dimensions
+VALID_EMBEDDING_MODELS = {
+    "text-embedding-3-small": 1536,
+    "text-embedding-3-large": 3072,
+    "text-embedding-ada-002": 1536,
+}
+
+
+def validate_embedding_model(model: str) -> None:
+    """
+    Validate that the embedding model is supported.
+
+    Args:
+        model: The embedding model name
+
+    Raises:
+        ValueError: If the model is not supported
+    """
+    if model not in VALID_EMBEDDING_MODELS:
+        valid_models = ", ".join(VALID_EMBEDDING_MODELS.keys())
+        raise ValueError(
+            f"Invalid embedding model '{model}'. " f"Supported models: {valid_models}"
+        )
+
+
+def build_embedding_jsonl(
+    results: list[dict[str, Any]],
+    embedding_model: str = "text-embedding-3-large",
+) -> list[dict[str, Any]]:
+    """
+    Build JSONL data for embedding batch using OpenAI Embeddings API.
+
+    Each line is a dict with:
+    - custom_id: Unique identifier (dataset item ID)
+    - method: POST
+    - url: /v1/embeddings
+    - body: Embedding request with input array [output, ground_truth]
+
+    Args:
+        results: List of evaluation results from parse_evaluation_output()
+                 Format: [
+                     {
+                         "item_id": "item_123",
+                         "question": "What is 2+2?",
+                         "generated_output": "The answer is 4",
+                         "ground_truth": "4"
+                     },
+                     ...
+                 ]
+        embedding_model: OpenAI embedding model to use (default: text-embedding-3-large)
+
+    Returns:
+        List of dictionaries (JSONL data)
+    """
+    # Validate embedding model
+    validate_embedding_model(embedding_model)
+
+    logger.info(
+        f"Building embedding JSONL for {len(results)} items with model {embedding_model}"
+    )
+
+    jsonl_data = []
+
+    for result in results:
+        item_id = result.get("item_id")
+        generated_output = result.get("generated_output", "")
+        ground_truth = result.get("ground_truth", "")
+
+        if not item_id:
+            logger.warning("Skipping result with no item_id")
+            continue
+
+        # Skip if either output or ground_truth is empty
+        if not generated_output or not ground_truth:
+            logger.warning(f"Skipping item {item_id} - empty output or ground_truth")
+            continue
+
+        # Build the batch request object for Embeddings API
+        # Use input array to get both embeddings in one request
+        batch_request = {
+            "custom_id": item_id,
+            "method": "POST",
+            "url": "/v1/embeddings",
+            "body": {
+                "model": embedding_model,
+                "input": [
+                    generated_output,  # Index 0
+                    ground_truth,  # Index 1
+                ],
+                "encoding_format": "float",
+            },
+        }
+
+        jsonl_data.append(batch_request)
+
+    logger.info(f"Built {len(jsonl_data)} embedding JSONL lines")
+    return jsonl_data
+
+
+def parse_embedding_results(raw_results: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """
+    Parse embedding batch output into structured embedding pairs.
+
+    Args:
+        raw_results: Raw results from batch provider (list of JSONL lines)
+
+    Returns:
+        List of embedding pairs in format:
+        [
+            {
+                "item_id": "item_123",
+                "output_embedding": [0.1, 0.2, ...],
+                "ground_truth_embedding": [0.15, 0.22, ...]
+            },
+            ...
+        ]
+    """
+    logger.info(f"Parsing embedding results from {len(raw_results)} lines")
+
+    embedding_pairs = []
+
+    for line_num, response in enumerate(raw_results, 1):
+        try:
+            # Extract custom_id (dataset item ID)
+            item_id = response.get("custom_id")
+            if not item_id:
+                logger.warning(f"Line {line_num}: No custom_id found, skipping")
+                continue
+
+            # Handle errors in batch processing
+            if response.get("error"):
+                error_msg = response["error"].get("message", "Unknown error")
+                logger.error(f"Item {item_id} had error: {error_msg}")
+                continue
+
+            # Extract the response body
+            response_body = response.get("response", {}).get("body", {})
+            embedding_data = response_body.get("data", [])
+
+            if len(embedding_data) < 2:
+                logger.warning(
+                    f"Item {item_id}: Expected 2 embeddings, got {len(embedding_data)}"
+                )
+                continue
+
+            # Extract embeddings by index
+            # Index 0 = generated_output embedding
+            # Index 1 = ground_truth embedding
+            output_embedding = None
+            ground_truth_embedding = None
+
+            for emb_obj in embedding_data:
+                index = emb_obj.get("index")
+                embedding = emb_obj.get("embedding")
+
+                if embedding is None:
+                    continue
+
+                if index == 0:
+                    output_embedding = embedding
+                elif index == 1:
+                    ground_truth_embedding = embedding
+
+            if output_embedding is None or ground_truth_embedding is None:
+                logger.warning(
+                    f"Item {item_id}: Missing embeddings (output={output_embedding is not None}, "
+                    f"ground_truth={ground_truth_embedding is not None})"
+                )
+                continue
+
+            embedding_pairs.append(
+                {
+                    "item_id": item_id,
+                    "output_embedding": output_embedding,
+                    "ground_truth_embedding": ground_truth_embedding,
+                }
+            )
+
+        except Exception as e:
+            logger.error(f"Line {line_num}: Unexpected error: {e}", exc_info=True)
+            continue
+
+    logger.info(
+        f"Parsed {len(embedding_pairs)} embedding pairs from {len(raw_results)} lines"
+    )
+    return embedding_pairs
+
+
+def calculate_cosine_similarity(vec1: list[float], vec2: list[float]) -> float:
+    """
+    Calculate cosine similarity between two vectors using numpy.
+
+    Formula: similarity = dot(vec1, vec2) / (||vec1|| * ||vec2||)
+
+    Args:
+        vec1: First embedding vector
+        vec2: Second embedding vector
+
+    Returns:
+        Cosine similarity score (range: -1 to 1, typically 0 to 1 for embeddings)
+    """
+    # Convert to numpy arrays
+    v1 = np.array(vec1)
+    v2 = np.array(vec2)
+
+    # Calculate dot product
+    dot_product = np.dot(v1, v2)
+
+    # Calculate norms
+    norm_v1 = np.linalg.norm(v1)
+    norm_v2 = np.linalg.norm(v2)
+
+    # Handle edge case of zero vectors
+    if norm_v1 == 0 or norm_v2 == 0:
+        return 0.0
+
+    # Calculate cosine similarity
+    similarity = dot_product / (norm_v1 * norm_v2)
+
+    return float(similarity)
+
+
+def calculate_average_similarity(
+    embedding_pairs: list[dict[str, Any]]
+) -> dict[str, Any]:
+    """
+    Calculate cosine similarity statistics for all embedding pairs.
+
+    Args:
+        embedding_pairs: List of embedding pairs from parse_embedding_results()
+
+    Returns:
+        Dictionary with similarity statistics:
+        {
+            "cosine_similarity_avg": 0.87,
+            "cosine_similarity_min": 0.65,
+            "cosine_similarity_max": 0.98,
+            "cosine_similarity_std": 0.12,
+            "total_pairs": 50,
+            "per_item_scores": [...]  # Optional: individual scores
+        }
+    """
+    logger.info(f"Calculating similarity for {len(embedding_pairs)} pairs")
+
+    if not embedding_pairs:
+        return {
+            "cosine_similarity_avg": 0.0,
+            "cosine_similarity_min": 0.0,
+            "cosine_similarity_max": 0.0,
+            "cosine_similarity_std": 0.0,
+            "total_pairs": 0,
+            "per_item_scores": [],
+        }
+
+    similarities = []
+    per_item_scores = []
+
+    for pair in embedding_pairs:
+        try:
+            output_emb = pair["output_embedding"]
+            ground_truth_emb = pair["ground_truth_embedding"]
+
+            similarity = calculate_cosine_similarity(output_emb, ground_truth_emb)
+            similarities.append(similarity)
+
+            per_item_scores.append(
+                {
+                    "item_id": pair["item_id"],
+                    "cosine_similarity": similarity,
+                }
+            )
+
+        except Exception as e:
+            logger.error(
+                f"Error calculating similarity for item {pair.get('item_id')}: {e}"
+            )
+            continue
+
+    if not similarities:
+        logger.warning("No valid similarities calculated")
+        return {
+            "cosine_similarity_avg": 0.0,
+            "cosine_similarity_min": 0.0,
+            "cosine_similarity_max": 0.0,
+            "cosine_similarity_std": 0.0,
+            "total_pairs": 0,
+            "per_item_scores": [],
+        }
+
+    # Calculate statistics
+    similarities_array = np.array(similarities)
+
+    stats = {
+        "cosine_similarity_avg": float(np.mean(similarities_array)),
+        "cosine_similarity_min": float(np.min(similarities_array)),
+        "cosine_similarity_max": float(np.max(similarities_array)),
+        "cosine_similarity_std": float(np.std(similarities_array)),
+        "total_pairs": len(similarities),
+        "per_item_scores": per_item_scores,
+    }
+
+    logger.info(
+        f"Calculated similarity stats: avg={stats['cosine_similarity_avg']:.3f}, "
+        f"min={stats['cosine_similarity_min']:.3f}, "
+        f"max={stats['cosine_similarity_max']:.3f}, "
+        f"std={stats['cosine_similarity_std']:.3f}"
+    )
+
+    return stats
+
+
+def start_embedding_batch(
+    session: Session,
+    openai_client: OpenAI,
+    eval_run: EvaluationRun,
+    results: list[dict[str, Any]],
+) -> EvaluationRun:
+    """
+    Start embedding batch for similarity scoring.
+
+    This function orchestrates the embedding batch creation:
+    1. Builds embedding JSONL from evaluation results
+    2. Creates batch via generic infrastructure (job_type="embedding")
+    3. Links embedding_batch_job_id to eval_run
+    4. Keeps status as "processing"
+
+    Args:
+        session: Database session
+        openai_client: Configured OpenAI client
+        eval_run: EvaluationRun database object
+        results: Parsed evaluation results (output + ground_truth pairs)
+
+    Returns:
+        Updated EvaluationRun with embedding_batch_job_id populated
+
+    Raises:
+        Exception: If any step fails
+    """
+    try:
+        logger.info(f"Starting embedding batch for evaluation run {eval_run.id}")
+
+        # Get embedding model from config (default: text-embedding-3-large)
+        embedding_model = eval_run.config.get(
+            "embedding_model", "text-embedding-3-large"
+        )
+
+        # Validate and fallback to default if invalid
+        try:
+            validate_embedding_model(embedding_model)
+        except ValueError as e:
+            logger.warning(
+                f"Invalid embedding model '{embedding_model}' in config: {e}. "
+                f"Falling back to text-embedding-3-large"
+            )
+            embedding_model = "text-embedding-3-large"
+
+        # Step 1: Build embedding JSONL
+        jsonl_data = build_embedding_jsonl(
+            results=results,
+            embedding_model=embedding_model,
+        )
+
+        if not jsonl_data:
+            raise ValueError("No valid items to create embeddings for")
+
+        # Step 2: Create batch provider
+        provider = OpenAIBatchProvider(client=openai_client)
+
+        # Step 3: Prepare batch configuration
+        batch_config = {
+            "endpoint": "/v1/embeddings",
+            "description": f"Embeddings for evaluation: {eval_run.run_name}",
+            "completion_window": "24h",
+            "embedding_model": embedding_model,
+        }
+
+        # Step 4: Start batch job using generic infrastructure
+        batch_job = start_batch_job(
+            session=session,
+            provider=provider,
+            provider_name="openai",
+            job_type="embedding",
+            organization_id=eval_run.organization_id,
+            project_id=eval_run.project_id,
+            jsonl_data=jsonl_data,
+            config=batch_config,
+        )
+
+        # Step 5: Link embedding_batch_job to evaluation_run
+        eval_run.embedding_batch_job_id = batch_job.id
+        # Keep status as "processing" - will change to "completed" after embeddings
+        eval_run.updated_at = now()
+
+        session.add(eval_run)
+        session.commit()
+        session.refresh(eval_run)
+
+        logger.info(
+            f"Successfully started embedding batch: batch_job_id={batch_job.id}, "
+            f"provider_batch_id={batch_job.provider_batch_id} "
+            f"for evaluation run {eval_run.id} with {batch_job.total_items} items"
+        )
+
+        return eval_run
+
+    except Exception as e:
+        logger.error(f"Failed to start embedding batch: {e}", exc_info=True)
+        # Don't update eval_run status here - let caller decide
+        raise
diff --git a/backend/app/crud/evaluation_processing.py b/backend/app/crud/evaluation_processing.py
index 17ee6b2f3..745b0b3b0 100644
--- a/backend/app/crud/evaluation_processing.py
+++ b/backend/app/crud/evaluation_processing.py
@@ -11,19 +11,24 @@
 import ast
 import json
 import logging
-
 from collections import defaultdict
+from typing import Any
+
 from langfuse import Langfuse
 from openai import OpenAI
 from sqlmodel import Session, select
-from typing import Any
 
 from app.core.batch.openai_provider import OpenAIBatchProvider
 from app.core.util import configure_langfuse, configure_openai, now
 from app.crud.batch_job import get_batch_job
-from app.crud.batch_operations import upload_batch_results_to_s3, download_batch_results
+from app.crud.batch_operations import download_batch_results, upload_batch_results_to_s3
 from app.crud.credentials import get_provider_credential
 from app.crud.evaluation_batch import fetch_dataset_items
+from app.crud.evaluation_embeddings import (
+    calculate_average_similarity,
+    parse_embedding_results,
+    start_embedding_batch,
+)
 from app.crud.evaluation_langfuse import create_langfuse_dataset_run
 from app.models import EvaluationRun
 
@@ -157,7 +162,7 @@ async def process_completed_evaluation(
     1. Downloads batch output from provider
     2. Parses results into question/output/ground_truth format
     3. Creates Langfuse dataset run with traces
-    4. Updates evaluation_run with completion status
+    4. Starts embedding batch for similarity scoring (keeps status as "processing")
 
     Args:
         eval_run: EvaluationRun database object
@@ -166,7 +171,7 @@ async def process_completed_evaluation(
         langfuse: Configured Langfuse client
 
     Returns:
-        Updated EvaluationRun object
+        Updated EvaluationRun object (with embedding_batch_job_id set)
 
     Raises:
         Exception: If processing fails
@@ -227,22 +232,43 @@ async def process_completed_evaluation(
             results=results,
         )
 
-        # Step 6: Mark evaluation as completed
-        logger.info("Step 5: Marking evaluation as completed")
-        eval_run.status = "completed"
-        eval_run.updated_at = now()
-
         # Set S3 URL if upload was successful
         if s3_url:
             eval_run.s3_url = s3_url
+            session.add(eval_run)
+            session.commit()
 
-        session.add(eval_run)
-        session.commit()
-        session.refresh(eval_run)
+        # Step 6: Start embedding batch for similarity scoring
+        logger.info("Step 6: Starting embedding batch for similarity scoring")
+        try:
+            eval_run = start_embedding_batch(
+                session=session,
+                openai_client=openai_client,
+                eval_run=eval_run,
+                results=results,
+            )
+            logger.info(
+                f"Successfully started embedding batch for evaluation run {eval_run.id}: "
+                f"embedding_batch_job_id={eval_run.embedding_batch_job_id}"
+            )
+            # Note: Status remains "processing" until embeddings complete
+
+        except Exception as e:
+            logger.error(
+                f"Failed to start embedding batch for run {eval_run.id}: {e}",
+                exc_info=True,
+            )
+            # Don't fail the entire evaluation, just mark as completed without embeddings
+            eval_run.status = "completed"
+            eval_run.error_message = f"Embeddings failed: {str(e)}"
+            eval_run.updated_at = now()
+            session.add(eval_run)
+            session.commit()
+            session.refresh(eval_run)
 
         logger.info(
-            f"Successfully completed processing for evaluation run {eval_run.id}: "
-            f"{len(results)} items processed"
+            f"Successfully processed evaluation run {eval_run.id}: "
+            f"{len(results)} items processed, embedding batch started"
         )
 
         return eval_run
@@ -262,6 +288,121 @@ async def process_completed_evaluation(
         return eval_run
 
 
+async def process_completed_embedding_batch(
+    eval_run: EvaluationRun,
+    session: Session,
+    openai_client: OpenAI,
+) -> EvaluationRun:
+    """
+    Process a completed embedding batch and calculate similarity scores.
+
+    This function:
+    1. Downloads embedding batch results
+    2. Parses embeddings (output + ground_truth pairs)
+    3. Calculates cosine similarity for each pair
+    4. Calculates average and statistics
+    5. Updates eval_run.score with results
+    6. Marks evaluation as completed
+
+    Args:
+        eval_run: EvaluationRun database object
+        session: Database session
+        openai_client: Configured OpenAI client
+
+    Returns:
+        Updated EvaluationRun object with similarity scores
+
+    Raises:
+        Exception: If processing fails
+    """
+    logger.info(f"Processing completed embedding batch for run {eval_run.id}")
+
+    try:
+        # Step 1: Get embedding_batch_job
+        if not eval_run.embedding_batch_job_id:
+            raise ValueError(
+                f"EvaluationRun {eval_run.id} has no embedding_batch_job_id"
+            )
+
+        embedding_batch_job = get_batch_job(
+            session=session, batch_job_id=eval_run.embedding_batch_job_id
+        )
+        if not embedding_batch_job:
+            raise ValueError(
+                f"Embedding BatchJob {eval_run.embedding_batch_job_id} not found for evaluation {eval_run.id}"
+            )
+
+        # Step 2: Create provider and download results
+        logger.info(
+            f"Step 1: Downloading embedding batch results for batch_job {embedding_batch_job.id}"
+        )
+        provider = OpenAIBatchProvider(client=openai_client)
+        raw_results = download_batch_results(
+            provider=provider, batch_job=embedding_batch_job
+        )
+
+        # Step 3: Parse embedding results
+        logger.info("Step 3: Parsing embedding results")
+        embedding_pairs = parse_embedding_results(raw_results=raw_results)
+
+        if not embedding_pairs:
+            raise ValueError("No valid embedding pairs found in batch output")
+
+        # Step 4: Calculate similarity scores
+        logger.info("Step 3: Calculating cosine similarity scores")
+        similarity_stats = calculate_average_similarity(embedding_pairs=embedding_pairs)
+
+        # Step 5: Update evaluation_run with scores
+        logger.info("Step 4: Updating evaluation run with similarity scores")
+
+        # Merge with existing score if any
+        if eval_run.score is None:
+            eval_run.score = {}
+
+        eval_run.score["cosine_similarity"] = {
+            "avg": similarity_stats["cosine_similarity_avg"],
+            "min": similarity_stats["cosine_similarity_min"],
+            "max": similarity_stats["cosine_similarity_max"],
+            "std": similarity_stats["cosine_similarity_std"],
+            "total_pairs": similarity_stats["total_pairs"],
+        }
+
+        # Optionally store per-item scores if not too large
+        if len(similarity_stats.get("per_item_scores", [])) <= 100:
+            eval_run.score["cosine_similarity"]["per_item_scores"] = similarity_stats[
+                "per_item_scores"
+            ]
+
+        # Step 6: Mark evaluation as completed
+        eval_run.status = "completed"
+        eval_run.updated_at = now()
+
+        session.add(eval_run)
+        session.commit()
+        session.refresh(eval_run)
+
+        logger.info(
+            f"Successfully completed embedding processing for evaluation run {eval_run.id}: "
+            f"avg_similarity={similarity_stats['cosine_similarity_avg']:.3f}"
+        )
+
+        return eval_run
+
+    except Exception as e:
+        logger.error(
+            f"Failed to process completed embedding batch for run {eval_run.id}: {e}",
+            exc_info=True,
+        )
+        # Mark as completed anyway, but with error message
+        eval_run.status = "completed"
+        eval_run.error_message = f"Embedding processing failed: {str(e)}"
+        eval_run.updated_at = now()
+        session.add(eval_run)
+        session.commit()
+        session.refresh(eval_run)
+        return eval_run
+
+
 async def check_and_process_evaluation(
     eval_run: EvaluationRun,
     session: Session,
@@ -271,8 +412,10 @@ async def check_and_process_evaluation(
     """
     Check evaluation batch status and process if completed.
 
-    This function checks the batch_job status and triggers evaluation-specific
-    processing when the batch is completed.
+    This function handles both the response batch and embedding batch:
+    1. If embedding_batch_job_id exists, checks and processes embedding batch first
+    2. Otherwise, checks and processes the main response batch
+    3. Triggers appropriate processing based on batch completion status
 
     Args:
         eval_run: EvaluationRun database object
@@ -288,7 +431,7 @@ async def check_and_process_evaluation(
             "previous_status": "processing",
             "current_status": "completed",
             "batch_status": "completed",
-            "action": "processed" | "updated" | "failed" | "no_change"
+            "action": "processed" | "embeddings_completed" | "embeddings_failed" | "failed" | "no_change"
         }
     """
     logger.info(f"Checking evaluation run {eval_run.id}")
@@ -296,7 +439,90 @@ async def check_and_process_evaluation(
     previous_status = eval_run.status
 
     try:
-        # Get batch_job
+        # Check if we need to process embedding batch first
+        if eval_run.embedding_batch_job_id and eval_run.status == "processing":
+            logger.info(
+                f"Checking embedding batch for evaluation run {eval_run.id}: "
+                f"embedding_batch_job_id={eval_run.embedding_batch_job_id}"
+            )
+
+            embedding_batch_job = get_batch_job(
+                session=session, batch_job_id=eval_run.embedding_batch_job_id
+            )
+
+            if embedding_batch_job:
+                # Poll embedding batch status
+                provider = OpenAIBatchProvider(client=openai_client)
+                from app.crud.batch_operations import poll_batch_status
+
+                poll_batch_status(
+                    session=session, provider=provider, batch_job=embedding_batch_job
+                )
+                session.refresh(embedding_batch_job)
+
+                embedding_status = embedding_batch_job.provider_status
+
+                if embedding_status == "completed":
+                    logger.info(
+                        f"Embedding batch {embedding_batch_job.provider_batch_id} completed, "
+                        f"processing similarity scores..."
+                    )
+
+                    await process_completed_embedding_batch(
+                        eval_run=eval_run,
+                        session=session,
+                        openai_client=openai_client,
+                    )
+
+                    return {
+                        "run_id": eval_run.id,
+                        "run_name": eval_run.run_name,
+                        "previous_status": previous_status,
+                        "current_status": eval_run.status,
+                        "provider_status": embedding_status,
+                        "action": "embeddings_completed",
+                    }
+
+                elif embedding_status in ["failed", "expired", "cancelled"]:
+                    logger.error(
+                        f"Embedding batch {embedding_batch_job.provider_batch_id} failed: "
+                        f"{embedding_batch_job.error_message}"
+                    )
+                    # Mark as completed without embeddings
+                    eval_run.status = "completed"
+                    eval_run.error_message = (
+                        f"Embedding batch failed: {embedding_batch_job.error_message}"
+                    )
+                    eval_run.updated_at = now()
+                    session.add(eval_run)
+                    session.commit()
+                    session.refresh(eval_run)
+
+                    return {
+                        "run_id": eval_run.id,
+                        "run_name": eval_run.run_name,
+                        "previous_status": previous_status,
+                        "current_status": "completed",
+                        "provider_status": embedding_status,
+                        "action": "embeddings_failed",
+                    }
+
+                else:
+                    # Embedding batch still processing
+                    logger.info(
+                        f"Embedding batch {embedding_batch_job.provider_batch_id} still processing "
+                        f"(status={embedding_status})"
+                    )
+                    return {
+                        "run_id": eval_run.id,
+                        "run_name": eval_run.run_name,
+                        "previous_status": previous_status,
+                        "current_status": eval_run.status,
+                        "provider_status": embedding_status,
+                        "action": "no_change",
+                    }
+
+        # Get batch_job (main response batch)
         if not eval_run.batch_job_id:
             raise ValueError(f"EvaluationRun {eval_run.id} has no batch_job_id")
 
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index e15268574..ab4597d1d 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -1,9 +1,10 @@
 from datetime import datetime
-from typing import Optional, Any
+from typing import Any, Optional
 
 from pydantic import BaseModel, Field
-from sqlalchemy import Column, Text, JSON
-from sqlmodel import Field as SQLField, Relationship, SQLModel
+from sqlalchemy import JSON, Column, Text
+from sqlmodel import Field as SQLField
+from sqlmodel import Relationship, SQLModel
 
 from app.core.util import now
 
@@ -77,11 +78,16 @@ class EvaluationRun(SQLModel, table=True):
         description="Evaluation configuration",
     )
 
-    # Batch job reference
-    batch_job_id: Optional[int] = SQLField(
+    # Batch job references
+    batch_job_id: int | None = SQLField(
         default=None,
         foreign_key="batch_job.id",
-        description="Reference to the batch_job that processes this evaluation",
+        description="Reference to the batch_job that processes this evaluation (responses)",
+    )
+    embedding_batch_job_id: int | None = SQLField(
+        default=None,
+        foreign_key="batch_job.id",
+        description="Reference to the batch_job for embedding-based similarity scoring",
     )
 
     # Output/Status fields (updated by system during processing)
@@ -89,7 +95,7 @@ class EvaluationRun(SQLModel, table=True):
         default="pending",
         description="Overall evaluation status: pending, processing, completed, failed",
     )
-    s3_url: Optional[str] = SQLField(
+    s3_url: str | None = SQLField(
         default=None,
         description="S3 URL of processed evaluation results for future reference",
     )
@@ -98,14 +104,14 @@ class EvaluationRun(SQLModel, table=True):
     )
 
     # Score field - dict requires sa_column
-    score: Optional[dict[str, Any]] = SQLField(
+    score: dict[str, Any] | None = SQLField(
         default=None,
         sa_column=Column(JSON, nullable=True),
         description="Evaluation scores (e.g., correctness, cosine_similarity, etc.)",
     )
 
     # Error message field
-    error_message: Optional[str] = SQLField(
+    error_message: str | None = SQLField(
         default=None,
         sa_column=Column(Text, nullable=True),
         description="Error message if failed",
@@ -126,7 +132,14 @@ class EvaluationRun(SQLModel, table=True):
     # Relationships
     project: "Project" = Relationship(back_populates="evaluation_runs")
     organization: "Organization" = Relationship(back_populates="evaluation_runs")
-    batch_job: Optional["BatchJob"] = Relationship()  # noqa: F821
+    batch_job: Optional["BatchJob"] = Relationship(
+        sa_relationship_kwargs={"foreign_keys": "[EvaluationRun.batch_job_id]"}
+    )  # noqa: F821
+    embedding_batch_job: Optional["BatchJob"] = Relationship(
+        sa_relationship_kwargs={
+            "foreign_keys": "[EvaluationRun.embedding_batch_job_id]"
+        }
+    )  # noqa: F821
 
 
 class EvaluationRunCreate(SQLModel):
@@ -147,12 +160,13 @@ class EvaluationRunPublic(SQLModel):
     run_name: str
     dataset_name: str
     config: dict[str, Any]
-    batch_job_id: Optional[int]
+    batch_job_id: int | None
+    embedding_batch_job_id: int | None
     status: str
-    s3_url: Optional[str]
+    s3_url: str | None
     total_items: int
-    score: Optional[dict[str, Any]]
-    error_message: Optional[str]
+    score: dict[str, Any] | None
+    error_message: str | None
     organization_id: int
     project_id: int
     inserted_at: datetime
diff --git a/backend/app/tests/crud/test_evaluation_embeddings.py b/backend/app/tests/crud/test_evaluation_embeddings.py
new file mode 100644
index 000000000..26468ed67
--- /dev/null
+++ b/backend/app/tests/crud/test_evaluation_embeddings.py
@@ -0,0 +1,373 @@
+"""Tests for evaluation embeddings functionality."""
+
+import numpy as np
+import pytest
+
+from app.crud.evaluation_embeddings import (
+    build_embedding_jsonl,
+    calculate_average_similarity,
+    calculate_cosine_similarity,
+    parse_embedding_results,
+)
+
+
+class TestBuildEmbeddingJsonl:
+    """Tests for build_embedding_jsonl function."""
+
+    def test_build_embedding_jsonl_basic(self):
+        """Test building JSONL for basic evaluation results."""
+        results = [
+            {
+                "item_id": "item_1",
+                "question": "What is 2+2?",
+                "generated_output": "The answer is 4",
+                "ground_truth": "4",
+            },
+            {
+                "item_id": "item_2",
+                "question": "What is the capital of France?",
+                "generated_output": "Paris",
+                "ground_truth": "Paris",
+            },
+        ]
+
+        jsonl_data = build_embedding_jsonl(results)
+
+        assert len(jsonl_data) == 2
+
+        # Check first item
+        assert jsonl_data[0]["custom_id"] == "item_1"
+        assert jsonl_data[0]["method"] == "POST"
+        assert jsonl_data[0]["url"] == "/v1/embeddings"
+        assert jsonl_data[0]["body"]["model"] == "text-embedding-3-large"
+        assert jsonl_data[0]["body"]["input"] == ["The answer is 4", "4"]
+        assert jsonl_data[0]["body"]["encoding_format"] == "float"
+
+    def test_build_embedding_jsonl_custom_model(self):
+        """Test building JSONL with custom embedding model."""
+        results = [
+            {
+                "item_id": "item_1",
+                "question": "Test?",
+                "generated_output": "Output",
+                "ground_truth": "Truth",
+            }
+        ]
+
+        jsonl_data = build_embedding_jsonl(
+            results, embedding_model="text-embedding-3-small"
+        )
+
+        assert len(jsonl_data) == 1
+        assert jsonl_data[0]["body"]["model"] == "text-embedding-3-small"
+
+    def test_build_embedding_jsonl_skips_empty(self):
+        """Test that items with empty output or ground_truth are skipped."""
+        results = [
+            {
+                "item_id": "item_1",
+                "question": "Test?",
+                "generated_output": "",  # Empty
+                "ground_truth": "Truth",
+            },
+            {
+                "item_id": "item_2",
+                "question": "Test?",
+                "generated_output": "Output",
+                "ground_truth": "",  # Empty
+            },
+            {
+                "item_id": "item_3",
+                "question": "Test?",
+                "generated_output": "Output",
+                "ground_truth": "Truth",
+            },
+        ]
+
+        jsonl_data = build_embedding_jsonl(results)
+
+        # Only item_3 should be included
+        assert len(jsonl_data) == 1
+        assert jsonl_data[0]["custom_id"] == "item_3"
+
+    def test_build_embedding_jsonl_missing_item_id(self):
+        """Test that items without item_id are skipped."""
+        results = [
+            {
+                # Missing item_id
+                "question": "Test?",
+                "generated_output": "Output",
+                "ground_truth": "Truth",
+            },
+            {
+                "item_id": "item_2",
+                "question": "Test?",
+                "generated_output": "Output",
+                "ground_truth": "Truth",
+            },
+        ]
+
+        jsonl_data = build_embedding_jsonl(results)
+
+        # Only item_2 should be included
+        assert len(jsonl_data) == 1
+        assert jsonl_data[0]["custom_id"] == "item_2"
+
+
+class TestParseEmbeddingResults:
+    """Tests for parse_embedding_results function."""
+
+    def test_parse_embedding_results_basic(self):
+        """Test parsing basic embedding results."""
+        raw_results = [
+            {
+                "custom_id": "item_1",
+                "response": {
+                    "body": {
+                        "data": [
+                            {"index": 0, "embedding": [0.1, 0.2, 0.3]},
+                            {"index": 1, "embedding": [0.15, 0.22, 0.32]},
+                        ]
+                    }
+                },
+            },
+            {
+                "custom_id": "item_2",
+                "response": {
+                    "body": {
+                        "data": [
+                            {"index": 0, "embedding": [0.5, 0.6, 0.7]},
+                            {"index": 1, "embedding": [0.55, 0.65, 0.75]},
+                        ]
+                    }
+                },
+            },
+        ]
+
+        embedding_pairs = parse_embedding_results(raw_results)
+
+        assert len(embedding_pairs) == 2
+
+        # Check first pair
+        assert embedding_pairs[0]["item_id"] == "item_1"
+        assert embedding_pairs[0]["output_embedding"] == [0.1, 0.2, 0.3]
+        assert embedding_pairs[0]["ground_truth_embedding"] == [0.15, 0.22, 0.32]
+
+        # Check second pair
+        assert embedding_pairs[1]["item_id"] == "item_2"
+        assert embedding_pairs[1]["output_embedding"] == [0.5, 0.6, 0.7]
+        assert embedding_pairs[1]["ground_truth_embedding"] == [0.55, 0.65, 0.75]
+
+    def test_parse_embedding_results_with_error(self):
+        """Test parsing results with errors."""
+        raw_results = [
+            {
+                "custom_id": "item_1",
+                "error": {"message": "Rate limit exceeded"},
+            },
+            {
+                "custom_id": "item_2",
+                "response": {
+                    "body": {
+                        "data": [
+                            {"index": 0, "embedding": [0.1, 0.2]},
+                            {"index": 1, "embedding": [0.15, 0.22]},
+                        ]
+                    }
+                },
+            },
+        ]
+
+        embedding_pairs = parse_embedding_results(raw_results)
+
+        # Only item_2 should be included (item_1 had error)
+        assert len(embedding_pairs) == 1
+        assert embedding_pairs[0]["item_id"] == "item_2"
+
+    def test_parse_embedding_results_missing_embedding(self):
+        """Test parsing results with missing embeddings."""
+        raw_results = [
+            {
+                "custom_id": "item_1",
+                "response": {
+                    "body": {
+                        "data": [
+                            {"index": 0, "embedding": [0.1, 0.2]},
+                            # Missing index 1
+                        ]
+                    }
+                },
+            },
+            {
+                "custom_id": "item_2",
+                "response": {
+                    "body": {
+                        "data": [
+                            {"index": 0, "embedding": [0.1, 0.2]},
+                            {"index": 1, "embedding": [0.15, 0.22]},
+                        ]
+                    }
+                },
+            },
+        ]
+
+        embedding_pairs = parse_embedding_results(raw_results)
+
+        # Only item_2 should be included (item_1 missing index 1)
+        assert len(embedding_pairs) == 1
+        assert embedding_pairs[0]["item_id"] == "item_2"
+
+
+class TestCalculateCosineSimilarity:
+    """Tests for calculate_cosine_similarity function."""
+
+    def test_calculate_cosine_similarity_identical(self):
+        """Test cosine similarity of identical vectors."""
+        vec1 = [1.0, 0.0, 0.0]
+        vec2 = [1.0, 0.0, 0.0]
+
+        similarity = calculate_cosine_similarity(vec1, vec2)
+
+        assert similarity == pytest.approx(1.0)
+
+    def test_calculate_cosine_similarity_orthogonal(self):
+        """Test cosine similarity of orthogonal vectors."""
+        vec1 = [1.0, 0.0, 0.0]
+        vec2 = [0.0, 1.0, 0.0]
+
+        similarity = calculate_cosine_similarity(vec1, vec2)
+
+        assert similarity == pytest.approx(0.0)
+
+    def test_calculate_cosine_similarity_opposite(self):
+        """Test cosine similarity of opposite vectors."""
+        vec1 = [1.0, 0.0, 0.0]
+        vec2 = [-1.0, 0.0, 0.0]
+
+        similarity = calculate_cosine_similarity(vec1, vec2)
+
+        assert similarity == pytest.approx(-1.0)
+
+    def test_calculate_cosine_similarity_partial(self):
+        """Test cosine similarity of partially similar vectors."""
+        vec1 = [1.0, 1.0, 0.0]
+        vec2 = [1.0, 0.0, 0.0]
+
+        similarity = calculate_cosine_similarity(vec1, vec2)
+
+        # cos(45°) ≈ 0.707
+        assert similarity == pytest.approx(0.707, abs=0.01)
+
+    def test_calculate_cosine_similarity_zero_vector(self):
+        """Test cosine similarity with zero vector."""
+        vec1 = [0.0, 0.0, 0.0]
+        vec2 = [1.0, 0.0, 0.0]
+
+        similarity = calculate_cosine_similarity(vec1, vec2)
+
+        assert similarity == 0.0
+
+
+class TestCalculateAverageSimilarity:
+    """Tests for calculate_average_similarity function."""
+
+    def test_calculate_average_similarity_basic(self):
+        """Test calculating average similarity for basic embedding pairs."""
+        embedding_pairs = [
+            {
+                "item_id": "item_1",
+                "output_embedding": [1.0, 0.0, 0.0],
+                "ground_truth_embedding": [1.0, 0.0, 0.0],  # Similarity = 1.0
+            },
+            {
+                "item_id": "item_2",
+                "output_embedding": [1.0, 0.0, 0.0],
+                "ground_truth_embedding": [0.0, 1.0, 0.0],  # Similarity = 0.0
+            },
+            {
+                "item_id": "item_3",
+                "output_embedding": [1.0, 1.0, 0.0],
+                "ground_truth_embedding": [1.0, 0.0, 0.0],  # Similarity ≈ 0.707
+            },
+        ]
+
+        stats = calculate_average_similarity(embedding_pairs)
+
+        assert stats["total_pairs"] == 3
+        assert stats["cosine_similarity_min"] == pytest.approx(0.0)
+        assert stats["cosine_similarity_max"] == pytest.approx(1.0)
+        # Average of [1.0, 0.0, 0.707] ≈ 0.569
+        assert stats["cosine_similarity_avg"] == pytest.approx(0.569, abs=0.01)
+        assert "cosine_similarity_std" in stats
+        assert len(stats["per_item_scores"]) == 3
+
+    def test_calculate_average_similarity_empty(self):
+        """Test calculating average similarity for empty list."""
+        embedding_pairs = []
+
+        stats = calculate_average_similarity(embedding_pairs)
+
+        assert stats["total_pairs"] == 0
+        assert stats["cosine_similarity_avg"] == 0.0
+        assert stats["cosine_similarity_min"] == 0.0
+        assert stats["cosine_similarity_max"] == 0.0
+        assert stats["per_item_scores"] == []
+
+    def test_calculate_average_similarity_per_item_scores(self):
+        """Test that per-item scores are correctly calculated."""
+        embedding_pairs = [
+            {
+                "item_id": "item_1",
+                "output_embedding": [1.0, 0.0],
+                "ground_truth_embedding": [1.0, 0.0],
+            },
+            {
+                "item_id": "item_2",
+                "output_embedding": [0.0, 1.0],
+                "ground_truth_embedding": [0.0, 1.0],
+            },
+        ]
+
+        stats = calculate_average_similarity(embedding_pairs)
+
+        assert len(stats["per_item_scores"]) == 2
+        assert stats["per_item_scores"][0]["item_id"] == "item_1"
+        assert stats["per_item_scores"][0]["cosine_similarity"] == pytest.approx(1.0)
+        assert stats["per_item_scores"][1]["item_id"] == "item_2"
+        assert stats["per_item_scores"][1]["cosine_similarity"] == pytest.approx(1.0)
+
+    def test_calculate_average_similarity_statistics(self):
+        """Test that all statistics are calculated correctly."""
+        # Create pairs with known similarities
+        embedding_pairs = [
+            {
+                "item_id": "item_1",
+                "output_embedding": [1.0, 0.0],
+                "ground_truth_embedding": [1.0, 0.0],  # sim = 1.0
+            },
+            {
+                "item_id": "item_2",
+                "output_embedding": [1.0, 0.0],
+                "ground_truth_embedding": [0.0, 1.0],  # sim = 0.0
+            },
+            {
+                "item_id": "item_3",
+                "output_embedding": [1.0, 0.0],
+                "ground_truth_embedding": [1.0, 0.0],  # sim = 1.0
+            },
+            {
+                "item_id": "item_4",
+                "output_embedding": [1.0, 0.0],
+                "ground_truth_embedding": [0.0, 1.0],  # sim = 0.0
+            },
+        ]
+
+        stats = calculate_average_similarity(embedding_pairs)
+
+        # Similarities = [1.0, 0.0, 1.0, 0.0]
+        assert stats["cosine_similarity_avg"] == pytest.approx(0.5)
+        assert stats["cosine_similarity_min"] == pytest.approx(0.0)
+        assert stats["cosine_similarity_max"] == pytest.approx(1.0)
+        # Standard deviation of [1, 0, 1, 0] = 0.5
+        assert stats["cosine_similarity_std"] == pytest.approx(0.5)
+        assert stats["total_pairs"] == 4
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index da158b34d..3a7993ae3 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -31,6 +31,7 @@ dependencies = [
     "asgi-correlation-id>=4.3.4",
     "py-zerox>=0.0.7,<1.0.0",
     "pandas>=2.3.2",
+    "numpy>=1.24.0",
     "scikit-learn>=1.7.1",
     "celery>=5.3.0,<6.0.0",
     "redis>=5.0.0,<6.0.0",
diff --git a/backend/uv.lock b/backend/uv.lock
index c5f5ca013..192661986 100644
--- a/backend/uv.lock
+++ b/backend/uv.lock
@@ -181,6 +181,7 @@ dependencies = [
     { name = "jinja2" },
     { name = "langfuse" },
     { name = "moto", extra = ["s3"] },
+    { name = "numpy" },
     { name = "openai" },
     { name = "openai-responses" },
     { name = "pandas" },
@@ -225,6 +226,7 @@ requires-dist = [
     { name = "jinja2", specifier = ">=3.1.4,<4.0.0" },
     { name = "langfuse", specifier = ">=2.60.3" },
     { name = "moto", extras = ["s3"], specifier = ">=5.1.1" },
+    { name = "numpy", specifier = ">=1.24.0" },
     { name = "openai", specifier = ">=1.67.0" },
     { name = "openai-responses" },
     { name = "pandas", specifier = ">=2.3.2" },

From 6979e3354bfe6fd11da411b3d613da9e949037dc Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 23 Oct 2025 22:53:53 +0530
Subject: [PATCH 26/64] first stab at pushing cosine to langfuse

---
 ...dd_langfuse_trace_ids_to_evaluation_run.py |  35 ++++++
 backend/app/crud/evaluation_langfuse.py       | 102 +++++++++++++++++-
 backend/app/crud/evaluation_processing.py     |  50 +++++++--
 backend/app/models/evaluation.py              |   8 ++
 4 files changed, 186 insertions(+), 9 deletions(-)
 create mode 100644 backend/app/alembic/versions/2cc3c67356a8_add_langfuse_trace_ids_to_evaluation_run.py

diff --git a/backend/app/alembic/versions/2cc3c67356a8_add_langfuse_trace_ids_to_evaluation_run.py b/backend/app/alembic/versions/2cc3c67356a8_add_langfuse_trace_ids_to_evaluation_run.py
new file mode 100644
index 000000000..1ce058db4
--- /dev/null
+++ b/backend/app/alembic/versions/2cc3c67356a8_add_langfuse_trace_ids_to_evaluation_run.py
@@ -0,0 +1,35 @@
+"""add_langfuse_trace_ids_to_evaluation_run
+
+Revision ID: 2cc3c67356a8
+Revises: a1b2c3d4e5f6
+Create Date: 2025-10-23 00:00:00.000000
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+
+# revision identifiers, used by Alembic.
+revision = "2cc3c67356a8"
+down_revision = "a1b2c3d4e5f6"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # Add langfuse_trace_ids column to evaluation_run table
+    op.add_column(
+        "evaluation_run",
+        sa.Column(
+            "langfuse_trace_ids",
+            postgresql.JSON(astext_type=sa.Text()),
+            nullable=True,
+            comment="Mapping of item_id to Langfuse trace_id for updating traces with scores",
+        ),
+    )
+
+
+def downgrade():
+    # Drop langfuse_trace_ids column
+    op.drop_column("evaluation_run", "langfuse_trace_ids")
diff --git a/backend/app/crud/evaluation_langfuse.py b/backend/app/crud/evaluation_langfuse.py
index ed68192de..3d1148e27 100644
--- a/backend/app/crud/evaluation_langfuse.py
+++ b/backend/app/crud/evaluation_langfuse.py
@@ -20,7 +20,7 @@ def create_langfuse_dataset_run(
     dataset_name: str,
     run_name: str,
     results: list[dict[str, Any]],
-) -> None:
+) -> dict[str, str]:
     """
     Create a dataset run in Langfuse with traces for each evaluation item.
 
@@ -28,6 +28,7 @@ def create_langfuse_dataset_run(
     1. Gets the dataset from Langfuse (which already exists)
     2. For each result, creates a trace linked to the dataset item
     3. Logs input (question), output (generated_output), and expected (ground_truth)
+    4. Returns a mapping of item_id -> trace_id for later score updates
 
     Args:
         langfuse: Configured Langfuse client
@@ -44,6 +45,9 @@ def create_langfuse_dataset_run(
                      ...
                  ]
 
+    Returns:
+        dict[str, str]: Mapping of item_id to Langfuse trace_id
+
     Raises:
         Exception: If Langfuse operations fail
     """
@@ -62,6 +66,7 @@ def create_langfuse_dataset_run(
 
         created_traces = 0
         skipped_items = 0
+        trace_id_mapping = {}  # Store item_id -> trace_id mapping
 
         # Create a trace for each result
         for idx, result in enumerate(results, 1):
@@ -93,6 +98,8 @@ def create_langfuse_dataset_run(
                         },
                     )
                     created_traces += 1
+                    # Store the trace_id for later score updates
+                    trace_id_mapping[item_id] = trace_id
 
                 if idx % 10 == 0:
                     logger.info(
@@ -111,11 +118,102 @@ def create_langfuse_dataset_run(
 
         logger.info(
             f"Successfully created Langfuse dataset run '{run_name}': "
-            f"{created_traces} traces created, {skipped_items} items skipped"
+            f"{created_traces} traces created, {skipped_items} items skipped, "
+            f"{len(trace_id_mapping)} trace IDs captured"
         )
 
+        return trace_id_mapping
+
     except Exception as e:
         logger.error(
             f"Failed to create Langfuse dataset run '{run_name}': {e}", exc_info=True
         )
         raise
+
+
+def update_traces_with_cosine_scores(
+    langfuse: Langfuse,
+    trace_id_mapping: dict[str, str],
+    per_item_scores: list[dict[str, Any]],
+) -> None:
+    """
+    Update Langfuse traces with cosine similarity scores.
+
+    This function adds custom "cosine_similarity" scores to traces at the trace level,
+    allowing them to be visualized in the Langfuse UI.
+
+    Args:
+        langfuse: Configured Langfuse client
+        trace_id_mapping: Mapping of item_id to Langfuse trace_id
+        per_item_scores: List of per-item score dictionaries from calculate_average_similarity()
+                        Format: [
+                            {
+                                "item_id": "item_123",
+                                "cosine_similarity": 0.95
+                            },
+                            ...
+                        ]
+
+    Note:
+        This function logs errors but does not raise exceptions to avoid blocking
+        evaluation completion if Langfuse updates fail.
+    """
+    logger.info(f"Updating {len(per_item_scores)} traces with cosine similarity scores")
+
+    updated_count = 0
+    error_count = 0
+
+    for score_item in per_item_scores:
+        item_id = score_item.get("item_id")
+        cosine_score = score_item.get("cosine_similarity")
+
+        if not item_id or cosine_score is None:
+            logger.warning(
+                f"Skipping score update: missing item_id or cosine_similarity in {score_item}"
+            )
+            error_count += 1
+            continue
+
+        trace_id = trace_id_mapping.get(item_id)
+        if not trace_id:
+            logger.warning(
+                f"Trace ID not found for item_id '{item_id}', skipping score update"
+            )
+            error_count += 1
+            continue
+
+        try:
+            # Add score to the trace using Langfuse score API
+            langfuse.score(
+                trace_id=trace_id,
+                name="cosine_similarity",
+                value=cosine_score,
+                comment="Cosine similarity between generated output and ground truth embeddings",
+            )
+            updated_count += 1
+
+            if updated_count % 10 == 0:
+                logger.info(
+                    f"Progress: Updated {updated_count}/{len(per_item_scores)} traces with scores"
+                )
+
+        except Exception as e:
+            logger.error(
+                f"Failed to add score for trace {trace_id} (item {item_id}): {e}",
+                exc_info=True,
+            )
+            error_count += 1
+            continue
+
+    # Flush to ensure all scores are sent
+    try:
+        langfuse.flush()
+        logger.info(
+            f"Successfully updated traces with scores: "
+            f"{updated_count} traces updated, {error_count} errors"
+        )
+    except Exception as e:
+        logger.error(
+            f"Failed to flush Langfuse scores: {e}",
+            exc_info=True,
+        )
diff --git a/backend/app/crud/evaluation_processing.py b/backend/app/crud/evaluation_processing.py
index 745b0b3b0..537d0e1c7 100644
--- a/backend/app/crud/evaluation_processing.py
+++ b/backend/app/crud/evaluation_processing.py
@@ -29,7 +29,10 @@
     parse_embedding_results,
     start_embedding_batch,
 )
-from app.crud.evaluation_langfuse import create_langfuse_dataset_run
+from app.crud.evaluation_langfuse import (
+    create_langfuse_dataset_run,
+    update_traces_with_cosine_scores,
+)
 from app.models import EvaluationRun
 
 logger = logging.getLogger(__name__)
@@ -225,18 +228,22 @@ async def process_completed_evaluation(
 
         # Step 5: Create Langfuse dataset run with traces
         logger.info("Step 4: Creating Langfuse dataset run with traces")
-        create_langfuse_dataset_run(
+        trace_id_mapping = create_langfuse_dataset_run(
             langfuse=langfuse,
             dataset_name=eval_run.dataset_name,
             run_name=eval_run.run_name,
             results=results,
         )
 
-        # Set S3 URL if upload was successful
+        # Store trace IDs and S3 URL in database
+        eval_run.langfuse_trace_ids = trace_id_mapping
         if s3_url:
             eval_run.s3_url = s3_url
-            session.add(eval_run)
-            session.commit()
+        session.add(eval_run)
+        session.commit()
+        logger.info(
+            f"Stored {len(trace_id_mapping)} trace IDs in evaluation run {eval_run.id}"
+        )
 
         # Step 6: Start embedding batch for similarity scoring
         logger.info("Step 6: Starting embedding batch for similarity scoring")
@@ -292,6 +299,7 @@ async def process_completed_embedding_batch(
     eval_run: EvaluationRun,
     session: Session,
     openai_client: OpenAI,
+    langfuse: Langfuse,
 ) -> EvaluationRun:
     """
     Process a completed embedding batch and calculate similarity scores.
@@ -302,12 +310,14 @@ async def process_completed_embedding_batch(
     3. Calculates cosine similarity for each pair
     4. Calculates average and statistics
     5. Updates eval_run.score with results
-    6. Marks evaluation as completed
+    6. Updates Langfuse traces with per-item cosine similarity scores
+    7. Marks evaluation as completed
 
     Args:
         eval_run: EvaluationRun database object
         session: Database session
         openai_client: Configured OpenAI client
+        langfuse: Configured Langfuse client
 
     Returns:
         Updated EvaluationRun object with similarity scores
@@ -373,7 +383,32 @@ async def process_completed_embedding_batch(
                 "per_item_scores"
             ]
 
-        # Step 6: Mark evaluation as completed
+        # Step 6: Update Langfuse traces with cosine similarity scores
+        logger.info("Step 5: Updating Langfuse traces with cosine similarity scores")
+        per_item_scores = similarity_stats.get("per_item_scores", [])
+        if per_item_scores and eval_run.langfuse_trace_ids:
+            try:
+                update_traces_with_cosine_scores(
+                    langfuse=langfuse,
+                    trace_id_mapping=eval_run.langfuse_trace_ids,
+                    per_item_scores=per_item_scores,
+                )
+                logger.info(
+                    f"Successfully updated {len(per_item_scores)} Langfuse traces with scores"
+                )
+            except Exception as e:
+                # Log error but don't fail the evaluation
+                logger.error(
+                    f"Failed to update Langfuse traces with scores: {e}",
+                    exc_info=True,
+                )
+        else:
+            if not per_item_scores:
+                logger.warning("No per-item scores available to update Langfuse traces")
+            if not eval_run.langfuse_trace_ids:
+                logger.warning("No trace IDs available to update Langfuse traces")
+
+        # Step 7: Mark evaluation as completed
         eval_run.status = "completed"
         eval_run.updated_at = now()
 
@@ -472,6 +507,7 @@ async def check_and_process_evaluation(
                         eval_run=eval_run,
                         session=session,
                         openai_client=openai_client,
+                        langfuse=langfuse,
                     )
 
                     return {
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index ab4597d1d..e8de3b827 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -110,6 +110,13 @@ class EvaluationRun(SQLModel, table=True):
         description="Evaluation scores (e.g., correctness, cosine_similarity, etc.)",
     )
 
+    # Langfuse trace IDs mapping (item_id -> trace_id)
+    langfuse_trace_ids: dict[str, str] | None = SQLField(
+        default=None,
+        sa_column=Column(JSON, nullable=True),
+        description="Mapping of item_id to Langfuse trace_id for updating traces with scores",
+    )
+
     # Error message field
     error_message: str | None = SQLField(
         default=None,
@@ -166,6 +173,7 @@ class EvaluationRunPublic(SQLModel):
     s3_url: str | None
     total_items: int
     score: dict[str, Any] | None
+    langfuse_trace_ids: dict[str, str] | None
     error_message: str | None
     organization_id: int
     project_id: int

From 26ee6f0bffba1eecafdd4c442b64c9dfbc32963e Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 23 Oct 2025 23:49:32 +0530
Subject: [PATCH 27/64] cleanup logs

---
 backend/app/crud/evaluation_langfuse.py   |  72 ++------------
 backend/app/crud/evaluation_processing.py | 114 ++++++----------------
 backend/app/models/evaluation.py          |   8 +-
 3 files changed, 45 insertions(+), 149 deletions(-)

diff --git a/backend/app/crud/evaluation_langfuse.py b/backend/app/crud/evaluation_langfuse.py
index 3d1148e27..a8558d2fe 100644
--- a/backend/app/crud/evaluation_langfuse.py
+++ b/backend/app/crud/evaluation_langfuse.py
@@ -59,35 +59,24 @@ def create_langfuse_dataset_run(
     try:
         # Get the dataset
         dataset = langfuse.get_dataset(dataset_name)
-        logger.info(f"Found dataset '{dataset_name}' with {len(dataset.items)} items")
-
-        # Create a map of item IDs for quick lookup
         dataset_items_map = {item.id: item for item in dataset.items}
 
-        created_traces = 0
-        skipped_items = 0
-        trace_id_mapping = {}  # Store item_id -> trace_id mapping
+        trace_id_mapping = {}
 
         # Create a trace for each result
-        for idx, result in enumerate(results, 1):
+        for result in results:
             item_id = result["item_id"]
             question = result["question"]
             generated_output = result["generated_output"]
             ground_truth = result["ground_truth"]
 
-            # Get the dataset item
             dataset_item = dataset_items_map.get(item_id)
             if not dataset_item:
-                logger.warning(
-                    f"Item {idx}/{len(results)}: Dataset item '{item_id}' not found, skipping"
-                )
-                skipped_items += 1
+                logger.warning(f"Dataset item '{item_id}' not found, skipping")
                 continue
 
             try:
-                # Use item.observe to create a trace linked to the dataset item
                 with dataset_item.observe(run_name=run_name) as trace_id:
-                    # Update the trace with input and output
                     langfuse.trace(
                         id=trace_id,
                         input={"question": question},
@@ -97,29 +86,17 @@ def create_langfuse_dataset_run(
                             "item_id": item_id,
                         },
                     )
-                    created_traces += 1
-                    # Store the trace_id for later score updates
                     trace_id_mapping[item_id] = trace_id
 
-                if idx % 10 == 0:
-                    logger.info(
-                        f"Progress: Created {idx}/{len(results)} traces for run '{run_name}'"
-                    )
-
             except Exception as e:
                 logger.error(
                     f"Failed to create trace for item {item_id}: {e}", exc_info=True
                 )
-                skipped_items += 1
                 continue
 
-        # Flush to ensure all traces are sent
         langfuse.flush()
-
         logger.info(
-            f"Successfully created Langfuse dataset run '{run_name}': "
-            f"{created_traces} traces created, {skipped_items} items skipped, "
-            f"{len(trace_id_mapping)} trace IDs captured"
+            f"Created Langfuse dataset run '{run_name}' with {len(trace_id_mapping)} traces"
         )
 
         return trace_id_mapping
@@ -158,62 +135,25 @@ def update_traces_with_cosine_scores(
         This function logs errors but does not raise exceptions to avoid blocking
         evaluation completion if Langfuse updates fail.
     """
-    logger.info(f"Updating {len(per_item_scores)} traces with cosine similarity scores")
-
-    updated_count = 0
-    error_count = 0
-
     for score_item in per_item_scores:
         item_id = score_item.get("item_id")
         cosine_score = score_item.get("cosine_similarity")
-
-        if not item_id or cosine_score is None:
-            logger.warning(
-                f"Skipping score update: missing item_id or cosine_similarity in {score_item}"
-            )
-            error_count += 1
-            continue
-
         trace_id = trace_id_mapping.get(item_id)
+
         if not trace_id:
-            logger.warning(
-                f"Trace ID not found for item_id '{item_id}', skipping score update"
-            )
-            error_count += 1
             continue
 
         try:
-            # Add score to the trace using Langfuse score API
             langfuse.score(
                 trace_id=trace_id,
                 name="cosine_similarity",
                 value=cosine_score,
                 comment="Cosine similarity between generated output and ground truth embeddings",
             )
-            updated_count += 1
-
-            if updated_count % 10 == 0:
-                logger.info(
-                    f"Progress: Updated {updated_count}/{len(per_item_scores)} traces with scores"
-                )
-
         except Exception as e:
             logger.error(
                 f"Failed to add score for trace {trace_id} (item {item_id}): {e}",
                 exc_info=True,
             )
-            error_count += 1
-            continue
 
-    # Flush to ensure all scores are sent
-    try:
-        langfuse.flush()
-        logger.info(
-            f"Successfully updated traces with scores: "
-            f"{updated_count} traces updated, {error_count} errors"
-        )
-    except Exception as e:
-        logger.error(
-            f"Failed to flush Langfuse scores: {e}",
-            exc_info=True,
-        )
+    langfuse.flush()
diff --git a/backend/app/crud/evaluation_processing.py b/backend/app/crud/evaluation_processing.py
index 537d0e1c7..bc6f174df 100644
--- a/backend/app/crud/evaluation_processing.py
+++ b/backend/app/crud/evaluation_processing.py
@@ -179,7 +179,8 @@ async def process_completed_evaluation(
     Raises:
         Exception: If processing fails
     """
-    logger.info(f"Processing completed evaluation for run {eval_run.id}")
+    log_prefix = f"[org={eval_run.organization_id}][project={eval_run.project_id}][eval={eval_run.id}]"
+    logger.info(f"{log_prefix} Processing completed evaluation")
 
     try:
         # Step 1: Get batch_job
@@ -193,7 +194,9 @@ async def process_completed_evaluation(
             )
 
         # Step 2: Create provider and download results
-        logger.info(f"Step 1: Downloading batch results for batch_job {batch_job.id}")
+        logger.info(
+            f"{log_prefix} Downloading batch results for batch_job {batch_job.id}"
+        )
         provider = OpenAIBatchProvider(client=openai_client)
         raw_results = download_batch_results(provider=provider, batch_job=batch_job)
 
@@ -203,22 +206,19 @@ async def process_completed_evaluation(
             s3_url = upload_batch_results_to_s3(
                 batch_job=batch_job, results=raw_results
             )
-            logger.info(f"Uploaded evaluation results to S3: {s3_url}")
         except Exception as s3_error:
-            logger.warning(
-                f"S3 upload failed (AWS credentials may not be configured): {s3_error}. "
-                f"Continuing without S3 storage.",
-                exc_info=True,
-            )
+            logger.warning(f"{log_prefix} S3 upload failed: {s3_error}")
 
         # Step 3: Fetch dataset items (needed for matching ground truth)
-        logger.info(f"Step 2: Fetching dataset items for '{eval_run.dataset_name}'")
+        logger.info(
+            f"{log_prefix} Fetching dataset items for '{eval_run.dataset_name}'"
+        )
         dataset_items = fetch_dataset_items(
             langfuse=langfuse, dataset_name=eval_run.dataset_name
         )
 
         # Step 4: Parse evaluation results
-        logger.info("Step 3: Parsing evaluation results")
+        logger.info(f"{log_prefix} Parsing evaluation results")
         results = parse_evaluation_output(
             raw_results=raw_results, dataset_items=dataset_items
         )
@@ -227,7 +227,7 @@ async def process_completed_evaluation(
             raise ValueError("No valid results found in batch output")
 
         # Step 5: Create Langfuse dataset run with traces
-        logger.info("Step 4: Creating Langfuse dataset run with traces")
+        logger.info(f"{log_prefix} Creating Langfuse dataset run with traces")
         trace_id_mapping = create_langfuse_dataset_run(
             langfuse=langfuse,
             dataset_name=eval_run.dataset_name,
@@ -241,12 +241,9 @@ async def process_completed_evaluation(
             eval_run.s3_url = s3_url
         session.add(eval_run)
         session.commit()
-        logger.info(
-            f"Stored {len(trace_id_mapping)} trace IDs in evaluation run {eval_run.id}"
-        )
 
-        # Step 6: Start embedding batch for similarity scoring
-        logger.info("Step 6: Starting embedding batch for similarity scoring")
+        # Step 5: Start embedding batch for similarity scoring
+        logger.info(f"{log_prefix} Starting embedding batch for similarity scoring")
         try:
             eval_run = start_embedding_batch(
                 session=session,
@@ -254,15 +251,11 @@ async def process_completed_evaluation(
                 eval_run=eval_run,
                 results=results,
             )
-            logger.info(
-                f"Successfully started embedding batch for evaluation run {eval_run.id}: "
-                f"embedding_batch_job_id={eval_run.embedding_batch_job_id}"
-            )
             # Note: Status remains "processing" until embeddings complete
 
         except Exception as e:
             logger.error(
-                f"Failed to start embedding batch for run {eval_run.id}: {e}",
+                f"{log_prefix} Failed to start embedding batch: {e}",
                 exc_info=True,
             )
             # Don't fail the entire evaluation, just mark as completed without embeddings
@@ -273,16 +266,13 @@ async def process_completed_evaluation(
             session.commit()
             session.refresh(eval_run)
 
-        logger.info(
-            f"Successfully processed evaluation run {eval_run.id}: "
-            f"{len(results)} items processed, embedding batch started"
-        )
+        logger.info(f"{log_prefix} Processed evaluation: {len(results)} items")
 
         return eval_run
 
     except Exception as e:
         logger.error(
-            f"Failed to process completed evaluation for run {eval_run.id}: {e}",
+            f"{log_prefix} Failed to process completed evaluation: {e}",
             exc_info=True,
         )
         # Mark as failed
@@ -325,7 +315,8 @@ async def process_completed_embedding_batch(
     Raises:
         Exception: If processing fails
     """
-    logger.info(f"Processing completed embedding batch for run {eval_run.id}")
+    log_prefix = f"[org={eval_run.organization_id}][project={eval_run.project_id}][eval={eval_run.id}]"
+    logger.info(f"{log_prefix} Processing completed embedding batch")
 
     try:
         # Step 1: Get embedding_batch_job
@@ -343,29 +334,22 @@ async def process_completed_embedding_batch(
             )
 
         # Step 2: Create provider and download results
-        logger.info(
-            f"Step 1: Downloading embedding batch results for batch_job {embedding_batch_job.id}"
-        )
         provider = OpenAIBatchProvider(client=openai_client)
         raw_results = download_batch_results(
             provider=provider, batch_job=embedding_batch_job
         )
 
         # Step 3: Parse embedding results
-        logger.info("Step 3: Parsing embedding results")
         embedding_pairs = parse_embedding_results(raw_results=raw_results)
 
         if not embedding_pairs:
             raise ValueError("No valid embedding pairs found in batch output")
 
         # Step 4: Calculate similarity scores
-        logger.info("Step 3: Calculating cosine similarity scores")
+        logger.info(f"{log_prefix} Calculating cosine similarity scores")
         similarity_stats = calculate_average_similarity(embedding_pairs=embedding_pairs)
 
         # Step 5: Update evaluation_run with scores
-        logger.info("Step 4: Updating evaluation run with similarity scores")
-
-        # Merge with existing score if any
         if eval_run.score is None:
             eval_run.score = {}
 
@@ -384,7 +368,9 @@ async def process_completed_embedding_batch(
             ]
 
         # Step 6: Update Langfuse traces with cosine similarity scores
-        logger.info("Step 5: Updating Langfuse traces with cosine similarity scores")
+        logger.info(
+            f"{log_prefix} Updating Langfuse traces with cosine similarity scores"
+        )
         per_item_scores = similarity_stats.get("per_item_scores", [])
         if per_item_scores and eval_run.langfuse_trace_ids:
             try:
@@ -393,20 +379,12 @@ async def process_completed_embedding_batch(
                     trace_id_mapping=eval_run.langfuse_trace_ids,
                     per_item_scores=per_item_scores,
                 )
-                logger.info(
-                    f"Successfully updated {len(per_item_scores)} Langfuse traces with scores"
-                )
             except Exception as e:
                 # Log error but don't fail the evaluation
                 logger.error(
-                    f"Failed to update Langfuse traces with scores: {e}",
+                    f"{log_prefix} Failed to update Langfuse traces with scores: {e}",
                     exc_info=True,
                 )
-        else:
-            if not per_item_scores:
-                logger.warning("No per-item scores available to update Langfuse traces")
-            if not eval_run.langfuse_trace_ids:
-                logger.warning("No trace IDs available to update Langfuse traces")
 
         # Step 7: Mark evaluation as completed
         eval_run.status = "completed"
@@ -417,7 +395,7 @@ async def process_completed_embedding_batch(
         session.refresh(eval_run)
 
         logger.info(
-            f"Successfully completed embedding processing for evaluation run {eval_run.id}: "
+            f"{log_prefix} Completed evaluation: "
             f"avg_similarity={similarity_stats['cosine_similarity_avg']:.3f}"
         )
 
@@ -425,7 +403,7 @@ async def process_completed_embedding_batch(
 
     except Exception as e:
         logger.error(
-            f"Failed to process completed embedding batch for run {eval_run.id}: {e}",
+            f"{log_prefix} Failed to process completed embedding batch: {e}",
             exc_info=True,
         )
         # Mark as completed anyway, but with error message
@@ -469,18 +447,12 @@ async def check_and_process_evaluation(
             "action": "processed" | "embeddings_completed" | "embeddings_failed" | "failed" | "no_change"
         }
     """
-    logger.info(f"Checking evaluation run {eval_run.id}")
-
+    log_prefix = f"[org={eval_run.organization_id}][project={eval_run.project_id}][eval={eval_run.id}]"
     previous_status = eval_run.status
 
     try:
         # Check if we need to process embedding batch first
         if eval_run.embedding_batch_job_id and eval_run.status == "processing":
-            logger.info(
-                f"Checking embedding batch for evaluation run {eval_run.id}: "
-                f"embedding_batch_job_id={eval_run.embedding_batch_job_id}"
-            )
-
             embedding_batch_job = get_batch_job(
                 session=session, batch_job_id=eval_run.embedding_batch_job_id
             )
@@ -499,8 +471,7 @@ async def check_and_process_evaluation(
 
                 if embedding_status == "completed":
                     logger.info(
-                        f"Embedding batch {embedding_batch_job.provider_batch_id} completed, "
-                        f"processing similarity scores..."
+                        f"{log_prefix} Processing embedding batch {embedding_batch_job.provider_batch_id}"
                     )
 
                     await process_completed_embedding_batch(
@@ -521,7 +492,7 @@ async def check_and_process_evaluation(
 
                 elif embedding_status in ["failed", "expired", "cancelled"]:
                     logger.error(
-                        f"Embedding batch {embedding_batch_job.provider_batch_id} failed: "
+                        f"{log_prefix} Embedding batch {embedding_batch_job.provider_batch_id} failed: "
                         f"{embedding_batch_job.error_message}"
                     )
                     # Mark as completed without embeddings
@@ -545,10 +516,6 @@ async def check_and_process_evaluation(
 
                 else:
                     # Embedding batch still processing
-                    logger.info(
-                        f"Embedding batch {embedding_batch_job.provider_batch_id} still processing "
-                        f"(status={embedding_status})"
-                    )
                     return {
                         "run_id": eval_run.id,
                         "run_name": eval_run.run_name,
@@ -569,7 +536,6 @@ async def check_and_process_evaluation(
             )
 
         # IMPORTANT: Poll OpenAI to get the latest status before checking
-        logger.info(f"Polling OpenAI for batch status: {batch_job.provider_batch_id}")
         provider = OpenAIBatchProvider(client=openai_client)
         from app.crud.batch_operations import poll_batch_status
 
@@ -582,10 +548,6 @@ async def check_and_process_evaluation(
         # Handle different provider statuses
         if provider_status == "completed":
             # Process the completed evaluation
-            logger.info(
-                f"Batch {batch_job.provider_batch_id} completed, processing evaluation results..."
-            )
-
             await process_completed_evaluation(
                 eval_run=eval_run,
                 session=session,
@@ -613,7 +575,9 @@ async def check_and_process_evaluation(
             session.commit()
             session.refresh(eval_run)
 
-            logger.error(f"Batch {batch_job.provider_batch_id} failed: {error_msg}")
+            logger.error(
+                f"{log_prefix} Batch {batch_job.provider_batch_id} failed: {error_msg}"
+            )
 
             return {
                 "run_id": eval_run.id,
@@ -627,10 +591,6 @@ async def check_and_process_evaluation(
 
         else:
             # Still in progress (validating, in_progress, finalizing)
-            logger.info(
-                f"Batch {batch_job.provider_batch_id} still processing (provider_status={provider_status})"
-            )
-
             return {
                 "run_id": eval_run.id,
                 "run_name": eval_run.run_name,
@@ -641,7 +601,7 @@ async def check_and_process_evaluation(
             }
 
     except Exception as e:
-        logger.error(f"Error checking evaluation run {eval_run.id}: {e}", exc_info=True)
+        logger.error(f"{log_prefix} Error checking evaluation: {e}", exc_info=True)
 
         # Mark as failed
         eval_run.status = "failed"
@@ -679,8 +639,6 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
             "details": [...]
         }
     """
-    logger.info(f"Polling all pending evaluations for org_id={org_id}")
-
     # Get pending evaluations (status = "processing")
     statement = select(EvaluationRun).where(
         EvaluationRun.status == "processing",
@@ -689,7 +647,6 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
     pending_runs = session.exec(statement).all()
 
     if not pending_runs:
-        logger.info(f"No pending evaluations found for org_id={org_id}")
         return {
             "total": 0,
             "processed": 0,
@@ -697,9 +654,6 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
             "still_processing": 0,
             "details": [],
         }
-
-    logger.info(f"Found {len(pending_runs)} pending evaluations for org_id={org_id}")
-
     # Group evaluations by project_id since credentials are per project
     evaluations_by_project = defaultdict(list)
     for run in pending_runs:
@@ -712,10 +666,6 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
     total_still_processing_count = 0
 
     for project_id, project_runs in evaluations_by_project.items():
-        logger.info(
-            f"Processing {len(project_runs)} evaluations for project_id={project_id}"
-        )
-
         try:
             # Get credentials for this project
             openai_credentials = get_provider_credential(
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index e8de3b827..7b9384c51 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -163,6 +163,8 @@ class EvaluationRunCreate(SQLModel):
 class EvaluationRunPublic(SQLModel):
     """Public model for evaluation runs."""
 
+    model_config = {"json_schema_extra": {"exclude": {"langfuse_trace_ids"}}}
+
     id: int
     run_name: str
     dataset_name: str
@@ -173,7 +175,11 @@ class EvaluationRunPublic(SQLModel):
     s3_url: str | None
     total_items: int
     score: dict[str, Any] | None
-    langfuse_trace_ids: dict[str, str] | None
+    langfuse_trace_ids: dict[str, str] | None = Field(
+        default=None,
+        exclude=True,
+        description="Internal: Trace ID mapping (excluded from API)",
+    )
     error_message: str | None
     organization_id: int
     project_id: int

From c76ef501f5d1a1611a6a14520b87f92171808ac8 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Sat, 25 Oct 2025 15:54:01 +0530
Subject: [PATCH 28/64] optimizing similarity

---
 ...dd_langfuse_trace_ids_to_evaluation_run.py | 35 ---------------
 backend/app/crud/evaluation_embeddings.py     | 43 ++++++++++++-------
 backend/app/crud/evaluation_langfuse.py       | 10 ++---
 backend/app/crud/evaluation_processing.py     | 14 +++---
 backend/app/models/evaluation.py              | 14 ------
 5 files changed, 38 insertions(+), 78 deletions(-)
 delete mode 100644 backend/app/alembic/versions/2cc3c67356a8_add_langfuse_trace_ids_to_evaluation_run.py

diff --git a/backend/app/alembic/versions/2cc3c67356a8_add_langfuse_trace_ids_to_evaluation_run.py b/backend/app/alembic/versions/2cc3c67356a8_add_langfuse_trace_ids_to_evaluation_run.py
deleted file mode 100644
index 1ce058db4..000000000
--- a/backend/app/alembic/versions/2cc3c67356a8_add_langfuse_trace_ids_to_evaluation_run.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""add_langfuse_trace_ids_to_evaluation_run
-
-Revision ID: 2cc3c67356a8
-Revises: a1b2c3d4e5f6
-Create Date: 2025-10-23 00:00:00.000000
-
-"""
-from alembic import op
-import sqlalchemy as sa
-from sqlalchemy.dialects import postgresql
-
-
-# revision identifiers, used by Alembic.
-revision = "2cc3c67356a8"
-down_revision = "a1b2c3d4e5f6"
-branch_labels = None
-depends_on = None
-
-
-def upgrade():
-    # Add langfuse_trace_ids column to evaluation_run table
-    op.add_column(
-        "evaluation_run",
-        sa.Column(
-            "langfuse_trace_ids",
-            postgresql.JSON(astext_type=sa.Text()),
-            nullable=True,
-            comment="Mapping of item_id to Langfuse trace_id for updating traces with scores",
-        ),
-    )
-
-
-def downgrade():
-    # Drop langfuse_trace_ids column
-    op.drop_column("evaluation_run", "langfuse_trace_ids")
diff --git a/backend/app/crud/evaluation_embeddings.py b/backend/app/crud/evaluation_embeddings.py
index cc304cade..7f423c85e 100644
--- a/backend/app/crud/evaluation_embeddings.py
+++ b/backend/app/crud/evaluation_embeddings.py
@@ -49,13 +49,14 @@ def validate_embedding_model(model: str) -> None:
 
 def build_embedding_jsonl(
     results: list[dict[str, Any]],
+    trace_id_mapping: dict[str, str],
     embedding_model: str = "text-embedding-3-large",
 ) -> list[dict[str, Any]]:
     """
     Build JSONL data for embedding batch using OpenAI Embeddings API.
 
     Each line is a dict with:
-    - custom_id: Unique identifier (dataset item ID)
+    - custom_id: Langfuse trace_id (for direct score updates)
     - method: POST
     - url: /v1/embeddings
     - body: Embedding request with input array [output, ground_truth]
@@ -71,6 +72,7 @@ def build_embedding_jsonl(
                      },
                      ...
                  ]
+        trace_id_mapping: Mapping of item_id to Langfuse trace_id
         embedding_model: OpenAI embedding model to use (default: text-embedding-3-large)
 
     Returns:
@@ -94,15 +96,21 @@ def build_embedding_jsonl(
             logger.warning("Skipping result with no item_id")
             continue
 
+        # Get trace_id from mapping
+        trace_id = trace_id_mapping.get(item_id)
+        if not trace_id:
+            logger.warning(f"Skipping item {item_id} - no trace_id found")
+            continue
+
         # Skip if either output or ground_truth is empty
         if not generated_output or not ground_truth:
             logger.warning(f"Skipping item {item_id} - empty output or ground_truth")
             continue
 
         # Build the batch request object for Embeddings API
-        # Use input array to get both embeddings in one request
+        # Use trace_id as custom_id for direct score updates
         batch_request = {
-            "custom_id": item_id,
+            "custom_id": trace_id,
             "method": "POST",
             "url": "/v1/embeddings",
             "body": {
@@ -132,7 +140,7 @@ def parse_embedding_results(raw_results: list[dict[str, Any]]) -> list[dict[str,
         List of embedding pairs in format:
         [
             {
-                "item_id": "item_123",
+                "trace_id": "trace-uuid-123",
                 "output_embedding": [0.1, 0.2, ...],
                 "ground_truth_embedding": [0.15, 0.22, ...]
             },
@@ -145,16 +153,16 @@ def parse_embedding_results(raw_results: list[dict[str, Any]]) -> list[dict[str,
 
     for line_num, response in enumerate(raw_results, 1):
         try:
-            # Extract custom_id (dataset item ID)
-            item_id = response.get("custom_id")
-            if not item_id:
+            # Extract custom_id (which is now the Langfuse trace_id)
+            trace_id = response.get("custom_id")
+            if not trace_id:
                 logger.warning(f"Line {line_num}: No custom_id found, skipping")
                 continue
 
             # Handle errors in batch processing
             if response.get("error"):
                 error_msg = response["error"].get("message", "Unknown error")
-                logger.error(f"Item {item_id} had error: {error_msg}")
+                logger.error(f"Trace {trace_id} had error: {error_msg}")
                 continue
 
             # Extract the response body
@@ -163,7 +171,7 @@ def parse_embedding_results(raw_results: list[dict[str, Any]]) -> list[dict[str,
 
             if len(embedding_data) < 2:
                 logger.warning(
-                    f"Item {item_id}: Expected 2 embeddings, got {len(embedding_data)}"
+                    f"Trace {trace_id}: Expected 2 embeddings, got {len(embedding_data)}"
                 )
                 continue
 
@@ -187,14 +195,14 @@ def parse_embedding_results(raw_results: list[dict[str, Any]]) -> list[dict[str,
 
             if output_embedding is None or ground_truth_embedding is None:
                 logger.warning(
-                    f"Item {item_id}: Missing embeddings (output={output_embedding is not None}, "
+                    f"Trace {trace_id}: Missing embeddings (output={output_embedding is not None}, "
                     f"ground_truth={ground_truth_embedding is not None})"
                 )
                 continue
 
             embedding_pairs.append(
                 {
-                    "item_id": item_id,
+                    "trace_id": trace_id,
                     "output_embedding": output_embedding,
                     "ground_truth_embedding": ground_truth_embedding,
                 }
@@ -261,7 +269,7 @@ def calculate_average_similarity(
             "cosine_similarity_max": 0.98,
             "cosine_similarity_std": 0.12,
             "total_pairs": 50,
-            "per_item_scores": [...]  # Optional: individual scores
+            "per_item_scores": [...]  # Individual scores with trace_ids
         }
     """
     logger.info(f"Calculating similarity for {len(embedding_pairs)} pairs")
@@ -289,14 +297,14 @@ def calculate_average_similarity(
 
             per_item_scores.append(
                 {
-                    "item_id": pair["item_id"],
+                    "trace_id": pair["trace_id"],
                     "cosine_similarity": similarity,
                 }
             )
 
         except Exception as e:
             logger.error(
-                f"Error calculating similarity for item {pair.get('item_id')}: {e}"
+                f"Error calculating similarity for trace {pair.get('trace_id')}: {e}"
             )
             continue
 
@@ -338,12 +346,13 @@ def start_embedding_batch(
     openai_client: OpenAI,
     eval_run: EvaluationRun,
     results: list[dict[str, Any]],
+    trace_id_mapping: dict[str, str],
 ) -> EvaluationRun:
     """
     Start embedding batch for similarity scoring.
 
     This function orchestrates the embedding batch creation:
-    1. Builds embedding JSONL from evaluation results
+    1. Builds embedding JSONL from evaluation results with trace_ids
     2. Creates batch via generic infrastructure (job_type="embedding")
     3. Links embedding_batch_job_id to eval_run
     4. Keeps status as "processing"
@@ -353,6 +362,7 @@ def start_embedding_batch(
         openai_client: Configured OpenAI client
         eval_run: EvaluationRun database object
         results: Parsed evaluation results (output + ground_truth pairs)
+        trace_id_mapping: Mapping of item_id to Langfuse trace_id
 
     Returns:
         Updated EvaluationRun with embedding_batch_job_id populated
@@ -378,9 +388,10 @@ def start_embedding_batch(
             )
             embedding_model = "text-embedding-3-large"
 
-        # Step 1: Build embedding JSONL
+        # Step 1: Build embedding JSONL with trace_ids
         jsonl_data = build_embedding_jsonl(
             results=results,
+            trace_id_mapping=trace_id_mapping,
             embedding_model=embedding_model,
         )
 
diff --git a/backend/app/crud/evaluation_langfuse.py b/backend/app/crud/evaluation_langfuse.py
index a8558d2fe..964c296fc 100644
--- a/backend/app/crud/evaluation_langfuse.py
+++ b/backend/app/crud/evaluation_langfuse.py
@@ -110,7 +110,6 @@ def create_langfuse_dataset_run(
 
 def update_traces_with_cosine_scores(
     langfuse: Langfuse,
-    trace_id_mapping: dict[str, str],
     per_item_scores: list[dict[str, Any]],
 ) -> None:
     """
@@ -121,11 +120,10 @@ def update_traces_with_cosine_scores(
 
     Args:
         langfuse: Configured Langfuse client
-        trace_id_mapping: Mapping of item_id to Langfuse trace_id
         per_item_scores: List of per-item score dictionaries from calculate_average_similarity()
                         Format: [
                             {
-                                "item_id": "item_123",
+                                "trace_id": "trace-uuid-123",
                                 "cosine_similarity": 0.95
                             },
                             ...
@@ -136,11 +134,11 @@ def update_traces_with_cosine_scores(
         evaluation completion if Langfuse updates fail.
     """
     for score_item in per_item_scores:
-        item_id = score_item.get("item_id")
+        trace_id = score_item.get("trace_id")
         cosine_score = score_item.get("cosine_similarity")
-        trace_id = trace_id_mapping.get(item_id)
 
         if not trace_id:
+            logger.warning("Score item missing trace_id, skipping")
             continue
 
         try:
@@ -152,7 +150,7 @@ def update_traces_with_cosine_scores(
             )
         except Exception as e:
             logger.error(
-                f"Failed to add score for trace {trace_id} (item {item_id}): {e}",
+                f"Failed to add score for trace {trace_id}: {e}",
                 exc_info=True,
             )
 
diff --git a/backend/app/crud/evaluation_processing.py b/backend/app/crud/evaluation_processing.py
index bc6f174df..3f74e2418 100644
--- a/backend/app/crud/evaluation_processing.py
+++ b/backend/app/crud/evaluation_processing.py
@@ -235,14 +235,14 @@ async def process_completed_evaluation(
             results=results,
         )
 
-        # Store trace IDs and S3 URL in database
-        eval_run.langfuse_trace_ids = trace_id_mapping
+        # Store S3 URL in database
         if s3_url:
             eval_run.s3_url = s3_url
-        session.add(eval_run)
-        session.commit()
+            session.add(eval_run)
+            session.commit()
 
-        # Step 5: Start embedding batch for similarity scoring
+        # Step 6: Start embedding batch for similarity scoring
+        # Pass trace_id_mapping directly without storing in DB
         logger.info(f"{log_prefix} Starting embedding batch for similarity scoring")
         try:
             eval_run = start_embedding_batch(
@@ -250,6 +250,7 @@ async def process_completed_evaluation(
                 openai_client=openai_client,
                 eval_run=eval_run,
                 results=results,
+                trace_id_mapping=trace_id_mapping,
             )
             # Note: Status remains "processing" until embeddings complete
 
@@ -372,11 +373,10 @@ async def process_completed_embedding_batch(
             f"{log_prefix} Updating Langfuse traces with cosine similarity scores"
         )
         per_item_scores = similarity_stats.get("per_item_scores", [])
-        if per_item_scores and eval_run.langfuse_trace_ids:
+        if per_item_scores:
             try:
                 update_traces_with_cosine_scores(
                     langfuse=langfuse,
-                    trace_id_mapping=eval_run.langfuse_trace_ids,
                     per_item_scores=per_item_scores,
                 )
             except Exception as e:
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index 7b9384c51..ab4597d1d 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -110,13 +110,6 @@ class EvaluationRun(SQLModel, table=True):
         description="Evaluation scores (e.g., correctness, cosine_similarity, etc.)",
     )
 
-    # Langfuse trace IDs mapping (item_id -> trace_id)
-    langfuse_trace_ids: dict[str, str] | None = SQLField(
-        default=None,
-        sa_column=Column(JSON, nullable=True),
-        description="Mapping of item_id to Langfuse trace_id for updating traces with scores",
-    )
-
     # Error message field
     error_message: str | None = SQLField(
         default=None,
@@ -163,8 +156,6 @@ class EvaluationRunCreate(SQLModel):
 class EvaluationRunPublic(SQLModel):
     """Public model for evaluation runs."""
 
-    model_config = {"json_schema_extra": {"exclude": {"langfuse_trace_ids"}}}
-
     id: int
     run_name: str
     dataset_name: str
@@ -175,11 +166,6 @@ class EvaluationRunPublic(SQLModel):
     s3_url: str | None
     total_items: int
     score: dict[str, Any] | None
-    langfuse_trace_ids: dict[str, str] | None = Field(
-        default=None,
-        exclude=True,
-        description="Internal: Trace ID mapping (excluded from API)",
-    )
     error_message: str | None
     organization_id: int
     project_id: int

From 5bfc8d83fddbcc5e1c536e4fc36fc845e6ac76bb Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Wed, 29 Oct 2025 14:14:40 +0530
Subject: [PATCH 29/64] added evaluation dataset

---
 ...evaluation_dataset_table_and_dataset_id.py |  82 +++
 backend/app/api/routes/evaluation.py          | 536 +++++++++++++++---
 backend/app/crud/evaluation_batch.py          |  50 +-
 backend/app/crud/evaluation_dataset.py        | 368 ++++++++++++
 backend/app/crud/evaluation_langfuse.py       | 125 +++-
 backend/app/models/__init__.py                |   3 +
 backend/app/models/evaluation.py              | 126 +++-
 backend/app/models/organization.py            |   5 +-
 backend/app/models/project.py                 |   3 +
 9 files changed, 1177 insertions(+), 121 deletions(-)
 create mode 100644 backend/app/alembic/versions/b2c3d4e5f6g7_add_evaluation_dataset_table_and_dataset_id.py
 create mode 100644 backend/app/crud/evaluation_dataset.py

diff --git a/backend/app/alembic/versions/b2c3d4e5f6g7_add_evaluation_dataset_table_and_dataset_id.py b/backend/app/alembic/versions/b2c3d4e5f6g7_add_evaluation_dataset_table_and_dataset_id.py
new file mode 100644
index 000000000..f6ca8a79d
--- /dev/null
+++ b/backend/app/alembic/versions/b2c3d4e5f6g7_add_evaluation_dataset_table_and_dataset_id.py
@@ -0,0 +1,82 @@
+"""add_evaluation_dataset_table_and_dataset_id_to_evaluation_run
+
+Revision ID: b2c3d4e5f6g7
+Revises: a1b2c3d4e5f6
+Create Date: 2025-10-28 00:00:00.000000
+
+"""
+from alembic import op
+import sqlalchemy as sa
+import sqlmodel.sql.sqltypes
+from sqlalchemy.dialects import postgresql
+
+
+# revision identifiers, used by Alembic.
+revision = "b2c3d4e5f6g7"
+down_revision = "a1b2c3d4e5f6"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # Create evaluation_dataset table
+    op.create_table(
+        "evaluation_dataset",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
+        sa.Column("description", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
+        sa.Column(
+            "dataset_metadata",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=False,
+            server_default=sa.text("'{}'::jsonb"),
+        ),
+        sa.Column("s3_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
+        sa.Column(
+            "langfuse_dataset_id",
+            sqlmodel.sql.sqltypes.AutoString(),
+            nullable=True,
+        ),
+        sa.Column("organization_id", sa.Integer(), nullable=False),
+        sa.Column("project_id", sa.Integer(), nullable=False),
+        sa.Column("inserted_at", sa.DateTime(), nullable=False),
+        sa.Column("updated_at", sa.DateTime(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["organization_id"], ["organization.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    op.create_index(
+        op.f("ix_evaluation_dataset_name"),
+        "evaluation_dataset",
+        ["name"],
+        unique=False,
+    )
+
+    # Add dataset_id column to evaluation_run table
+    op.add_column(
+        "evaluation_run",
+        sa.Column("dataset_id", sa.Integer(), nullable=True),
+    )
+    op.create_foreign_key(
+        "fk_evaluation_run_dataset_id",
+        "evaluation_run",
+        "evaluation_dataset",
+        ["dataset_id"],
+        ["id"],
+    )
+
+
+def downgrade():
+    # Drop foreign key and column from evaluation_run
+    op.drop_constraint(
+        "fk_evaluation_run_dataset_id",
+        "evaluation_run",
+        type_="foreignkey",
+    )
+    op.drop_column("evaluation_run", "dataset_id")
+
+    # Drop evaluation_dataset table
+    op.drop_index(op.f("ix_evaluation_dataset_name"), table_name="evaluation_dataset")
+    op.drop_table("evaluation_dataset")
diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index f5667bbb9..e6d9a2d47 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -7,7 +7,6 @@
 from app.core.util import configure_langfuse, configure_openai, now
 from app.crud.assistants import get_assistant_by_id
 from app.crud.credentials import get_provider_credential
-from app.crud.evaluation import upload_dataset_to_langfuse
 from app.crud.evaluation_batch import start_evaluation_batch
 from app.crud.evaluation_processing import poll_all_pending_evaluations
 from app.models import EvaluationRun, UserProjectOrg
@@ -26,7 +25,8 @@ async def upload_dataset(
     file: UploadFile = File(
         ..., description="CSV file with 'question' and 'answer' columns"
     ),
-    dataset_name: str = Form(..., description="Name for the dataset in Langfuse"),
+    dataset_name: str = Form(..., description="Name for the dataset"),
+    description: str | None = Form(None, description="Optional dataset description"),
     duplication_factor: int = Form(
         default=5, description="Number of times to duplicate each item"
     ),
@@ -34,8 +34,13 @@ async def upload_dataset(
     _current_user: UserProjectOrg = Depends(get_current_user_org_project),
 ) -> DatasetUploadResponse:
     """
-    Upload a CSV file containing Golden Q&A pairs to Langfuse as a dataset.
-    Each question will be duplicated N times (default 5) to test LLM flakiness.
+    Upload a CSV file containing Golden Q&A pairs.
+
+    This endpoint:
+    1. Validates and parses the CSV file
+    2. Uploads CSV to AWS S3 (if credentials configured)
+    3. Uploads dataset to Langfuse (for immediate use)
+    4. Stores metadata in database
 
     CSV Format:
     - Must contain 'question' and 'answer' columns
@@ -47,40 +52,312 @@ async def upload_dataset(
     "What is the capital of France?","Paris"
     "What is 2+2?","4"
     ```
+
+    Returns:
+        DatasetUploadResponse with dataset_id, s3_url, and Langfuse details
     """
+    from app.core.cloud import get_cloud_storage
+    from app.crud.evaluation_dataset import create_evaluation_dataset, upload_csv_to_s3
+    from app.crud.evaluation_langfuse import upload_dataset_to_langfuse_from_csv
+
     logger.info(
-        f"Uploading dataset: {dataset_name} with duplication factor: {duplication_factor}"
+        f"Uploading dataset: {dataset_name} with duplication factor: "
+        f"{duplication_factor}, org_id={_current_user.organization_id}, "
+        f"project_id={_current_user.project_id}"
     )
 
     # Read CSV content
-    content = await file.read()
+    csv_content = await file.read()
 
-    success, data, error = await upload_dataset_to_langfuse(
-        csv_content=content,
-        dataset_name=dataset_name,
-        duplication_factor=duplication_factor,
-        _session=_session,
-        _current_user=_current_user,
+    # Step 1: Parse and validate CSV
+    import csv
+    import io
+
+    try:
+        csv_text = csv_content.decode("utf-8")
+        csv_reader = csv.DictReader(io.StringIO(csv_text))
+
+        # Validate headers
+        if (
+            "question" not in csv_reader.fieldnames
+            or "answer" not in csv_reader.fieldnames
+        ):
+            raise ValueError(
+                f"CSV must contain 'question' and 'answer' columns. "
+                f"Found columns: {csv_reader.fieldnames}"
+            )
+
+        # Count original items
+        original_items = []
+        for row in csv_reader:
+            question = row.get("question", "").strip()
+            answer = row.get("answer", "").strip()
+            if question and answer:
+                original_items.append({"question": question, "answer": answer})
+
+        if not original_items:
+            raise ValueError("No valid items found in CSV file")
+
+        original_items_count = len(original_items)
+        total_items_count = original_items_count * duplication_factor
+
+        logger.info(
+            f"Parsed {original_items_count} items from CSV, "
+            f"will create {total_items_count} total items with duplication"
+        )
+
+    except Exception as e:
+        logger.error(f"Failed to parse CSV: {e}", exc_info=True)
+        raise ValueError(f"Invalid CSV file: {e}")
+
+    # Step 2: Upload to AWS S3 (if credentials configured)
+    s3_url = None
+    try:
+        storage = get_cloud_storage(
+            session=_session, project_id=_current_user.project_id
+        )
+        s3_url = upload_csv_to_s3(
+            storage=storage, csv_content=csv_content, dataset_name=dataset_name
+        )
+        if s3_url:
+            logger.info(f"Successfully uploaded CSV to S3: {s3_url}")
+        else:
+            logger.info("S3 upload returned None, continuing without S3 storage")
+    except Exception as e:
+        logger.warning(
+            f"Failed to upload CSV to S3 (continuing without S3): {e}", exc_info=True
+        )
+        s3_url = None
+
+    # Step 3: Upload to Langfuse
+    langfuse_dataset_id = None
+    try:
+        # Get Langfuse credentials
+        langfuse_credentials = get_provider_credential(
+            session=_session,
+            org_id=_current_user.organization_id,
+            project_id=_current_user.project_id,
+            provider="langfuse",
+        )
+        if not langfuse_credentials:
+            raise ValueError("Langfuse credentials not configured")
+
+        langfuse, langfuse_success = configure_langfuse(langfuse_credentials)
+        if not langfuse_success:
+            raise ValueError("Failed to configure Langfuse client")
+
+        # Upload to Langfuse
+        langfuse_dataset_id, _ = upload_dataset_to_langfuse_from_csv(
+            langfuse=langfuse,
+            csv_content=csv_content,
+            dataset_name=dataset_name,
+            duplication_factor=duplication_factor,
+        )
+
+        logger.info(
+            f"Successfully uploaded dataset to Langfuse: {dataset_name} "
+            f"(id={langfuse_dataset_id})"
+        )
+
+    except Exception as e:
+        logger.error(f"Failed to upload dataset to Langfuse: {e}", exc_info=True)
+        raise ValueError(f"Failed to upload dataset to Langfuse: {e}")
+
+    # Step 4: Store metadata in database
+    try:
+        metadata = {
+            "original_items_count": original_items_count,
+            "total_items_count": total_items_count,
+            "duplication_factor": duplication_factor,
+        }
+
+        dataset = create_evaluation_dataset(
+            session=_session,
+            name=dataset_name,
+            description=description,
+            dataset_metadata=metadata,
+            s3_url=s3_url,
+            langfuse_dataset_id=langfuse_dataset_id,
+            organization_id=_current_user.organization_id,
+            project_id=_current_user.project_id,
+        )
+
+        logger.info(
+            f"Successfully created dataset record in database: id={dataset.id}, "
+            f"name={dataset_name}"
+        )
+
+        # Return response
+        return DatasetUploadResponse(
+            dataset_id=dataset.id,
+            dataset_name=dataset_name,
+            total_items=total_items_count,
+            original_items=original_items_count,
+            duplication_factor=duplication_factor,
+            langfuse_dataset_id=langfuse_dataset_id,
+            s3_url=s3_url,
+        )
+
+    except Exception as e:
+        logger.error(f"Failed to create dataset record in database: {e}", exc_info=True)
+        raise ValueError(f"Failed to save dataset metadata: {e}")
+
+
+@router.get("/dataset/{dataset_id}", response_model=DatasetUploadResponse)
+async def get_dataset(
+    dataset_id: int,
+    _session: Session = Depends(get_db),
+    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
+) -> DatasetUploadResponse:
+    """
+    Get details of a specific dataset by ID.
+
+    Args:
+        dataset_id: ID of the dataset to retrieve
+
+    Returns:
+        DatasetUploadResponse with dataset details
+    """
+    from app.crud.evaluation_dataset import get_dataset_by_id
+
+    logger.info(
+        f"Fetching dataset: id={dataset_id}, "
+        f"org_id={_current_user.organization_id}, "
+        f"project_id={_current_user.project_id}"
+    )
+
+    dataset = get_dataset_by_id(
+        session=_session,
+        dataset_id=dataset_id,
+        organization_id=_current_user.organization_id,
+        project_id=_current_user.project_id,
+    )
+
+    if not dataset:
+        raise ValueError(f"Dataset {dataset_id} not found or not accessible")
+
+    # Build response
+    return DatasetUploadResponse(
+        dataset_id=dataset.id,
+        dataset_name=dataset.name,
+        total_items=dataset.dataset_metadata.get("total_items_count", 0),
+        original_items=dataset.dataset_metadata.get("original_items_count", 0),
+        duplication_factor=dataset.dataset_metadata.get("duplication_factor", 1),
+        langfuse_dataset_id=dataset.langfuse_dataset_id,
+        s3_url=dataset.s3_url,
+    )
+
+
+@router.get("/datasets", response_model=list[DatasetUploadResponse])
+async def list_datasets_endpoint(
+    limit: int = 50,
+    offset: int = 0,
+    _session: Session = Depends(get_db),
+    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
+) -> list[DatasetUploadResponse]:
+    """
+    List all datasets for the current organization and project.
+
+    Args:
+        limit: Maximum number of datasets to return (default 50, max 100)
+        offset: Number of datasets to skip for pagination (default 0)
+
+    Returns:
+        List of DatasetUploadResponse objects, ordered by most recent first
+    """
+    from app.crud.evaluation_dataset import list_datasets
+
+    # Enforce maximum limit
+    if limit > 100:
+        limit = 100
+
+    logger.info(
+        f"Listing datasets: org_id={_current_user.organization_id}, "
+        f"project_id={_current_user.project_id}, limit={limit}, "
+        f"offset={offset}"
+    )
+
+    datasets = list_datasets(
+        session=_session,
+        organization_id=_current_user.organization_id,
+        project_id=_current_user.project_id,
+        limit=limit,
+        offset=offset,
     )
 
-    if not success or data is None:
-        raise ValueError(error or "Failed to upload dataset")
+    # Convert to response format
+    response = []
+    for dataset in datasets:
+        response.append(
+            DatasetUploadResponse(
+                dataset_id=dataset.id,
+                dataset_name=dataset.name,
+                total_items=dataset.dataset_metadata.get("total_items_count", 0),
+                original_items=dataset.dataset_metadata.get("original_items_count", 0),
+                duplication_factor=dataset.dataset_metadata.get(
+                    "duplication_factor", 1
+                ),
+                langfuse_dataset_id=dataset.langfuse_dataset_id,
+                s3_url=dataset.s3_url,
+            )
+        )
+
+    logger.info(f"Found {len(response)} datasets")
+    return response
+
+
+@router.delete("/dataset/{dataset_id}")
+async def delete_dataset(
+    dataset_id: int,
+    _session: Session = Depends(get_db),
+    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
+) -> dict:
+    """
+    Delete a dataset by ID.
+
+    This will remove the dataset record from the database. The CSV file in S3
+    (if exists) will remain for audit purposes, but the dataset will no longer
+    be accessible for creating new evaluations.
+
+    Args:
+        dataset_id: ID of the dataset to delete
+
+    Returns:
+        Success message with deleted dataset details
+    """
+    from app.crud.evaluation_dataset import delete_dataset as delete_dataset_crud
 
     logger.info(
-        f"Successfully uploaded dataset: {dataset_name} with {data.total_items} items "
-        f"({data.original_items} original items × {duplication_factor})"
+        f"Deleting dataset: id={dataset_id}, "
+        f"org_id={_current_user.organization_id}, "
+        f"project_id={_current_user.project_id}"
     )
 
-    return data
+    success, message = delete_dataset_crud(
+        session=_session,
+        dataset_id=dataset_id,
+        organization_id=_current_user.organization_id,
+        project_id=_current_user.project_id,
+    )
+
+    if not success:
+        raise ValueError(message)
+
+    logger.info(f"Successfully deleted dataset: id={dataset_id}")
+    return {"message": message, "dataset_id": dataset_id}
 
 
 @router.post("/evaluate", response_model=EvaluationRunPublic)
 async def evaluate_threads(
-    dataset_name: str = Body(..., description="Name of the Langfuse dataset"),
+    dataset_id: int = Body(..., description="ID of the evaluation dataset"),
     experiment_name: str = Body(
         ..., description="Name for this evaluation experiment/run"
     ),
-    config: dict = Body(..., description="Evaluation configuration"),
+    config: dict = Body(default_factory=dict, description="Evaluation configuration"),
+    assistant_id: str
+    | None = Body(
+        None, description="Optional assistant ID to fetch configuration from"
+    ),
     _session: Session = Depends(get_db),
     _current_user: UserProjectOrg = Depends(get_current_user_org_project),
 ) -> EvaluationRunPublic:
@@ -88,42 +365,102 @@ async def evaluate_threads(
     Start an evaluation using OpenAI Batch API.
 
     This endpoint:
-    1. Creates an EvaluationRun record in the database
-    2. Fetches dataset items from Langfuse
-    3. Builds JSONL for batch processing (using provided config)
-    4. Creates a batch job via the generic batch infrastructure
-    5. Returns the evaluation run details with batch_job_id
+    1. Fetches the dataset from database
+    2. Ensures dataset is uploaded to Langfuse (re-uploads from S3 if needed)
+    3. Creates an EvaluationRun record in the database
+    4. Fetches dataset items from Langfuse
+    5. Builds JSONL for batch processing (config is used as-is)
+    6. Creates a batch job via the generic batch infrastructure
+    7. Returns the evaluation run details with batch_job_id
 
     The batch will be processed asynchronously by Celery Beat (every 60s).
     Use GET /evaluate/batch/{run_id}/status to check progress.
 
     Args:
-        dataset_name: Name of the Langfuse dataset
+        dataset_id: ID of the evaluation dataset (from /dataset/upload)
         experiment_name: Name for this evaluation experiment/run
-        config: Configuration dict with optional fields:
-            - assistant_id (optional): If provided, fetch config from openai_assistant table
-            - llm (optional): {"model": "gpt-4o", "temperature": 0.2}
-            - instructions (optional): System instructions
-            - vector_store_ids (optional): List of vector store IDs
+        config: Configuration dict that will be used as-is in JSONL generation.
+            Can include any OpenAI Responses API parameters like:
+            - model: str (e.g., "gpt-4o", "gpt-5")
+            - instructions: str
+            - tools: list (e.g., [{"type": "file_search", "vector_store_ids": [...]}])
+            - reasoning: dict (e.g., {"effort": "low"})
+            - text: dict (e.g., {"verbosity": "low"})
+            - temperature: float
+            - include: list (e.g., ["file_search_call.results"])
+            Note: "input" will be added automatically from the dataset
+        assistant_id: Optional assistant ID. If provided, configuration will be
+            fetched from the assistant in the database. Config can be passed as
+            empty dict {} when using assistant_id.
+
+    Example with config:
+    {
+        "dataset_id": 123,
+        "experiment_name": "test_run",
+        "config": {
+            "model": "gpt-4.1",
+            "instructions": "You are a helpful FAQ assistant.",
+            "tools": [
+                {
+                    "type": "file_search",
+                    "vector_store_ids": ["vs_12345"],
+                    "max_num_results": 3
+                }
+            ],
+            "include": ["file_search_call.results"]
+        }
+    }
 
-    Example config:
+    Example with assistant_id:
     {
-        "llm": {"model": "gpt-4o", "temperature": 0.2},
-        "instructions": "You are a friendly assistant",
-        "vector_store_ids": ["vs_abc123"],
-        "assistant_id": "asst_xyz"  # Optional - fetches from DB if provided
+        "dataset_id": 123,
+        "experiment_name": "test_run",
+        "config": {},
+        "assistant_id": "asst_xyz"
     }
 
     Returns:
         EvaluationRunPublic with batch details and status
     """
+    from app.core.cloud import get_cloud_storage
+    from app.crud.evaluation_dataset import (
+        download_csv_from_s3,
+        get_dataset_by_id,
+        update_dataset_langfuse_id,
+    )
+    from app.crud.evaluation_langfuse import upload_dataset_to_langfuse_from_csv
+
     logger.info(
         f"Starting evaluation: experiment_name={experiment_name}, "
-        f"dataset={dataset_name}, "
+        f"dataset_id={dataset_id}, "
         f"org_id={_current_user.organization_id}, "
+        f"assistant_id={assistant_id}, "
         f"config_keys={list(config.keys())}"
     )
 
+    # Step 1: Fetch dataset from database
+    dataset = get_dataset_by_id(
+        session=_session,
+        dataset_id=dataset_id,
+        organization_id=_current_user.organization_id,
+        project_id=_current_user.project_id,
+    )
+
+    if not dataset:
+        raise ValueError(
+            f"Dataset {dataset_id} not found or not accessible to this "
+            f"organization/project"
+        )
+
+    logger.info(
+        f"Found dataset: id={dataset.id}, name={dataset.name}, "
+        f"s3_url={'present' if dataset.s3_url else 'None'}, "
+        f"langfuse_id={dataset.langfuse_dataset_id}"
+    )
+
+    dataset_name = dataset.name
+    duplication_factor = dataset.dataset_metadata.get("duplication_factor", 5)
+
     # Get credentials
     openai_credentials = get_provider_credential(
         session=_session,
@@ -148,8 +485,56 @@ async def evaluate_threads(
     if not openai_success or not langfuse_success:
         raise ValueError("Failed to configure API clients")
 
-    # Check if assistant_id is provided in config
-    assistant_id = config.get("assistant_id")
+    # Step 2: Ensure dataset is in Langfuse (re-upload from S3 if needed)
+    if not dataset.langfuse_dataset_id:
+        logger.info(f"Dataset {dataset_id} not yet in Langfuse, uploading from S3")
+
+        if not dataset.s3_url:
+            raise ValueError(
+                f"Dataset {dataset_id} has no S3 URL and no Langfuse ID. "
+                "Cannot proceed with evaluation."
+            )
+
+        try:
+            # Download CSV from S3
+            storage = get_cloud_storage(
+                session=_session, project_id=_current_user.project_id
+            )
+            csv_content = download_csv_from_s3(storage=storage, s3_url=dataset.s3_url)
+
+            # Upload to Langfuse
+            langfuse_dataset_id, _ = upload_dataset_to_langfuse_from_csv(
+                langfuse=langfuse,
+                csv_content=csv_content,
+                dataset_name=dataset_name,
+                duplication_factor=duplication_factor,
+            )
+
+            # Update dataset record with langfuse_dataset_id
+            update_dataset_langfuse_id(
+                session=_session,
+                dataset_id=dataset.id,
+                langfuse_dataset_id=langfuse_dataset_id,
+            )
+
+            logger.info(
+                f"Successfully uploaded dataset {dataset_id} to Langfuse: "
+                f"langfuse_id={langfuse_dataset_id}"
+            )
+
+        except Exception as e:
+            logger.error(
+                f"Failed to upload dataset {dataset_id} to Langfuse from S3: {e}",
+                exc_info=True,
+            )
+            raise ValueError(f"Failed to prepare dataset for evaluation: {e}")
+    else:
+        logger.info(
+            f"Dataset {dataset_id} already in Langfuse: "
+            f"langfuse_id={dataset.langfuse_dataset_id}"
+        )
+
+    # Handle assistant_id if provided
     if assistant_id:
         # Fetch assistant details from database
         assistant = get_assistant_by_id(
@@ -158,49 +543,49 @@ async def evaluate_threads(
             project_id=_current_user.project_id,
         )
 
-        if assistant:
-            logger.info(
-                f"Found assistant in DB: id={assistant.id}, "
-                f"model={assistant.model}, instructions={assistant.instructions[:50]}..."
-            )
+        if not assistant:
+            raise ValueError(f"Assistant {assistant_id} not found")
 
-            # Merge DB config with provided config (provided config takes precedence)
-            db_config = {
-                "assistant_id": assistant_id,
-                "llm": {
-                    "model": assistant.model,
-                    "temperature": assistant.temperature,
-                },
-                "instructions": assistant.instructions,
-                "vector_store_ids": assistant.vector_store_ids or [],
-            }
-
-            # Override with provided config values
-            for key in ["llm", "instructions", "vector_store_ids"]:
-                if key in config:
-                    db_config[key] = config[key]
-
-            config = db_config
-            logger.info("Using merged config from DB and provided values")
-        else:
-            logger.warning(
-                f"Assistant {assistant_id} not found in DB, using provided config"
-            )
-    else:
-        logger.info("No assistant_id provided, using provided config directly")
+        logger.info(
+            f"Found assistant in DB: id={assistant.id}, "
+            f"model={assistant.model}, instructions="
+            f"{assistant.instructions[:50] if assistant.instructions else 'None'}..."
+        )
 
-    # Ensure config has required fields with defaults
-    if "llm" not in config:
-        config["llm"] = {"model": "gpt-4o", "temperature": 0.2}
-    if "instructions" not in config:
-        config["instructions"] = "You are a helpful assistant"
-    if "vector_store_ids" not in config:
-        config["vector_store_ids"] = []
+        # Build config from assistant (use provided config values to override
+        # if present)
+        config = {
+            "model": config.get("model", assistant.model),
+            "instructions": config.get("instructions", assistant.instructions),
+            "temperature": config.get("temperature", assistant.temperature),
+        }
+
+        # Add tools if vector stores are available
+        vector_store_ids = config.get(
+            "vector_store_ids", assistant.vector_store_ids or []
+        )
+        if vector_store_ids and len(vector_store_ids) > 0:
+            config["tools"] = [
+                {
+                    "type": "file_search",
+                    "vector_store_ids": vector_store_ids,
+                }
+            ]
+
+        logger.info("Using config from assistant")
+    else:
+        logger.info("Using provided config directly")
+        # Validate that config has minimum required fields
+        if not config.get("model"):
+            raise ValueError(
+                "Config must include 'model' when assistant_id is not provided"
+            )
 
     # Create EvaluationRun record
     eval_run = EvaluationRun(
         run_name=experiment_name,
         dataset_name=dataset_name,
+        dataset_id=dataset_id,
         config=config,
         status="pending",
         organization_id=_current_user.organization_id,
@@ -256,7 +641,8 @@ async def poll_evaluation_batches(
     - Debugging evaluation issues
 
     Returns:
-        Summary of polling results including processed, failed, and still processing counts
+        Summary of polling results including processed, failed, and still
+        processing counts
     """
     logger.info(
         f"Manual polling triggered for org_id={_current_user.organization_id} "
diff --git a/backend/app/crud/evaluation_batch.py b/backend/app/crud/evaluation_batch.py
index 8f6e267ab..d0adff919 100644
--- a/backend/app/crud/evaluation_batch.py
+++ b/backend/app/crud/evaluation_batch.py
@@ -35,8 +35,6 @@ def fetch_dataset_items(langfuse: Langfuse, dataset_name: str) -> list[dict[str,
     Raises:
         ValueError: If dataset not found or empty
     """
-    logger.info(f"Fetching dataset: {dataset_name}")
-
     try:
         dataset = langfuse.get_dataset(dataset_name)
     except Exception as e:
@@ -56,8 +54,6 @@ def fetch_dataset_items(langfuse: Langfuse, dataset_name: str) -> list[dict[str,
                 "metadata": item.metadata if hasattr(item, "metadata") else {},
             }
         )
-
-    logger.info(f"Fetched {len(items)} items from dataset '{dataset_name}'")
     return items
 
 
@@ -71,23 +67,25 @@ def build_evaluation_jsonl(
     - custom_id: Unique identifier for the request (dataset item ID)
     - method: POST
     - url: /v1/responses
-    - body: Response request with model, instructions, and input
+    - body: Response request using config as-is with input from dataset
 
     Args:
         dataset_items: List of dataset items from Langfuse
-        config: Evaluation configuration dict with llm, instructions, vector_store_ids
+        config: Evaluation configuration dict with OpenAI Responses API parameters.
+            This config is used as-is in the body, with only "input" being added
+            from the dataset. Config can include any fields like:
+            - model (required)
+            - instructions
+            - tools
+            - reasoning
+            - text
+            - temperature
+            - include
+            etc.
 
     Returns:
         List of dictionaries (JSONL data)
     """
-    # Extract config values
-    llm_config = config.get("llm", {})
-    model = llm_config.get("model", "gpt-4o")
-    instructions = config.get("instructions", "You are a helpful assistant")
-    vector_store_ids = config.get("vector_store_ids", [])
-
-    logger.info(f"Building JSONL for {len(dataset_items)} items with model {model}")
-
     jsonl_data = []
 
     for item in dataset_items:
@@ -98,30 +96,18 @@ def build_evaluation_jsonl(
             continue
 
         # Build the batch request object for Responses API
+        # Use config as-is and only add the input field
         batch_request = {
             "custom_id": item["id"],
             "method": "POST",
             "url": "/v1/responses",
             "body": {
-                "model": model,
-                "instructions": instructions,
-                "input": question,
+                **config,  # Use config as-is
+                "input": question,  # Add input from dataset
             },
         }
 
-        # Add vector store IDs if available (for file search)
-        if vector_store_ids and len(vector_store_ids) > 0:
-            batch_request["body"]["tools"] = [
-                {
-                    "type": "file_search",
-                    "vector_store_ids": vector_store_ids,
-                }
-            ]
-            batch_request["body"]["tool_choice"] = "auto"
-
         jsonl_data.append(batch_request)
-
-    logger.info(f"Built {len(jsonl_data)} JSONL lines")
     return jsonl_data
 
 
@@ -169,10 +155,8 @@ def start_evaluation_batch(
             "endpoint": "/v1/responses",
             "description": f"Evaluation: {eval_run.run_name}",
             "completion_window": "24h",
-            # Store complete config including LLM settings for reference
-            "llm": config.get("llm", {}),
-            "instructions": config.get("instructions"),
-            "vector_store_ids": config.get("vector_store_ids", []),
+            # Store complete config for reference
+            "evaluation_config": config,
         }
 
         # Step 5: Start batch job using generic infrastructure
diff --git a/backend/app/crud/evaluation_dataset.py b/backend/app/crud/evaluation_dataset.py
new file mode 100644
index 000000000..b9f798565
--- /dev/null
+++ b/backend/app/crud/evaluation_dataset.py
@@ -0,0 +1,368 @@
+"""
+CRUD operations for evaluation datasets.
+
+This module handles database operations for evaluation datasets including:
+1. Creating new datasets
+2. Fetching datasets by ID or name
+3. Listing datasets with pagination
+4. Uploading CSV files to AWS S3
+"""
+
+import logging
+from pathlib import Path
+from typing import Any
+
+from sqlmodel import Session, select
+
+from app.core.cloud.storage import CloudStorage, CloudStorageError
+from app.core.util import now
+from app.models import EvaluationDataset
+
+logger = logging.getLogger(__name__)
+
+
+def create_evaluation_dataset(
+    session: Session,
+    name: str,
+    dataset_metadata: dict[str, Any],
+    organization_id: int,
+    project_id: int,
+    description: str | None = None,
+    s3_url: str | None = None,
+    langfuse_dataset_id: str | None = None,
+) -> EvaluationDataset:
+    """
+    Create a new evaluation dataset record in the database.
+
+    Args:
+        session: Database session
+        name: Name of the dataset
+        dataset_metadata: Dataset metadata (original_items_count,
+            total_items_count, duplication_factor)
+        organization_id: Organization ID
+        project_id: Project ID
+        description: Optional dataset description
+        s3_url: Optional AWS S3 URL where CSV is stored
+        langfuse_dataset_id: Optional Langfuse dataset ID
+
+    Returns:
+        Created EvaluationDataset object
+    """
+    dataset = EvaluationDataset(
+        name=name,
+        description=description,
+        dataset_metadata=dataset_metadata,
+        s3_url=s3_url,
+        langfuse_dataset_id=langfuse_dataset_id,
+        organization_id=organization_id,
+        project_id=project_id,
+        inserted_at=now(),
+        updated_at=now(),
+    )
+
+    session.add(dataset)
+    session.commit()
+    session.refresh(dataset)
+
+    logger.info(
+        f"Created evaluation dataset: id={dataset.id}, name={name}, "
+        f"org_id={organization_id}, project_id={project_id}"
+    )
+
+    return dataset
+
+
+def get_dataset_by_id(
+    session: Session, dataset_id: int, organization_id: int, project_id: int
+) -> EvaluationDataset | None:
+    """
+    Fetch an evaluation dataset by ID with organization and project validation.
+
+    Args:
+        session: Database session
+        dataset_id: Dataset ID
+        organization_id: Organization ID for validation
+        project_id: Project ID for validation
+
+    Returns:
+        EvaluationDataset if found and belongs to the org/project, None otherwise
+    """
+    statement = (
+        select(EvaluationDataset)
+        .where(EvaluationDataset.id == dataset_id)
+        .where(EvaluationDataset.organization_id == organization_id)
+        .where(EvaluationDataset.project_id == project_id)
+    )
+
+    dataset = session.exec(statement).first()
+
+    if dataset:
+        logger.info(
+            f"Found dataset: id={dataset_id}, name={dataset.name}, "
+            f"org_id={organization_id}, project_id={project_id}"
+        )
+    else:
+        logger.warning(
+            f"Dataset not found or not accessible: id={dataset_id}, "
+            f"org_id={organization_id}, project_id={project_id}"
+        )
+
+    return dataset
+
+
+def get_dataset_by_name(
+    session: Session, name: str, organization_id: int, project_id: int
+) -> EvaluationDataset | None:
+    """
+    Fetch an evaluation dataset by name with organization and project validation.
+
+    Args:
+        session: Database session
+        name: Dataset name
+        organization_id: Organization ID for validation
+        project_id: Project ID for validation
+
+    Returns:
+        EvaluationDataset if found and belongs to the org/project, None otherwise
+    """
+    statement = (
+        select(EvaluationDataset)
+        .where(EvaluationDataset.name == name)
+        .where(EvaluationDataset.organization_id == organization_id)
+        .where(EvaluationDataset.project_id == project_id)
+    )
+
+    dataset = session.exec(statement).first()
+
+    if dataset:
+        logger.info(
+            f"Found dataset by name: name={name}, id={dataset.id}, "
+            f"org_id={organization_id}, project_id={project_id}"
+        )
+
+    return dataset
+
+
+def list_datasets(
+    session: Session,
+    organization_id: int,
+    project_id: int,
+    limit: int = 50,
+    offset: int = 0,
+) -> list[EvaluationDataset]:
+    """
+    List all evaluation datasets for an organization and project with pagination.
+
+    Args:
+        session: Database session
+        organization_id: Organization ID
+        project_id: Project ID
+        limit: Maximum number of datasets to return (default 50)
+        offset: Number of datasets to skip (for pagination)
+
+    Returns:
+        List of EvaluationDataset objects, ordered by most recent first
+    """
+    statement = (
+        select(EvaluationDataset)
+        .where(EvaluationDataset.organization_id == organization_id)
+        .where(EvaluationDataset.project_id == project_id)
+        .order_by(EvaluationDataset.inserted_at.desc())
+        .limit(limit)
+        .offset(offset)
+    )
+
+    datasets = session.exec(statement).all()
+
+    logger.info(
+        f"Listed {len(datasets)} datasets for org_id={organization_id}, "
+        f"project_id={project_id} (limit={limit}, offset={offset})"
+    )
+
+    return list(datasets)
+
+
+def upload_csv_to_s3(
+    storage: CloudStorage,
+    csv_content: bytes,
+    dataset_name: str,
+) -> str | None:
+    """
+    Upload CSV file to AWS S3.
+
+    Args:
+        storage: CloudStorage instance
+        csv_content: Raw CSV content as bytes
+        dataset_name: Name of the dataset (used for file naming)
+
+    Returns:
+        S3 URL as string if successful, None if failed
+
+    Note:
+        This function handles errors gracefully and returns None on failure.
+        Callers should continue without S3 URL when this returns None.
+    """
+    try:
+        # Create a file path for the CSV
+        # Format: datasets/{dataset_name}_{timestamp}.csv
+        from datetime import datetime
+
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        file_path = Path(f"datasets/{dataset_name}_{timestamp}.csv")
+
+        # Create a mock UploadFile-like object for the storage put method
+        import io
+
+        class CSVFile:
+            def __init__(self, content: bytes):
+                self.file = io.BytesIO(content)
+                self.content_type = "text/csv"
+
+        csv_file = CSVFile(csv_content)
+
+        # Upload to S3
+        destination = storage.put(source=csv_file, file_path=file_path)
+        s3_url = str(destination)
+
+        logger.info(f"Successfully uploaded CSV to S3: {s3_url}")
+        return s3_url
+
+    except CloudStorageError as e:
+        logger.warning(
+            f"Failed to upload CSV to S3 for dataset '{dataset_name}': {e}. "
+            "Continuing without S3 storage."
+        )
+        return None
+    except Exception as e:
+        logger.warning(
+            f"Unexpected error uploading CSV to S3 for dataset '{dataset_name}': {e}. "
+            "Continuing without S3 storage.",
+            exc_info=True,
+        )
+        return None
+
+
+def download_csv_from_s3(storage: CloudStorage, s3_url: str) -> bytes:
+    """
+    Download CSV file from AWS S3.
+
+    Args:
+        storage: CloudStorage instance
+        s3_url: S3 URL of the CSV file
+
+    Returns:
+        CSV content as bytes
+
+    Raises:
+        CloudStorageError: If download fails
+        ValueError: If s3_url is None or empty
+    """
+    if not s3_url:
+        raise ValueError("s3_url cannot be None or empty")
+
+    try:
+        logger.info(f"Downloading CSV from S3: {s3_url}")
+        body = storage.stream(s3_url)
+        csv_content = body.read()
+        logger.info(f"Successfully downloaded CSV from S3: {len(csv_content)} bytes")
+        return csv_content
+    except Exception as e:
+        logger.error(f"Failed to download CSV from S3: {s3_url}: {e}", exc_info=True)
+        raise
+
+
+def update_dataset_langfuse_id(
+    session: Session, dataset_id: int, langfuse_dataset_id: str
+) -> None:
+    """
+    Update the langfuse_dataset_id for an existing dataset.
+
+    Args:
+        session: Database session
+        dataset_id: Dataset ID
+        langfuse_dataset_id: Langfuse dataset ID to store
+
+    Returns:
+        None
+    """
+    dataset = session.get(EvaluationDataset, dataset_id)
+    if dataset:
+        dataset.langfuse_dataset_id = langfuse_dataset_id
+        dataset.updated_at = now()
+        session.add(dataset)
+        session.commit()
+        logger.info(
+            f"Updated langfuse_dataset_id for dataset {dataset_id}: "
+            f"{langfuse_dataset_id}"
+        )
+    else:
+        logger.warning(f"Dataset {dataset_id} not found for langfuse_id update")
+
+
+def delete_dataset(
+    session: Session, dataset_id: int, organization_id: int, project_id: int
+) -> tuple[bool, str]:
+    """
+    Delete an evaluation dataset by ID.
+
+    This performs a hard delete from the database. The CSV file in S3 (if exists)
+    will remain for audit purposes.
+
+    Args:
+        session: Database session
+        dataset_id: Dataset ID to delete
+        organization_id: Organization ID for validation
+        project_id: Project ID for validation
+
+    Returns:
+        Tuple of (success: bool, message: str)
+    """
+    # First, fetch the dataset to ensure it exists and belongs to the org/project
+    dataset = get_dataset_by_id(
+        session=session,
+        dataset_id=dataset_id,
+        organization_id=organization_id,
+        project_id=project_id,
+    )
+
+    if not dataset:
+        return (
+            False,
+            f"Dataset {dataset_id} not found or not accessible",
+        )
+
+    # Check if dataset is being used by any evaluation runs
+    from sqlmodel import select
+
+    from app.models import EvaluationRun
+
+    statement = select(EvaluationRun).where(EvaluationRun.dataset_id == dataset_id)
+    evaluation_runs = session.exec(statement).all()
+
+    if evaluation_runs:
+        return (
+            False,
+            f"Cannot delete dataset {dataset_id}: it is being used by "
+            f"{len(evaluation_runs)} evaluation run(s). Please delete "
+            f"the evaluation runs first.",
+        )
+
+    # Delete the dataset
+    try:
+        session.delete(dataset)
+        session.commit()
+
+        logger.info(
+            f"Deleted dataset: id={dataset_id}, name={dataset.name}, "
+            f"org_id={organization_id}, project_id={project_id}"
+        )
+
+        return (
+            True,
+            f"Successfully deleted dataset '{dataset.name}' (id={dataset_id})",
+        )
+
+    except Exception as e:
+        session.rollback()
+        logger.error(f"Failed to delete dataset {dataset_id}: {e}", exc_info=True)
+        return (False, f"Failed to delete dataset: {e}")
diff --git a/backend/app/crud/evaluation_langfuse.py b/backend/app/crud/evaluation_langfuse.py
index 964c296fc..d21130609 100644
--- a/backend/app/crud/evaluation_langfuse.py
+++ b/backend/app/crud/evaluation_langfuse.py
@@ -96,7 +96,8 @@ def create_langfuse_dataset_run(
 
         langfuse.flush()
         logger.info(
-            f"Created Langfuse dataset run '{run_name}' with {len(trace_id_mapping)} traces"
+            f"Created Langfuse dataset run '{run_name}' with "
+            f"{len(trace_id_mapping)} traces"
         )
 
         return trace_id_mapping
@@ -120,7 +121,8 @@ def update_traces_with_cosine_scores(
 
     Args:
         langfuse: Configured Langfuse client
-        per_item_scores: List of per-item score dictionaries from calculate_average_similarity()
+        per_item_scores: List of per-item score dictionaries from
+            calculate_average_similarity()
                         Format: [
                             {
                                 "trace_id": "trace-uuid-123",
@@ -146,7 +148,10 @@ def update_traces_with_cosine_scores(
                 trace_id=trace_id,
                 name="cosine_similarity",
                 value=cosine_score,
-                comment="Cosine similarity between generated output and ground truth embeddings",
+                comment=(
+                    "Cosine similarity between generated output and "
+                    "ground truth embeddings"
+                ),
             )
         except Exception as e:
             logger.error(
@@ -155,3 +160,117 @@ def update_traces_with_cosine_scores(
             )
 
     langfuse.flush()
+
+
+def upload_dataset_to_langfuse_from_csv(
+    langfuse: Langfuse,
+    csv_content: bytes,
+    dataset_name: str,
+    duplication_factor: int,
+) -> tuple[str, int]:
+    """
+    Upload a dataset to Langfuse from CSV content.
+
+    This function parses CSV content and uploads it to Langfuse with duplication.
+    Used when re-uploading datasets from S3 storage.
+
+    Args:
+        langfuse: Configured Langfuse client
+        csv_content: Raw CSV content as bytes
+        dataset_name: Name for the dataset in Langfuse
+        duplication_factor: Number of times to duplicate each item
+
+    Returns:
+        Tuple of (langfuse_dataset_id, total_items_uploaded)
+
+    Raises:
+        ValueError: If CSV is invalid or empty
+        Exception: If Langfuse operations fail
+    """
+    import csv
+    import io
+
+    logger.info(
+        f"Uploading dataset '{dataset_name}' to Langfuse from CSV "
+        f"(duplication_factor={duplication_factor})"
+    )
+
+    try:
+        # Parse CSV content
+        csv_text = csv_content.decode("utf-8")
+        csv_reader = csv.DictReader(io.StringIO(csv_text))
+
+        # Validate CSV headers
+        if (
+            "question" not in csv_reader.fieldnames
+            or "answer" not in csv_reader.fieldnames
+        ):
+            raise ValueError(
+                f"CSV must contain 'question' and 'answer' columns. "
+                f"Found columns: {csv_reader.fieldnames}"
+            )
+
+        # Read all rows from CSV
+        original_items = []
+        for row in csv_reader:
+            question = row.get("question", "").strip()
+            answer = row.get("answer", "").strip()
+
+            if not question or not answer:
+                logger.warning(f"Skipping row with empty question or answer: {row}")
+                continue
+
+            original_items.append({"question": question, "answer": answer})
+
+        if not original_items:
+            raise ValueError("No valid items found in CSV file")
+
+        logger.info(
+            f"Parsed {len(original_items)} items from CSV. "
+            f"Will duplicate {duplication_factor}x for a total of "
+            f"{len(original_items) * duplication_factor} items."
+        )
+
+        # Create or get dataset in Langfuse
+        dataset = langfuse.create_dataset(name=dataset_name)
+
+        # Upload items with duplication
+        total_uploaded = 0
+        for item in original_items:
+            # Duplicate each item N times
+            for duplicate_num in range(duplication_factor):
+                try:
+                    langfuse.create_dataset_item(
+                        dataset_name=dataset_name,
+                        input={"question": item["question"]},
+                        expected_output={"answer": item["answer"]},
+                        metadata={
+                            "original_question": item["question"],
+                            "duplicate_number": duplicate_num + 1,
+                            "duplication_factor": duplication_factor,
+                        },
+                    )
+                    total_uploaded += 1
+                except Exception as e:
+                    logger.error(
+                        f"Failed to upload item (duplicate {duplicate_num + 1}): "
+                        f"{item['question'][:50]}... Error: {e}"
+                    )
+
+        # Flush to ensure all items are uploaded
+        langfuse.flush()
+
+        langfuse_dataset_id = dataset.id if hasattr(dataset, "id") else None
+
+        logger.info(
+            f"Successfully uploaded {total_uploaded} items to Langfuse dataset "
+            f"'{dataset_name}' (id={langfuse_dataset_id})"
+        )
+
+        return langfuse_dataset_id, total_uploaded
+
+    except Exception as e:
+        logger.error(
+            f"Failed to upload dataset '{dataset_name}' to Langfuse: {e}", exc_info=True
+        )
+        raise
diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py
index 62cbd2a77..130ffbb9f 100644
--- a/backend/app/models/__init__.py
+++ b/backend/app/models/__init__.py
@@ -45,6 +45,9 @@
 )
 
 from .evaluation import (
+    EvaluationDataset,
+    EvaluationDatasetCreate,
+    EvaluationDatasetPublic,
     EvaluationRun,
     EvaluationRunCreate,
     EvaluationRunPublic,
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index ab4597d1d..5000ab24b 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -19,6 +19,7 @@ class DatasetItem(BaseModel):
 class DatasetUploadResponse(BaseModel):
     """Response model for dataset upload."""
 
+    dataset_id: int = Field(..., description="Database ID of the created dataset")
     dataset_name: str = Field(..., description="Name of the created dataset")
     total_items: int = Field(
         ..., description="Total number of items uploaded (after duplication)"
@@ -32,6 +33,7 @@ class DatasetUploadResponse(BaseModel):
     langfuse_dataset_id: str | None = Field(
         None, description="Langfuse dataset ID if available"
     )
+    s3_url: str | None = Field(None, description="AWS S3 URL if uploaded")
 
 
 class EvaluationResult(BaseModel):
@@ -60,6 +62,61 @@ class Experiment(BaseModel):
 # Database Models
 
 
+class EvaluationDataset(SQLModel, table=True):
+    """Database table for evaluation datasets."""
+
+    __tablename__ = "evaluation_dataset"
+
+    id: int = SQLField(default=None, primary_key=True)
+
+    # Dataset information
+    name: str = SQLField(index=True, description="Name of the dataset")
+    description: str | None = SQLField(
+        default=None, description="Optional description of the dataset"
+    )
+
+    # Dataset metadata stored as JSON
+    dataset_metadata: dict[str, Any] = SQLField(
+        default_factory=dict,
+        sa_column=Column(JSON),
+        description=(
+            "Dataset metadata (original_items_count, total_items_count, "
+            "duplication_factor)"
+        ),
+    )
+
+    # Storage references
+    s3_url: str | None = SQLField(
+        default=None, description="AWS S3 URL where CSV is stored"
+    )
+    langfuse_dataset_id: str | None = SQLField(
+        default=None, description="Langfuse dataset ID for reference"
+    )
+
+    # Foreign keys
+    organization_id: int = SQLField(
+        foreign_key="organization.id", nullable=False, ondelete="CASCADE"
+    )
+    project_id: int = SQLField(
+        foreign_key="project.id", nullable=False, ondelete="CASCADE"
+    )
+
+    # Timestamps
+    inserted_at: datetime = SQLField(default_factory=now, nullable=False)
+    updated_at: datetime = SQLField(default_factory=now, nullable=False)
+
+    # Relationships
+    project: "Project" = Relationship(
+        back_populates="evaluation_datasets"
+    )  # noqa: F821
+    organization: "Organization" = Relationship(
+        back_populates="evaluation_datasets"
+    )  # noqa: F821
+    evaluation_runs: list["EvaluationRun"] = Relationship(
+        back_populates="evaluation_dataset"
+    )
+
+
 class EvaluationRun(SQLModel, table=True):
     """Database table for evaluation runs."""
 
@@ -78,11 +135,20 @@ class EvaluationRun(SQLModel, table=True):
         description="Evaluation configuration",
     )
 
+    # Dataset reference
+    dataset_id: int | None = SQLField(
+        default=None,
+        foreign_key="evaluation_dataset.id",
+        description="Reference to the evaluation_dataset used for this run",
+    )
+
     # Batch job references
     batch_job_id: int | None = SQLField(
         default=None,
         foreign_key="batch_job.id",
-        description="Reference to the batch_job that processes this evaluation (responses)",
+        description=(
+            "Reference to the batch_job that processes this evaluation " "(responses)"
+        ),
     )
     embedding_batch_job_id: int | None = SQLField(
         default=None,
@@ -130,26 +196,34 @@ class EvaluationRun(SQLModel, table=True):
     updated_at: datetime = SQLField(default_factory=now, nullable=False)
 
     # Relationships
-    project: "Project" = Relationship(back_populates="evaluation_runs")
-    organization: "Organization" = Relationship(back_populates="evaluation_runs")
-    batch_job: Optional["BatchJob"] = Relationship(
-        sa_relationship_kwargs={"foreign_keys": "[EvaluationRun.batch_job_id]"}
+    project: "Project" = Relationship(back_populates="evaluation_runs")  # noqa: F821
+    organization: "Organization" = Relationship(
+        back_populates="evaluation_runs"
     )  # noqa: F821
-    embedding_batch_job: Optional["BatchJob"] = Relationship(
+    evaluation_dataset: Optional["EvaluationDataset"] = Relationship(
+        back_populates="evaluation_runs"
+    )
+    batch_job: Optional["BatchJob"] = Relationship(  # noqa: F821
+        sa_relationship_kwargs={"foreign_keys": "[EvaluationRun.batch_job_id]"}
+    )
+    embedding_batch_job: Optional["BatchJob"] = Relationship(  # noqa: F821
         sa_relationship_kwargs={
             "foreign_keys": "[EvaluationRun.embedding_batch_job_id]"
         }
-    )  # noqa: F821
+    )
 
 
 class EvaluationRunCreate(SQLModel):
     """Model for creating an evaluation run."""
 
     run_name: str = Field(description="Name of the evaluation run", min_length=3)
-    dataset_name: str = Field(description="Name of the Langfuse dataset", min_length=1)
+    dataset_id: int = Field(description="ID of the evaluation dataset")
     config: dict[str, Any] = Field(
         default_factory=dict,
-        description="Evaluation configuration (flexible dict with llm, instructions, vector_store_ids, etc.)",
+        description=(
+            "Evaluation configuration (flexible dict with llm, instructions, "
+            "vector_store_ids, etc.)"
+        ),
     )
 
 
@@ -160,6 +234,7 @@ class EvaluationRunPublic(SQLModel):
     run_name: str
     dataset_name: str
     config: dict[str, Any]
+    dataset_id: int | None
     batch_job_id: int | None
     embedding_batch_job_id: int | None
     status: str
@@ -171,3 +246,36 @@ class EvaluationRunPublic(SQLModel):
     project_id: int
     inserted_at: datetime
     updated_at: datetime
+
+
+class EvaluationDatasetCreate(SQLModel):
+    """Model for creating an evaluation dataset."""
+
+    name: str = Field(description="Name of the dataset", min_length=1)
+    description: str | None = Field(None, description="Optional dataset description")
+    dataset_metadata: dict[str, Any] = Field(
+        default_factory=dict,
+        description=(
+            "Dataset metadata (original_items_count, total_items_count, "
+            "duplication_factor)"
+        ),
+    )
+    s3_url: str | None = Field(None, description="AWS S3 URL where CSV is stored")
+    langfuse_dataset_id: str | None = Field(
+        None, description="Langfuse dataset ID for reference"
+    )
+
+
+class EvaluationDatasetPublic(SQLModel):
+    """Public model for evaluation datasets."""
+
+    id: int
+    name: str
+    description: str | None
+    dataset_metadata: dict[str, Any]
+    s3_url: str | None
+    langfuse_dataset_id: str | None
+    organization_id: int
+    project_id: int
+    inserted_at: datetime
+    updated_at: datetime
diff --git a/backend/app/models/organization.py b/backend/app/models/organization.py
index 7ac42879b..09a1f9af2 100644
--- a/backend/app/models/organization.py
+++ b/backend/app/models/organization.py
@@ -12,7 +12,7 @@
     from .collection import Collection
     from .openai_conversation import OpenAIConversation
     from .batch_job import BatchJob
-    from .evaluation import EvaluationRun
+    from .evaluation import EvaluationRun, EvaluationDataset
 
 
 # Shared properties for an Organization
@@ -57,6 +57,9 @@ class Organization(OrganizationBase, table=True):
     evaluation_runs: list["EvaluationRun"] = Relationship(
         back_populates="organization", cascade_delete=True
     )
+    evaluation_datasets: list["EvaluationDataset"] = Relationship(
+        back_populates="organization", cascade_delete=True
+    )
     batch_jobs: list["BatchJob"] = Relationship(
         back_populates="organization", cascade_delete=True
     )
diff --git a/backend/app/models/project.py b/backend/app/models/project.py
index fc531cc39..ae43b1e5d 100644
--- a/backend/app/models/project.py
+++ b/backend/app/models/project.py
@@ -58,6 +58,9 @@ class Project(ProjectBase, table=True):
     evaluation_runs: list["EvaluationRun"] = Relationship(
         back_populates="project", cascade_delete=True
     )
+    evaluation_datasets: list["EvaluationDataset"] = Relationship(
+        back_populates="project", cascade_delete=True
+    )
     batch_jobs: list["BatchJob"] = Relationship(
         back_populates="project", cascade_delete=True
     )

From 3912d9f9d4b58843e8687534b89e2ab70b16b24b Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Wed, 29 Oct 2025 14:33:33 +0530
Subject: [PATCH 30/64] update endpoints

---
 backend/app/api/routes/evaluation.py          | 67 +++++--------------
 .../app/tests/api/routes/test_evaluation.py   | 44 ++++++------
 2 files changed, 37 insertions(+), 74 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index e6d9a2d47..41df3ca61 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -8,7 +8,6 @@
 from app.crud.assistants import get_assistant_by_id
 from app.crud.credentials import get_provider_credential
 from app.crud.evaluation_batch import start_evaluation_batch
-from app.crud.evaluation_processing import poll_all_pending_evaluations
 from app.models import EvaluationRun, UserProjectOrg
 from app.models.evaluation import (
     DatasetUploadResponse,
@@ -20,7 +19,7 @@
 router = APIRouter(tags=["evaluation"])
 
 
-@router.post("/dataset/upload", response_model=DatasetUploadResponse)
+@router.post("/evaluations/datasets", response_model=DatasetUploadResponse)
 async def upload_dataset(
     file: UploadFile = File(
         ..., description="CSV file with 'question' and 'answer' columns"
@@ -203,7 +202,7 @@ async def upload_dataset(
         raise ValueError(f"Failed to save dataset metadata: {e}")
 
 
-@router.get("/dataset/{dataset_id}", response_model=DatasetUploadResponse)
+@router.get("/evaluations/datasets/{dataset_id}", response_model=DatasetUploadResponse)
 async def get_dataset(
     dataset_id: int,
     _session: Session = Depends(get_db),
@@ -248,7 +247,7 @@ async def get_dataset(
     )
 
 
-@router.get("/datasets", response_model=list[DatasetUploadResponse])
+@router.get("/evaluations/datasets", response_model=list[DatasetUploadResponse])
 async def list_datasets_endpoint(
     limit: int = 50,
     offset: int = 0,
@@ -306,7 +305,7 @@ async def list_datasets_endpoint(
     return response
 
 
-@router.delete("/dataset/{dataset_id}")
+@router.delete("/evaluations/datasets/{dataset_id}")
 async def delete_dataset(
     dataset_id: int,
     _session: Session = Depends(get_db),
@@ -347,7 +346,7 @@ async def delete_dataset(
     return {"message": message, "dataset_id": dataset_id}
 
 
-@router.post("/evaluate", response_model=EvaluationRunPublic)
+@router.post("/evaluations", response_model=EvaluationRunPublic)
 async def evaluate_threads(
     dataset_id: int = Body(..., description="ID of the evaluation dataset"),
     experiment_name: str = Body(
@@ -374,10 +373,10 @@ async def evaluate_threads(
     7. Returns the evaluation run details with batch_job_id
 
     The batch will be processed asynchronously by Celery Beat (every 60s).
-    Use GET /evaluate/batch/{run_id}/status to check progress.
+    Use GET /evaluations/{evaluation_id} to check progress.
 
     Args:
-        dataset_id: ID of the evaluation dataset (from /dataset/upload)
+        dataset_id: ID of the evaluation dataset (from /evaluations/datasets)
         experiment_name: Name for this evaluation experiment/run
         config: Configuration dict that will be used as-is in JSONL generation.
             Can include any OpenAI Responses API parameters like:
@@ -627,45 +626,9 @@ async def evaluate_threads(
         return eval_run
 
 
-@router.post("/evaluate/batch/poll")
-async def poll_evaluation_batches(
-    _session: Session = Depends(get_db),
-    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
-) -> dict:
-    """
-    Manually trigger polling for all pending evaluations in the current organization.
-
-    This endpoint is useful for:
-    - Testing the evaluation flow
-    - Immediately checking status instead of waiting for Celery beat
-    - Debugging evaluation issues
-
-    Returns:
-        Summary of polling results including processed, failed, and still
-        processing counts
-    """
-    logger.info(
-        f"Manual polling triggered for org_id={_current_user.organization_id} "
-        f"by user_id={_current_user.user_id}"
-    )
-
-    summary = await poll_all_pending_evaluations(
-        session=_session, org_id=_current_user.organization_id
-    )
-
-    logger.info(
-        f"Manual polling completed for org_id={_current_user.organization_id}: "
-        f"{summary.get('total', 0)} evaluations checked, "
-        f"{summary.get('processed', 0)} processed, "
-        f"{summary.get('failed', 0)} failed"
-    )
-
-    return summary
-
-
-@router.get("/evaluate/batch/{run_id}/status", response_model=EvaluationRunPublic)
+@router.get("/evaluations/{evaluation_id}", response_model=EvaluationRunPublic)
 async def get_evaluation_run_status(
-    run_id: int,
+    evaluation_id: int,
     _session: Session = Depends(get_db),
     _current_user: UserProjectOrg = Depends(get_current_user_org_project),
 ) -> EvaluationRunPublic:
@@ -673,20 +636,20 @@ async def get_evaluation_run_status(
     Get the current status of a specific evaluation run.
 
     Args:
-        run_id: ID of the evaluation run
+        evaluation_id: ID of the evaluation run
 
     Returns:
         EvaluationRunPublic with current status and results if completed
     """
     logger.info(
-        f"Fetching status for evaluation run {run_id} "
+        f"Fetching status for evaluation run {evaluation_id} "
         f"(org_id={_current_user.organization_id})"
     )
 
     # Query the evaluation run
     statement = (
         select(EvaluationRun)
-        .where(EvaluationRun.id == run_id)
+        .where(EvaluationRun.id == evaluation_id)
         .where(EvaluationRun.organization_id == _current_user.organization_id)
     )
 
@@ -694,18 +657,18 @@ async def get_evaluation_run_status(
 
     if not eval_run:
         raise ValueError(
-            f"Evaluation run {run_id} not found or not accessible to this organization"
+            f"Evaluation run {evaluation_id} not found or not accessible to this organization"
         )
 
     logger.info(
-        f"Found evaluation run {run_id}: status={eval_run.status}, "
+        f"Found evaluation run {evaluation_id}: status={eval_run.status}, "
         f"batch_job_id={eval_run.batch_job_id}"
     )
 
     return eval_run
 
 
-@router.get("/evaluate/batch/list", response_model=list[EvaluationRunPublic])
+@router.get("/evaluations", response_model=list[EvaluationRunPublic])
 async def list_evaluation_runs(
     _session: Session = Depends(get_db),
     _current_user: UserProjectOrg = Depends(get_current_user_org_project),
diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
index 207de6d67..c39dae87c 100644
--- a/backend/app/tests/api/routes/test_evaluation.py
+++ b/backend/app/tests/api/routes/test_evaluation.py
@@ -5,7 +5,7 @@
 import pytest
 from sqlmodel import select
 
-from app.crud.evaluation_batch import build_batch_jsonl
+from app.crud.evaluation_batch import build_evaluation_jsonl
 from app.models import EvaluationRun
 
 
@@ -64,7 +64,7 @@ def test_upload_dataset_valid_csv(
             filename, file_obj = create_csv_file(valid_csv_content)
 
             response = client.post(
-                "/api/v1/dataset/upload",
+                "/api/v1/evaluations/datasets",
                 files={"file": (filename, file_obj, "text/csv")},
                 data={
                     "dataset_name": "test_dataset",
@@ -102,7 +102,7 @@ def test_upload_dataset_missing_columns(
             filename, file_obj = create_csv_file(invalid_csv_missing_columns)
 
             response = client.post(
-                "/api/v1/dataset/upload",
+                "/api/v1/evaluations/datasets",
                 files={"file": (filename, file_obj, "text/csv")},
                 data={
                     "dataset_name": "test_dataset",
@@ -131,7 +131,7 @@ def test_upload_dataset_empty_rows(
             filename, file_obj = create_csv_file(csv_with_empty_rows)
 
             response = client.post(
-                "/api/v1/dataset/upload",
+                "/api/v1/evaluations/datasets",
                 files={"file": (filename, file_obj, "text/csv")},
                 data={
                     "dataset_name": "test_dataset",
@@ -166,7 +166,7 @@ def test_upload_with_default_duplication(
             filename, file_obj = create_csv_file(valid_csv_content)
 
             response = client.post(
-                "/api/v1/dataset/upload",
+                "/api/v1/evaluations/datasets",
                 files={"file": (filename, file_obj, "text/csv")},
                 data={
                     "dataset_name": "test_dataset",
@@ -197,7 +197,7 @@ def test_upload_with_custom_duplication(
             filename, file_obj = create_csv_file(valid_csv_content)
 
             response = client.post(
-                "/api/v1/dataset/upload",
+                "/api/v1/evaluations/datasets",
                 files={"file": (filename, file_obj, "text/csv")},
                 data={
                     "dataset_name": "test_dataset",
@@ -228,7 +228,7 @@ def test_upload_metadata_includes_duplicate_number(
             filename, file_obj = create_csv_file(valid_csv_content)
 
             response = client.post(
-                "/api/v1/dataset/upload",
+                "/api/v1/evaluations/datasets",
                 files={"file": (filename, file_obj, "text/csv")},
                 data={
                     "dataset_name": "test_dataset",
@@ -267,7 +267,7 @@ def test_upload_langfuse_configuration_fails(
             filename, file_obj = create_csv_file(valid_csv_content)
 
             response = client.post(
-                "/api/v1/dataset/upload",
+                "/api/v1/evaluations/datasets",
                 files={"file": (filename, file_obj, "text/csv")},
                 data={
                     "dataset_name": "test_dataset",
@@ -289,7 +289,7 @@ def test_upload_invalid_csv_format(self, client, user_api_key_header):
             mock_langfuse.return_value = (mock_client, True)
 
             response = client.post(
-                "/api/v1/dataset/upload",
+                "/api/v1/evaluations/datasets",
                 files={"file": (filename, file_obj, "text/csv")},
                 data={
                     "dataset_name": "test_dataset",
@@ -306,7 +306,7 @@ def test_upload_without_authentication(self, client, valid_csv_content):
         filename, file_obj = create_csv_file(valid_csv_content)
 
         response = client.post(
-            "/api/v1/dataset/upload",
+            "/api/v1/evaluations/datasets",
             files={"file": (filename, file_obj, "text/csv")},
             data={
                 "dataset_name": "test_dataset",
@@ -392,7 +392,7 @@ def test_start_batch_evaluation_success(
             mock_langfuse.return_value = (mock_langfuse_client, True)
 
             response = client.post(
-                "/api/v1/evaluate/batch",
+                "/api/v1/evaluations",
                 json={
                     "run_name": "test_evaluation_run",
                     "dataset_name": "test_dataset",
@@ -462,7 +462,7 @@ def test_start_batch_evaluation_with_vector_stores(
             mock_langfuse.return_value = (mock_langfuse_client, True)
 
             response = client.post(
-                "/api/v1/evaluate/batch",
+                "/api/v1/evaluations",
                 json={
                     "run_name": "test_with_vector_stores",
                     "dataset_name": "test_dataset",
@@ -498,7 +498,7 @@ def test_start_batch_evaluation_invalid_dataset(
             mock_langfuse.return_value = (mock_langfuse_client, True)
 
             response = client.post(
-                "/api/v1/evaluate/batch",
+                "/api/v1/evaluations",
                 json={
                     "run_name": "test_evaluation_run",
                     "dataset_name": "invalid_dataset",
@@ -534,7 +534,7 @@ def test_start_batch_evaluation_empty_dataset(
             mock_langfuse.return_value = (mock_langfuse_client, True)
 
             response = client.post(
-                "/api/v1/evaluate/batch",
+                "/api/v1/evaluations",
                 json={
                     "run_name": "test_evaluation_run",
                     "dataset_name": "empty_dataset",
@@ -551,7 +551,7 @@ def test_start_batch_evaluation_without_authentication(
     ):
         """Test batch evaluation requires authentication."""
         response = client.post(
-            "/api/v1/evaluate/batch",
+            "/api/v1/evaluations",
             json={
                 "run_name": "test_evaluation_run",
                 "dataset_name": "test_dataset",
@@ -581,7 +581,7 @@ def test_start_batch_evaluation_invalid_config(self, client, user_api_key_header
             # This should still work because config is flexible (dict)
             # but build_batch_jsonl will use defaults for missing values
             response = client.post(
-                "/api/v1/evaluate/batch",
+                "/api/v1/evaluations",
                 json={
                     "run_name": "test_evaluation_run",
                     "dataset_name": "test_dataset",
@@ -634,7 +634,7 @@ def test_start_batch_evaluation_creates_database_record(
             mock_langfuse.return_value = (mock_langfuse_client, True)
 
             response = client.post(
-                "/api/v1/evaluate/batch",
+                "/api/v1/evaluations",
                 json={
                     "run_name": "database_test_run",
                     "dataset_name": "test_dataset",
@@ -682,7 +682,7 @@ def test_build_batch_jsonl_basic(self):
             "vector_store_ids": [],
         }
 
-        batch_file = build_batch_jsonl(dataset_items, config)
+        batch_file = build_evaluation_jsonl(dataset_items, config)
 
         assert len(batch_file) == 1
 
@@ -713,7 +713,7 @@ def test_build_batch_jsonl_with_vector_stores(self):
             "vector_store_ids": ["vs_abc123"],
         }
 
-        batch_file = build_batch_jsonl(dataset_items, config)
+        batch_file = build_evaluation_jsonl(dataset_items, config)
 
         assert len(batch_file) == 1
 
@@ -735,7 +735,7 @@ def test_build_batch_jsonl_uses_defaults(self):
 
         config = {}  # Empty config, should use defaults
 
-        batch_file = build_batch_jsonl(dataset_items, config)
+        batch_file = build_evaluation_jsonl(dataset_items, config)
 
         assert len(batch_file) == 1
 
@@ -772,7 +772,7 @@ def test_build_batch_jsonl_skips_empty_questions(self):
 
         config = {"llm": {"model": "gpt-4o"}, "instructions": "Test"}
 
-        batch_file = build_batch_jsonl(dataset_items, config)
+        batch_file = build_evaluation_jsonl(dataset_items, config)
 
         # Should only have 1 valid item
         assert len(batch_file) == 1
@@ -798,7 +798,7 @@ def test_build_batch_jsonl_multiple_items(self):
             "vector_store_ids": [],
         }
 
-        batch_file = build_batch_jsonl(dataset_items, config)
+        batch_file = build_evaluation_jsonl(dataset_items, config)
 
         assert len(batch_file) == 5
 

From 6012d5cab362eb1d295a0529174efb401fa53b5b Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 30 Oct 2025 12:23:05 +0530
Subject: [PATCH 31/64] updated testcases

---
 .../app/tests/api/routes/test_evaluation.py   | 615 ++++++------------
 .../app/tests/crud/test_evaluation_batch.py   | 177 -----
 .../app/tests/crud/test_evaluation_dataset.py | 409 ++++++++++++
 .../tests/crud/test_evaluation_embeddings.py  |  82 ++-
 .../tests/crud/test_evaluation_langfuse.py    | 414 ++++++++++++
 5 files changed, 1053 insertions(+), 644 deletions(-)
 delete mode 100644 backend/app/tests/crud/test_evaluation_batch.py
 create mode 100644 backend/app/tests/crud/test_evaluation_dataset.py
 create mode 100644 backend/app/tests/crud/test_evaluation_langfuse.py

diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
index c39dae87c..7158460a9 100644
--- a/backend/app/tests/api/routes/test_evaluation.py
+++ b/backend/app/tests/api/routes/test_evaluation.py
@@ -49,17 +49,19 @@ class TestDatasetUploadValidation:
     """Test CSV validation and parsing."""
 
     def test_upload_dataset_valid_csv(
-        self, client, user_api_key_header, valid_csv_content
+        self, client, user_api_key_header, valid_csv_content, db
     ):
         """Test uploading a valid CSV file."""
-        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
-            # Mock Langfuse client
-            mock_client = MagicMock()
-            mock_dataset = MagicMock()
-            mock_dataset.id = "test_dataset_id"
-            mock_client.create_dataset.return_value = mock_dataset
-            mock_client.create_dataset_item.return_value = None
-            mock_langfuse.return_value = (mock_client, True)
+        with patch("app.core.cloud.get_cloud_storage") as _mock_storage, patch(
+            "app.crud.evaluation_dataset.upload_csv_to_s3"
+        ) as mock_s3_upload, patch(
+            "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
+        ) as mock_langfuse_upload:
+            # Mock S3 upload
+            mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+
+            # Mock Langfuse upload
+            mock_langfuse_upload.return_value = ("test_dataset_id", 9)
 
             filename, file_obj = create_csv_file(valid_csv_content)
 
@@ -68,6 +70,7 @@ def test_upload_dataset_valid_csv(
                 files={"file": (filename, file_obj, "text/csv")},
                 data={
                     "dataset_name": "test_dataset",
+                    "description": "Test dataset description",
                     "duplication_factor": 3,
                 },
                 headers=user_api_key_header,
@@ -81,12 +84,14 @@ def test_upload_dataset_valid_csv(
             assert data["total_items"] == 9  # 3 items * 3 duplication
             assert data["duplication_factor"] == 3
             assert data["langfuse_dataset_id"] == "test_dataset_id"
+            assert data["s3_url"] == "s3://bucket/datasets/test_dataset.csv"
+            assert "dataset_id" in data
+
+            # Verify S3 upload was called
+            mock_s3_upload.assert_called_once()
 
-            # Verify Langfuse was called correctly
-            mock_client.create_dataset.assert_called_once_with(name="test_dataset")
-            assert (
-                mock_client.create_dataset_item.call_count == 9
-            )  # 3 items * 3 duplicates
+            # Verify Langfuse upload was called
+            mock_langfuse_upload.assert_called_once()
 
     def test_upload_dataset_missing_columns(
         self,
@@ -95,13 +100,12 @@ def test_upload_dataset_missing_columns(
         invalid_csv_missing_columns,
     ):
         """Test uploading CSV with missing required columns."""
-        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
-            mock_client = MagicMock()
-            mock_langfuse.return_value = (mock_client, True)
+        filename, file_obj = create_csv_file(invalid_csv_missing_columns)
 
-            filename, file_obj = create_csv_file(invalid_csv_missing_columns)
-
-            response = client.post(
+        # The CSV validation happens before any mocked functions are called
+        # so this test checks the actual validation logic
+        with pytest.raises(Exception) as exc_info:
+            client.post(
                 "/api/v1/evaluations/datasets",
                 files={"file": (filename, file_obj, "text/csv")},
                 data={
@@ -111,22 +115,22 @@ def test_upload_dataset_missing_columns(
                 headers=user_api_key_header,
             )
 
-            assert response.status_code == 500  # ValueError is raised
-            assert (
-                "question" in response.text.lower() or "answer" in response.text.lower()
-            )
+        # Check that the error message mentions the missing columns
+        error_str = str(exc_info.value)
+        assert "question" in error_str.lower() or "answer" in error_str.lower()
 
     def test_upload_dataset_empty_rows(
         self, client, user_api_key_header, csv_with_empty_rows
     ):
         """Test uploading CSV with empty rows (should skip them)."""
-        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
-            mock_client = MagicMock()
-            mock_dataset = MagicMock()
-            mock_dataset.id = "test_dataset_id"
-            mock_client.create_dataset.return_value = mock_dataset
-            mock_client.create_dataset_item.return_value = None
-            mock_langfuse.return_value = (mock_client, True)
+        with patch("app.core.cloud.get_cloud_storage") as _mock_storage, patch(
+            "app.crud.evaluation_dataset.upload_csv_to_s3"
+        ) as mock_s3_upload, patch(
+            "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
+        ) as mock_langfuse_upload:
+            # Mock S3 and Langfuse uploads
+            mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            mock_langfuse_upload.return_value = ("test_dataset_id", 4)
 
             filename, file_obj = create_csv_file(csv_with_empty_rows)
 
@@ -155,13 +159,13 @@ def test_upload_with_default_duplication(
         self, client, user_api_key_header, valid_csv_content
     ):
         """Test uploading with default duplication factor (5)."""
-        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
-            mock_client = MagicMock()
-            mock_dataset = MagicMock()
-            mock_dataset.id = "test_dataset_id"
-            mock_client.create_dataset.return_value = mock_dataset
-            mock_client.create_dataset_item.return_value = None
-            mock_langfuse.return_value = (mock_client, True)
+        with patch("app.core.cloud.get_cloud_storage") as _mock_storage, patch(
+            "app.crud.evaluation_dataset.upload_csv_to_s3"
+        ) as mock_s3_upload, patch(
+            "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
+        ) as mock_langfuse_upload:
+            mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            mock_langfuse_upload.return_value = ("test_dataset_id", 15)
 
             filename, file_obj = create_csv_file(valid_csv_content)
 
@@ -186,13 +190,13 @@ def test_upload_with_custom_duplication(
         self, client, user_api_key_header, valid_csv_content
     ):
         """Test uploading with custom duplication factor."""
-        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
-            mock_client = MagicMock()
-            mock_dataset = MagicMock()
-            mock_dataset.id = "test_dataset_id"
-            mock_client.create_dataset.return_value = mock_dataset
-            mock_client.create_dataset_item.return_value = None
-            mock_langfuse.return_value = (mock_client, True)
+        with patch("app.core.cloud.get_cloud_storage") as _mock_storage, patch(
+            "app.crud.evaluation_dataset.upload_csv_to_s3"
+        ) as mock_s3_upload, patch(
+            "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
+        ) as mock_langfuse_upload:
+            mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            mock_langfuse_upload.return_value = ("test_dataset_id", 30)
 
             filename, file_obj = create_csv_file(valid_csv_content)
 
@@ -213,17 +217,17 @@ def test_upload_with_custom_duplication(
             assert data["original_items"] == 3
             assert data["total_items"] == 30  # 3 items * 10 duplication
 
-    def test_upload_metadata_includes_duplicate_number(
-        self, client, user_api_key_header, valid_csv_content
+    def test_upload_with_description(
+        self, client, user_api_key_header, valid_csv_content, db
     ):
-        """Test that metadata includes duplicate number for each item."""
-        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
-            mock_client = MagicMock()
-            mock_dataset = MagicMock()
-            mock_dataset.id = "test_dataset_id"
-            mock_client.create_dataset.return_value = mock_dataset
-            mock_client.create_dataset_item.return_value = None
-            mock_langfuse.return_value = (mock_client, True)
+        """Test uploading with a description."""
+        with patch("app.core.cloud.get_cloud_storage") as _mock_storage, patch(
+            "app.crud.evaluation_dataset.upload_csv_to_s3"
+        ) as mock_s3_upload, patch(
+            "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
+        ) as mock_langfuse_upload:
+            mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            mock_langfuse_upload.return_value = ("test_dataset_id", 9)
 
             filename, file_obj = create_csv_file(valid_csv_content)
 
@@ -231,27 +235,29 @@ def test_upload_metadata_includes_duplicate_number(
                 "/api/v1/evaluations/datasets",
                 files={"file": (filename, file_obj, "text/csv")},
                 data={
-                    "dataset_name": "test_dataset",
+                    "dataset_name": "test_dataset_with_description",
+                    "description": "This is a test dataset for evaluation",
                     "duplication_factor": 3,
                 },
                 headers=user_api_key_header,
             )
 
             assert response.status_code == 200, response.text
+            data = response.json()
 
-            # Verify metadata was passed correctly
-            calls = mock_client.create_dataset_item.call_args_list
+            # Verify the description is stored
+            from sqlmodel import select
 
-            # Check that each duplicate has correct metadata
-            duplicate_numbers = set()
-            for call in calls:
-                metadata = call.kwargs.get("metadata", {})
-                duplicate_numbers.add(metadata["duplicate_number"])
-                assert metadata["duplication_factor"] == 3
-                assert "original_question" in metadata
+            from app.models import EvaluationDataset
 
-            # Should have duplicate numbers 1, 2, 3
-            assert duplicate_numbers == {1, 2, 3}
+            dataset = db.exec(
+                select(EvaluationDataset).where(
+                    EvaluationDataset.id == data["dataset_id"]
+                )
+            ).first()
+
+            assert dataset is not None
+            assert dataset.description == "This is a test dataset for evaluation"
 
 
 class TestDatasetUploadErrors:
@@ -261,34 +267,39 @@ def test_upload_langfuse_configuration_fails(
         self, client, user_api_key_header, valid_csv_content
     ):
         """Test when Langfuse client configuration fails."""
-        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
-            mock_langfuse.return_value = (None, False)
+        with patch("app.core.cloud.get_cloud_storage") as _mock_storage, patch(
+            "app.crud.evaluation_dataset.upload_csv_to_s3"
+        ) as mock_s3_upload, patch(
+            "app.crud.credentials.get_provider_credential"
+        ) as mock_get_cred:
+            # Mock S3 upload succeeds
+            mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            # Mock Langfuse credentials not found
+            mock_get_cred.return_value = None
 
             filename, file_obj = create_csv_file(valid_csv_content)
 
-            response = client.post(
-                "/api/v1/evaluations/datasets",
-                files={"file": (filename, file_obj, "text/csv")},
-                data={
-                    "dataset_name": "test_dataset",
-                    "duplication_factor": 5,
-                },
-                headers=user_api_key_header,
-            )
+            with pytest.raises(Exception) as exc_info:
+                client.post(
+                    "/api/v1/evaluations/datasets",
+                    files={"file": (filename, file_obj, "text/csv")},
+                    data={
+                        "dataset_name": "test_dataset",
+                        "duplication_factor": 5,
+                    },
+                    headers=user_api_key_header,
+                )
 
-            assert response.status_code == 500
-            assert "Failed to configure" in response.text or "Langfuse" in response.text
+            error_str = str(exc_info.value)
+            assert "langfuse" in error_str.lower() or "credential" in error_str.lower()
 
     def test_upload_invalid_csv_format(self, client, user_api_key_header):
         """Test uploading invalid CSV format."""
         invalid_csv = "not,a,valid\ncsv format here!!!"
         filename, file_obj = create_csv_file(invalid_csv)
 
-        with patch("app.crud.evaluation.configure_langfuse") as mock_langfuse:
-            mock_client = MagicMock()
-            mock_langfuse.return_value = (mock_client, True)
-
-            response = client.post(
+        with pytest.raises(Exception) as exc_info:
+            client.post(
                 "/api/v1/evaluations/datasets",
                 files={"file": (filename, file_obj, "text/csv")},
                 data={
@@ -298,8 +309,13 @@ def test_upload_invalid_csv_format(self, client, user_api_key_header):
                 headers=user_api_key_header,
             )
 
-            # Should fail validation
-            assert response.status_code == 500
+        # Should fail validation - check error contains expected message
+        error_str = str(exc_info.value)
+        assert (
+            "question" in error_str.lower()
+            or "answer" in error_str.lower()
+            or "invalid" in error_str.lower()
+        )
 
     def test_upload_without_authentication(self, client, valid_csv_content):
         """Test uploading without authentication."""
@@ -324,227 +340,53 @@ class TestBatchEvaluation:
     def sample_evaluation_config(self):
         """Sample evaluation configuration."""
         return {
-            "llm": {"model": "gpt-4o", "temperature": 0.2},
+            "model": "gpt-4o",
+            "temperature": 0.2,
             "instructions": "You are a helpful assistant",
-            "vector_store_ids": [],
         }
 
-    @pytest.fixture
-    def sample_evaluation_config_with_vector_stores(self):
-        """Sample evaluation configuration with vector stores."""
-        return {
-            "llm": {"model": "gpt-4o-mini", "temperature": 0.5},
-            "instructions": "You are an expert assistant with access to documents",
-            "vector_store_ids": ["vs_abc123", "vs_def456"],
-        }
-
-    def test_start_batch_evaluation_success(
-        self,
-        client,
-        user_api_key_header,
-        sample_evaluation_config,
-    ):
-        """Test successfully starting a batch evaluation."""
-        with patch(
-            "app.crud.evaluation_batch.fetch_dataset_items"
-        ) as mock_fetch, patch(
-            "app.crud.evaluation_batch.upload_batch_file"
-        ) as mock_upload, patch(
-            "app.crud.evaluation_batch.create_batch_job"
-        ) as mock_create_batch, patch(
-            "app.api.routes.evaluation.configure_openai"
-        ) as mock_openai, patch(
-            "app.api.routes.evaluation.configure_langfuse"
-        ) as mock_langfuse:
-            # Mock dataset items from Langfuse
-            mock_fetch.return_value = [
-                {
-                    "id": "item1",
-                    "input": {"question": "What is 2+2?"},
-                    "expected_output": {"answer": "4"},
-                    "metadata": {},
-                },
-                {
-                    "id": "item2",
-                    "input": {"question": "What is the capital of France?"},
-                    "expected_output": {"answer": "Paris"},
-                    "metadata": {},
-                },
-            ]
-
-            # Mock OpenAI file upload
-            mock_upload.return_value = "file-abc123"
-
-            # Mock batch job creation
-            mock_create_batch.return_value = {
-                "id": "batch_abc123",
-                "status": "validating",
-                "created_at": 1234567890,
-                "endpoint": "/v1/responses",
-                "input_file_id": "file-abc123",
-            }
-
-            # Mock clients
-            mock_openai_client = MagicMock()
-            mock_openai.return_value = (mock_openai_client, True)
-
-            mock_langfuse_client = MagicMock()
-            mock_langfuse.return_value = (mock_langfuse_client, True)
-
-            response = client.post(
-                "/api/v1/evaluations",
-                json={
-                    "run_name": "test_evaluation_run",
-                    "dataset_name": "test_dataset",
-                    "config": sample_evaluation_config,
-                },
-                headers=user_api_key_header,
-            )
-
-            assert response.status_code == 200, response.text
-            data = response.json()
-
-            # Verify response structure
-            assert data["run_name"] == "test_evaluation_run"
-            assert data["dataset_name"] == "test_dataset"
-            assert data["config"] == sample_evaluation_config
-            assert data["status"] == "processing"
-            assert data["batch_status"] == "validating"
-            assert data["batch_id"] == "batch_abc123"
-            assert data["batch_file_id"] == "file-abc123"
-            assert data["total_items"] == 2
-
-            # Verify mocks were called
-            mock_fetch.assert_called_once()
-            mock_upload.assert_called_once()
-            mock_create_batch.assert_called_once()
-
-    def test_start_batch_evaluation_with_vector_stores(
-        self,
-        client,
-        user_api_key_header,
-        sample_evaluation_config_with_vector_stores,
-    ):
-        """Test batch evaluation with vector stores configured."""
-        with patch(
-            "app.crud.evaluation_batch.fetch_dataset_items"
-        ) as mock_fetch, patch(
-            "app.crud.evaluation_batch.upload_batch_file"
-        ) as mock_upload, patch(
-            "app.crud.evaluation_batch.create_batch_job"
-        ) as mock_create_batch, patch(
-            "app.api.routes.evaluation.configure_openai"
-        ) as mock_openai, patch(
-            "app.api.routes.evaluation.configure_langfuse"
-        ) as mock_langfuse:
-            mock_fetch.return_value = [
-                {
-                    "id": "item1",
-                    "input": {"question": "Test question"},
-                    "expected_output": {"answer": "Test answer"},
-                    "metadata": {},
-                }
-            ]
-
-            mock_upload.return_value = "file-xyz789"
-            mock_create_batch.return_value = {
-                "id": "batch_xyz789",
-                "status": "validating",
-                "created_at": 1234567890,
-                "endpoint": "/v1/responses",
-                "input_file_id": "file-xyz789",
-            }
-
-            mock_openai_client = MagicMock()
-            mock_openai.return_value = (mock_openai_client, True)
-
-            mock_langfuse_client = MagicMock()
-            mock_langfuse.return_value = (mock_langfuse_client, True)
-
-            response = client.post(
-                "/api/v1/evaluations",
-                json={
-                    "run_name": "test_with_vector_stores",
-                    "dataset_name": "test_dataset",
-                    "config": sample_evaluation_config_with_vector_stores,
-                },
-                headers=user_api_key_header,
-            )
-
-            assert response.status_code == 200, response.text
-            data = response.json()
-
-            assert data["config"]["vector_store_ids"] == ["vs_abc123", "vs_def456"]
-            assert data["batch_id"] == "batch_xyz789"
-
-    def test_start_batch_evaluation_invalid_dataset(
+    def test_start_batch_evaluation_invalid_dataset_id(
         self, client, user_api_key_header, sample_evaluation_config
     ):
-        """Test batch evaluation fails with invalid dataset name."""
-        with patch(
-            "app.crud.evaluation_batch.fetch_dataset_items"
-        ) as mock_fetch, patch(
-            "app.api.routes.evaluation.configure_openai"
-        ) as mock_openai, patch(
-            "app.api.routes.evaluation.configure_langfuse"
-        ) as mock_langfuse:
-            # Mock dataset fetch to raise error
-            mock_fetch.side_effect = ValueError("Dataset 'invalid_dataset' not found")
-
-            mock_openai_client = MagicMock()
-            mock_openai.return_value = (mock_openai_client, True)
-
-            mock_langfuse_client = MagicMock()
-            mock_langfuse.return_value = (mock_langfuse_client, True)
-
-            response = client.post(
+        """Test batch evaluation fails with invalid dataset_id."""
+        # Try to start evaluation with non-existent dataset_id
+        with pytest.raises(Exception) as exc_info:
+            client.post(
                 "/api/v1/evaluations",
                 json={
-                    "run_name": "test_evaluation_run",
-                    "dataset_name": "invalid_dataset",
+                    "experiment_name": "test_evaluation_run",
+                    "dataset_id": 99999,  # Non-existent
                     "config": sample_evaluation_config,
                 },
                 headers=user_api_key_header,
             )
 
-            assert response.status_code == 500
-            assert (
-                "not found" in response.text.lower()
-                or "failed" in response.text.lower()
-            )
+        error_str = str(exc_info.value)
+        assert "not found" in error_str.lower() or "not accessible" in error_str.lower()
 
-    def test_start_batch_evaluation_empty_dataset(
-        self, client, user_api_key_header, sample_evaluation_config
-    ):
-        """Test batch evaluation fails with empty dataset."""
-        with patch(
-            "app.crud.evaluation_batch.fetch_dataset_items"
-        ) as mock_fetch, patch(
-            "app.api.routes.evaluation.configure_openai"
-        ) as mock_openai, patch(
-            "app.api.routes.evaluation.configure_langfuse"
-        ) as mock_langfuse:
-            # Mock empty dataset
-            mock_fetch.side_effect = ValueError("Dataset 'empty_dataset' is empty")
-
-            mock_openai_client = MagicMock()
-            mock_openai.return_value = (mock_openai_client, True)
-
-            mock_langfuse_client = MagicMock()
-            mock_langfuse.return_value = (mock_langfuse_client, True)
+    def test_start_batch_evaluation_missing_model(self, client, user_api_key_header):
+        """Test batch evaluation fails when model is missing from config."""
+        # We don't need a real dataset for this test - the validation should happen
+        # before dataset lookup. Use any dataset_id and expect config validation error
+        invalid_config = {
+            "instructions": "You are a helpful assistant",
+            "temperature": 0.5,
+        }
 
-            response = client.post(
+        with pytest.raises(Exception) as exc_info:
+            client.post(
                 "/api/v1/evaluations",
                 json={
-                    "run_name": "test_evaluation_run",
-                    "dataset_name": "empty_dataset",
-                    "config": sample_evaluation_config,
+                    "experiment_name": "test_no_model",
+                    "dataset_id": 1,  # Dummy ID, error should come before this is checked
+                    "config": invalid_config,
                 },
                 headers=user_api_key_header,
             )
 
-            assert response.status_code == 500
-            assert "empty" in response.text.lower() or "failed" in response.text.lower()
+        error_str = str(exc_info.value)
+        # Should fail with either "model" missing or "dataset not found" (both acceptable)
+        assert "model" in error_str.lower() or "not found" in error_str.lower()
 
     def test_start_batch_evaluation_without_authentication(
         self, client, sample_evaluation_config
@@ -553,120 +395,22 @@ def test_start_batch_evaluation_without_authentication(
         response = client.post(
             "/api/v1/evaluations",
             json={
-                "run_name": "test_evaluation_run",
-                "dataset_name": "test_dataset",
+                "experiment_name": "test_evaluation_run",
+                "dataset_id": 1,
                 "config": sample_evaluation_config,
             },
         )
 
         assert response.status_code == 401  # Unauthorized
 
-    def test_start_batch_evaluation_invalid_config(self, client, user_api_key_header):
-        """Test batch evaluation with invalid config structure."""
-        invalid_config = {
-            "llm": {"model": "gpt-4o"},
-            # Missing instructions
-            "vector_store_ids": "should_be_list_not_string",
-        }
-
-        with patch("app.api.routes.evaluation.configure_openai") as mock_openai, patch(
-            "app.api.routes.evaluation.configure_langfuse"
-        ) as mock_langfuse:
-            mock_openai_client = MagicMock()
-            mock_openai.return_value = (mock_openai_client, True)
-
-            mock_langfuse_client = MagicMock()
-            mock_langfuse.return_value = (mock_langfuse_client, True)
-
-            # This should still work because config is flexible (dict)
-            # but build_batch_jsonl will use defaults for missing values
-            response = client.post(
-                "/api/v1/evaluations",
-                json={
-                    "run_name": "test_evaluation_run",
-                    "dataset_name": "test_dataset",
-                    "config": invalid_config,
-                },
-                headers=user_api_key_header,
-            )
-
-            # Should succeed because config validation is flexible
-            # The function will use defaults where needed
-            assert response.status_code in [200, 500]  # Depends on other mocks
-
-    def test_start_batch_evaluation_creates_database_record(
-        self, client, user_api_key_header, sample_evaluation_config, db
-    ):
-        """Test that batch evaluation creates a proper database record."""
-        with patch(
-            "app.crud.evaluation_batch.fetch_dataset_items"
-        ) as mock_fetch, patch(
-            "app.crud.evaluation_batch.upload_batch_file"
-        ) as mock_upload, patch(
-            "app.crud.evaluation_batch.create_batch_job"
-        ) as mock_create_batch, patch(
-            "app.api.routes.evaluation.configure_openai"
-        ) as mock_openai, patch(
-            "app.api.routes.evaluation.configure_langfuse"
-        ) as mock_langfuse:
-            mock_fetch.return_value = [
-                {
-                    "id": "item1",
-                    "input": {"question": "Test?"},
-                    "expected_output": {"answer": "Test"},
-                    "metadata": {},
-                }
-            ]
-
-            mock_upload.return_value = "file-test123"
-            mock_create_batch.return_value = {
-                "id": "batch_test123",
-                "status": "validating",
-                "created_at": 1234567890,
-                "endpoint": "/v1/responses",
-                "input_file_id": "file-test123",
-            }
-
-            mock_openai_client = MagicMock()
-            mock_openai.return_value = (mock_openai_client, True)
-
-            mock_langfuse_client = MagicMock()
-            mock_langfuse.return_value = (mock_langfuse_client, True)
-
-            response = client.post(
-                "/api/v1/evaluations",
-                json={
-                    "run_name": "database_test_run",
-                    "dataset_name": "test_dataset",
-                    "config": sample_evaluation_config,
-                },
-                headers=user_api_key_header,
-            )
-
-            assert response.status_code == 200, response.text
-
-            # Verify database record was created
-            eval_run = db.exec(
-                select(EvaluationRun).where(
-                    EvaluationRun.run_name == "database_test_run"
-                )
-            ).first()
-
-            assert eval_run is not None
-            assert eval_run.dataset_name == "test_dataset"
-            assert eval_run.config == sample_evaluation_config
-            assert eval_run.status == "processing"
-            assert eval_run.batch_status == "validating"
-            assert eval_run.batch_id == "batch_test123"
-            assert eval_run.batch_file_id == "file-test123"
-            assert eval_run.total_items == 1
-
 
 class TestBatchEvaluationJSONLBuilding:
     """Test JSONL building logic for batch evaluation."""
 
     def test_build_batch_jsonl_basic(self):
         """Test basic JSONL building with minimal config."""
+        from app.crud.evaluation_batch import build_evaluation_jsonl
+
         dataset_items = [
             {
                 "id": "item1",
@@ -677,27 +421,29 @@ def test_build_batch_jsonl_basic(self):
         ]
 
         config = {
-            "llm": {"model": "gpt-4o", "temperature": 0.2},
+            "model": "gpt-4o",
+            "temperature": 0.2,
             "instructions": "You are a helpful assistant",
-            "vector_store_ids": [],
         }
 
-        batch_file = build_evaluation_jsonl(dataset_items, config)
-
-        assert len(batch_file) == 1
+        jsonl_data = build_evaluation_jsonl(dataset_items, config)
 
-        request = json.loads(batch_file[0])
+        assert len(jsonl_data) == 1
+        assert isinstance(jsonl_data[0], dict)
 
+        request = jsonl_data[0]
         assert request["custom_id"] == "item1"
         assert request["method"] == "POST"
         assert request["url"] == "/v1/responses"
         assert request["body"]["model"] == "gpt-4o"
+        assert request["body"]["temperature"] == 0.2
         assert request["body"]["instructions"] == "You are a helpful assistant"
         assert request["body"]["input"] == "What is 2+2?"
-        assert "tools" not in request["body"]
 
-    def test_build_batch_jsonl_with_vector_stores(self):
-        """Test JSONL building with vector stores."""
+    def test_build_batch_jsonl_with_tools(self):
+        """Test JSONL building with tools configuration."""
+        from app.crud.evaluation_batch import build_evaluation_jsonl
+
         dataset_items = [
             {
                 "id": "item1",
@@ -708,22 +454,27 @@ def test_build_batch_jsonl_with_vector_stores(self):
         ]
 
         config = {
-            "llm": {"model": "gpt-4o-mini"},
+            "model": "gpt-4o-mini",
             "instructions": "Search documents",
-            "vector_store_ids": ["vs_abc123"],
+            "tools": [
+                {
+                    "type": "file_search",
+                    "vector_store_ids": ["vs_abc123"],
+                }
+            ],
         }
 
-        batch_file = build_evaluation_jsonl(dataset_items, config)
-
-        assert len(batch_file) == 1
+        jsonl_data = build_evaluation_jsonl(dataset_items, config)
 
-        request = json.loads(batch_file[0])
+        assert len(jsonl_data) == 1
+        request = jsonl_data[0]
+        assert request["body"]["tools"][0]["type"] == "file_search"
+        assert "vs_abc123" in request["body"]["tools"][0]["vector_store_ids"]
 
-        assert request["body"]["tools"] == [{"type": "file_search"}]
-        assert request["body"]["tool_choice"] == "auto"
+    def test_build_batch_jsonl_minimal_config(self):
+        """Test JSONL building with minimal config (only model required)."""
+        from app.crud.evaluation_batch import build_evaluation_jsonl
 
-    def test_build_batch_jsonl_uses_defaults(self):
-        """Test JSONL building with missing config values uses defaults."""
         dataset_items = [
             {
                 "id": "item1",
@@ -733,22 +484,19 @@ def test_build_batch_jsonl_uses_defaults(self):
             }
         ]
 
-        config = {}  # Empty config, should use defaults
+        config = {"model": "gpt-4o"}  # Only model provided
 
-        batch_file = build_evaluation_jsonl(dataset_items, config)
+        jsonl_data = build_evaluation_jsonl(dataset_items, config)
 
-        assert len(batch_file) == 1
-
-        request = json.loads(batch_file[0])
-
-        # Check defaults
-        assert request["body"]["model"] == "gpt-4o"  # Default model
-        assert (
-            request["body"]["instructions"] == "You are a helpful assistant"
-        )  # Default instructions
+        assert len(jsonl_data) == 1
+        request = jsonl_data[0]
+        assert request["body"]["model"] == "gpt-4o"
+        assert request["body"]["input"] == "Test question"
 
     def test_build_batch_jsonl_skips_empty_questions(self):
         """Test that items with empty questions are skipped."""
+        from app.crud.evaluation_batch import build_evaluation_jsonl
+
         dataset_items = [
             {
                 "id": "item1",
@@ -770,18 +518,18 @@ def test_build_batch_jsonl_skips_empty_questions(self):
             },
         ]
 
-        config = {"llm": {"model": "gpt-4o"}, "instructions": "Test"}
+        config = {"model": "gpt-4o", "instructions": "Test"}
 
-        batch_file = build_evaluation_jsonl(dataset_items, config)
+        jsonl_data = build_evaluation_jsonl(dataset_items, config)
 
         # Should only have 1 valid item
-        assert len(batch_file) == 1
-
-        request = json.loads(batch_file[0])
-        assert request["custom_id"] == "item1"
+        assert len(jsonl_data) == 1
+        assert jsonl_data[0]["custom_id"] == "item1"
 
     def test_build_batch_jsonl_multiple_items(self):
         """Test JSONL building with multiple items."""
+        from app.crud.evaluation_batch import build_evaluation_jsonl
+
         dataset_items = [
             {
                 "id": f"item{i}",
@@ -793,16 +541,15 @@ def test_build_batch_jsonl_multiple_items(self):
         ]
 
         config = {
-            "llm": {"model": "gpt-4o"},
+            "model": "gpt-4o",
             "instructions": "Answer questions",
-            "vector_store_ids": [],
         }
 
-        batch_file = build_evaluation_jsonl(dataset_items, config)
+        jsonl_data = build_evaluation_jsonl(dataset_items, config)
 
-        assert len(batch_file) == 5
+        assert len(jsonl_data) == 5
 
-        for i, line in enumerate(batch_file):
-            request = json.loads(line)
-            assert request["custom_id"] == f"item{i}"
-            assert request["body"]["input"] == f"Question {i}"
+        for i, request_dict in enumerate(jsonl_data):
+            assert request_dict["custom_id"] == f"item{i}"
+            assert request_dict["body"]["input"] == f"Question {i}"
+            assert request_dict["body"]["model"] == "gpt-4o"
diff --git a/backend/app/tests/crud/test_evaluation_batch.py b/backend/app/tests/crud/test_evaluation_batch.py
deleted file mode 100644
index 6431c9ead..000000000
--- a/backend/app/tests/crud/test_evaluation_batch.py
+++ /dev/null
@@ -1,177 +0,0 @@
-"""Tests for evaluation batch output parsing."""
-
-import json
-from app.crud.evaluation_batch import parse_batch_output
-
-
-def test_parse_batch_output_complex_structure():
-    """Test parsing batch output with complex answer structure."""
-    # Batch output JSONL with complex structure
-    jsonl_content = json.dumps(
-        {
-            "custom_id": "item_123",
-            "response": {
-                "status_code": 200,
-                "body": {
-                    "id": "resp_abc",
-                    "output": [
-                        {
-                            "type": "file_search_call",
-                            "status": "completed",
-                        },
-                        {
-                            "type": "message",
-                            "content": [
-                                {
-                                    "type": "output_text",
-                                    "text": "This is the extracted answer.",
-                                }
-                            ],
-                        },
-                    ],
-                },
-            },
-        }
-    )
-
-    # Dataset items
-    dataset_items = [
-        {
-            "id": "item_123",
-            "input": {"question": "What is the answer?"},
-            "expected_output": {"answer": "Expected answer"},
-        }
-    ]
-
-    results = parse_batch_output(jsonl_content, dataset_items)
-
-    assert len(results) == 1
-    assert results[0]["item_id"] == "item_123"
-    assert results[0]["question"] == "What is the answer?"
-    assert results[0]["generated_output"] == "This is the extracted answer."
-    assert results[0]["ground_truth"] == "Expected answer"
-
-
-def test_parse_batch_output_simple_string():
-    """Test parsing batch output with simple string output."""
-    # Batch output JSONL with simple string
-    jsonl_content = json.dumps(
-        {
-            "custom_id": "item_456",
-            "response": {
-                "status_code": 200,
-                "body": {
-                    "id": "resp_def",
-                    "output": "Simple string answer",
-                },
-            },
-        }
-    )
-
-    # Dataset items
-    dataset_items = [
-        {
-            "id": "item_456",
-            "input": {"question": "Simple question?"},
-            "expected_output": {"answer": "Simple expected"},
-        }
-    ]
-
-    results = parse_batch_output(jsonl_content, dataset_items)
-
-    assert len(results) == 1
-    assert results[0]["item_id"] == "item_456"
-    assert results[0]["generated_output"] == "Simple string answer"
-
-
-def test_parse_batch_output_error_handling():
-    """Test parsing batch output with error response."""
-    # Batch output JSONL with error
-    jsonl_content = json.dumps(
-        {
-            "custom_id": "item_789",
-            "error": {
-                "message": "Rate limit exceeded",
-                "type": "rate_limit_error",
-            },
-        }
-    )
-
-    # Dataset items
-    dataset_items = [
-        {
-            "id": "item_789",
-            "input": {"question": "Error question?"},
-            "expected_output": {"answer": "Error expected"},
-        }
-    ]
-
-    results = parse_batch_output(jsonl_content, dataset_items)
-
-    assert len(results) == 1
-    assert results[0]["item_id"] == "item_789"
-    assert "ERROR: Rate limit exceeded" in results[0]["generated_output"]
-
-
-def test_parse_batch_output_stringified_list():
-    """Test parsing batch output with stringified Python list (single quotes)."""
-    # This is the exact format you showed - Python string representation of a list
-    stringified_output = str(
-        [
-            {
-                "id": "fs_0a09867e650850280068ee8d506cd081959c3e4891a733e591",
-                "type": "file_search_call",
-                "status": "completed",
-                "queries": [
-                    "सीएलएफ की आरजीबी बैठक में आय और व्यय का विवरण प्रस्तुत करने के लिए कौन जिम्मेदार है?"
-                ],
-                "results": None,
-            },
-            {
-                "id": "msg_0a09867e650850280068ee8d515d5881959de222d6218b4804",
-                "type": "message",
-                "status": "completed",
-                "content": [
-                    {
-                        "type": "output_text",
-                        "annotations": [],
-                        "logprobs": [],
-                        "text": "I'm sorry, I couldn't find any relevant information regarding who is responsible for presenting the income and expenditure details at the RGB meeting of CLF in the provided file. If there is more data or another file, I can check that for you.",
-                    }
-                ],
-                "role": "assistant",
-            },
-        ]
-    )
-
-    # Batch output JSONL with stringified list
-    jsonl_content = json.dumps(
-        {
-            "custom_id": "item_stringified",
-            "response": {
-                "status_code": 200,
-                "body": {
-                    "id": "resp_str",
-                    "output": stringified_output,
-                },
-            },
-        }
-    )
-
-    # Dataset items
-    dataset_items = [
-        {
-            "id": "item_stringified",
-            "input": {"question": "Stringified question?"},
-            "expected_output": {"answer": "Stringified expected"},
-        }
-    ]
-
-    results = parse_batch_output(jsonl_content, dataset_items)
-
-    assert len(results) == 1
-    assert results[0]["item_id"] == "item_stringified"
-    assert (
-        results[0]["generated_output"]
-        == "I'm sorry, I couldn't find any relevant information regarding who is responsible for presenting the income and expenditure details at the RGB meeting of CLF in the provided file. If there is more data or another file, I can check that for you."
-    )
diff --git a/backend/app/tests/crud/test_evaluation_dataset.py b/backend/app/tests/crud/test_evaluation_dataset.py
new file mode 100644
index 000000000..243fa2b37
--- /dev/null
+++ b/backend/app/tests/crud/test_evaluation_dataset.py
@@ -0,0 +1,409 @@
+"""
+Tests for evaluation_dataset CRUD operations.
+"""
+
+from unittest.mock import MagicMock
+
+import pytest
+from sqlmodel import Session, select
+
+from app.core.cloud.storage import CloudStorageError
+from app.crud.evaluation_dataset import (
+    create_evaluation_dataset,
+    download_csv_from_s3,
+    get_dataset_by_id,
+    get_dataset_by_name,
+    list_datasets,
+    update_dataset_langfuse_id,
+    upload_csv_to_s3,
+)
+from app.models import Organization, Project
+
+
+class TestCreateEvaluationDataset:
+    """Test creating evaluation datasets."""
+
+    def test_create_evaluation_dataset_minimal(self, db: Session):
+        """Test creating a dataset with minimal required fields."""
+        # Get organization and project from seeded data
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        dataset = create_evaluation_dataset(
+            session=db,
+            name="test_dataset",
+            dataset_metadata={"original_items_count": 10, "total_items_count": 50},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert dataset.id is not None
+        assert dataset.name == "test_dataset"
+        assert dataset.dataset_metadata["original_items_count"] == 10
+        assert dataset.dataset_metadata["total_items_count"] == 50
+        assert dataset.organization_id == org.id
+        assert dataset.project_id == project.id
+        assert dataset.description is None
+        assert dataset.s3_url is None
+        assert dataset.langfuse_dataset_id is None
+
+    def test_create_evaluation_dataset_complete(self, db: Session):
+        """Test creating a dataset with all fields."""
+        # Get organization and project from seeded data
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        dataset = create_evaluation_dataset(
+            session=db,
+            name="complete_dataset",
+            description="A complete test dataset",
+            dataset_metadata={
+                "original_items_count": 5,
+                "total_items_count": 25,
+                "duplication_factor": 5,
+            },
+            s3_url="s3://bucket/datasets/complete_dataset.csv",
+            langfuse_dataset_id="langfuse_123",
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert dataset.id is not None
+        assert dataset.name == "complete_dataset"
+        assert dataset.description == "A complete test dataset"
+        assert dataset.dataset_metadata["duplication_factor"] == 5
+        assert dataset.s3_url == "s3://bucket/datasets/complete_dataset.csv"
+        assert dataset.langfuse_dataset_id == "langfuse_123"
+        assert dataset.inserted_at is not None
+        assert dataset.updated_at is not None
+
+
+class TestGetDatasetById:
+    """Test fetching datasets by ID."""
+
+    def test_get_dataset_by_id_success(self, db: Session):
+        """Test fetching an existing dataset by ID."""
+        # Get organization and project from seeded data
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        # Create a dataset
+        dataset = create_evaluation_dataset(
+            session=db,
+            name="test_dataset",
+            dataset_metadata={"original_items_count": 10},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        # Fetch it by ID
+        fetched = get_dataset_by_id(
+            session=db,
+            dataset_id=dataset.id,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert fetched is not None
+        assert fetched.id == dataset.id
+        assert fetched.name == "test_dataset"
+
+    def test_get_dataset_by_id_not_found(self, db: Session):
+        """Test fetching a non-existent dataset."""
+        # Get organization and project from seeded data
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        fetched = get_dataset_by_id(
+            session=db,
+            dataset_id=99999,
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert fetched is None
+
+    def test_get_dataset_by_id_wrong_org(self, db: Session):
+        """Test that datasets from other orgs can't be fetched."""
+        # Get organization and project from seeded data
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        # Create a dataset
+        dataset = create_evaluation_dataset(
+            session=db,
+            name="test_dataset",
+            dataset_metadata={"original_items_count": 10},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        # Try to fetch it with wrong org_id
+        fetched = get_dataset_by_id(
+            session=db,
+            dataset_id=dataset.id,
+            organization_id=99999,  # Wrong org
+            project_id=project.id,
+        )
+
+        assert fetched is None
+
+
+class TestGetDatasetByName:
+    """Test fetching datasets by name."""
+
+    def test_get_dataset_by_name_success(self, db: Session):
+        """Test fetching an existing dataset by name."""
+        # Get organization and project from seeded data
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        # Create a dataset
+        create_evaluation_dataset(
+            session=db,
+            name="unique_dataset",
+            dataset_metadata={"original_items_count": 10},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        # Fetch it by name
+        fetched = get_dataset_by_name(
+            session=db,
+            name="unique_dataset",
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert fetched is not None
+        assert fetched.name == "unique_dataset"
+
+    def test_get_dataset_by_name_not_found(self, db: Session):
+        """Test fetching a non-existent dataset by name."""
+        # Get organization and project from seeded data
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        fetched = get_dataset_by_name(
+            session=db,
+            name="nonexistent_dataset",
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert fetched is None
+
+
+class TestListDatasets:
+    """Test listing datasets."""
+
+    def test_list_datasets_empty(self, db: Session):
+        """Test listing datasets when none exist."""
+        # Get organization and project from seeded data
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        datasets = list_datasets(
+            session=db, organization_id=org.id, project_id=project.id
+        )
+
+        assert len(datasets) == 0
+
+    def test_list_datasets_multiple(self, db: Session):
+        """Test listing multiple datasets."""
+        # Get organization and project from seeded data
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        # Create multiple datasets
+        for i in range(5):
+            create_evaluation_dataset(
+                session=db,
+                name=f"dataset_{i}",
+                dataset_metadata={"original_items_count": i},
+                organization_id=org.id,
+                project_id=project.id,
+            )
+
+        datasets = list_datasets(
+            session=db, organization_id=org.id, project_id=project.id
+        )
+
+        assert len(datasets) == 5
+        # Should be ordered by most recent first
+        assert datasets[0].name == "dataset_4"
+        assert datasets[4].name == "dataset_0"
+
+    def test_list_datasets_pagination(self, db: Session):
+        """Test pagination of datasets."""
+        # Get organization and project from seeded data
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        # Create 10 datasets
+        for i in range(10):
+            create_evaluation_dataset(
+                session=db,
+                name=f"dataset_{i}",
+                dataset_metadata={"original_items_count": i},
+                organization_id=org.id,
+                project_id=project.id,
+            )
+
+        # Get first page
+        page1 = list_datasets(
+            session=db, organization_id=org.id, project_id=project.id, limit=5, offset=0
+        )
+
+        # Get second page
+        page2 = list_datasets(
+            session=db, organization_id=org.id, project_id=project.id, limit=5, offset=5
+        )
+
+        assert len(page1) == 5
+        assert len(page2) == 5
+        # Ensure no overlap
+        page1_names = [d.name for d in page1]
+        page2_names = [d.name for d in page2]
+        assert len(set(page1_names) & set(page2_names)) == 0
+
+
+class TestUploadCsvToS3:
+    """Test CSV upload to S3."""
+
+    def test_upload_csv_to_s3_success(self):
+        """Test successful S3 upload."""
+        mock_storage = MagicMock()
+        mock_storage.put.return_value = "s3://bucket/datasets/test_dataset.csv"
+
+        csv_content = b"question,answer\nWhat is 2+2?,4\n"
+
+        s3_url = upload_csv_to_s3(
+            storage=mock_storage, csv_content=csv_content, dataset_name="test_dataset"
+        )
+
+        assert s3_url == "s3://bucket/datasets/test_dataset.csv"
+        mock_storage.put.assert_called_once()
+
+    def test_upload_csv_to_s3_cloud_storage_error(self):
+        """Test S3 upload with CloudStorageError."""
+        mock_storage = MagicMock()
+        mock_storage.put.side_effect = CloudStorageError("S3 bucket not found")
+
+        csv_content = b"question,answer\nWhat is 2+2?,4\n"
+
+        # Should return None on error
+        s3_url = upload_csv_to_s3(
+            storage=mock_storage, csv_content=csv_content, dataset_name="test_dataset"
+        )
+
+        assert s3_url is None
+
+    def test_upload_csv_to_s3_unexpected_error(self):
+        """Test S3 upload with unexpected error."""
+        mock_storage = MagicMock()
+        mock_storage.put.side_effect = Exception("Unexpected error")
+
+        csv_content = b"question,answer\nWhat is 2+2?,4\n"
+
+        # Should return None on error
+        s3_url = upload_csv_to_s3(
+            storage=mock_storage, csv_content=csv_content, dataset_name="test_dataset"
+        )
+
+        assert s3_url is None
+
+
+class TestDownloadCsvFromS3:
+    """Test CSV download from S3."""
+
+    def test_download_csv_from_s3_success(self):
+        """Test successful S3 download."""
+        mock_storage = MagicMock()
+        mock_body = MagicMock()
+        mock_body.read.return_value = b"question,answer\nWhat is 2+2?,4\n"
+        mock_storage.stream.return_value = mock_body
+
+        csv_content = download_csv_from_s3(
+            storage=mock_storage, s3_url="s3://bucket/datasets/test.csv"
+        )
+
+        assert csv_content == b"question,answer\nWhat is 2+2?,4\n"
+        mock_storage.stream.assert_called_once_with("s3://bucket/datasets/test.csv")
+
+    def test_download_csv_from_s3_empty_url(self):
+        """Test download with empty URL."""
+        mock_storage = MagicMock()
+
+        with pytest.raises(ValueError, match="s3_url cannot be None or empty"):
+            download_csv_from_s3(storage=mock_storage, s3_url=None)
+
+    def test_download_csv_from_s3_error(self):
+        """Test download with storage error."""
+        mock_storage = MagicMock()
+        mock_storage.stream.side_effect = Exception("S3 download failed")
+
+        with pytest.raises(Exception, match="S3 download failed"):
+            download_csv_from_s3(
+                storage=mock_storage, s3_url="s3://bucket/datasets/test.csv"
+            )
+
+
+class TestUpdateDatasetLangfuseId:
+    """Test updating Langfuse ID."""
+
+    def test_update_dataset_langfuse_id(self, db: Session):
+        """Test updating Langfuse dataset ID."""
+        # Get organization and project from seeded data
+        org = db.exec(select(Organization)).first()
+        project = db.exec(
+            select(Project).where(Project.organization_id == org.id)
+        ).first()
+
+        # Create a dataset without Langfuse ID
+        dataset = create_evaluation_dataset(
+            session=db,
+            name="test_dataset",
+            dataset_metadata={"original_items_count": 10},
+            organization_id=org.id,
+            project_id=project.id,
+        )
+
+        assert dataset.langfuse_dataset_id is None
+
+        # Update Langfuse ID
+        update_dataset_langfuse_id(
+            session=db, dataset_id=dataset.id, langfuse_dataset_id="langfuse_123"
+        )
+
+        # Refresh and verify
+        db.refresh(dataset)
+        assert dataset.langfuse_dataset_id == "langfuse_123"
+
+    def test_update_dataset_langfuse_id_nonexistent(self, db: Session):
+        """Test updating Langfuse ID for non-existent dataset."""
+        # Should not raise an error, just do nothing
+        update_dataset_langfuse_id(
+            session=db, dataset_id=99999, langfuse_dataset_id="langfuse_123"
+        )
+        # No assertion needed, just ensuring it doesn't crash
diff --git a/backend/app/tests/crud/test_evaluation_embeddings.py b/backend/app/tests/crud/test_evaluation_embeddings.py
index 26468ed67..67bb709c4 100644
--- a/backend/app/tests/crud/test_evaluation_embeddings.py
+++ b/backend/app/tests/crud/test_evaluation_embeddings.py
@@ -31,12 +31,17 @@ def test_build_embedding_jsonl_basic(self):
             },
         ]
 
-        jsonl_data = build_embedding_jsonl(results)
+        trace_id_mapping = {
+            "item_1": "trace_1",
+            "item_2": "trace_2",
+        }
+
+        jsonl_data = build_embedding_jsonl(results, trace_id_mapping)
 
         assert len(jsonl_data) == 2
 
-        # Check first item
-        assert jsonl_data[0]["custom_id"] == "item_1"
+        # Check first item - uses trace_id as custom_id
+        assert jsonl_data[0]["custom_id"] == "trace_1"
         assert jsonl_data[0]["method"] == "POST"
         assert jsonl_data[0]["url"] == "/v1/embeddings"
         assert jsonl_data[0]["body"]["model"] == "text-embedding-3-large"
@@ -54,8 +59,10 @@ def test_build_embedding_jsonl_custom_model(self):
             }
         ]
 
+        trace_id_mapping = {"item_1": "trace_1"}
+
         jsonl_data = build_embedding_jsonl(
-            results, embedding_model="text-embedding-3-small"
+            results, trace_id_mapping, embedding_model="text-embedding-3-small"
         )
 
         assert len(jsonl_data) == 1
@@ -84,14 +91,20 @@ def test_build_embedding_jsonl_skips_empty(self):
             },
         ]
 
-        jsonl_data = build_embedding_jsonl(results)
+        trace_id_mapping = {
+            "item_1": "trace_1",
+            "item_2": "trace_2",
+            "item_3": "trace_3",
+        }
+
+        jsonl_data = build_embedding_jsonl(results, trace_id_mapping)
 
         # Only item_3 should be included
         assert len(jsonl_data) == 1
-        assert jsonl_data[0]["custom_id"] == "item_3"
+        assert jsonl_data[0]["custom_id"] == "trace_3"
 
     def test_build_embedding_jsonl_missing_item_id(self):
-        """Test that items without item_id are skipped."""
+        """Test that items without item_id or trace_id are skipped."""
         results = [
             {
                 # Missing item_id
@@ -107,11 +120,14 @@ def test_build_embedding_jsonl_missing_item_id(self):
             },
         ]
 
-        jsonl_data = build_embedding_jsonl(results)
+        # Only item_2 has a mapping
+        trace_id_mapping = {"item_2": "trace_2"}
+
+        jsonl_data = build_embedding_jsonl(results, trace_id_mapping)
 
         # Only item_2 should be included
         assert len(jsonl_data) == 1
-        assert jsonl_data[0]["custom_id"] == "item_2"
+        assert jsonl_data[0]["custom_id"] == "trace_2"
 
 
 class TestParseEmbeddingResults:
@@ -121,7 +137,7 @@ def test_parse_embedding_results_basic(self):
         """Test parsing basic embedding results."""
         raw_results = [
             {
-                "custom_id": "item_1",
+                "custom_id": "trace_1",
                 "response": {
                     "body": {
                         "data": [
@@ -132,7 +148,7 @@ def test_parse_embedding_results_basic(self):
                 },
             },
             {
-                "custom_id": "item_2",
+                "custom_id": "trace_2",
                 "response": {
                     "body": {
                         "data": [
@@ -148,13 +164,13 @@ def test_parse_embedding_results_basic(self):
 
         assert len(embedding_pairs) == 2
 
-        # Check first pair
-        assert embedding_pairs[0]["item_id"] == "item_1"
+        # Check first pair - now uses trace_id
+        assert embedding_pairs[0]["trace_id"] == "trace_1"
         assert embedding_pairs[0]["output_embedding"] == [0.1, 0.2, 0.3]
         assert embedding_pairs[0]["ground_truth_embedding"] == [0.15, 0.22, 0.32]
 
         # Check second pair
-        assert embedding_pairs[1]["item_id"] == "item_2"
+        assert embedding_pairs[1]["trace_id"] == "trace_2"
         assert embedding_pairs[1]["output_embedding"] == [0.5, 0.6, 0.7]
         assert embedding_pairs[1]["ground_truth_embedding"] == [0.55, 0.65, 0.75]
 
@@ -162,11 +178,11 @@ def test_parse_embedding_results_with_error(self):
         """Test parsing results with errors."""
         raw_results = [
             {
-                "custom_id": "item_1",
+                "custom_id": "trace_1",
                 "error": {"message": "Rate limit exceeded"},
             },
             {
-                "custom_id": "item_2",
+                "custom_id": "trace_2",
                 "response": {
                     "body": {
                         "data": [
@@ -180,15 +196,15 @@ def test_parse_embedding_results_with_error(self):
 
         embedding_pairs = parse_embedding_results(raw_results)
 
-        # Only item_2 should be included (item_1 had error)
+        # Only trace_2 should be included (trace_1 had error)
         assert len(embedding_pairs) == 1
-        assert embedding_pairs[0]["item_id"] == "item_2"
+        assert embedding_pairs[0]["trace_id"] == "trace_2"
 
     def test_parse_embedding_results_missing_embedding(self):
         """Test parsing results with missing embeddings."""
         raw_results = [
             {
-                "custom_id": "item_1",
+                "custom_id": "trace_1",
                 "response": {
                     "body": {
                         "data": [
@@ -199,7 +215,7 @@ def test_parse_embedding_results_missing_embedding(self):
                 },
             },
             {
-                "custom_id": "item_2",
+                "custom_id": "trace_2",
                 "response": {
                     "body": {
                         "data": [
@@ -213,9 +229,9 @@ def test_parse_embedding_results_missing_embedding(self):
 
         embedding_pairs = parse_embedding_results(raw_results)
 
-        # Only item_2 should be included (item_1 missing index 1)
+        # Only trace_2 should be included (trace_1 missing index 1)
         assert len(embedding_pairs) == 1
-        assert embedding_pairs[0]["item_id"] == "item_2"
+        assert embedding_pairs[0]["trace_id"] == "trace_2"
 
 
 class TestCalculateCosineSimilarity:
@@ -275,17 +291,17 @@ def test_calculate_average_similarity_basic(self):
         """Test calculating average similarity for basic embedding pairs."""
         embedding_pairs = [
             {
-                "item_id": "item_1",
+                "trace_id": "trace_1",
                 "output_embedding": [1.0, 0.0, 0.0],
                 "ground_truth_embedding": [1.0, 0.0, 0.0],  # Similarity = 1.0
             },
             {
-                "item_id": "item_2",
+                "trace_id": "trace_2",
                 "output_embedding": [1.0, 0.0, 0.0],
                 "ground_truth_embedding": [0.0, 1.0, 0.0],  # Similarity = 0.0
             },
             {
-                "item_id": "item_3",
+                "trace_id": "trace_3",
                 "output_embedding": [1.0, 1.0, 0.0],
                 "ground_truth_embedding": [1.0, 0.0, 0.0],  # Similarity ≈ 0.707
             },
@@ -317,12 +333,12 @@ def test_calculate_average_similarity_per_item_scores(self):
         """Test that per-item scores are correctly calculated."""
         embedding_pairs = [
             {
-                "item_id": "item_1",
+                "trace_id": "trace_1",
                 "output_embedding": [1.0, 0.0],
                 "ground_truth_embedding": [1.0, 0.0],
             },
             {
-                "item_id": "item_2",
+                "trace_id": "trace_2",
                 "output_embedding": [0.0, 1.0],
                 "ground_truth_embedding": [0.0, 1.0],
             },
@@ -331,9 +347,9 @@ def test_calculate_average_similarity_per_item_scores(self):
         stats = calculate_average_similarity(embedding_pairs)
 
         assert len(stats["per_item_scores"]) == 2
-        assert stats["per_item_scores"][0]["item_id"] == "item_1"
+        assert stats["per_item_scores"][0]["trace_id"] == "trace_1"
         assert stats["per_item_scores"][0]["cosine_similarity"] == pytest.approx(1.0)
-        assert stats["per_item_scores"][1]["item_id"] == "item_2"
+        assert stats["per_item_scores"][1]["trace_id"] == "trace_2"
         assert stats["per_item_scores"][1]["cosine_similarity"] == pytest.approx(1.0)
 
     def test_calculate_average_similarity_statistics(self):
@@ -341,22 +357,22 @@ def test_calculate_average_similarity_statistics(self):
         # Create pairs with known similarities
         embedding_pairs = [
             {
-                "item_id": "item_1",
+                "trace_id": "trace_1",
                 "output_embedding": [1.0, 0.0],
                 "ground_truth_embedding": [1.0, 0.0],  # sim = 1.0
             },
             {
-                "item_id": "item_2",
+                "trace_id": "trace_2",
                 "output_embedding": [1.0, 0.0],
                 "ground_truth_embedding": [0.0, 1.0],  # sim = 0.0
             },
             {
-                "item_id": "item_3",
+                "trace_id": "trace_3",
                 "output_embedding": [1.0, 0.0],
                 "ground_truth_embedding": [1.0, 0.0],  # sim = 1.0
             },
             {
-                "item_id": "item_4",
+                "trace_id": "trace_4",
                 "output_embedding": [1.0, 0.0],
                 "ground_truth_embedding": [0.0, 1.0],  # sim = 0.0
             },
diff --git a/backend/app/tests/crud/test_evaluation_langfuse.py b/backend/app/tests/crud/test_evaluation_langfuse.py
new file mode 100644
index 000000000..1bc8b2198
--- /dev/null
+++ b/backend/app/tests/crud/test_evaluation_langfuse.py
@@ -0,0 +1,414 @@
+"""
+Tests for evaluation_langfuse CRUD operations.
+"""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from app.crud.evaluation_langfuse import (
+    create_langfuse_dataset_run,
+    update_traces_with_cosine_scores,
+    upload_dataset_to_langfuse_from_csv,
+)
+
+
+class TestCreateLangfuseDatasetRun:
+    """Test creating Langfuse dataset runs."""
+
+    def test_create_langfuse_dataset_run_success(self):
+        """Test successfully creating a dataset run with traces."""
+        # Mock Langfuse client
+        mock_langfuse = MagicMock()
+        mock_dataset = MagicMock()
+
+        # Mock dataset items
+        mock_item1 = MagicMock()
+        mock_item1.id = "item_1"
+        mock_item1.observe.return_value.__enter__.return_value = "trace_id_1"
+
+        mock_item2 = MagicMock()
+        mock_item2.id = "item_2"
+        mock_item2.observe.return_value.__enter__.return_value = "trace_id_2"
+
+        mock_dataset.items = [mock_item1, mock_item2]
+        mock_langfuse.get_dataset.return_value = mock_dataset
+
+        # Test data
+        results = [
+            {
+                "item_id": "item_1",
+                "question": "What is 2+2?",
+                "generated_output": "4",
+                "ground_truth": "4",
+            },
+            {
+                "item_id": "item_2",
+                "question": "What is the capital of France?",
+                "generated_output": "Paris",
+                "ground_truth": "Paris",
+            },
+        ]
+
+        # Call function
+        trace_id_mapping = create_langfuse_dataset_run(
+            langfuse=mock_langfuse,
+            dataset_name="test_dataset",
+            run_name="test_run",
+            results=results,
+        )
+
+        # Verify results
+        assert len(trace_id_mapping) == 2
+        assert trace_id_mapping["item_1"] == "trace_id_1"
+        assert trace_id_mapping["item_2"] == "trace_id_2"
+
+        # Verify Langfuse calls
+        mock_langfuse.get_dataset.assert_called_once_with("test_dataset")
+        mock_langfuse.flush.assert_called_once()
+        assert mock_langfuse.trace.call_count == 2
+
+    def test_create_langfuse_dataset_run_skips_missing_items(self):
+        """Test that missing dataset items are skipped."""
+        mock_langfuse = MagicMock()
+        mock_dataset = MagicMock()
+
+        # Only one item exists
+        mock_item1 = MagicMock()
+        mock_item1.id = "item_1"
+        mock_item1.observe.return_value.__enter__.return_value = "trace_id_1"
+
+        mock_dataset.items = [mock_item1]
+        mock_langfuse.get_dataset.return_value = mock_dataset
+
+        # Results include an item that doesn't exist in dataset
+        results = [
+            {
+                "item_id": "item_1",
+                "question": "What is 2+2?",
+                "generated_output": "4",
+                "ground_truth": "4",
+            },
+            {
+                "item_id": "item_nonexistent",
+                "question": "Invalid question",
+                "generated_output": "Invalid",
+                "ground_truth": "Invalid",
+            },
+        ]
+
+        trace_id_mapping = create_langfuse_dataset_run(
+            langfuse=mock_langfuse,
+            dataset_name="test_dataset",
+            run_name="test_run",
+            results=results,
+        )
+
+        # Only the valid item should be in the mapping
+        assert len(trace_id_mapping) == 1
+        assert "item_1" in trace_id_mapping
+        assert "item_nonexistent" not in trace_id_mapping
+
+    def test_create_langfuse_dataset_run_handles_trace_error(self):
+        """Test that trace creation errors are handled gracefully."""
+        mock_langfuse = MagicMock()
+        mock_dataset = MagicMock()
+
+        # First item succeeds
+        mock_item1 = MagicMock()
+        mock_item1.id = "item_1"
+        mock_item1.observe.return_value.__enter__.return_value = "trace_id_1"
+
+        # Second item fails
+        mock_item2 = MagicMock()
+        mock_item2.id = "item_2"
+        mock_item2.observe.side_effect = Exception("Trace creation failed")
+
+        mock_dataset.items = [mock_item1, mock_item2]
+        mock_langfuse.get_dataset.return_value = mock_dataset
+
+        results = [
+            {
+                "item_id": "item_1",
+                "question": "What is 2+2?",
+                "generated_output": "4",
+                "ground_truth": "4",
+            },
+            {
+                "item_id": "item_2",
+                "question": "What is the capital?",
+                "generated_output": "Paris",
+                "ground_truth": "Paris",
+            },
+        ]
+
+        trace_id_mapping = create_langfuse_dataset_run(
+            langfuse=mock_langfuse,
+            dataset_name="test_dataset",
+            run_name="test_run",
+            results=results,
+        )
+
+        # Only successful item should be in mapping
+        assert len(trace_id_mapping) == 1
+        assert "item_1" in trace_id_mapping
+        assert "item_2" not in trace_id_mapping
+
+    def test_create_langfuse_dataset_run_empty_results(self):
+        """Test with empty results list."""
+        mock_langfuse = MagicMock()
+        mock_dataset = MagicMock()
+        mock_dataset.items = []
+        mock_langfuse.get_dataset.return_value = mock_dataset
+
+        trace_id_mapping = create_langfuse_dataset_run(
+            langfuse=mock_langfuse,
+            dataset_name="test_dataset",
+            run_name="test_run",
+            results=[],
+        )
+
+        assert len(trace_id_mapping) == 0
+        mock_langfuse.flush.assert_called_once()
+
+
+class TestUpdateTracesWithCosineScores:
+    """Test updating Langfuse traces with cosine similarity scores."""
+
+    def test_update_traces_with_cosine_scores_success(self):
+        """Test successfully updating traces with scores."""
+        mock_langfuse = MagicMock()
+
+        per_item_scores = [
+            {"trace_id": "trace_1", "cosine_similarity": 0.95},
+            {"trace_id": "trace_2", "cosine_similarity": 0.87},
+            {"trace_id": "trace_3", "cosine_similarity": 0.92},
+        ]
+
+        update_traces_with_cosine_scores(
+            langfuse=mock_langfuse, per_item_scores=per_item_scores
+        )
+
+        # Verify score was called for each item
+        assert mock_langfuse.score.call_count == 3
+
+        # Verify the score calls
+        calls = mock_langfuse.score.call_args_list
+        assert calls[0].kwargs["trace_id"] == "trace_1"
+        assert calls[0].kwargs["name"] == "cosine_similarity"
+        assert calls[0].kwargs["value"] == 0.95
+        assert "cosine similarity" in calls[0].kwargs["comment"].lower()
+
+        assert calls[1].kwargs["trace_id"] == "trace_2"
+        assert calls[1].kwargs["value"] == 0.87
+
+        mock_langfuse.flush.assert_called_once()
+
+    def test_update_traces_with_cosine_scores_missing_trace_id(self):
+        """Test that items without trace_id are skipped."""
+        mock_langfuse = MagicMock()
+
+        per_item_scores = [
+            {"trace_id": "trace_1", "cosine_similarity": 0.95},
+            {"cosine_similarity": 0.87},  # Missing trace_id
+            {"trace_id": "trace_3", "cosine_similarity": 0.92},
+        ]
+
+        update_traces_with_cosine_scores(
+            langfuse=mock_langfuse, per_item_scores=per_item_scores
+        )
+
+        # Should only call score for items with trace_id
+        assert mock_langfuse.score.call_count == 2
+
+    def test_update_traces_with_cosine_scores_error_handling(self):
+        """Test that score errors don't stop processing."""
+        mock_langfuse = MagicMock()
+
+        # First call succeeds, second fails, third succeeds
+        mock_langfuse.score.side_effect = [None, Exception("Score failed"), None]
+
+        per_item_scores = [
+            {"trace_id": "trace_1", "cosine_similarity": 0.95},
+            {"trace_id": "trace_2", "cosine_similarity": 0.87},
+            {"trace_id": "trace_3", "cosine_similarity": 0.92},
+        ]
+
+        # Should not raise exception
+        update_traces_with_cosine_scores(
+            langfuse=mock_langfuse, per_item_scores=per_item_scores
+        )
+
+        # All three should have been attempted
+        assert mock_langfuse.score.call_count == 3
+        mock_langfuse.flush.assert_called_once()
+
+    def test_update_traces_with_cosine_scores_empty_list(self):
+        """Test with empty scores list."""
+        mock_langfuse = MagicMock()
+
+        update_traces_with_cosine_scores(langfuse=mock_langfuse, per_item_scores=[])
+
+        mock_langfuse.score.assert_not_called()
+        mock_langfuse.flush.assert_called_once()
+
+
+class TestUploadDatasetToLangfuseFromCsv:
+    """Test uploading datasets to Langfuse from CSV content."""
+
+    @pytest.fixture
+    def valid_csv_content(self):
+        """Valid CSV content."""
+        csv_string = """question,answer
+"What is 2+2?","4"
+"What is the capital of France?","Paris"
+"Who wrote Romeo and Juliet?","Shakespeare"
+"""
+        return csv_string.encode("utf-8")
+
+    def test_upload_dataset_to_langfuse_from_csv_success(self, valid_csv_content):
+        """Test successful upload with duplication."""
+        mock_langfuse = MagicMock()
+        mock_dataset = MagicMock()
+        mock_dataset.id = "dataset_123"
+        mock_langfuse.create_dataset.return_value = mock_dataset
+
+        langfuse_id, total_items = upload_dataset_to_langfuse_from_csv(
+            langfuse=mock_langfuse,
+            csv_content=valid_csv_content,
+            dataset_name="test_dataset",
+            duplication_factor=5,
+        )
+
+        assert langfuse_id == "dataset_123"
+        assert total_items == 15  # 3 items * 5 duplication
+
+        # Verify dataset creation
+        mock_langfuse.create_dataset.assert_called_once_with(name="test_dataset")
+
+        # Verify dataset items were created (3 original * 5 duplicates = 15)
+        assert mock_langfuse.create_dataset_item.call_count == 15
+
+        mock_langfuse.flush.assert_called_once()
+
+    def test_upload_dataset_to_langfuse_from_csv_duplication_metadata(
+        self, valid_csv_content
+    ):
+        """Test that duplication metadata is included."""
+        mock_langfuse = MagicMock()
+        mock_dataset = MagicMock()
+        mock_dataset.id = "dataset_123"
+        mock_langfuse.create_dataset.return_value = mock_dataset
+
+        upload_dataset_to_langfuse_from_csv(
+            langfuse=mock_langfuse,
+            csv_content=valid_csv_content,
+            dataset_name="test_dataset",
+            duplication_factor=3,
+        )
+
+        # Check metadata in create_dataset_item calls
+        calls = mock_langfuse.create_dataset_item.call_args_list
+
+        # Each original item should have 3 duplicates
+        duplicate_numbers = []
+        for call_args in calls:
+            metadata = call_args.kwargs.get("metadata", {})
+            duplicate_numbers.append(metadata.get("duplicate_number"))
+            assert metadata.get("duplication_factor") == 3
+
+        # Should have 3 sets of duplicates (1, 2, 3)
+        assert duplicate_numbers.count(1) == 3  # 3 original items, each with dup #1
+        assert duplicate_numbers.count(2) == 3  # 3 original items, each with dup #2
+        assert duplicate_numbers.count(3) == 3  # 3 original items, each with dup #3
+
+    def test_upload_dataset_to_langfuse_from_csv_missing_columns(self):
+        """Test with CSV missing required columns."""
+        mock_langfuse = MagicMock()
+
+        invalid_csv = b"query,response\nWhat is 2+2?,4\n"
+
+        with pytest.raises(ValueError, match="question.*answer"):
+            upload_dataset_to_langfuse_from_csv(
+                langfuse=mock_langfuse,
+                csv_content=invalid_csv,
+                dataset_name="test_dataset",
+                duplication_factor=1,
+            )
+
+    def test_upload_dataset_to_langfuse_from_csv_empty_rows(self):
+        """Test that empty rows are skipped."""
+        mock_langfuse = MagicMock()
+        mock_dataset = MagicMock()
+        mock_dataset.id = "dataset_123"
+        mock_langfuse.create_dataset.return_value = mock_dataset
+
+        # CSV with some empty rows
+        csv_with_empty = b"""question,answer
+"Valid question 1","Valid answer 1"
+"","Empty answer"
+"Valid question 2",""
+"Valid question 3","Valid answer 3"
+"""
+
+        langfuse_id, total_items = upload_dataset_to_langfuse_from_csv(
+            langfuse=mock_langfuse,
+            csv_content=csv_with_empty,
+            dataset_name="test_dataset",
+            duplication_factor=2,
+        )
+
+        # Should only process 2 valid items (first and last)
+        assert total_items == 4  # 2 valid items * 2 duplication
+        assert mock_langfuse.create_dataset_item.call_count == 4
+
+    def test_upload_dataset_to_langfuse_from_csv_empty_dataset(self):
+        """Test with CSV that has no valid items."""
+        mock_langfuse = MagicMock()
+
+        empty_csv = b"""question,answer
+"",""
+"","answer without question"
+"""
+
+        with pytest.raises(ValueError, match="No valid items found"):
+            upload_dataset_to_langfuse_from_csv(
+                langfuse=mock_langfuse,
+                csv_content=empty_csv,
+                dataset_name="test_dataset",
+                duplication_factor=1,
+            )
+
+    def test_upload_dataset_to_langfuse_from_csv_invalid_encoding(self):
+        """Test with invalid CSV encoding."""
+        mock_langfuse = MagicMock()
+
+        # Invalid UTF-8 bytes
+        invalid_csv = b"\xff\xfe Invalid UTF-8"
+
+        with pytest.raises((ValueError, Exception)):
+            upload_dataset_to_langfuse_from_csv(
+                langfuse=mock_langfuse,
+                csv_content=invalid_csv,
+                dataset_name="test_dataset",
+                duplication_factor=1,
+            )
+
+    def test_upload_dataset_to_langfuse_from_csv_default_duplication(
+        self, valid_csv_content
+    ):
+        """Test upload with duplication factor of 1."""
+        mock_langfuse = MagicMock()
+        mock_dataset = MagicMock()
+        mock_dataset.id = "dataset_123"
+        mock_langfuse.create_dataset.return_value = mock_dataset
+
+        langfuse_id, total_items = upload_dataset_to_langfuse_from_csv(
+            langfuse=mock_langfuse,
+            csv_content=valid_csv_content,
+            dataset_name="test_dataset",
+            duplication_factor=1,
+        )
+
+        assert total_items == 3  # 3 items * 1 duplication
+        assert mock_langfuse.create_dataset_item.call_count == 3

From d289794f9b9408cad5c754d64f30898029c234cb Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 30 Oct 2025 13:05:14 +0530
Subject: [PATCH 32/64] using single migration file

---
 ...mbedding_batch_job_id_to_evaluation_run.py | 50 -----------
 ...evaluation_dataset_table_and_dataset_id.py | 82 -------------------
 ...5747495bd7c_create_evaluation_run_table.py | 62 +++++++++++++-
 3 files changed, 59 insertions(+), 135 deletions(-)
 delete mode 100644 backend/app/alembic/versions/a1b2c3d4e5f6_add_embedding_batch_job_id_to_evaluation_run.py
 delete mode 100644 backend/app/alembic/versions/b2c3d4e5f6g7_add_evaluation_dataset_table_and_dataset_id.py

diff --git a/backend/app/alembic/versions/a1b2c3d4e5f6_add_embedding_batch_job_id_to_evaluation_run.py b/backend/app/alembic/versions/a1b2c3d4e5f6_add_embedding_batch_job_id_to_evaluation_run.py
deleted file mode 100644
index 195c8a81c..000000000
--- a/backend/app/alembic/versions/a1b2c3d4e5f6_add_embedding_batch_job_id_to_evaluation_run.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""add_embedding_batch_job_id_to_evaluation_run
-
-Revision ID: a1b2c3d4e5f6
-Revises: d5747495bd7c
-Create Date: 2025-10-22 00:00:00.000000
-
-"""
-from alembic import op
-import sqlalchemy as sa
-
-
-# revision identifiers, used by Alembic.
-revision = "a1b2c3d4e5f6"
-down_revision = "d5747495bd7c"
-branch_labels = None
-depends_on = None
-
-
-def upgrade():
-    # Add embedding_batch_job_id column to evaluation_run table
-    op.add_column(
-        "evaluation_run",
-        sa.Column(
-            "embedding_batch_job_id",
-            sa.Integer(),
-            nullable=True,
-            comment="Reference to the batch_job for embedding-based similarity scoring",
-        ),
-    )
-
-    # Add foreign key constraint to batch_job table
-    op.create_foreign_key(
-        "fk_evaluation_run_embedding_batch_job_id",
-        "evaluation_run",
-        "batch_job",
-        ["embedding_batch_job_id"],
-        ["id"],
-    )
-
-
-def downgrade():
-    # Drop foreign key constraint
-    op.drop_constraint(
-        "fk_evaluation_run_embedding_batch_job_id",
-        "evaluation_run",
-        type_="foreignkey",
-    )
-
-    # Drop embedding_batch_job_id column
-    op.drop_column("evaluation_run", "embedding_batch_job_id")
diff --git a/backend/app/alembic/versions/b2c3d4e5f6g7_add_evaluation_dataset_table_and_dataset_id.py b/backend/app/alembic/versions/b2c3d4e5f6g7_add_evaluation_dataset_table_and_dataset_id.py
deleted file mode 100644
index f6ca8a79d..000000000
--- a/backend/app/alembic/versions/b2c3d4e5f6g7_add_evaluation_dataset_table_and_dataset_id.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""add_evaluation_dataset_table_and_dataset_id_to_evaluation_run
-
-Revision ID: b2c3d4e5f6g7
-Revises: a1b2c3d4e5f6
-Create Date: 2025-10-28 00:00:00.000000
-
-"""
-from alembic import op
-import sqlalchemy as sa
-import sqlmodel.sql.sqltypes
-from sqlalchemy.dialects import postgresql
-
-
-# revision identifiers, used by Alembic.
-revision = "b2c3d4e5f6g7"
-down_revision = "a1b2c3d4e5f6"
-branch_labels = None
-depends_on = None
-
-
-def upgrade():
-    # Create evaluation_dataset table
-    op.create_table(
-        "evaluation_dataset",
-        sa.Column("id", sa.Integer(), nullable=False),
-        sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
-        sa.Column("description", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
-        sa.Column(
-            "dataset_metadata",
-            postgresql.JSONB(astext_type=sa.Text()),
-            nullable=False,
-            server_default=sa.text("'{}'::jsonb"),
-        ),
-        sa.Column("s3_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
-        sa.Column(
-            "langfuse_dataset_id",
-            sqlmodel.sql.sqltypes.AutoString(),
-            nullable=True,
-        ),
-        sa.Column("organization_id", sa.Integer(), nullable=False),
-        sa.Column("project_id", sa.Integer(), nullable=False),
-        sa.Column("inserted_at", sa.DateTime(), nullable=False),
-        sa.Column("updated_at", sa.DateTime(), nullable=False),
-        sa.ForeignKeyConstraint(
-            ["organization_id"], ["organization.id"], ondelete="CASCADE"
-        ),
-        sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"),
-        sa.PrimaryKeyConstraint("id"),
-    )
-    op.create_index(
-        op.f("ix_evaluation_dataset_name"),
-        "evaluation_dataset",
-        ["name"],
-        unique=False,
-    )
-
-    # Add dataset_id column to evaluation_run table
-    op.add_column(
-        "evaluation_run",
-        sa.Column("dataset_id", sa.Integer(), nullable=True),
-    )
-    op.create_foreign_key(
-        "fk_evaluation_run_dataset_id",
-        "evaluation_run",
-        "evaluation_dataset",
-        ["dataset_id"],
-        ["id"],
-    )
-
-
-def downgrade():
-    # Drop foreign key and column from evaluation_run
-    op.drop_constraint(
-        "fk_evaluation_run_dataset_id",
-        "evaluation_run",
-        type_="foreignkey",
-    )
-    op.drop_column("evaluation_run", "dataset_id")
-
-    # Drop evaluation_dataset table
-    op.drop_index(op.f("ix_evaluation_dataset_name"), table_name="evaluation_dataset")
-    op.drop_table("evaluation_dataset")
diff --git a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
index 79954049d..1b03f490b 100644
--- a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
+++ b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
@@ -1,4 +1,4 @@
-"""create_evaluation_run_table and batch_job_table
+"""create_evaluation_run_table, batch_job_table, and evaluation_dataset_table
 
 Revision ID: d5747495bd7c
 Revises: e7c68e43ce6f
@@ -108,13 +108,55 @@ def upgrade():
         op.f("ix_batch_job_project_id"), "batch_job", ["project_id"], unique=False
     )
 
-    # Create evaluation_run table with batch_job_id reference (no old batch columns)
+    # Create evaluation_dataset table
+    op.create_table(
+        "evaluation_dataset",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
+        sa.Column("description", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
+        sa.Column(
+            "dataset_metadata",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=False,
+            server_default=sa.text("'{}'::jsonb"),
+        ),
+        sa.Column("s3_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
+        sa.Column(
+            "langfuse_dataset_id",
+            sqlmodel.sql.sqltypes.AutoString(),
+            nullable=True,
+        ),
+        sa.Column("organization_id", sa.Integer(), nullable=False),
+        sa.Column("project_id", sa.Integer(), nullable=False),
+        sa.Column("inserted_at", sa.DateTime(), nullable=False),
+        sa.Column("updated_at", sa.DateTime(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["organization_id"], ["organization.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    op.create_index(
+        op.f("ix_evaluation_dataset_name"),
+        "evaluation_dataset",
+        ["name"],
+        unique=False,
+    )
+
+    # Create evaluation_run table with all columns and foreign key references
     op.create_table(
         "evaluation_run",
         sa.Column("run_name", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
         sa.Column("dataset_name", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
         sa.Column("config", sa.JSON(), nullable=False),
         sa.Column("batch_job_id", sa.Integer(), nullable=True),
+        sa.Column(
+            "embedding_batch_job_id",
+            sa.Integer(),
+            nullable=True,
+            comment="Reference to the batch_job for embedding-based similarity scoring",
+        ),
+        sa.Column("dataset_id", sa.Integer(), nullable=True),
         sa.Column("status", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
         sa.Column("s3_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
         sa.Column("total_items", sa.Integer(), nullable=False),
@@ -126,6 +168,16 @@ def upgrade():
         sa.Column("inserted_at", sa.DateTime(), nullable=False),
         sa.Column("updated_at", sa.DateTime(), nullable=False),
         sa.ForeignKeyConstraint(["batch_job_id"], ["batch_job.id"]),
+        sa.ForeignKeyConstraint(
+            ["embedding_batch_job_id"],
+            ["batch_job.id"],
+            name="fk_evaluation_run_embedding_batch_job_id",
+        ),
+        sa.ForeignKeyConstraint(
+            ["dataset_id"],
+            ["evaluation_dataset.id"],
+            name="fk_evaluation_run_dataset_id",
+        ),
         sa.ForeignKeyConstraint(
             ["organization_id"], ["organization.id"], ondelete="CASCADE"
         ),
@@ -138,10 +190,14 @@ def upgrade():
 
 
 def downgrade():
-    # Drop evaluation_run table first
+    # Drop evaluation_run table first (has foreign keys to batch_job and evaluation_dataset)
     op.drop_index(op.f("ix_evaluation_run_run_name"), table_name="evaluation_run")
     op.drop_table("evaluation_run")
 
+    # Drop evaluation_dataset table
+    op.drop_index(op.f("ix_evaluation_dataset_name"), table_name="evaluation_dataset")
+    op.drop_table("evaluation_dataset")
+
     # Drop batch_job table
     op.drop_index(op.f("ix_batch_job_project_id"), table_name="batch_job")
     op.drop_index(op.f("ix_batch_job_organization_id"), table_name="batch_job")

From e98dae62ff016bc47e521206f014532feeed74fe Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 30 Oct 2025 15:48:10 +0530
Subject: [PATCH 33/64] code cleanups

---
 backend/app/api/deps.py                       |   2 +-
 backend/app/api/routes/evaluation.py          |  79 +++++++----
 backend/app/crud/evaluation_dataset.py        |  11 +-
 backend/app/crud/evaluation_processing.py     |   6 -
 .../app/tests/api/routes/test_evaluation.py   | 133 ++++++++++--------
 5 files changed, 135 insertions(+), 96 deletions(-)

diff --git a/backend/app/api/deps.py b/backend/app/api/deps.py
index 59678d2f9..73cb77427 100644
--- a/backend/app/api/deps.py
+++ b/backend/app/api/deps.py
@@ -70,7 +70,7 @@ def get_current_user(
         if not user:
             raise HTTPException(status_code=404, detail="User not found")
         if not user.is_active:
-            raise HTTPException(status_code=400, detail="Inactive user")
+            raise HTTPException(status_code=403, detail="Inactive user")
 
         return user  # Return only User object
 
diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 41df3ca61..d013e90cf 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -1,6 +1,6 @@
 import logging
 
-from fastapi import APIRouter, Body, Depends, File, Form, UploadFile
+from fastapi import APIRouter, Body, Depends, File, Form, HTTPException, UploadFile
 from sqlmodel import Session, select
 
 from app.api.deps import get_current_user_org_project, get_db
@@ -81,9 +81,10 @@ async def upload_dataset(
             "question" not in csv_reader.fieldnames
             or "answer" not in csv_reader.fieldnames
         ):
-            raise ValueError(
-                f"CSV must contain 'question' and 'answer' columns. "
-                f"Found columns: {csv_reader.fieldnames}"
+            raise HTTPException(
+                status_code=400,
+                detail=f"CSV must contain 'question' and 'answer' columns. "
+                f"Found columns: {csv_reader.fieldnames}",
             )
 
         # Count original items
@@ -95,7 +96,9 @@ async def upload_dataset(
                 original_items.append({"question": question, "answer": answer})
 
         if not original_items:
-            raise ValueError("No valid items found in CSV file")
+            raise HTTPException(
+                status_code=400, detail="No valid items found in CSV file"
+            )
 
         original_items_count = len(original_items)
         total_items_count = original_items_count * duplication_factor
@@ -107,7 +110,7 @@ async def upload_dataset(
 
     except Exception as e:
         logger.error(f"Failed to parse CSV: {e}", exc_info=True)
-        raise ValueError(f"Invalid CSV file: {e}")
+        raise HTTPException(status_code=400, detail=f"Invalid CSV file: {e}")
 
     # Step 2: Upload to AWS S3 (if credentials configured)
     s3_url = None
@@ -139,11 +142,15 @@ async def upload_dataset(
             provider="langfuse",
         )
         if not langfuse_credentials:
-            raise ValueError("Langfuse credentials not configured")
+            raise HTTPException(
+                status_code=400, detail="Langfuse credentials not configured"
+            )
 
         langfuse, langfuse_success = configure_langfuse(langfuse_credentials)
         if not langfuse_success:
-            raise ValueError("Failed to configure Langfuse client")
+            raise HTTPException(
+                status_code=500, detail="Failed to configure Langfuse client"
+            )
 
         # Upload to Langfuse
         langfuse_dataset_id, _ = upload_dataset_to_langfuse_from_csv(
@@ -160,7 +167,9 @@ async def upload_dataset(
 
     except Exception as e:
         logger.error(f"Failed to upload dataset to Langfuse: {e}", exc_info=True)
-        raise ValueError(f"Failed to upload dataset to Langfuse: {e}")
+        raise HTTPException(
+            status_code=500, detail=f"Failed to upload dataset to Langfuse: {e}"
+        )
 
     # Step 4: Store metadata in database
     try:
@@ -199,7 +208,9 @@ async def upload_dataset(
 
     except Exception as e:
         logger.error(f"Failed to create dataset record in database: {e}", exc_info=True)
-        raise ValueError(f"Failed to save dataset metadata: {e}")
+        raise HTTPException(
+            status_code=500, detail=f"Failed to save dataset metadata: {e}"
+        )
 
 
 @router.get("/evaluations/datasets/{dataset_id}", response_model=DatasetUploadResponse)
@@ -233,7 +244,9 @@ async def get_dataset(
     )
 
     if not dataset:
-        raise ValueError(f"Dataset {dataset_id} not found or not accessible")
+        raise HTTPException(
+            status_code=404, detail=f"Dataset {dataset_id} not found or not accessible"
+        )
 
     # Build response
     return DatasetUploadResponse(
@@ -340,7 +353,11 @@ async def delete_dataset(
     )
 
     if not success:
-        raise ValueError(message)
+        # Check if it's a not found error or other error type
+        if "not found" in message.lower():
+            raise HTTPException(status_code=404, detail=message)
+        else:
+            raise HTTPException(status_code=400, detail=message)
 
     logger.info(f"Successfully deleted dataset: id={dataset_id}")
     return {"message": message, "dataset_id": dataset_id}
@@ -446,9 +463,10 @@ async def evaluate_threads(
     )
 
     if not dataset:
-        raise ValueError(
-            f"Dataset {dataset_id} not found or not accessible to this "
-            f"organization/project"
+        raise HTTPException(
+            status_code=404,
+            detail=f"Dataset {dataset_id} not found or not accessible to this "
+            f"organization/project",
         )
 
     logger.info(
@@ -475,23 +493,26 @@ async def evaluate_threads(
     )
 
     if not openai_credentials or not langfuse_credentials:
-        raise ValueError("OpenAI or Langfuse credentials not configured")
+        raise HTTPException(
+            status_code=400, detail="OpenAI or Langfuse credentials not configured"
+        )
 
     # Configure clients
     openai_client, openai_success = configure_openai(openai_credentials)
     langfuse, langfuse_success = configure_langfuse(langfuse_credentials)
 
     if not openai_success or not langfuse_success:
-        raise ValueError("Failed to configure API clients")
+        raise HTTPException(status_code=500, detail="Failed to configure API clients")
 
     # Step 2: Ensure dataset is in Langfuse (re-upload from S3 if needed)
     if not dataset.langfuse_dataset_id:
         logger.info(f"Dataset {dataset_id} not yet in Langfuse, uploading from S3")
 
         if not dataset.s3_url:
-            raise ValueError(
-                f"Dataset {dataset_id} has no S3 URL and no Langfuse ID. "
-                "Cannot proceed with evaluation."
+            raise HTTPException(
+                status_code=400,
+                detail=f"Dataset {dataset_id} has no S3 URL and no Langfuse ID. "
+                "Cannot proceed with evaluation.",
             )
 
         try:
@@ -526,7 +547,9 @@ async def evaluate_threads(
                 f"Failed to upload dataset {dataset_id} to Langfuse from S3: {e}",
                 exc_info=True,
             )
-            raise ValueError(f"Failed to prepare dataset for evaluation: {e}")
+            raise HTTPException(
+                status_code=500, detail=f"Failed to prepare dataset for evaluation: {e}"
+            )
     else:
         logger.info(
             f"Dataset {dataset_id} already in Langfuse: "
@@ -543,7 +566,9 @@ async def evaluate_threads(
         )
 
         if not assistant:
-            raise ValueError(f"Assistant {assistant_id} not found")
+            raise HTTPException(
+                status_code=404, detail=f"Assistant {assistant_id} not found"
+            )
 
         logger.info(
             f"Found assistant in DB: id={assistant.id}, "
@@ -576,8 +601,9 @@ async def evaluate_threads(
         logger.info("Using provided config directly")
         # Validate that config has minimum required fields
         if not config.get("model"):
-            raise ValueError(
-                "Config must include 'model' when assistant_id is not provided"
+            raise HTTPException(
+                status_code=400,
+                detail="Config must include 'model' when assistant_id is not provided",
             )
 
     # Create EvaluationRun record
@@ -656,8 +682,9 @@ async def get_evaluation_run_status(
     eval_run = _session.exec(statement).first()
 
     if not eval_run:
-        raise ValueError(
-            f"Evaluation run {evaluation_id} not found or not accessible to this organization"
+        raise HTTPException(
+            status_code=404,
+            detail=f"Evaluation run {evaluation_id} not found or not accessible to this organization",
         )
 
     logger.info(
diff --git a/backend/app/crud/evaluation_dataset.py b/backend/app/crud/evaluation_dataset.py
index b9f798565..c3dfc22c4 100644
--- a/backend/app/crud/evaluation_dataset.py
+++ b/backend/app/crud/evaluation_dataset.py
@@ -9,14 +9,17 @@
 """
 
 import logging
+
 from pathlib import Path
 from typing import Any
-
 from sqlmodel import Session, select
+from datetime import datetime
 
 from app.core.cloud.storage import CloudStorage, CloudStorageError
 from app.core.util import now
 from app.models import EvaluationDataset
+from app.models import EvaluationRun
+
 
 logger = logging.getLogger(__name__)
 
@@ -205,8 +208,6 @@ def upload_csv_to_s3(
     try:
         # Create a file path for the CSV
         # Format: datasets/{dataset_name}_{timestamp}.csv
-        from datetime import datetime
-
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         file_path = Path(f"datasets/{dataset_name}_{timestamp}.csv")
 
@@ -332,10 +333,6 @@ def delete_dataset(
         )
 
     # Check if dataset is being used by any evaluation runs
-    from sqlmodel import select
-
-    from app.models import EvaluationRun
-
     statement = select(EvaluationRun).where(EvaluationRun.dataset_id == dataset_id)
     evaluation_runs = session.exec(statement).all()
 
diff --git a/backend/app/crud/evaluation_processing.py b/backend/app/crud/evaluation_processing.py
index 3f74e2418..1851a7e37 100644
--- a/backend/app/crud/evaluation_processing.py
+++ b/backend/app/crud/evaluation_processing.py
@@ -63,8 +63,6 @@ def parse_evaluation_output(
             ...
         ]
     """
-    logger.info("Parsing evaluation results")
-
     # Create lookup map for dataset items by ID
     dataset_map = {item["id"]: item for item in dataset_items}
 
@@ -218,7 +216,6 @@ async def process_completed_evaluation(
         )
 
         # Step 4: Parse evaluation results
-        logger.info(f"{log_prefix} Parsing evaluation results")
         results = parse_evaluation_output(
             raw_results=raw_results, dataset_items=dataset_items
         )
@@ -227,7 +224,6 @@ async def process_completed_evaluation(
             raise ValueError("No valid results found in batch output")
 
         # Step 5: Create Langfuse dataset run with traces
-        logger.info(f"{log_prefix} Creating Langfuse dataset run with traces")
         trace_id_mapping = create_langfuse_dataset_run(
             langfuse=langfuse,
             dataset_name=eval_run.dataset_name,
@@ -243,7 +239,6 @@ async def process_completed_evaluation(
 
         # Step 6: Start embedding batch for similarity scoring
         # Pass trace_id_mapping directly without storing in DB
-        logger.info(f"{log_prefix} Starting embedding batch for similarity scoring")
         try:
             eval_run = start_embedding_batch(
                 session=session,
@@ -347,7 +342,6 @@ async def process_completed_embedding_batch(
             raise ValueError("No valid embedding pairs found in batch output")
 
         # Step 4: Calculate similarity scores
-        logger.info(f"{log_prefix} Calculating cosine similarity scores")
         similarity_stats = calculate_average_similarity(embedding_pairs=embedding_pairs)
 
         # Step 5: Update evaluation_run with scores
diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
index 7158460a9..5a641d7c3 100644
--- a/backend/app/tests/api/routes/test_evaluation.py
+++ b/backend/app/tests/api/routes/test_evaluation.py
@@ -104,19 +104,22 @@ def test_upload_dataset_missing_columns(
 
         # The CSV validation happens before any mocked functions are called
         # so this test checks the actual validation logic
-        with pytest.raises(Exception) as exc_info:
-            client.post(
-                "/api/v1/evaluations/datasets",
-                files={"file": (filename, file_obj, "text/csv")},
-                data={
-                    "dataset_name": "test_dataset",
-                    "duplication_factor": 5,
-                },
-                headers=user_api_key_header,
-            )
+        response = client.post(
+            "/api/v1/evaluations/datasets",
+            files={"file": (filename, file_obj, "text/csv")},
+            data={
+                "dataset_name": "test_dataset",
+                "duplication_factor": 5,
+            },
+            headers=user_api_key_header,
+        )
 
-        # Check that the error message mentions the missing columns
-        error_str = str(exc_info.value)
+        # Check that the response indicates a bad request
+        assert response.status_code == 400
+        response_data = response.json()
+        error_str = response_data.get(
+            "detail", response_data.get("message", str(response_data))
+        )
         assert "question" in error_str.lower() or "answer" in error_str.lower()
 
     def test_upload_dataset_empty_rows(
@@ -279,27 +282,7 @@ def test_upload_langfuse_configuration_fails(
 
             filename, file_obj = create_csv_file(valid_csv_content)
 
-            with pytest.raises(Exception) as exc_info:
-                client.post(
-                    "/api/v1/evaluations/datasets",
-                    files={"file": (filename, file_obj, "text/csv")},
-                    data={
-                        "dataset_name": "test_dataset",
-                        "duplication_factor": 5,
-                    },
-                    headers=user_api_key_header,
-                )
-
-            error_str = str(exc_info.value)
-            assert "langfuse" in error_str.lower() or "credential" in error_str.lower()
-
-    def test_upload_invalid_csv_format(self, client, user_api_key_header):
-        """Test uploading invalid CSV format."""
-        invalid_csv = "not,a,valid\ncsv format here!!!"
-        filename, file_obj = create_csv_file(invalid_csv)
-
-        with pytest.raises(Exception) as exc_info:
-            client.post(
+            response = client.post(
                 "/api/v1/evaluations/datasets",
                 files={"file": (filename, file_obj, "text/csv")},
                 data={
@@ -309,8 +292,39 @@ def test_upload_invalid_csv_format(self, client, user_api_key_header):
                 headers=user_api_key_header,
             )
 
+            # Accept either 400 (credentials not configured) or 500 (configuration/auth fails)
+            assert response.status_code in [400, 500]
+            response_data = response.json()
+            error_str = response_data.get(
+                "detail", response_data.get("message", str(response_data))
+            )
+            assert (
+                "langfuse" in error_str.lower()
+                or "credential" in error_str.lower()
+                or "unauthorized" in error_str.lower()
+            )
+
+    def test_upload_invalid_csv_format(self, client, user_api_key_header):
+        """Test uploading invalid CSV format."""
+        invalid_csv = "not,a,valid\ncsv format here!!!"
+        filename, file_obj = create_csv_file(invalid_csv)
+
+        response = client.post(
+            "/api/v1/evaluations/datasets",
+            files={"file": (filename, file_obj, "text/csv")},
+            data={
+                "dataset_name": "test_dataset",
+                "duplication_factor": 5,
+            },
+            headers=user_api_key_header,
+        )
+
         # Should fail validation - check error contains expected message
-        error_str = str(exc_info.value)
+        assert response.status_code == 400
+        response_data = response.json()
+        error_str = response_data.get(
+            "detail", response_data.get("message", str(response_data))
+        )
         assert (
             "question" in error_str.lower()
             or "answer" in error_str.lower()
@@ -350,18 +364,21 @@ def test_start_batch_evaluation_invalid_dataset_id(
     ):
         """Test batch evaluation fails with invalid dataset_id."""
         # Try to start evaluation with non-existent dataset_id
-        with pytest.raises(Exception) as exc_info:
-            client.post(
-                "/api/v1/evaluations",
-                json={
-                    "experiment_name": "test_evaluation_run",
-                    "dataset_id": 99999,  # Non-existent
-                    "config": sample_evaluation_config,
-                },
-                headers=user_api_key_header,
-            )
+        response = client.post(
+            "/api/v1/evaluations",
+            json={
+                "experiment_name": "test_evaluation_run",
+                "dataset_id": 99999,  # Non-existent
+                "config": sample_evaluation_config,
+            },
+            headers=user_api_key_header,
+        )
 
-        error_str = str(exc_info.value)
+        assert response.status_code == 404
+        response_data = response.json()
+        error_str = response_data.get(
+            "detail", response_data.get("message", str(response_data))
+        )
         assert "not found" in error_str.lower() or "not accessible" in error_str.lower()
 
     def test_start_batch_evaluation_missing_model(self, client, user_api_key_header):
@@ -373,18 +390,22 @@ def test_start_batch_evaluation_missing_model(self, client, user_api_key_header)
             "temperature": 0.5,
         }
 
-        with pytest.raises(Exception) as exc_info:
-            client.post(
-                "/api/v1/evaluations",
-                json={
-                    "experiment_name": "test_no_model",
-                    "dataset_id": 1,  # Dummy ID, error should come before this is checked
-                    "config": invalid_config,
-                },
-                headers=user_api_key_header,
-            )
+        response = client.post(
+            "/api/v1/evaluations",
+            json={
+                "experiment_name": "test_no_model",
+                "dataset_id": 1,  # Dummy ID, error should come before this is checked
+                "config": invalid_config,
+            },
+            headers=user_api_key_header,
+        )
 
-        error_str = str(exc_info.value)
+        # Should fail with either 400 (model missing) or 404 (dataset not found)
+        assert response.status_code in [400, 404]
+        response_data = response.json()
+        error_str = response_data.get(
+            "detail", response_data.get("message", str(response_data))
+        )
         # Should fail with either "model" missing or "dataset not found" (both acceptable)
         assert "model" in error_str.lower() or "not found" in error_str.lower()
 

From cc2df27cc177c0ef5ae65e940dfc7cff10942d5b Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 30 Oct 2025 21:22:33 +0530
Subject: [PATCH 34/64] few more cleanups and tests

---
 ...5747495bd7c_create_evaluation_run_table.py |   6 +
 backend/app/api/routes/evaluation.py          | 184 ++++++++++--------
 backend/app/crud/evaluation_langfuse.py       |   1 +
 backend/app/models/evaluation.py              |  10 +-
 4 files changed, 115 insertions(+), 86 deletions(-)

diff --git a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
index 1b03f490b..56bc5508f 100644
--- a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
+++ b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
@@ -135,6 +135,12 @@ def upgrade():
         ),
         sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"),
         sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint(
+            "name",
+            "organization_id",
+            "project_id",
+            name="uq_evaluation_dataset_name_org_project",
+        ),
     )
     op.create_index(
         op.f("ix_evaluation_dataset_name"),
diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index d013e90cf..fedd2f32f 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -1,6 +1,7 @@
 import logging
 
 from fastapi import APIRouter, Body, Depends, File, Form, HTTPException, UploadFile
+from sqlalchemy.exc import IntegrityError
 from sqlmodel import Session, select
 
 from app.api.deps import get_current_user_org_project, get_db
@@ -44,6 +45,7 @@ async def upload_dataset(
     CSV Format:
     - Must contain 'question' and 'answer' columns
     - Can have additional columns (will be ignored)
+    - Missing values in 'question' or 'answer' rows will be skipped
 
     Example CSV:
     ```
@@ -75,6 +77,7 @@ async def upload_dataset(
     try:
         csv_text = csv_content.decode("utf-8")
         csv_reader = csv.DictReader(io.StringIO(csv_text))
+        csv_reader.fieldnames = [name.strip() for name in csv_reader.fieldnames]
 
         # Validate headers
         if (
@@ -206,61 +209,25 @@ async def upload_dataset(
             s3_url=s3_url,
         )
 
-    except Exception as e:
-        logger.error(f"Failed to create dataset record in database: {e}", exc_info=True)
+    except IntegrityError as e:
+        logger.error(
+            f"Database integrity error creating dataset '{dataset_name}': {e}",
+            exc_info=True,
+        )
         raise HTTPException(
-            status_code=500, detail=f"Failed to save dataset metadata: {e}"
+            status_code=400,
+            detail=f"Dataset with name '{dataset_name}' already exists in this "
+            "organization and project. Please choose a different name.",
         )
 
-
-@router.get("/evaluations/datasets/{dataset_id}", response_model=DatasetUploadResponse)
-async def get_dataset(
-    dataset_id: int,
-    _session: Session = Depends(get_db),
-    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
-) -> DatasetUploadResponse:
-    """
-    Get details of a specific dataset by ID.
-
-    Args:
-        dataset_id: ID of the dataset to retrieve
-
-    Returns:
-        DatasetUploadResponse with dataset details
-    """
-    from app.crud.evaluation_dataset import get_dataset_by_id
-
-    logger.info(
-        f"Fetching dataset: id={dataset_id}, "
-        f"org_id={_current_user.organization_id}, "
-        f"project_id={_current_user.project_id}"
-    )
-
-    dataset = get_dataset_by_id(
-        session=_session,
-        dataset_id=dataset_id,
-        organization_id=_current_user.organization_id,
-        project_id=_current_user.project_id,
-    )
-
-    if not dataset:
+    except Exception as e:
+        logger.error(f"Failed to create dataset record in database: {e}", exc_info=True)
         raise HTTPException(
-            status_code=404, detail=f"Dataset {dataset_id} not found or not accessible"
+            status_code=500, detail=f"Failed to save dataset metadata: {e}"
         )
 
-    # Build response
-    return DatasetUploadResponse(
-        dataset_id=dataset.id,
-        dataset_name=dataset.name,
-        total_items=dataset.dataset_metadata.get("total_items_count", 0),
-        original_items=dataset.dataset_metadata.get("original_items_count", 0),
-        duplication_factor=dataset.dataset_metadata.get("duplication_factor", 1),
-        langfuse_dataset_id=dataset.langfuse_dataset_id,
-        s3_url=dataset.s3_url,
-    )
-
 
-@router.get("/evaluations/datasets", response_model=list[DatasetUploadResponse])
+@router.get("/evaluations/datasets/list", response_model=list[DatasetUploadResponse])
 async def list_datasets_endpoint(
     limit: int = 50,
     offset: int = 0,
@@ -318,6 +285,53 @@ async def list_datasets_endpoint(
     return response
 
 
+@router.get("/evaluations/datasets/{dataset_id}", response_model=DatasetUploadResponse)
+async def get_dataset(
+    dataset_id: int,
+    _session: Session = Depends(get_db),
+    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
+) -> DatasetUploadResponse:
+    """
+    Get details of a specific dataset by ID.
+
+    Args:
+        dataset_id: ID of the dataset to retrieve
+
+    Returns:
+        DatasetUploadResponse with dataset details
+    """
+    from app.crud.evaluation_dataset import get_dataset_by_id
+
+    logger.info(
+        f"Fetching dataset: id={dataset_id}, "
+        f"org_id={_current_user.organization_id}, "
+        f"project_id={_current_user.project_id}"
+    )
+
+    dataset = get_dataset_by_id(
+        session=_session,
+        dataset_id=dataset_id,
+        organization_id=_current_user.organization_id,
+        project_id=_current_user.project_id,
+    )
+
+    if not dataset:
+        raise HTTPException(
+            status_code=404, detail=f"Dataset {dataset_id} not found or not accessible"
+        )
+
+    # Build response
+    return DatasetUploadResponse(
+        dataset_id=dataset.id,
+        dataset_name=dataset.name,
+        total_items=dataset.dataset_metadata.get("total_items_count", 0),
+        original_items=dataset.dataset_metadata.get("original_items_count", 0),
+        duplication_factor=dataset.dataset_metadata.get("duplication_factor", 1),
+        langfuse_dataset_id=dataset.langfuse_dataset_id,
+        s3_url=dataset.s3_url,
+    )
+
+
 @router.delete("/evaluations/datasets/{dataset_id}")
 async def delete_dataset(
     dataset_id: int,
@@ -652,6 +666,43 @@ async def evaluate_threads(
         return eval_run
 
 
+@router.get("/evaluations/list", response_model=list[EvaluationRunPublic])
+async def list_evaluation_runs(
+    _session: Session = Depends(get_db),
+    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
+    limit: int = 50,
+    offset: int = 0,
+) -> list[EvaluationRunPublic]:
+    """
+    List all evaluation runs for the current organization.
+
+    Args:
+        limit: Maximum number of runs to return (default 50)
+        offset: Number of runs to skip (for pagination)
+
+    Returns:
+        List of EvaluationRunPublic objects, ordered by most recent first
+    """
+    logger.info(
+        f"Listing evaluation runs for org_id={_current_user.organization_id} "
+        f"(limit={limit}, offset={offset})"
+    )
+
+    statement = (
+        select(EvaluationRun)
+        .where(EvaluationRun.organization_id == _current_user.organization_id)
+        .order_by(EvaluationRun.inserted_at.desc())
+        .limit(limit)
+        .offset(offset)
+    )
+
+    runs = _session.exec(statement).all()
+
+    logger.info(f"Found {len(runs)} evaluation runs")
+
+    return list(runs)
+
+
 @router.get("/evaluations/{evaluation_id}", response_model=EvaluationRunPublic)
 async def get_evaluation_run_status(
     evaluation_id: int,
@@ -693,40 +744,3 @@ async def get_evaluation_run_status(
     )
 
     return eval_run
-
-
-@router.get("/evaluations", response_model=list[EvaluationRunPublic])
-async def list_evaluation_runs(
-    _session: Session = Depends(get_db),
-    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
-    limit: int = 50,
-    offset: int = 0,
-) -> list[EvaluationRunPublic]:
-    """
-    List all evaluation runs for the current organization.
-
-    Args:
-        limit: Maximum number of runs to return (default 50)
-        offset: Number of runs to skip (for pagination)
-
-    Returns:
-        List of EvaluationRunPublic objects, ordered by most recent first
-    """
-    logger.info(
-        f"Listing evaluation runs for org_id={_current_user.organization_id} "
-        f"(limit={limit}, offset={offset})"
-    )
-
-    statement = (
-        select(EvaluationRun)
-        .where(EvaluationRun.organization_id == _current_user.organization_id)
-        .order_by(EvaluationRun.inserted_at.desc())
-        .limit(limit)
-        .offset(offset)
-    )
-
-    runs = _session.exec(statement).all()
-
-    logger.info(f"Found {len(runs)} evaluation runs")
-
-    return list(runs)
diff --git a/backend/app/crud/evaluation_langfuse.py b/backend/app/crud/evaluation_langfuse.py
index d21130609..6c91d4252 100644
--- a/backend/app/crud/evaluation_langfuse.py
+++ b/backend/app/crud/evaluation_langfuse.py
@@ -199,6 +199,7 @@ def upload_dataset_to_langfuse_from_csv(
         # Parse CSV content
         csv_text = csv_content.decode("utf-8")
         csv_reader = csv.DictReader(io.StringIO(csv_text))
+        csv_reader.fieldnames = [name.strip() for name in csv_reader.fieldnames]
 
         # Validate CSV headers
         if (
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index 5000ab24b..7750c01a2 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -2,7 +2,7 @@
 from typing import Any, Optional
 
 from pydantic import BaseModel, Field
-from sqlalchemy import JSON, Column, Text
+from sqlalchemy import JSON, Column, Text, UniqueConstraint
 from sqlmodel import Field as SQLField
 from sqlmodel import Relationship, SQLModel
 
@@ -66,6 +66,14 @@ class EvaluationDataset(SQLModel, table=True):
     """Database table for evaluation datasets."""
 
     __tablename__ = "evaluation_dataset"
+    __table_args__ = (
+        UniqueConstraint(
+            "name",
+            "organization_id",
+            "project_id",
+            name="uq_evaluation_dataset_name_org_project",
+        ),
+    )
 
     id: int = SQLField(default=None, primary_key=True)
 

From ebafe8be8a76aa2df5d04550bc5a0e285d254584 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 30 Oct 2025 22:24:12 +0530
Subject: [PATCH 35/64] added support for sanitizing dataset name

---
 backend/app/api/routes/evaluation.py | 82 ++++++++++++++++++++++++++--
 1 file changed, 76 insertions(+), 6 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index fedd2f32f..5e8c9fde0 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -1,4 +1,4 @@
-import logging
+import logging, re
 
 from fastapi import APIRouter, Body, Depends, File, Form, HTTPException, UploadFile
 from sqlalchemy.exc import IntegrityError
@@ -20,6 +20,54 @@
 router = APIRouter(tags=["evaluation"])
 
 
+def sanitize_dataset_name(name: str) -> str:
+    """
+    Sanitize dataset name for Langfuse compatibility.
+
+    Langfuse has issues with spaces and special characters in dataset names.
+    This function ensures the name can be both created and fetched.
+
+    Rules:
+    - Replace spaces with underscores
+    - Replace hyphens with underscores
+    - Keep only alphanumeric characters and underscores
+    - Convert to lowercase for consistency
+    - Remove leading/trailing underscores
+    - Collapse multiple consecutive underscores into one
+
+    Args:
+        name: Original dataset name
+
+    Returns:
+        Sanitized dataset name safe for Langfuse
+
+    Examples:
+        "testing 0001" -> "testing_0001"
+        "My Dataset!" -> "my_dataset"
+        "Test--Data__Set" -> "test_data_set"
+    """
+    # Convert to lowercase
+    sanitized = name.lower()
+
+    # Replace spaces and hyphens with underscores
+    sanitized = sanitized.replace(" ", "_").replace("-", "_")
+
+    # Keep only alphanumeric characters and underscores
+    sanitized = re.sub(r"[^a-z0-9_]", "", sanitized)
+
+    # Collapse multiple underscores into one
+    sanitized = re.sub(r"_+", "_", sanitized)
+
+    # Remove leading/trailing underscores
+    sanitized = sanitized.strip("_")
+
+    # Ensure name is not empty
+    if not sanitized:
+        raise ValueError("Dataset name cannot be empty after sanitization")
+
+    return sanitized
+
+
 @router.post("/evaluations/datasets", response_model=DatasetUploadResponse)
 async def upload_dataset(
     file: UploadFile = File(
@@ -37,10 +85,18 @@ async def upload_dataset(
     Upload a CSV file containing Golden Q&A pairs.
 
     This endpoint:
-    1. Validates and parses the CSV file
-    2. Uploads CSV to AWS S3 (if credentials configured)
-    3. Uploads dataset to Langfuse (for immediate use)
-    4. Stores metadata in database
+    1. Sanitizes the dataset name (removes spaces, special characters)
+    2. Validates and parses the CSV file
+    3. Uploads CSV to AWS S3 (if credentials configured)
+    4. Uploads dataset to Langfuse (for immediate use)
+    5. Stores metadata in database
+
+    Dataset Name:
+    - Will be sanitized for Langfuse compatibility
+    - Spaces replaced with underscores
+    - Special characters removed
+    - Converted to lowercase
+    - Example: "My Dataset 01!" becomes "my_dataset_01"
 
     CSV Format:
     - Must contain 'question' and 'answer' columns
@@ -56,11 +112,22 @@ async def upload_dataset(
 
     Returns:
         DatasetUploadResponse with dataset_id, s3_url, and Langfuse details
+        (dataset_name in response will be the sanitized version)
     """
     from app.core.cloud import get_cloud_storage
     from app.crud.evaluation_dataset import create_evaluation_dataset, upload_csv_to_s3
     from app.crud.evaluation_langfuse import upload_dataset_to_langfuse_from_csv
 
+    # Sanitize dataset name for Langfuse compatibility
+    original_name = dataset_name
+    try:
+        dataset_name = sanitize_dataset_name(dataset_name)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=f"Invalid dataset name: {str(e)}")
+
+    if original_name != dataset_name:
+        logger.info(f"Dataset name sanitized: '{original_name}' -> '{dataset_name}'")
+
     logger.info(
         f"Uploading dataset: {dataset_name} with duplication factor: "
         f"{duplication_factor}, org_id={_current_user.organization_id}, "
@@ -735,7 +802,10 @@ async def get_evaluation_run_status(
     if not eval_run:
         raise HTTPException(
             status_code=404,
-            detail=f"Evaluation run {evaluation_id} not found or not accessible to this organization",
+            detail=(
+                f"Evaluation run {evaluation_id} not found or not accessible "
+                "to this organization"
+            ),
         )
 
     logger.info(

From a21709fbc12829b641655dbe390095034fa37f19 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 30 Oct 2025 22:59:11 +0530
Subject: [PATCH 36/64] fix import issues in testcases

---
 .../app/tests/api/routes/test_evaluation.py   | 89 +++++++++----------
 1 file changed, 42 insertions(+), 47 deletions(-)

diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
index 5a641d7c3..33f8f4740 100644
--- a/backend/app/tests/api/routes/test_evaluation.py
+++ b/backend/app/tests/api/routes/test_evaluation.py
@@ -1,12 +1,11 @@
 import io
-import json
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch
 
 import pytest
 from sqlmodel import select
 
 from app.crud.evaluation_batch import build_evaluation_jsonl
-from app.models import EvaluationRun
+from app.models import EvaluationDataset
 
 
 # Helper function to create CSV file-like object
@@ -52,11 +51,13 @@ def test_upload_dataset_valid_csv(
         self, client, user_api_key_header, valid_csv_content, db
     ):
         """Test uploading a valid CSV file."""
-        with patch("app.core.cloud.get_cloud_storage") as _mock_storage, patch(
-            "app.crud.evaluation_dataset.upload_csv_to_s3"
-        ) as mock_s3_upload, patch(
-            "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
-        ) as mock_langfuse_upload:
+        with (
+            patch("app.core.cloud.get_cloud_storage") as _mock_storage,
+            patch("app.crud.evaluation_dataset.upload_csv_to_s3") as mock_s3_upload,
+            patch(
+                "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
+            ) as mock_langfuse_upload,
+        ):
             # Mock S3 upload
             mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
 
@@ -126,11 +127,13 @@ def test_upload_dataset_empty_rows(
         self, client, user_api_key_header, csv_with_empty_rows
     ):
         """Test uploading CSV with empty rows (should skip them)."""
-        with patch("app.core.cloud.get_cloud_storage") as _mock_storage, patch(
-            "app.crud.evaluation_dataset.upload_csv_to_s3"
-        ) as mock_s3_upload, patch(
-            "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
-        ) as mock_langfuse_upload:
+        with (
+            patch("app.core.cloud.get_cloud_storage") as _mock_storage,
+            patch("app.crud.evaluation_dataset.upload_csv_to_s3") as mock_s3_upload,
+            patch(
+                "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
+            ) as mock_langfuse_upload,
+        ):
             # Mock S3 and Langfuse uploads
             mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
             mock_langfuse_upload.return_value = ("test_dataset_id", 4)
@@ -162,11 +165,13 @@ def test_upload_with_default_duplication(
         self, client, user_api_key_header, valid_csv_content
     ):
         """Test uploading with default duplication factor (5)."""
-        with patch("app.core.cloud.get_cloud_storage") as _mock_storage, patch(
-            "app.crud.evaluation_dataset.upload_csv_to_s3"
-        ) as mock_s3_upload, patch(
-            "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
-        ) as mock_langfuse_upload:
+        with (
+            patch("app.core.cloud.get_cloud_storage") as _mock_storage,
+            patch("app.crud.evaluation_dataset.upload_csv_to_s3") as mock_s3_upload,
+            patch(
+                "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
+            ) as mock_langfuse_upload,
+        ):
             mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
             mock_langfuse_upload.return_value = ("test_dataset_id", 15)
 
@@ -193,11 +198,13 @@ def test_upload_with_custom_duplication(
         self, client, user_api_key_header, valid_csv_content
     ):
         """Test uploading with custom duplication factor."""
-        with patch("app.core.cloud.get_cloud_storage") as _mock_storage, patch(
-            "app.crud.evaluation_dataset.upload_csv_to_s3"
-        ) as mock_s3_upload, patch(
-            "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
-        ) as mock_langfuse_upload:
+        with (
+            patch("app.core.cloud.get_cloud_storage") as _mock_storage,
+            patch("app.crud.evaluation_dataset.upload_csv_to_s3") as mock_s3_upload,
+            patch(
+                "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
+            ) as mock_langfuse_upload,
+        ):
             mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
             mock_langfuse_upload.return_value = ("test_dataset_id", 30)
 
@@ -224,11 +231,13 @@ def test_upload_with_description(
         self, client, user_api_key_header, valid_csv_content, db
     ):
         """Test uploading with a description."""
-        with patch("app.core.cloud.get_cloud_storage") as _mock_storage, patch(
-            "app.crud.evaluation_dataset.upload_csv_to_s3"
-        ) as mock_s3_upload, patch(
-            "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
-        ) as mock_langfuse_upload:
+        with (
+            patch("app.core.cloud.get_cloud_storage") as _mock_storage,
+            patch("app.crud.evaluation_dataset.upload_csv_to_s3") as mock_s3_upload,
+            patch(
+                "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
+            ) as mock_langfuse_upload,
+        ):
             mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
             mock_langfuse_upload.return_value = ("test_dataset_id", 9)
 
@@ -249,10 +258,6 @@ def test_upload_with_description(
             data = response.json()
 
             # Verify the description is stored
-            from sqlmodel import select
-
-            from app.models import EvaluationDataset
-
             dataset = db.exec(
                 select(EvaluationDataset).where(
                     EvaluationDataset.id == data["dataset_id"]
@@ -270,11 +275,11 @@ def test_upload_langfuse_configuration_fails(
         self, client, user_api_key_header, valid_csv_content
     ):
         """Test when Langfuse client configuration fails."""
-        with patch("app.core.cloud.get_cloud_storage") as _mock_storage, patch(
-            "app.crud.evaluation_dataset.upload_csv_to_s3"
-        ) as mock_s3_upload, patch(
-            "app.crud.credentials.get_provider_credential"
-        ) as mock_get_cred:
+        with (
+            patch("app.core.cloud.get_cloud_storage") as _mock_storage,
+            patch("app.crud.evaluation_dataset.upload_csv_to_s3") as mock_s3_upload,
+            patch("app.crud.credentials.get_provider_credential") as mock_get_cred,
+        ):
             # Mock S3 upload succeeds
             mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
             # Mock Langfuse credentials not found
@@ -430,8 +435,6 @@ class TestBatchEvaluationJSONLBuilding:
 
     def test_build_batch_jsonl_basic(self):
         """Test basic JSONL building with minimal config."""
-        from app.crud.evaluation_batch import build_evaluation_jsonl
-
         dataset_items = [
             {
                 "id": "item1",
@@ -463,8 +466,6 @@ def test_build_batch_jsonl_basic(self):
 
     def test_build_batch_jsonl_with_tools(self):
         """Test JSONL building with tools configuration."""
-        from app.crud.evaluation_batch import build_evaluation_jsonl
-
         dataset_items = [
             {
                 "id": "item1",
@@ -494,8 +495,6 @@ def test_build_batch_jsonl_with_tools(self):
 
     def test_build_batch_jsonl_minimal_config(self):
         """Test JSONL building with minimal config (only model required)."""
-        from app.crud.evaluation_batch import build_evaluation_jsonl
-
         dataset_items = [
             {
                 "id": "item1",
@@ -516,8 +515,6 @@ def test_build_batch_jsonl_minimal_config(self):
 
     def test_build_batch_jsonl_skips_empty_questions(self):
         """Test that items with empty questions are skipped."""
-        from app.crud.evaluation_batch import build_evaluation_jsonl
-
         dataset_items = [
             {
                 "id": "item1",
@@ -549,8 +546,6 @@ def test_build_batch_jsonl_skips_empty_questions(self):
 
     def test_build_batch_jsonl_multiple_items(self):
         """Test JSONL building with multiple items."""
-        from app.crud.evaluation_batch import build_evaluation_jsonl
-
         dataset_items = [
             {
                 "id": f"item{i}",

From ed0da5856a362d9e3ba1867ee3c0a663d5c4465a Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Fri, 31 Oct 2025 11:16:35 +0530
Subject: [PATCH 37/64] fixing imports

---
 backend/app/api/routes/evaluation.py | 39 ++++++++++++----------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 5e8c9fde0..851561f40 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -1,14 +1,30 @@
-import logging, re
+import csv
+import io
+import logging
+import re
 
 from fastapi import APIRouter, Body, Depends, File, Form, HTTPException, UploadFile
 from sqlalchemy.exc import IntegrityError
 from sqlmodel import Session, select
 
 from app.api.deps import get_current_user_org_project, get_db
+from app.core.cloud import get_cloud_storage
 from app.core.util import configure_langfuse, configure_openai, now
 from app.crud.assistants import get_assistant_by_id
 from app.crud.credentials import get_provider_credential
 from app.crud.evaluation_batch import start_evaluation_batch
+from app.crud.evaluation_dataset import (
+    create_evaluation_dataset,
+    download_csv_from_s3,
+    get_dataset_by_id,
+    list_datasets,
+    update_dataset_langfuse_id,
+    upload_csv_to_s3,
+)
+from app.crud.evaluation_dataset import (
+    delete_dataset as delete_dataset_crud,
+)
+from app.crud.evaluation_langfuse import upload_dataset_to_langfuse_from_csv
 from app.models import EvaluationRun, UserProjectOrg
 from app.models.evaluation import (
     DatasetUploadResponse,
@@ -114,10 +130,6 @@ async def upload_dataset(
         DatasetUploadResponse with dataset_id, s3_url, and Langfuse details
         (dataset_name in response will be the sanitized version)
     """
-    from app.core.cloud import get_cloud_storage
-    from app.crud.evaluation_dataset import create_evaluation_dataset, upload_csv_to_s3
-    from app.crud.evaluation_langfuse import upload_dataset_to_langfuse_from_csv
-
     # Sanitize dataset name for Langfuse compatibility
     original_name = dataset_name
     try:
@@ -138,9 +150,6 @@ async def upload_dataset(
     csv_content = await file.read()
 
     # Step 1: Parse and validate CSV
-    import csv
-    import io
-
     try:
         csv_text = csv_content.decode("utf-8")
         csv_reader = csv.DictReader(io.StringIO(csv_text))
@@ -311,8 +320,6 @@ async def list_datasets_endpoint(
     Returns:
         List of DatasetUploadResponse objects, ordered by most recent first
     """
-    from app.crud.evaluation_dataset import list_datasets
-
     # Enforce maximum limit
     if limit > 100:
         limit = 100
@@ -367,8 +374,6 @@ async def get_dataset(
     Returns:
         DatasetUploadResponse with dataset details
     """
-    from app.crud.evaluation_dataset import get_dataset_by_id
-
     logger.info(
         f"Fetching dataset: id={dataset_id}, "
         f"org_id={_current_user.organization_id}, "
@@ -418,8 +423,6 @@ async def delete_dataset(
     Returns:
         Success message with deleted dataset details
     """
-    from app.crud.evaluation_dataset import delete_dataset as delete_dataset_crud
-
     logger.info(
         f"Deleting dataset: id={dataset_id}, "
         f"org_id={_current_user.organization_id}, "
@@ -519,14 +522,6 @@ async def evaluate_threads(
     Returns:
         EvaluationRunPublic with batch details and status
     """
-    from app.core.cloud import get_cloud_storage
-    from app.crud.evaluation_dataset import (
-        download_csv_from_s3,
-        get_dataset_by_id,
-        update_dataset_langfuse_id,
-    )
-    from app.crud.evaluation_langfuse import upload_dataset_to_langfuse_from_csv
-
     logger.info(
         f"Starting evaluation: experiment_name={experiment_name}, "
         f"dataset_id={dataset_id}, "

From 11663da3821e7643991148ee469cf6d821807352 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Fri, 31 Oct 2025 13:14:15 +0530
Subject: [PATCH 38/64] minor cleanups for evaluation

---
 backend/app/api/routes/evaluation.py | 70 +++++-----------------------
 1 file changed, 11 insertions(+), 59 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 851561f40..1cd05e956 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -15,10 +15,8 @@
 from app.crud.evaluation_batch import start_evaluation_batch
 from app.crud.evaluation_dataset import (
     create_evaluation_dataset,
-    download_csv_from_s3,
     get_dataset_by_id,
     list_datasets,
-    update_dataset_langfuse_id,
     upload_csv_to_s3,
 )
 from app.crud.evaluation_dataset import (
@@ -465,13 +463,12 @@ async def evaluate_threads(
     Start an evaluation using OpenAI Batch API.
 
     This endpoint:
-    1. Fetches the dataset from database
-    2. Ensures dataset is uploaded to Langfuse (re-uploads from S3 if needed)
-    3. Creates an EvaluationRun record in the database
-    4. Fetches dataset items from Langfuse
-    5. Builds JSONL for batch processing (config is used as-is)
-    6. Creates a batch job via the generic batch infrastructure
-    7. Returns the evaluation run details with batch_job_id
+    1. Fetches the dataset from database and validates it has Langfuse dataset ID
+    2. Creates an EvaluationRun record in the database
+    3. Fetches dataset items from Langfuse
+    4. Builds JSONL for batch processing (config is used as-is)
+    5. Creates a batch job via the generic batch infrastructure
+    6. Returns the evaluation run details with batch_job_id
 
     The batch will be processed asynchronously by Celery Beat (every 60s).
     Use GET /evaluations/{evaluation_id} to check progress.
@@ -552,7 +549,6 @@ async def evaluate_threads(
     )
 
     dataset_name = dataset.name
-    duplication_factor = dataset.dataset_metadata.get("duplication_factor", 5)
 
     # Get credentials
     openai_credentials = get_provider_credential(
@@ -580,56 +576,12 @@ async def evaluate_threads(
     if not openai_success or not langfuse_success:
         raise HTTPException(status_code=500, detail="Failed to configure API clients")
 
-    # Step 2: Ensure dataset is in Langfuse (re-upload from S3 if needed)
+    # Validate dataset has Langfuse ID (should have been set during dataset creation)
     if not dataset.langfuse_dataset_id:
-        logger.info(f"Dataset {dataset_id} not yet in Langfuse, uploading from S3")
-
-        if not dataset.s3_url:
-            raise HTTPException(
-                status_code=400,
-                detail=f"Dataset {dataset_id} has no S3 URL and no Langfuse ID. "
-                "Cannot proceed with evaluation.",
-            )
-
-        try:
-            # Download CSV from S3
-            storage = get_cloud_storage(
-                session=_session, project_id=_current_user.project_id
-            )
-            csv_content = download_csv_from_s3(storage=storage, s3_url=dataset.s3_url)
-
-            # Upload to Langfuse
-            langfuse_dataset_id, _ = upload_dataset_to_langfuse_from_csv(
-                langfuse=langfuse,
-                csv_content=csv_content,
-                dataset_name=dataset_name,
-                duplication_factor=duplication_factor,
-            )
-
-            # Update dataset record with langfuse_dataset_id
-            update_dataset_langfuse_id(
-                session=_session,
-                dataset_id=dataset.id,
-                langfuse_dataset_id=langfuse_dataset_id,
-            )
-
-            logger.info(
-                f"Successfully uploaded dataset {dataset_id} to Langfuse: "
-                f"langfuse_id={langfuse_dataset_id}"
-            )
-
-        except Exception as e:
-            logger.error(
-                f"Failed to upload dataset {dataset_id} to Langfuse from S3: {e}",
-                exc_info=True,
-            )
-            raise HTTPException(
-                status_code=500, detail=f"Failed to prepare dataset for evaluation: {e}"
-            )
-    else:
-        logger.info(
-            f"Dataset {dataset_id} already in Langfuse: "
-            f"langfuse_id={dataset.langfuse_dataset_id}"
+        raise HTTPException(
+            status_code=400,
+            detail=f"Dataset {dataset_id} does not have a Langfuse dataset ID. "
+            "Please ensure Langfuse credentials were configured when the dataset was created.",
         )
 
     # Handle assistant_id if provided

From 5988f8071e03e200e008dfc17a405a7e47d4fee3 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Fri, 31 Oct 2025 15:52:27 +0530
Subject: [PATCH 39/64] passing project id as well

---
 backend/app/api/routes/evaluation.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 1cd05e956..7215b7164 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -322,12 +322,6 @@ async def list_datasets_endpoint(
     if limit > 100:
         limit = 100
 
-    logger.info(
-        f"Listing datasets: org_id={_current_user.organization_id}, "
-        f"project_id={_current_user.project_id}, limit={limit}, "
-        f"offset={offset}"
-    )
-
     datasets = list_datasets(
         session=_session,
         organization_id=_current_user.organization_id,
@@ -353,7 +347,6 @@ async def list_datasets_endpoint(
             )
         )
 
-    logger.info(f"Found {len(response)} datasets")
     return response
 
 
@@ -390,7 +383,6 @@ async def get_dataset(
             status_code=404, detail=f"Dataset {dataset_id} not found or not accessible"
         )
 
-    # Build response
     return DatasetUploadResponse(
         dataset_id=dataset.id,
         dataset_name=dataset.name,
@@ -698,13 +690,14 @@ async def list_evaluation_runs(
         List of EvaluationRunPublic objects, ordered by most recent first
     """
     logger.info(
-        f"Listing evaluation runs for org_id={_current_user.organization_id} "
-        f"(limit={limit}, offset={offset})"
+        f"Listing evaluation runs for org_id={_current_user.organization_id}, "
+        f"project_id={_current_user.project_id} (limit={limit}, offset={offset})"
     )
 
     statement = (
         select(EvaluationRun)
         .where(EvaluationRun.organization_id == _current_user.organization_id)
+        .where(EvaluationRun.project_id == _current_user.project_id)
         .order_by(EvaluationRun.inserted_at.desc())
         .limit(limit)
         .offset(offset)
@@ -734,7 +727,8 @@ async def get_evaluation_run_status(
     """
     logger.info(
         f"Fetching status for evaluation run {evaluation_id} "
-        f"(org_id={_current_user.organization_id})"
+        f"(org_id={_current_user.organization_id}, "
+        f"project_id={_current_user.project_id})"
     )
 
     # Query the evaluation run
@@ -742,6 +736,7 @@ async def get_evaluation_run_status(
         select(EvaluationRun)
         .where(EvaluationRun.id == evaluation_id)
         .where(EvaluationRun.organization_id == _current_user.organization_id)
+        .where(EvaluationRun.project_id == _current_user.project_id)
     )
 
     eval_run = _session.exec(statement).first()

From 22361fe8f0895302fdb93fc00d25ce6705129884 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Mon, 3 Nov 2025 11:08:04 +0530
Subject: [PATCH 40/64] updated testcases and error codes

---
 backend/app/api/routes/evaluation.py          |  8 +--
 .../app/tests/api/routes/test_evaluation.py   | 50 +++++++++++++------
 2 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 7215b7164..af08d8260 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -159,7 +159,7 @@ async def upload_dataset(
             or "answer" not in csv_reader.fieldnames
         ):
             raise HTTPException(
-                status_code=400,
+                status_code=422,
                 detail=f"CSV must contain 'question' and 'answer' columns. "
                 f"Found columns: {csv_reader.fieldnames}",
             )
@@ -174,7 +174,7 @@ async def upload_dataset(
 
         if not original_items:
             raise HTTPException(
-                status_code=400, detail="No valid items found in CSV file"
+                status_code=422, detail="No valid items found in CSV file"
             )
 
         original_items_count = len(original_items)
@@ -187,7 +187,7 @@ async def upload_dataset(
 
     except Exception as e:
         logger.error(f"Failed to parse CSV: {e}", exc_info=True)
-        raise HTTPException(status_code=400, detail=f"Invalid CSV file: {e}")
+        raise HTTPException(status_code=422, detail=f"Invalid CSV file: {e}")
 
     # Step 2: Upload to AWS S3 (if credentials configured)
     s3_url = None
@@ -289,7 +289,7 @@ async def upload_dataset(
             exc_info=True,
         )
         raise HTTPException(
-            status_code=400,
+            status_code=409,
             detail=f"Dataset with name '{dataset_name}' already exists in this "
             "organization and project. Please choose a different name.",
         )
diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
index 33f8f4740..e8cb3cb55 100644
--- a/backend/app/tests/api/routes/test_evaluation.py
+++ b/backend/app/tests/api/routes/test_evaluation.py
@@ -53,14 +53,20 @@ def test_upload_dataset_valid_csv(
         """Test uploading a valid CSV file."""
         with (
             patch("app.core.cloud.get_cloud_storage") as _mock_storage,
-            patch("app.crud.evaluation_dataset.upload_csv_to_s3") as mock_s3_upload,
+            patch("app.api.routes.evaluation.upload_csv_to_s3") as mock_s3_upload,
             patch(
-                "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
+                "app.api.routes.evaluation.configure_langfuse"
+            ) as mock_configure_langfuse,
+            patch(
+                "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
         ):
             # Mock S3 upload
             mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
 
+            # Mock Langfuse configuration
+            mock_configure_langfuse.return_value = (None, True)
+
             # Mock Langfuse upload
             mock_langfuse_upload.return_value = ("test_dataset_id", 9)
 
@@ -115,8 +121,8 @@ def test_upload_dataset_missing_columns(
             headers=user_api_key_header,
         )
 
-        # Check that the response indicates a bad request
-        assert response.status_code == 400
+        # Check that the response indicates unprocessable entity
+        assert response.status_code == 422
         response_data = response.json()
         error_str = response_data.get(
             "detail", response_data.get("message", str(response_data))
@@ -129,13 +135,17 @@ def test_upload_dataset_empty_rows(
         """Test uploading CSV with empty rows (should skip them)."""
         with (
             patch("app.core.cloud.get_cloud_storage") as _mock_storage,
-            patch("app.crud.evaluation_dataset.upload_csv_to_s3") as mock_s3_upload,
+            patch("app.api.routes.evaluation.upload_csv_to_s3") as mock_s3_upload,
+            patch(
+                "app.api.routes.evaluation.configure_langfuse"
+            ) as mock_configure_langfuse,
             patch(
-                "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
+                "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
         ):
             # Mock S3 and Langfuse uploads
             mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            mock_configure_langfuse.return_value = (None, True)
             mock_langfuse_upload.return_value = ("test_dataset_id", 4)
 
             filename, file_obj = create_csv_file(csv_with_empty_rows)
@@ -167,12 +177,16 @@ def test_upload_with_default_duplication(
         """Test uploading with default duplication factor (5)."""
         with (
             patch("app.core.cloud.get_cloud_storage") as _mock_storage,
-            patch("app.crud.evaluation_dataset.upload_csv_to_s3") as mock_s3_upload,
+            patch("app.api.routes.evaluation.upload_csv_to_s3") as mock_s3_upload,
             patch(
-                "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
+                "app.api.routes.evaluation.configure_langfuse"
+            ) as mock_configure_langfuse,
+            patch(
+                "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
         ):
             mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            mock_configure_langfuse.return_value = (None, True)
             mock_langfuse_upload.return_value = ("test_dataset_id", 15)
 
             filename, file_obj = create_csv_file(valid_csv_content)
@@ -200,12 +214,16 @@ def test_upload_with_custom_duplication(
         """Test uploading with custom duplication factor."""
         with (
             patch("app.core.cloud.get_cloud_storage") as _mock_storage,
-            patch("app.crud.evaluation_dataset.upload_csv_to_s3") as mock_s3_upload,
+            patch("app.api.routes.evaluation.upload_csv_to_s3") as mock_s3_upload,
+            patch(
+                "app.api.routes.evaluation.configure_langfuse"
+            ) as mock_configure_langfuse,
             patch(
-                "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
+                "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
         ):
             mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            mock_configure_langfuse.return_value = (None, True)
             mock_langfuse_upload.return_value = ("test_dataset_id", 30)
 
             filename, file_obj = create_csv_file(valid_csv_content)
@@ -233,12 +251,16 @@ def test_upload_with_description(
         """Test uploading with a description."""
         with (
             patch("app.core.cloud.get_cloud_storage") as _mock_storage,
-            patch("app.crud.evaluation_dataset.upload_csv_to_s3") as mock_s3_upload,
+            patch("app.api.routes.evaluation.upload_csv_to_s3") as mock_s3_upload,
+            patch(
+                "app.api.routes.evaluation.configure_langfuse"
+            ) as mock_configure_langfuse,
             patch(
-                "app.crud.evaluation_langfuse.upload_dataset_to_langfuse_from_csv"
+                "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
         ):
             mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            mock_configure_langfuse.return_value = (None, True)
             mock_langfuse_upload.return_value = ("test_dataset_id", 9)
 
             filename, file_obj = create_csv_file(valid_csv_content)
@@ -277,7 +299,7 @@ def test_upload_langfuse_configuration_fails(
         """Test when Langfuse client configuration fails."""
         with (
             patch("app.core.cloud.get_cloud_storage") as _mock_storage,
-            patch("app.crud.evaluation_dataset.upload_csv_to_s3") as mock_s3_upload,
+            patch("app.api.routes.evaluation.upload_csv_to_s3") as mock_s3_upload,
             patch("app.crud.credentials.get_provider_credential") as mock_get_cred,
         ):
             # Mock S3 upload succeeds
@@ -325,7 +347,7 @@ def test_upload_invalid_csv_format(self, client, user_api_key_header):
         )
 
         # Should fail validation - check error contains expected message
-        assert response.status_code == 400
+        assert response.status_code == 422
         response_data = response.json()
         error_str = response_data.get(
             "detail", response_data.get("message", str(response_data))

From d9704e38b5ac62ed87addb452537296586363fff Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Mon, 3 Nov 2025 12:33:39 +0530
Subject: [PATCH 41/64] using util for file uploads

---
 ...5747495bd7c_create_evaluation_run_table.py |  11 +-
 backend/app/api/routes/evaluation.py          |  39 ++--
 backend/app/core/storage_utils.py             | 167 ++++++++++++++++++
 backend/app/crud/batch_operations.py          | 104 ++++++-----
 backend/app/crud/evaluation_dataset.py        | 112 ++++++------
 backend/app/crud/evaluation_processing.py     |  23 +--
 backend/app/models/evaluation.py              |  20 ++-
 .../app/tests/api/routes/test_evaluation.py   |  48 +++--
 .../app/tests/crud/test_evaluation_dataset.py |  72 ++++----
 9 files changed, 393 insertions(+), 203 deletions(-)
 create mode 100644 backend/app/core/storage_utils.py

diff --git a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
index 56bc5508f..737b4f40c 100644
--- a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
+++ b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
@@ -5,12 +5,11 @@
 Create Date: 2025-10-14 12:42:15.464302
 
 """
-from alembic import op
 import sqlalchemy as sa
 import sqlmodel.sql.sqltypes
+from alembic import op
 from sqlalchemy.dialects import postgresql
 
-
 # revision identifiers, used by Alembic.
 revision = "d5747495bd7c"
 down_revision = "e7c68e43ce6f"
@@ -120,7 +119,9 @@ def upgrade():
             nullable=False,
             server_default=sa.text("'{}'::jsonb"),
         ),
-        sa.Column("s3_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
+        sa.Column(
+            "object_store_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True
+        ),
         sa.Column(
             "langfuse_dataset_id",
             sqlmodel.sql.sqltypes.AutoString(),
@@ -164,7 +165,9 @@ def upgrade():
         ),
         sa.Column("dataset_id", sa.Integer(), nullable=True),
         sa.Column("status", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
-        sa.Column("s3_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
+        sa.Column(
+            "object_store_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True
+        ),
         sa.Column("total_items", sa.Integer(), nullable=False),
         sa.Column("score", sa.JSON(), nullable=True),
         sa.Column("error_message", sa.Text(), nullable=True),
diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index af08d8260..dd4dfe19d 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -17,7 +17,7 @@
     create_evaluation_dataset,
     get_dataset_by_id,
     list_datasets,
-    upload_csv_to_s3,
+    upload_csv_to_object_store,
 )
 from app.crud.evaluation_dataset import (
     delete_dataset as delete_dataset_crud,
@@ -101,7 +101,7 @@ async def upload_dataset(
     This endpoint:
     1. Sanitizes the dataset name (removes spaces, special characters)
     2. Validates and parses the CSV file
-    3. Uploads CSV to AWS S3 (if credentials configured)
+    3. Uploads CSV to object store (if credentials configured)
     4. Uploads dataset to Langfuse (for immediate use)
     5. Stores metadata in database
 
@@ -125,7 +125,7 @@ async def upload_dataset(
     ```
 
     Returns:
-        DatasetUploadResponse with dataset_id, s3_url, and Langfuse details
+        DatasetUploadResponse with dataset_id, object_store_url, and Langfuse details
         (dataset_name in response will be the sanitized version)
     """
     # Sanitize dataset name for Langfuse compatibility
@@ -189,24 +189,29 @@ async def upload_dataset(
         logger.error(f"Failed to parse CSV: {e}", exc_info=True)
         raise HTTPException(status_code=422, detail=f"Invalid CSV file: {e}")
 
-    # Step 2: Upload to AWS S3 (if credentials configured)
-    s3_url = None
+    # Step 2: Upload to object store (if credentials configured)
+    object_store_url = None
     try:
         storage = get_cloud_storage(
             session=_session, project_id=_current_user.project_id
         )
-        s3_url = upload_csv_to_s3(
+        object_store_url = upload_csv_to_object_store(
             storage=storage, csv_content=csv_content, dataset_name=dataset_name
         )
-        if s3_url:
-            logger.info(f"Successfully uploaded CSV to S3: {s3_url}")
+        if object_store_url:
+            logger.info(
+                f"Successfully uploaded CSV to object store: {object_store_url}"
+            )
         else:
-            logger.info("S3 upload returned None, continuing without S3 storage")
+            logger.info(
+                "Object store upload returned None, continuing without object store storage"
+            )
     except Exception as e:
         logger.warning(
-            f"Failed to upload CSV to S3 (continuing without S3): {e}", exc_info=True
+            f"Failed to upload CSV to object store (continuing without object store): {e}",
+            exc_info=True,
         )
-        s3_url = None
+        object_store_url = None
 
     # Step 3: Upload to Langfuse
     langfuse_dataset_id = None
@@ -261,7 +266,7 @@ async def upload_dataset(
             name=dataset_name,
             description=description,
             dataset_metadata=metadata,
-            s3_url=s3_url,
+            object_store_url=object_store_url,
             langfuse_dataset_id=langfuse_dataset_id,
             organization_id=_current_user.organization_id,
             project_id=_current_user.project_id,
@@ -280,7 +285,7 @@ async def upload_dataset(
             original_items=original_items_count,
             duplication_factor=duplication_factor,
             langfuse_dataset_id=langfuse_dataset_id,
-            s3_url=s3_url,
+            object_store_url=object_store_url,
         )
 
     except IntegrityError as e:
@@ -343,7 +348,7 @@ async def list_datasets_endpoint(
                     "duplication_factor", 1
                 ),
                 langfuse_dataset_id=dataset.langfuse_dataset_id,
-                s3_url=dataset.s3_url,
+                object_store_url=dataset.object_store_url,
             )
         )
 
@@ -390,7 +395,7 @@ async def get_dataset(
         original_items=dataset.dataset_metadata.get("original_items_count", 0),
         duplication_factor=dataset.dataset_metadata.get("duplication_factor", 1),
         langfuse_dataset_id=dataset.langfuse_dataset_id,
-        s3_url=dataset.s3_url,
+        object_store_url=dataset.object_store_url,
     )
 
 
@@ -403,7 +408,7 @@ async def delete_dataset(
     """
     Delete a dataset by ID.
 
-    This will remove the dataset record from the database. The CSV file in S3
+    This will remove the dataset record from the database. The CSV file in object store
     (if exists) will remain for audit purposes, but the dataset will no longer
     be accessible for creating new evaluations.
 
@@ -536,7 +541,7 @@ async def evaluate_threads(
 
     logger.info(
         f"Found dataset: id={dataset.id}, name={dataset.name}, "
-        f"s3_url={'present' if dataset.s3_url else 'None'}, "
+        f"object_store_url={'present' if dataset.object_store_url else 'None'}, "
         f"langfuse_id={dataset.langfuse_dataset_id}"
     )
 
diff --git a/backend/app/core/storage_utils.py b/backend/app/core/storage_utils.py
new file mode 100644
index 000000000..63830d7d0
--- /dev/null
+++ b/backend/app/core/storage_utils.py
@@ -0,0 +1,167 @@
+"""
+Shared storage utilities for uploading files to object store.
+
+This module provides common functions for uploading various file types
+to cloud object storage, abstracting away provider-specific details.
+"""
+
+import io
+import json
+import logging
+from datetime import datetime
+from io import BytesIO
+from pathlib import Path
+
+from starlette.datastructures import Headers, UploadFile
+
+from app.core.cloud.storage import CloudStorage, CloudStorageError
+
+logger = logging.getLogger(__name__)
+
+
+def upload_csv_to_object_store(
+    storage: CloudStorage,
+    csv_content: bytes,
+    filename: str,
+    subdirectory: str = "datasets",
+) -> str | None:
+    """
+    Upload CSV content to object store.
+
+    Args:
+        storage: CloudStorage instance
+        csv_content: Raw CSV content as bytes
+        filename: Name of the file (can include timestamp)
+        subdirectory: Subdirectory path in object store (default: "datasets")
+
+    Returns:
+        Object store URL as string if successful, None if failed
+
+    Note:
+        This function handles errors gracefully and returns None on failure.
+        Callers should continue without object store URL when this returns None.
+    """
+    logger.info(
+        f"[upload_csv_to_object_store] Preparing to upload '{filename}' | "
+        f"size={len(csv_content)} bytes, subdirectory='{subdirectory}'"
+    )
+
+    try:
+        # Create file path
+        file_path = Path(subdirectory) / filename
+
+        # Create a mock UploadFile-like object for the storage put method
+        class CSVFile:
+            def __init__(self, content: bytes):
+                self.file = io.BytesIO(content)
+                self.content_type = "text/csv"
+
+        csv_file = CSVFile(csv_content)
+
+        # Upload to object store
+        destination = storage.put(source=csv_file, file_path=file_path)
+        object_store_url = str(destination)
+
+        logger.info(
+            f"[upload_csv_to_object_store] Upload successful | "
+            f"filename='{filename}', url='{object_store_url}'"
+        )
+        return object_store_url
+
+    except CloudStorageError as e:
+        logger.warning(
+            f"[upload_csv_to_object_store] Upload failed for '{filename}': {e}. "
+            "Continuing without object store storage."
+        )
+        return None
+    except Exception as e:
+        logger.warning(
+            f"[upload_csv_to_object_store] Unexpected error uploading '{filename}': {e}. "
+            "Continuing without object store storage.",
+            exc_info=True,
+        )
+        return None
+
+
+def upload_jsonl_to_object_store(
+    storage: CloudStorage,
+    results: list[dict],
+    filename: str,
+    subdirectory: str,
+) -> str | None:
+    """
+    Upload JSONL (JSON Lines) content to object store.
+
+    Args:
+        storage: CloudStorage instance
+        results: List of dictionaries to be converted to JSONL
+        filename: Name of the file
+        subdirectory: Subdirectory path in object store (e.g., "evaluation/batch-123")
+
+    Returns:
+        Object store URL as string if successful, None if failed
+
+    Note:
+        This function handles errors gracefully and returns None on failure.
+        Callers should continue without object store URL when this returns None.
+    """
+    logger.info(
+        f"[upload_jsonl_to_object_store] Preparing to upload '{filename}' | "
+        f"items={len(results)}, subdirectory='{subdirectory}'"
+    )
+
+    try:
+        # Create file path
+        file_path = Path(subdirectory) / filename
+
+        # Convert results to JSONL
+        jsonl_content = "\n".join([json.dumps(result) for result in results])
+        content_bytes = jsonl_content.encode("utf-8")
+
+        # Create UploadFile-like object
+        headers = Headers({"content-type": "application/jsonl"})
+        upload_file = UploadFile(
+            filename=filename,
+            file=BytesIO(content_bytes),
+            headers=headers,
+        )
+
+        # Upload to object store
+        destination = storage.put(source=upload_file, file_path=file_path)
+        object_store_url = str(destination)
+
+        logger.info(
+            f"[upload_jsonl_to_object_store] Upload successful | "
+            f"filename='{filename}', url='{object_store_url}', "
+            f"size={len(content_bytes)} bytes"
+        )
+        return object_store_url
+
+    except CloudStorageError as e:
+        logger.warning(
+            f"[upload_jsonl_to_object_store] Upload failed for '{filename}': {e}. "
+            "Continuing without object store storage."
+        )
+        return None
+    except Exception as e:
+        logger.warning(
+            f"[upload_jsonl_to_object_store] Unexpected error uploading '{filename}': {e}. "
+            "Continuing without object store storage.",
+            exc_info=True,
+        )
+        return None
+
+
+def generate_timestamped_filename(base_name: str, extension: str = "csv") -> str:
+    """
+    Generate a filename with timestamp.
+
+    Args:
+        base_name: Base name for the file (e.g., "dataset_name" or "batch-123")
+        extension: File extension without dot (default: "csv")
+
+    Returns:
+        Filename with timestamp (e.g., "dataset_name_20250114_153045.csv")
+    """
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return f"{base_name}_{timestamp}.{extension}"
diff --git a/backend/app/crud/batch_operations.py b/backend/app/crud/batch_operations.py
index 71998f816..b0806efaf 100644
--- a/backend/app/crud/batch_operations.py
+++ b/backend/app/crud/batch_operations.py
@@ -1,14 +1,13 @@
 """Generic batch operations orchestrator."""
 
-import json
 import logging
-from io import BytesIO
 from typing import Any
 
 from sqlmodel import Session
 
 from app.core.batch.provider_interface import BatchProvider
-from app.core.cloud.storage import AmazonCloudStorageClient, SimpleStorageName
+from app.core.cloud import get_cloud_storage
+from app.core.storage_utils import upload_jsonl_to_object_store as shared_upload_jsonl
 from app.crud.batch_job import (
     create_batch_job,
     update_batch_job,
@@ -209,21 +208,21 @@ def process_completed_batch(
     session: Session,
     provider: BatchProvider,
     batch_job: BatchJob,
-    upload_to_s3: bool = True,
+    upload_to_object_store: bool = True,
 ) -> tuple[list[dict[str, Any]], str | None]:
     """
-    Process a completed batch: download results and optionally upload to S3.
+    Process a completed batch: download results and optionally upload to object store.
 
     Args:
         session: Database session
         provider: BatchProvider instance
         batch_job: BatchJob object
-        upload_to_s3: Whether to upload raw results to S3
+        upload_to_object_store: Whether to upload raw results to object store
 
     Returns:
-        Tuple of (results, s3_url)
+        Tuple of (results, object_store_url)
         - results: List of result dictionaries
-        - s3_url: S3 URL if uploaded, None otherwise
+        - object_store_url: Object store URL if uploaded, None otherwise
 
     Raises:
         Exception: If processing fails
@@ -234,87 +233,86 @@ def process_completed_batch(
         # Download results
         results = download_batch_results(provider=provider, batch_job=batch_job)
 
-        # Upload to S3 if requested
-        s3_url = None
-        if upload_to_s3:
+        # Upload to object store if requested
+        object_store_url = None
+        if upload_to_object_store:
             try:
-                s3_url = upload_batch_results_to_s3(
-                    batch_job=batch_job, results=results
+                object_store_url = upload_batch_results_to_object_store(
+                    session=session, batch_job=batch_job, results=results
                 )
-                logger.info(f"Uploaded batch results to S3: {s3_url}")
-            except Exception as s3_error:
+                logger.info(
+                    f"Uploaded batch results to object store: {object_store_url}"
+                )
+            except Exception as store_error:
                 logger.warning(
-                    f"S3 upload failed (AWS credentials may not be configured): {s3_error}. "
-                    f"Continuing without S3 storage.",
+                    f"Object store upload failed (credentials may not be configured): {store_error}. "
+                    f"Continuing without object store storage.",
                     exc_info=True,
                 )
 
-        # Update batch_job with S3 URL
-        if s3_url:
-            batch_job_update = BatchJobUpdate(raw_output_url=s3_url)
+        # Update batch_job with object store URL
+        if object_store_url:
+            batch_job_update = BatchJobUpdate(raw_output_url=object_store_url)
             update_batch_job(
                 session=session, batch_job=batch_job, batch_job_update=batch_job_update
             )
 
-        return results, s3_url
+        return results, object_store_url
 
     except Exception as e:
         logger.error(f"Failed to process completed batch: {e}", exc_info=True)
         raise
 
 
-def upload_batch_results_to_s3(
-    batch_job: BatchJob, results: list[dict[str, Any]]
-) -> str:
+def upload_batch_results_to_object_store(
+    session: Session, batch_job: BatchJob, results: list[dict[str, Any]]
+) -> str | None:
     """
-    Upload batch results to S3.
+    Upload batch results to object store.
+
+    This function uses the shared storage utility for consistent upload behavior.
 
     Args:
+        session: Database session (for getting cloud storage)
         batch_job: BatchJob object
         results: List of result dictionaries
 
     Returns:
-        S3 URL
+        Object store URL if successful, None if failed
 
     Raises:
         Exception: If upload fails
     """
-    logger.info(f"Uploading batch results to S3 for batch_job {batch_job.id}")
+    logger.info(f"Uploading batch results to object store for batch_job {batch_job.id}")
 
     try:
-        # Create S3 key path
+        # Get cloud storage instance
+        storage = get_cloud_storage(session=session, project_id=batch_job.project_id)
+
+        # Define subdirectory and filename
         # Format: {job_type}/batch-{id}/results.jsonl
-        s3_key = f"{batch_job.job_type}/batch-{batch_job.id}/results.jsonl"
-
-        # Convert results to JSONL
-        jsonl_content = "\n".join([json.dumps(result) for result in results])
-        content_bytes = jsonl_content.encode("utf-8")
-        file_like = BytesIO(content_bytes)
-
-        # Upload to S3
-        aws_client = AmazonCloudStorageClient()
-        aws_client.client.upload_fileobj(
-            file_like,
-            Bucket=aws_client.client._client_config.__dict__.get(
-                "bucket", "kaapi-storage"
-            ),
-            Key=s3_key,
-            ExtraArgs={"ContentType": "application/jsonl"},
+        subdirectory = f"{batch_job.job_type}/batch-{batch_job.id}"
+        filename = "results.jsonl"
+
+        # Use shared utility for upload
+        object_store_url = shared_upload_jsonl(
+            storage=storage,
+            results=results,
+            filename=filename,
+            subdirectory=subdirectory,
         )
 
-        # Construct S3 URL
-        storage_name = SimpleStorageName(Key=s3_key)
-        s3_url = str(storage_name)
+        return object_store_url
 
-        logger.info(
-            f"Successfully uploaded batch results to S3: {s3_url} ({len(content_bytes)} bytes)"
+    except Exception as e:
+        logger.error(
+            f"Failed to upload batch results to object store: {e}", exc_info=True
         )
+        raise
 
-        return s3_url
 
-    except Exception as e:
-        logger.error(f"Failed to upload batch results to S3: {e}", exc_info=True)
-        raise
+# Backward compatibility alias
+upload_batch_results_to_s3 = upload_batch_results_to_object_store
 
 
 # NOTE: Batch-level polling has been removed from this module.
diff --git a/backend/app/crud/evaluation_dataset.py b/backend/app/crud/evaluation_dataset.py
index c3dfc22c4..f00175020 100644
--- a/backend/app/crud/evaluation_dataset.py
+++ b/backend/app/crud/evaluation_dataset.py
@@ -9,17 +9,19 @@
 """
 
 import logging
-
-from pathlib import Path
 from typing import Any
+
 from sqlmodel import Session, select
-from datetime import datetime
 
-from app.core.cloud.storage import CloudStorage, CloudStorageError
+from app.core.cloud.storage import CloudStorage
+from app.core.storage_utils import (
+    generate_timestamped_filename,
+)
+from app.core.storage_utils import (
+    upload_csv_to_object_store as shared_upload_csv,
+)
 from app.core.util import now
-from app.models import EvaluationDataset
-from app.models import EvaluationRun
-
+from app.models import EvaluationDataset, EvaluationRun
 
 logger = logging.getLogger(__name__)
 
@@ -31,7 +33,7 @@ def create_evaluation_dataset(
     organization_id: int,
     project_id: int,
     description: str | None = None,
-    s3_url: str | None = None,
+    object_store_url: str | None = None,
     langfuse_dataset_id: str | None = None,
 ) -> EvaluationDataset:
     """
@@ -45,7 +47,7 @@ def create_evaluation_dataset(
         organization_id: Organization ID
         project_id: Project ID
         description: Optional dataset description
-        s3_url: Optional AWS S3 URL where CSV is stored
+        object_store_url: Optional object store URL where CSV is stored
         langfuse_dataset_id: Optional Langfuse dataset ID
 
     Returns:
@@ -55,7 +57,7 @@ def create_evaluation_dataset(
         name=name,
         description=description,
         dataset_metadata=dataset_metadata,
-        s3_url=s3_url,
+        object_store_url=object_store_url,
         langfuse_dataset_id=langfuse_dataset_id,
         organization_id=organization_id,
         project_id=project_id,
@@ -185,13 +187,16 @@ def list_datasets(
     return list(datasets)
 
 
-def upload_csv_to_s3(
+def upload_csv_to_object_store(
     storage: CloudStorage,
     csv_content: bytes,
     dataset_name: str,
 ) -> str | None:
     """
-    Upload CSV file to AWS S3.
+    Upload CSV file to object store.
+
+    This is a wrapper around the shared storage utility function,
+    providing dataset-specific file naming.
 
     Args:
         storage: CloudStorage instance
@@ -199,79 +204,68 @@ def upload_csv_to_s3(
         dataset_name: Name of the dataset (used for file naming)
 
     Returns:
-        S3 URL as string if successful, None if failed
+        Object store URL as string if successful, None if failed
 
     Note:
         This function handles errors gracefully and returns None on failure.
-        Callers should continue without S3 URL when this returns None.
+        Callers should continue without object store URL when this returns None.
     """
-    try:
-        # Create a file path for the CSV
-        # Format: datasets/{dataset_name}_{timestamp}.csv
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        file_path = Path(f"datasets/{dataset_name}_{timestamp}.csv")
-
-        # Create a mock UploadFile-like object for the storage put method
-        import io
-
-        class CSVFile:
-            def __init__(self, content: bytes):
-                self.file = io.BytesIO(content)
-                self.content_type = "text/csv"
-
-        csv_file = CSVFile(csv_content)
-
-        # Upload to S3
-        destination = storage.put(source=csv_file, file_path=file_path)
-        s3_url = str(destination)
+    # Generate timestamped filename
+    filename = generate_timestamped_filename(dataset_name, extension="csv")
+
+    # Use shared utility for upload
+    return shared_upload_csv(
+        storage=storage,
+        csv_content=csv_content,
+        filename=filename,
+        subdirectory="datasets",
+    )
 
-        logger.info(f"Successfully uploaded CSV to S3: {s3_url}")
-        return s3_url
 
-    except CloudStorageError as e:
-        logger.warning(
-            f"Failed to upload CSV to S3 for dataset '{dataset_name}': {e}. "
-            "Continuing without S3 storage."
-        )
-        return None
-    except Exception as e:
-        logger.warning(
-            f"Unexpected error uploading CSV to S3 for dataset '{dataset_name}': {e}. "
-            "Continuing without S3 storage.",
-            exc_info=True,
-        )
-        return None
+# Backward compatibility alias
+upload_csv_to_s3 = upload_csv_to_object_store
 
 
-def download_csv_from_s3(storage: CloudStorage, s3_url: str) -> bytes:
+def download_csv_from_object_store(
+    storage: CloudStorage, object_store_url: str
+) -> bytes:
     """
-    Download CSV file from AWS S3.
+    Download CSV file from object store.
 
     Args:
         storage: CloudStorage instance
-        s3_url: S3 URL of the CSV file
+        object_store_url: Object store URL of the CSV file
 
     Returns:
         CSV content as bytes
 
     Raises:
         CloudStorageError: If download fails
-        ValueError: If s3_url is None or empty
+        ValueError: If object_store_url is None or empty
     """
-    if not s3_url:
-        raise ValueError("s3_url cannot be None or empty")
+    if not object_store_url:
+        raise ValueError("object_store_url cannot be None or empty")
 
     try:
-        logger.info(f"Downloading CSV from S3: {s3_url}")
-        body = storage.stream(s3_url)
+        logger.info(f"Downloading CSV from object store: {object_store_url}")
+        body = storage.stream(object_store_url)
         csv_content = body.read()
-        logger.info(f"Successfully downloaded CSV from S3: {len(csv_content)} bytes")
+        logger.info(
+            f"Successfully downloaded CSV from object store: {len(csv_content)} bytes"
+        )
         return csv_content
     except Exception as e:
-        logger.error(f"Failed to download CSV from S3: {s3_url}: {e}", exc_info=True)
+        logger.error(
+            f"Failed to download CSV from object store: {object_store_url}: {e}",
+            exc_info=True,
+        )
         raise
 
 
+# Backward compatibility alias
+download_csv_from_s3 = download_csv_from_object_store
+
+
 def update_dataset_langfuse_id(
     session: Session, dataset_id: int, langfuse_dataset_id: str
 ) -> None:
@@ -306,7 +300,7 @@ def delete_dataset(
     """
     Delete an evaluation dataset by ID.
 
-    This performs a hard delete from the database. The CSV file in S3 (if exists)
+    This performs a hard delete from the database. The CSV file in object store (if exists)
     will remain for audit purposes.
 
     Args:
diff --git a/backend/app/crud/evaluation_processing.py b/backend/app/crud/evaluation_processing.py
index 1851a7e37..c4b5c92ef 100644
--- a/backend/app/crud/evaluation_processing.py
+++ b/backend/app/crud/evaluation_processing.py
@@ -21,7 +21,10 @@
 from app.core.batch.openai_provider import OpenAIBatchProvider
 from app.core.util import configure_langfuse, configure_openai, now
 from app.crud.batch_job import get_batch_job
-from app.crud.batch_operations import download_batch_results, upload_batch_results_to_s3
+from app.crud.batch_operations import (
+    download_batch_results,
+    upload_batch_results_to_object_store,
+)
 from app.crud.credentials import get_provider_credential
 from app.crud.evaluation_batch import fetch_dataset_items
 from app.crud.evaluation_embeddings import (
@@ -198,14 +201,14 @@ async def process_completed_evaluation(
         provider = OpenAIBatchProvider(client=openai_client)
         raw_results = download_batch_results(provider=provider, batch_job=batch_job)
 
-        # Step 2a: Upload raw results to S3 for evaluation_run
-        s3_url = None
+        # Step 2a: Upload raw results to object store for evaluation_run
+        object_store_url = None
         try:
-            s3_url = upload_batch_results_to_s3(
-                batch_job=batch_job, results=raw_results
+            object_store_url = upload_batch_results_to_object_store(
+                session=session, batch_job=batch_job, results=raw_results
             )
-        except Exception as s3_error:
-            logger.warning(f"{log_prefix} S3 upload failed: {s3_error}")
+        except Exception as store_error:
+            logger.warning(f"{log_prefix} Object store upload failed: {store_error}")
 
         # Step 3: Fetch dataset items (needed for matching ground truth)
         logger.info(
@@ -231,9 +234,9 @@ async def process_completed_evaluation(
             results=results,
         )
 
-        # Store S3 URL in database
-        if s3_url:
-            eval_run.s3_url = s3_url
+        # Store object store URL in database
+        if object_store_url:
+            eval_run.object_store_url = object_store_url
             session.add(eval_run)
             session.commit()
 
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index 7750c01a2..c64d92795 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -33,7 +33,9 @@ class DatasetUploadResponse(BaseModel):
     langfuse_dataset_id: str | None = Field(
         None, description="Langfuse dataset ID if available"
     )
-    s3_url: str | None = Field(None, description="AWS S3 URL if uploaded")
+    object_store_url: str | None = Field(
+        None, description="Object store URL if uploaded"
+    )
 
 
 class EvaluationResult(BaseModel):
@@ -94,8 +96,8 @@ class EvaluationDataset(SQLModel, table=True):
     )
 
     # Storage references
-    s3_url: str | None = SQLField(
-        default=None, description="AWS S3 URL where CSV is stored"
+    object_store_url: str | None = SQLField(
+        default=None, description="Object store URL where CSV is stored"
     )
     langfuse_dataset_id: str | None = SQLField(
         default=None, description="Langfuse dataset ID for reference"
@@ -169,9 +171,9 @@ class EvaluationRun(SQLModel, table=True):
         default="pending",
         description="Overall evaluation status: pending, processing, completed, failed",
     )
-    s3_url: str | None = SQLField(
+    object_store_url: str | None = SQLField(
         default=None,
-        description="S3 URL of processed evaluation results for future reference",
+        description="Object store URL of processed evaluation results for future reference",
     )
     total_items: int = SQLField(
         default=0, description="Total number of items evaluated (set during processing)"
@@ -246,7 +248,7 @@ class EvaluationRunPublic(SQLModel):
     batch_job_id: int | None
     embedding_batch_job_id: int | None
     status: str
-    s3_url: str | None
+    object_store_url: str | None
     total_items: int
     score: dict[str, Any] | None
     error_message: str | None
@@ -268,7 +270,9 @@ class EvaluationDatasetCreate(SQLModel):
             "duplication_factor)"
         ),
     )
-    s3_url: str | None = Field(None, description="AWS S3 URL where CSV is stored")
+    object_store_url: str | None = Field(
+        None, description="Object store URL where CSV is stored"
+    )
     langfuse_dataset_id: str | None = Field(
         None, description="Langfuse dataset ID for reference"
     )
@@ -281,7 +285,7 @@ class EvaluationDatasetPublic(SQLModel):
     name: str
     description: str | None
     dataset_metadata: dict[str, Any]
-    s3_url: str | None
+    object_store_url: str | None
     langfuse_dataset_id: str | None
     organization_id: int
     project_id: int
diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
index e8cb3cb55..a257af36f 100644
--- a/backend/app/tests/api/routes/test_evaluation.py
+++ b/backend/app/tests/api/routes/test_evaluation.py
@@ -53,7 +53,9 @@ def test_upload_dataset_valid_csv(
         """Test uploading a valid CSV file."""
         with (
             patch("app.core.cloud.get_cloud_storage") as _mock_storage,
-            patch("app.api.routes.evaluation.upload_csv_to_s3") as mock_s3_upload,
+            patch(
+                "app.api.routes.evaluation.upload_csv_to_object_store"
+            ) as mock_store_upload,
             patch(
                 "app.api.routes.evaluation.configure_langfuse"
             ) as mock_configure_langfuse,
@@ -61,8 +63,8 @@ def test_upload_dataset_valid_csv(
                 "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
         ):
-            # Mock S3 upload
-            mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            # Mock object store upload
+            mock_store_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
 
             # Mock Langfuse configuration
             mock_configure_langfuse.return_value = (None, True)
@@ -91,11 +93,11 @@ def test_upload_dataset_valid_csv(
             assert data["total_items"] == 9  # 3 items * 3 duplication
             assert data["duplication_factor"] == 3
             assert data["langfuse_dataset_id"] == "test_dataset_id"
-            assert data["s3_url"] == "s3://bucket/datasets/test_dataset.csv"
+            assert data["object_store_url"] == "s3://bucket/datasets/test_dataset.csv"
             assert "dataset_id" in data
 
-            # Verify S3 upload was called
-            mock_s3_upload.assert_called_once()
+            # Verify object store upload was called
+            mock_store_upload.assert_called_once()
 
             # Verify Langfuse upload was called
             mock_langfuse_upload.assert_called_once()
@@ -135,7 +137,9 @@ def test_upload_dataset_empty_rows(
         """Test uploading CSV with empty rows (should skip them)."""
         with (
             patch("app.core.cloud.get_cloud_storage") as _mock_storage,
-            patch("app.api.routes.evaluation.upload_csv_to_s3") as mock_s3_upload,
+            patch(
+                "app.api.routes.evaluation.upload_csv_to_object_store"
+            ) as mock_store_upload,
             patch(
                 "app.api.routes.evaluation.configure_langfuse"
             ) as mock_configure_langfuse,
@@ -143,8 +147,8 @@ def test_upload_dataset_empty_rows(
                 "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
         ):
-            # Mock S3 and Langfuse uploads
-            mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            # Mock object store and Langfuse uploads
+            mock_store_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
             mock_configure_langfuse.return_value = (None, True)
             mock_langfuse_upload.return_value = ("test_dataset_id", 4)
 
@@ -177,7 +181,9 @@ def test_upload_with_default_duplication(
         """Test uploading with default duplication factor (5)."""
         with (
             patch("app.core.cloud.get_cloud_storage") as _mock_storage,
-            patch("app.api.routes.evaluation.upload_csv_to_s3") as mock_s3_upload,
+            patch(
+                "app.api.routes.evaluation.upload_csv_to_object_store"
+            ) as mock_store_upload,
             patch(
                 "app.api.routes.evaluation.configure_langfuse"
             ) as mock_configure_langfuse,
@@ -185,7 +191,7 @@ def test_upload_with_default_duplication(
                 "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
         ):
-            mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            mock_store_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
             mock_configure_langfuse.return_value = (None, True)
             mock_langfuse_upload.return_value = ("test_dataset_id", 15)
 
@@ -214,7 +220,9 @@ def test_upload_with_custom_duplication(
         """Test uploading with custom duplication factor."""
         with (
             patch("app.core.cloud.get_cloud_storage") as _mock_storage,
-            patch("app.api.routes.evaluation.upload_csv_to_s3") as mock_s3_upload,
+            patch(
+                "app.api.routes.evaluation.upload_csv_to_object_store"
+            ) as mock_store_upload,
             patch(
                 "app.api.routes.evaluation.configure_langfuse"
             ) as mock_configure_langfuse,
@@ -222,7 +230,7 @@ def test_upload_with_custom_duplication(
                 "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
         ):
-            mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            mock_store_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
             mock_configure_langfuse.return_value = (None, True)
             mock_langfuse_upload.return_value = ("test_dataset_id", 30)
 
@@ -251,7 +259,9 @@ def test_upload_with_description(
         """Test uploading with a description."""
         with (
             patch("app.core.cloud.get_cloud_storage") as _mock_storage,
-            patch("app.api.routes.evaluation.upload_csv_to_s3") as mock_s3_upload,
+            patch(
+                "app.api.routes.evaluation.upload_csv_to_object_store"
+            ) as mock_store_upload,
             patch(
                 "app.api.routes.evaluation.configure_langfuse"
             ) as mock_configure_langfuse,
@@ -259,7 +269,7 @@ def test_upload_with_description(
                 "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
         ):
-            mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            mock_store_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
             mock_configure_langfuse.return_value = (None, True)
             mock_langfuse_upload.return_value = ("test_dataset_id", 9)
 
@@ -299,11 +309,13 @@ def test_upload_langfuse_configuration_fails(
         """Test when Langfuse client configuration fails."""
         with (
             patch("app.core.cloud.get_cloud_storage") as _mock_storage,
-            patch("app.api.routes.evaluation.upload_csv_to_s3") as mock_s3_upload,
+            patch(
+                "app.api.routes.evaluation.upload_csv_to_object_store"
+            ) as mock_store_upload,
             patch("app.crud.credentials.get_provider_credential") as mock_get_cred,
         ):
-            # Mock S3 upload succeeds
-            mock_s3_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            # Mock object store upload succeeds
+            mock_store_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
             # Mock Langfuse credentials not found
             mock_get_cred.return_value = None
 
diff --git a/backend/app/tests/crud/test_evaluation_dataset.py b/backend/app/tests/crud/test_evaluation_dataset.py
index 243fa2b37..82a8d1d74 100644
--- a/backend/app/tests/crud/test_evaluation_dataset.py
+++ b/backend/app/tests/crud/test_evaluation_dataset.py
@@ -10,12 +10,12 @@
 from app.core.cloud.storage import CloudStorageError
 from app.crud.evaluation_dataset import (
     create_evaluation_dataset,
-    download_csv_from_s3,
+    download_csv_from_object_store,
     get_dataset_by_id,
     get_dataset_by_name,
     list_datasets,
     update_dataset_langfuse_id,
-    upload_csv_to_s3,
+    upload_csv_to_object_store,
 )
 from app.models import Organization, Project
 
@@ -46,7 +46,7 @@ def test_create_evaluation_dataset_minimal(self, db: Session):
         assert dataset.organization_id == org.id
         assert dataset.project_id == project.id
         assert dataset.description is None
-        assert dataset.s3_url is None
+        assert dataset.object_store_url is None
         assert dataset.langfuse_dataset_id is None
 
     def test_create_evaluation_dataset_complete(self, db: Session):
@@ -66,7 +66,7 @@ def test_create_evaluation_dataset_complete(self, db: Session):
                 "total_items_count": 25,
                 "duplication_factor": 5,
             },
-            s3_url="s3://bucket/datasets/complete_dataset.csv",
+            object_store_url="s3://bucket/datasets/complete_dataset.csv",
             langfuse_dataset_id="langfuse_123",
             organization_id=org.id,
             project_id=project.id,
@@ -76,7 +76,7 @@ def test_create_evaluation_dataset_complete(self, db: Session):
         assert dataset.name == "complete_dataset"
         assert dataset.description == "A complete test dataset"
         assert dataset.dataset_metadata["duplication_factor"] == 5
-        assert dataset.s3_url == "s3://bucket/datasets/complete_dataset.csv"
+        assert dataset.object_store_url == "s3://bucket/datasets/complete_dataset.csv"
         assert dataset.langfuse_dataset_id == "langfuse_123"
         assert dataset.inserted_at is not None
         assert dataset.updated_at is not None
@@ -288,84 +288,88 @@ def test_list_datasets_pagination(self, db: Session):
         assert len(set(page1_names) & set(page2_names)) == 0
 
 
-class TestUploadCsvToS3:
-    """Test CSV upload to S3."""
+class TestUploadCsvToObjectStore:
+    """Test CSV upload to object store."""
 
-    def test_upload_csv_to_s3_success(self):
-        """Test successful S3 upload."""
+    def test_upload_csv_to_object_store_success(self):
+        """Test successful object store upload."""
         mock_storage = MagicMock()
         mock_storage.put.return_value = "s3://bucket/datasets/test_dataset.csv"
 
         csv_content = b"question,answer\nWhat is 2+2?,4\n"
 
-        s3_url = upload_csv_to_s3(
+        object_store_url = upload_csv_to_object_store(
             storage=mock_storage, csv_content=csv_content, dataset_name="test_dataset"
         )
 
-        assert s3_url == "s3://bucket/datasets/test_dataset.csv"
+        assert object_store_url == "s3://bucket/datasets/test_dataset.csv"
         mock_storage.put.assert_called_once()
 
-    def test_upload_csv_to_s3_cloud_storage_error(self):
-        """Test S3 upload with CloudStorageError."""
+    def test_upload_csv_to_object_store_cloud_storage_error(self):
+        """Test object store upload with CloudStorageError."""
         mock_storage = MagicMock()
-        mock_storage.put.side_effect = CloudStorageError("S3 bucket not found")
+        mock_storage.put.side_effect = CloudStorageError(
+            "Object store bucket not found"
+        )
 
         csv_content = b"question,answer\nWhat is 2+2?,4\n"
 
         # Should return None on error
-        s3_url = upload_csv_to_s3(
+        object_store_url = upload_csv_to_object_store(
             storage=mock_storage, csv_content=csv_content, dataset_name="test_dataset"
         )
 
-        assert s3_url is None
+        assert object_store_url is None
 
-    def test_upload_csv_to_s3_unexpected_error(self):
-        """Test S3 upload with unexpected error."""
+    def test_upload_csv_to_object_store_unexpected_error(self):
+        """Test object store upload with unexpected error."""
         mock_storage = MagicMock()
         mock_storage.put.side_effect = Exception("Unexpected error")
 
         csv_content = b"question,answer\nWhat is 2+2?,4\n"
 
         # Should return None on error
-        s3_url = upload_csv_to_s3(
+        object_store_url = upload_csv_to_object_store(
             storage=mock_storage, csv_content=csv_content, dataset_name="test_dataset"
         )
 
-        assert s3_url is None
+        assert object_store_url is None
 
 
-class TestDownloadCsvFromS3:
-    """Test CSV download from S3."""
+class TestDownloadCsvFromObjectStore:
+    """Test CSV download from object store."""
 
-    def test_download_csv_from_s3_success(self):
-        """Test successful S3 download."""
+    def test_download_csv_from_object_store_success(self):
+        """Test successful object store download."""
         mock_storage = MagicMock()
         mock_body = MagicMock()
         mock_body.read.return_value = b"question,answer\nWhat is 2+2?,4\n"
         mock_storage.stream.return_value = mock_body
 
-        csv_content = download_csv_from_s3(
-            storage=mock_storage, s3_url="s3://bucket/datasets/test.csv"
+        csv_content = download_csv_from_object_store(
+            storage=mock_storage, object_store_url="s3://bucket/datasets/test.csv"
         )
 
         assert csv_content == b"question,answer\nWhat is 2+2?,4\n"
         mock_storage.stream.assert_called_once_with("s3://bucket/datasets/test.csv")
 
-    def test_download_csv_from_s3_empty_url(self):
+    def test_download_csv_from_object_store_empty_url(self):
         """Test download with empty URL."""
         mock_storage = MagicMock()
 
-        with pytest.raises(ValueError, match="s3_url cannot be None or empty"):
-            download_csv_from_s3(storage=mock_storage, s3_url=None)
+        with pytest.raises(
+            ValueError, match="object_store_url cannot be None or empty"
+        ):
+            download_csv_from_object_store(storage=mock_storage, object_store_url=None)
 
-    def test_download_csv_from_s3_error(self):
+    def test_download_csv_from_object_store_error(self):
         """Test download with storage error."""
         mock_storage = MagicMock()
-        mock_storage.stream.side_effect = Exception("S3 download failed")
+        mock_storage.stream.side_effect = Exception("Object store download failed")
 
-        with pytest.raises(Exception, match="S3 download failed"):
-            download_csv_from_s3(
-                storage=mock_storage, s3_url="s3://bucket/datasets/test.csv"
+        with pytest.raises(Exception, match="Object store download failed"):
+            download_csv_from_object_store(
+                storage=mock_storage, object_store_url="s3://bucket/datasets/test.csv"
             )
 
 

From 0fd0842c89fa62f98e4815d22a8f4ab3bcc6ac4c Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Mon, 3 Nov 2025 14:18:24 +0530
Subject: [PATCH 42/64] optimizing cosine similarities

---
 backend/app/api/routes/evaluation.py          |  62 +++------
 .../app/celery/tasks/evaluation_polling.py    |   1 +
 backend/app/crud/evaluation.py                | 131 +++++++++++++++++-
 backend/app/crud/evaluation_embeddings.py     |  10 --
 backend/app/crud/evaluation_processing.py     |   2 -
 .../tests/crud/test_evaluation_embeddings.py  |   7 +-
 6 files changed, 152 insertions(+), 61 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index dd4dfe19d..f7280e19f 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -5,13 +5,18 @@
 
 from fastapi import APIRouter, Body, Depends, File, Form, HTTPException, UploadFile
 from sqlalchemy.exc import IntegrityError
-from sqlmodel import Session, select
+from sqlmodel import Session
 
 from app.api.deps import get_current_user_org_project, get_db
 from app.core.cloud import get_cloud_storage
-from app.core.util import configure_langfuse, configure_openai, now
+from app.core.util import configure_langfuse, configure_openai
 from app.crud.assistants import get_assistant_by_id
 from app.crud.credentials import get_provider_credential
+from app.crud.evaluation import (
+    create_evaluation_run,
+    get_evaluation_run_by_id,
+    list_evaluation_runs as list_evaluation_runs_crud,
+)
 from app.crud.evaluation_batch import start_evaluation_batch
 from app.crud.evaluation_dataset import (
     create_evaluation_dataset,
@@ -23,7 +28,7 @@
     delete_dataset as delete_dataset_crud,
 )
 from app.crud.evaluation_langfuse import upload_dataset_to_langfuse_from_csv
-from app.models import EvaluationRun, UserProjectOrg
+from app.models import UserProjectOrg
 from app.models.evaluation import (
     DatasetUploadResponse,
     EvaluationRunPublic,
@@ -443,7 +448,7 @@ async def delete_dataset(
 
 
 @router.post("/evaluations", response_model=EvaluationRunPublic)
-async def evaluate_threads(
+async def evaluate(
     dataset_id: int = Body(..., description="ID of the evaluation dataset"),
     experiment_name: str = Body(
         ..., description="Name for this evaluation experiment/run"
@@ -632,24 +637,16 @@ async def evaluate_threads(
             )
 
     # Create EvaluationRun record
-    eval_run = EvaluationRun(
+    eval_run = create_evaluation_run(
+        session=_session,
         run_name=experiment_name,
         dataset_name=dataset_name,
         dataset_id=dataset_id,
         config=config,
-        status="pending",
         organization_id=_current_user.organization_id,
         project_id=_current_user.project_id,
-        inserted_at=now(),
-        updated_at=now(),
     )
 
-    _session.add(eval_run)
-    _session.commit()
-    _session.refresh(eval_run)
-
-    logger.info(f"Created EvaluationRun record: id={eval_run.id}")
-
     # Start the batch evaluation
     try:
         eval_run = start_evaluation_batch(
@@ -699,21 +696,14 @@ async def list_evaluation_runs(
         f"project_id={_current_user.project_id} (limit={limit}, offset={offset})"
     )
 
-    statement = (
-        select(EvaluationRun)
-        .where(EvaluationRun.organization_id == _current_user.organization_id)
-        .where(EvaluationRun.project_id == _current_user.project_id)
-        .order_by(EvaluationRun.inserted_at.desc())
-        .limit(limit)
-        .offset(offset)
+    return list_evaluation_runs_crud(
+        session=_session,
+        organization_id=_current_user.organization_id,
+        project_id=_current_user.project_id,
+        limit=limit,
+        offset=offset,
     )
 
-    runs = _session.exec(statement).all()
-
-    logger.info(f"Found {len(runs)} evaluation runs")
-
-    return list(runs)
-
 
 @router.get("/evaluations/{evaluation_id}", response_model=EvaluationRunPublic)
 async def get_evaluation_run_status(
@@ -736,16 +726,13 @@ async def get_evaluation_run_status(
         f"project_id={_current_user.project_id})"
     )
 
-    # Query the evaluation run
-    statement = (
-        select(EvaluationRun)
-        .where(EvaluationRun.id == evaluation_id)
-        .where(EvaluationRun.organization_id == _current_user.organization_id)
-        .where(EvaluationRun.project_id == _current_user.project_id)
+    eval_run = get_evaluation_run_by_id(
+        session=_session,
+        evaluation_id=evaluation_id,
+        organization_id=_current_user.organization_id,
+        project_id=_current_user.project_id,
     )
 
-    eval_run = _session.exec(statement).first()
-
     if not eval_run:
         raise HTTPException(
             status_code=404,
@@ -755,9 +742,4 @@ async def get_evaluation_run_status(
             ),
         )
 
-    logger.info(
-        f"Found evaluation run {evaluation_id}: status={eval_run.status}, "
-        f"batch_job_id={eval_run.batch_job_id}"
-    )
-
     return eval_run
diff --git a/backend/app/celery/tasks/evaluation_polling.py b/backend/app/celery/tasks/evaluation_polling.py
index d5ccad192..6d7879e76 100644
--- a/backend/app/celery/tasks/evaluation_polling.py
+++ b/backend/app/celery/tasks/evaluation_polling.py
@@ -86,6 +86,7 @@ def poll_evaluation_batches_task(self):
                         f"[poll_evaluation_batches] Error processing org_id={org.id}: {e}",
                         exc_info=True,
                     )
+                    session.rollback()
                     results.append(
                         {"org_id": org.id, "org_name": org.name, "error": str(e)}
                     )
diff --git a/backend/app/crud/evaluation.py b/backend/app/crud/evaluation.py
index 49f3d1e5f..0e57df9c4 100644
--- a/backend/app/crud/evaluation.py
+++ b/backend/app/crud/evaluation.py
@@ -3,11 +3,11 @@
 import logging
 
 from langfuse import Langfuse
-from sqlmodel import Session
+from sqlmodel import Session, select
 
-from app.core.util import configure_langfuse, configure_openai
+from app.core.util import configure_langfuse, configure_openai, now
 from app.crud.credentials import get_provider_credential
-from app.models import UserProjectOrg
+from app.models import EvaluationRun, UserProjectOrg
 from app.models.evaluation import DatasetUploadResponse
 
 logger = logging.getLogger(__name__)
@@ -133,3 +133,128 @@ async def upload_dataset_to_langfuse(
     except Exception as e:
         logger.error(f"Error uploading dataset: {str(e)}", exc_info=True)
         return False, None, f"Failed to upload dataset: {str(e)}"
+
+
+def create_evaluation_run(
+    session: Session,
+    run_name: str,
+    dataset_name: str,
+    dataset_id: int,
+    config: dict,
+    organization_id: int,
+    project_id: int,
+) -> EvaluationRun:
+    """
+    Create a new evaluation run record in the database.
+
+    Args:
+        session: Database session
+        run_name: Name of the evaluation run/experiment
+        dataset_name: Name of the dataset being used
+        dataset_id: ID of the dataset
+        config: Configuration dict for the evaluation
+        organization_id: Organization ID
+        project_id: Project ID
+
+    Returns:
+        The created EvaluationRun instance
+    """
+    eval_run = EvaluationRun(
+        run_name=run_name,
+        dataset_name=dataset_name,
+        dataset_id=dataset_id,
+        config=config,
+        status="pending",
+        organization_id=organization_id,
+        project_id=project_id,
+        inserted_at=now(),
+        updated_at=now(),
+    )
+
+    session.add(eval_run)
+    session.commit()
+    session.refresh(eval_run)
+
+    logger.info(f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}")
+
+    return eval_run
+
+
+def list_evaluation_runs(
+    session: Session,
+    organization_id: int,
+    project_id: int,
+    limit: int = 50,
+    offset: int = 0,
+) -> list[EvaluationRun]:
+    """
+    List all evaluation runs for an organization and project.
+
+    Args:
+        session: Database session
+        organization_id: Organization ID to filter by
+        project_id: Project ID to filter by
+        limit: Maximum number of runs to return (default 50)
+        offset: Number of runs to skip (for pagination)
+
+    Returns:
+        List of EvaluationRun objects, ordered by most recent first
+    """
+    statement = (
+        select(EvaluationRun)
+        .where(EvaluationRun.organization_id == organization_id)
+        .where(EvaluationRun.project_id == project_id)
+        .order_by(EvaluationRun.inserted_at.desc())
+        .limit(limit)
+        .offset(offset)
+    )
+
+    runs = session.exec(statement).all()
+
+    logger.info(
+        f"Found {len(runs)} evaluation runs for org_id={organization_id}, "
+        f"project_id={project_id}"
+    )
+
+    return list(runs)
+
+
+def get_evaluation_run_by_id(
+    session: Session,
+    evaluation_id: int,
+    organization_id: int,
+    project_id: int,
+) -> EvaluationRun | None:
+    """
+    Get a specific evaluation run by ID.
+
+    Args:
+        session: Database session
+        evaluation_id: ID of the evaluation run
+        organization_id: Organization ID (for access control)
+        project_id: Project ID (for access control)
+
+    Returns:
+        EvaluationRun if found and accessible, None otherwise
+    """
+    statement = (
+        select(EvaluationRun)
+        .where(EvaluationRun.id == evaluation_id)
+        .where(EvaluationRun.organization_id == organization_id)
+        .where(EvaluationRun.project_id == project_id)
+    )
+
+    eval_run = session.exec(statement).first()
+
+    if eval_run:
+        logger.info(
+            f"Found evaluation run {evaluation_id}: status={eval_run.status}, "
+            f"batch_job_id={eval_run.batch_job_id}"
+        )
+    else:
+        logger.warning(
+            f"Evaluation run {evaluation_id} not found or not accessible "
+            f"for org_id={organization_id}, project_id={project_id}"
+        )
+
+    return eval_run
diff --git a/backend/app/crud/evaluation_embeddings.py b/backend/app/crud/evaluation_embeddings.py
index 7f423c85e..77ea39251 100644
--- a/backend/app/crud/evaluation_embeddings.py
+++ b/backend/app/crud/evaluation_embeddings.py
@@ -265,8 +265,6 @@ def calculate_average_similarity(
         Dictionary with similarity statistics:
         {
             "cosine_similarity_avg": 0.87,
-            "cosine_similarity_min": 0.65,
-            "cosine_similarity_max": 0.98,
             "cosine_similarity_std": 0.12,
             "total_pairs": 50,
             "per_item_scores": [...]  # Individual scores with trace_ids
@@ -277,8 +275,6 @@ def calculate_average_similarity(
     if not embedding_pairs:
         return {
             "cosine_similarity_avg": 0.0,
-            "cosine_similarity_min": 0.0,
-            "cosine_similarity_max": 0.0,
             "cosine_similarity_std": 0.0,
             "total_pairs": 0,
             "per_item_scores": [],
@@ -312,8 +308,6 @@ def calculate_average_similarity(
         logger.warning("No valid similarities calculated")
         return {
             "cosine_similarity_avg": 0.0,
-            "cosine_similarity_min": 0.0,
-            "cosine_similarity_max": 0.0,
             "cosine_similarity_std": 0.0,
             "total_pairs": 0,
             "per_item_scores": [],
@@ -324,8 +318,6 @@ def calculate_average_similarity(
 
     stats = {
         "cosine_similarity_avg": float(np.mean(similarities_array)),
-        "cosine_similarity_min": float(np.min(similarities_array)),
-        "cosine_similarity_max": float(np.max(similarities_array)),
         "cosine_similarity_std": float(np.std(similarities_array)),
         "total_pairs": len(similarities),
         "per_item_scores": per_item_scores,
@@ -333,8 +325,6 @@ def calculate_average_similarity(
 
     logger.info(
         f"Calculated similarity stats: avg={stats['cosine_similarity_avg']:.3f}, "
-        f"min={stats['cosine_similarity_min']:.3f}, "
-        f"max={stats['cosine_similarity_max']:.3f}, "
         f"std={stats['cosine_similarity_std']:.3f}"
     )
 
diff --git a/backend/app/crud/evaluation_processing.py b/backend/app/crud/evaluation_processing.py
index c4b5c92ef..3da0695e7 100644
--- a/backend/app/crud/evaluation_processing.py
+++ b/backend/app/crud/evaluation_processing.py
@@ -353,8 +353,6 @@ async def process_completed_embedding_batch(
 
         eval_run.score["cosine_similarity"] = {
             "avg": similarity_stats["cosine_similarity_avg"],
-            "min": similarity_stats["cosine_similarity_min"],
-            "max": similarity_stats["cosine_similarity_max"],
             "std": similarity_stats["cosine_similarity_std"],
             "total_pairs": similarity_stats["total_pairs"],
         }
diff --git a/backend/app/tests/crud/test_evaluation_embeddings.py b/backend/app/tests/crud/test_evaluation_embeddings.py
index 67bb709c4..7556fa7df 100644
--- a/backend/app/tests/crud/test_evaluation_embeddings.py
+++ b/backend/app/tests/crud/test_evaluation_embeddings.py
@@ -310,8 +310,6 @@ def test_calculate_average_similarity_basic(self):
         stats = calculate_average_similarity(embedding_pairs)
 
         assert stats["total_pairs"] == 3
-        assert stats["cosine_similarity_min"] == pytest.approx(0.0)
-        assert stats["cosine_similarity_max"] == pytest.approx(1.0)
         # Average of [1.0, 0.0, 0.707] ≈ 0.569
         assert stats["cosine_similarity_avg"] == pytest.approx(0.569, abs=0.01)
         assert "cosine_similarity_std" in stats
@@ -325,8 +323,7 @@ def test_calculate_average_similarity_empty(self):
 
         assert stats["total_pairs"] == 0
         assert stats["cosine_similarity_avg"] == 0.0
-        assert stats["cosine_similarity_min"] == 0.0
-        assert stats["cosine_similarity_max"] == 0.0
+        assert stats["cosine_similarity_std"] == 0.0
         assert stats["per_item_scores"] == []
 
     def test_calculate_average_similarity_per_item_scores(self):
@@ -382,8 +379,6 @@ def test_calculate_average_similarity_statistics(self):
 
         # Similarities = [1.0, 0.0, 1.0, 0.0]
         assert stats["cosine_similarity_avg"] == pytest.approx(0.5)
-        assert stats["cosine_similarity_min"] == pytest.approx(0.0)
-        assert stats["cosine_similarity_max"] == pytest.approx(1.0)
         # Standard deviation of [1, 0, 1, 0] = 0.5
         assert stats["cosine_similarity_std"] == pytest.approx(0.5)
         assert stats["total_pairs"] == 4

From cd757bd086b38e5d9ccbd5d580bee1f9144e5507 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Mon, 3 Nov 2025 16:23:42 +0530
Subject: [PATCH 43/64] added support for duplication factor limit

---
 backend/app/api/routes/evaluation.py          | 14 ++-
 .../app/tests/api/routes/test_evaluation.py   | 91 ++++++++++++++++++-
 2 files changed, 100 insertions(+), 5 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index f7280e19f..c03fd47e3 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -15,6 +15,8 @@
 from app.crud.evaluation import (
     create_evaluation_run,
     get_evaluation_run_by_id,
+)
+from app.crud.evaluation import (
     list_evaluation_runs as list_evaluation_runs_crud,
 )
 from app.crud.evaluation_batch import start_evaluation_batch
@@ -95,7 +97,10 @@ async def upload_dataset(
     dataset_name: str = Form(..., description="Name for the dataset"),
     description: str | None = Form(None, description="Optional dataset description"),
     duplication_factor: int = Form(
-        default=5, description="Number of times to duplicate each item"
+        default=5,
+        ge=1,
+        le=5,
+        description="Number of times to duplicate each item (min: 1, max: 5)",
     ),
     _session: Session = Depends(get_db),
     _current_user: UserProjectOrg = Depends(get_current_user_org_project),
@@ -122,6 +127,13 @@ async def upload_dataset(
     - Can have additional columns (will be ignored)
     - Missing values in 'question' or 'answer' rows will be skipped
 
+    Duplication Factor:
+    - Minimum: 1 (no duplication)
+    - Maximum: 5
+    - Default: 5
+    - Each item in the dataset will be duplicated this many times
+    - Used to ensure statistical significance in evaluation results
+
     Example CSV:
     ```
     question,answer
diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
index a257af36f..17e4ee339 100644
--- a/backend/app/tests/api/routes/test_evaluation.py
+++ b/backend/app/tests/api/routes/test_evaluation.py
@@ -232,7 +232,7 @@ def test_upload_with_custom_duplication(
         ):
             mock_store_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
             mock_configure_langfuse.return_value = (None, True)
-            mock_langfuse_upload.return_value = ("test_dataset_id", 30)
+            mock_langfuse_upload.return_value = ("test_dataset_id", 12)
 
             filename, file_obj = create_csv_file(valid_csv_content)
 
@@ -241,7 +241,7 @@ def test_upload_with_custom_duplication(
                 files={"file": (filename, file_obj, "text/csv")},
                 data={
                     "dataset_name": "test_dataset",
-                    "duplication_factor": 10,
+                    "duplication_factor": 4,
                 },
                 headers=user_api_key_header,
             )
@@ -249,9 +249,9 @@ def test_upload_with_custom_duplication(
             assert response.status_code == 200, response.text
             data = response.json()
 
-            assert data["duplication_factor"] == 10
+            assert data["duplication_factor"] == 4
             assert data["original_items"] == 3
-            assert data["total_items"] == 30  # 3 items * 10 duplication
+            assert data["total_items"] == 12  # 3 items * 4 duplication
 
     def test_upload_with_description(
         self, client, user_api_key_header, valid_csv_content, db
@@ -299,6 +299,89 @@ def test_upload_with_description(
             assert dataset is not None
             assert dataset.description == "This is a test dataset for evaluation"
 
+    def test_upload_with_duplication_factor_below_minimum(
+        self, client, user_api_key_header, valid_csv_content
+    ):
+        """Test uploading with duplication factor below minimum (0)."""
+        filename, file_obj = create_csv_file(valid_csv_content)
+
+        response = client.post(
+            "/api/v1/evaluations/datasets",
+            files={"file": (filename, file_obj, "text/csv")},
+            data={
+                "dataset_name": "test_dataset",
+                "duplication_factor": 0,
+            },
+            headers=user_api_key_header,
+        )
+
+        assert response.status_code == 422
+        response_data = response.json()
+        # Check that the error mentions validation and minimum value
+        assert "error" in response_data
+        assert "greater than or equal to 1" in response_data["error"]
+
+    def test_upload_with_duplication_factor_above_maximum(
+        self, client, user_api_key_header, valid_csv_content
+    ):
+        """Test uploading with duplication factor above maximum (6)."""
+        filename, file_obj = create_csv_file(valid_csv_content)
+
+        response = client.post(
+            "/api/v1/evaluations/datasets",
+            files={"file": (filename, file_obj, "text/csv")},
+            data={
+                "dataset_name": "test_dataset",
+                "duplication_factor": 6,
+            },
+            headers=user_api_key_header,
+        )
+
+        assert response.status_code == 422
+        response_data = response.json()
+        # Check that the error mentions validation and maximum value
+        assert "error" in response_data
+        assert "less than or equal to 5" in response_data["error"]
+
+    def test_upload_with_duplication_factor_boundary_minimum(
+        self, client, user_api_key_header, valid_csv_content
+    ):
+        """Test uploading with duplication factor at minimum boundary (1)."""
+        with (
+            patch("app.core.cloud.get_cloud_storage") as _mock_storage,
+            patch(
+                "app.api.routes.evaluation.upload_csv_to_object_store"
+            ) as mock_store_upload,
+            patch(
+                "app.api.routes.evaluation.configure_langfuse"
+            ) as mock_configure_langfuse,
+            patch(
+                "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
+            ) as mock_langfuse_upload,
+        ):
+            mock_store_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
+            mock_configure_langfuse.return_value = (None, True)
+            mock_langfuse_upload.return_value = ("test_dataset_id", 3)
+
+            filename, file_obj = create_csv_file(valid_csv_content)
+
+            response = client.post(
+                "/api/v1/evaluations/datasets",
+                files={"file": (filename, file_obj, "text/csv")},
+                data={
+                    "dataset_name": "test_dataset",
+                    "duplication_factor": 1,
+                },
+                headers=user_api_key_header,
+            )
+
+            assert response.status_code == 200, response.text
+            data = response.json()
+
+            assert data["duplication_factor"] == 1
+            assert data["original_items"] == 3
+            assert data["total_items"] == 3  # 3 items * 1 duplication
+
 
 class TestDatasetUploadErrors:
     """Test error handling."""

From f2ec2a517667cef3984095f3e44239cd803d3518 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Mon, 3 Nov 2025 20:44:16 +0530
Subject: [PATCH 44/64] cleanup for dataset id in evaluation

---
 .../versions/d5747495bd7c_create_evaluation_run_table.py | 3 ++-
 backend/app/api/routes/evaluation.py                     | 2 +-
 backend/app/models/evaluation.py                         | 9 +++++----
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
index 737b4f40c..f7225a2bf 100644
--- a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
+++ b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
@@ -163,7 +163,7 @@ def upgrade():
             nullable=True,
             comment="Reference to the batch_job for embedding-based similarity scoring",
         ),
-        sa.Column("dataset_id", sa.Integer(), nullable=True),
+        sa.Column("dataset_id", sa.Integer(), nullable=False),
         sa.Column("status", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
         sa.Column(
             "object_store_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True
@@ -186,6 +186,7 @@ def upgrade():
             ["dataset_id"],
             ["evaluation_dataset.id"],
             name="fk_evaluation_run_dataset_id",
+            ondelete="CASCADE",
         ),
         sa.ForeignKeyConstraint(
             ["organization_id"], ["organization.id"], ondelete="CASCADE"
diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index c03fd47e3..874dbe7fd 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -150,7 +150,7 @@ async def upload_dataset(
     try:
         dataset_name = sanitize_dataset_name(dataset_name)
     except ValueError as e:
-        raise HTTPException(status_code=400, detail=f"Invalid dataset name: {str(e)}")
+        raise HTTPException(status_code=422, detail=f"Invalid dataset name: {str(e)}")
 
     if original_name != dataset_name:
         logger.info(f"Dataset name sanitized: '{original_name}' -> '{dataset_name}'")
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index c64d92795..af1bcfa22 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -146,9 +146,10 @@ class EvaluationRun(SQLModel, table=True):
     )
 
     # Dataset reference
-    dataset_id: int | None = SQLField(
-        default=None,
+    dataset_id: int = SQLField(
         foreign_key="evaluation_dataset.id",
+        nullable=False,
+        ondelete="CASCADE",
         description="Reference to the evaluation_dataset used for this run",
     )
 
@@ -210,7 +211,7 @@ class EvaluationRun(SQLModel, table=True):
     organization: "Organization" = Relationship(
         back_populates="evaluation_runs"
     )  # noqa: F821
-    evaluation_dataset: Optional["EvaluationDataset"] = Relationship(
+    evaluation_dataset: "EvaluationDataset" = Relationship(
         back_populates="evaluation_runs"
     )
     batch_job: Optional["BatchJob"] = Relationship(  # noqa: F821
@@ -244,7 +245,7 @@ class EvaluationRunPublic(SQLModel):
     run_name: str
     dataset_name: str
     config: dict[str, Any]
-    dataset_id: int | None
+    dataset_id: int
     batch_job_id: int | None
     embedding_batch_job_id: int | None
     status: str

From f7ca621819c6a101ee1008282b9dc64fd5023605 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Mon, 3 Nov 2025 22:43:35 +0530
Subject: [PATCH 45/64] file validations

---
 ...5747495bd7c_create_evaluation_run_table.py |  7 +++-
 backend/app/api/routes/evaluation.py          | 40 +++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
index f7225a2bf..681e881ae 100644
--- a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
+++ b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
@@ -176,11 +176,16 @@ def upgrade():
         sa.Column("id", sa.Integer(), nullable=False),
         sa.Column("inserted_at", sa.DateTime(), nullable=False),
         sa.Column("updated_at", sa.DateTime(), nullable=False),
-        sa.ForeignKeyConstraint(["batch_job_id"], ["batch_job.id"]),
+        sa.ForeignKeyConstraint(
+            ["batch_job_id"],
+            ["batch_job.id"],
+            ondelete="SET NULL",
+        ),
         sa.ForeignKeyConstraint(
             ["embedding_batch_job_id"],
             ["batch_job.id"],
             name="fk_evaluation_run_embedding_batch_job_id",
+            ondelete="SET NULL",
         ),
         sa.ForeignKeyConstraint(
             ["dataset_id"],
diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 874dbe7fd..a9b8ad4dd 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -2,6 +2,7 @@
 import io
 import logging
 import re
+from pathlib import Path
 
 from fastapi import APIRouter, Body, Depends, File, Form, HTTPException, UploadFile
 from sqlalchemy.exc import IntegrityError
@@ -38,6 +39,15 @@
 
 logger = logging.getLogger(__name__)
 
+# File upload security constants
+MAX_FILE_SIZE = 1024 * 1024  # 1 MB
+ALLOWED_EXTENSIONS = {".csv"}
+ALLOWED_MIME_TYPES = {
+    "text/csv",
+    "application/csv",
+    "text/plain",  # Some systems report CSV as text/plain
+}
+
 router = APIRouter(tags=["evaluation"])
 
 
@@ -161,6 +171,36 @@ async def upload_dataset(
         f"project_id={_current_user.project_id}"
     )
 
+    # Security validation: Check file extension
+    file_ext = Path(file.filename).suffix.lower()
+    if file_ext not in ALLOWED_EXTENSIONS:
+        raise HTTPException(
+            status_code=422,
+            detail=f"Invalid file type. Only CSV files are allowed. Got: {file_ext}",
+        )
+
+    # Security validation: Check MIME type
+    content_type = file.content_type
+    if content_type not in ALLOWED_MIME_TYPES:
+        raise HTTPException(
+            status_code=422,
+            detail=f"Invalid content type. Expected CSV, got: {content_type}",
+        )
+
+    # Security validation: Check file size
+    file.file.seek(0, 2)  # Seek to end
+    file_size = file.file.tell()
+    file.file.seek(0)  # Reset to beginning
+
+    if file_size > MAX_FILE_SIZE:
+        raise HTTPException(
+            status_code=413,
+            detail=f"File too large. Maximum size: {MAX_FILE_SIZE / (1024*1024):.0f}MB",
+        )
+
+    if file_size == 0:
+        raise HTTPException(status_code=422, detail="Empty file uploaded")
+
     # Read CSV content
     csv_content = await file.read()
 

From e74ea094d4647dd61b53987f976d6ee7ad3700d4 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 4 Nov 2025 00:42:59 +0530
Subject: [PATCH 46/64] refactoring file structure

---
 backend/app/api/routes/evaluation.py          | 20 +++----
 .../app/celery/tasks/evaluation_polling.py    |  2 +-
 .../app/celery/tasks/evaluation_score_sync.py |  2 +-
 backend/app/crud/evaluations/__init__.py      | 59 +++++++++++++++++++
 .../batch.py}                                 |  5 ++
 .../{evaluation.py => evaluations/core.py}    |  3 +
 .../dataset.py}                               |  0
 .../embeddings.py}                            |  0
 .../langfuse.py}                              |  0
 .../processing.py}                            |  6 +-
 backend/app/models/batch_job.py               | 10 +++-
 backend/app/models/evaluation.py              |  8 ++-
 .../app/tests/api/routes/test_evaluation.py   |  2 +-
 .../app/tests/crud/evaluations/__init__.py    |  1 +
 .../test_dataset.py}                          |  2 +-
 .../test_embeddings.py}                       |  2 +-
 .../test_langfuse.py}                         |  2 +-
 17 files changed, 98 insertions(+), 26 deletions(-)
 create mode 100644 backend/app/crud/evaluations/__init__.py
 rename backend/app/crud/{evaluation_batch.py => evaluations/batch.py} (97%)
 rename backend/app/crud/{evaluation.py => evaluations/core.py} (98%)
 rename backend/app/crud/{evaluation_dataset.py => evaluations/dataset.py} (100%)
 rename backend/app/crud/{evaluation_embeddings.py => evaluations/embeddings.py} (100%)
 rename backend/app/crud/{evaluation_langfuse.py => evaluations/langfuse.py} (100%)
 rename backend/app/crud/{evaluation_processing.py => evaluations/processing.py} (99%)
 create mode 100644 backend/app/tests/crud/evaluations/__init__.py
 rename backend/app/tests/crud/{test_evaluation_dataset.py => evaluations/test_dataset.py} (99%)
 rename backend/app/tests/crud/{test_evaluation_embeddings.py => evaluations/test_embeddings.py} (99%)
 rename backend/app/tests/crud/{test_evaluation_langfuse.py => evaluations/test_langfuse.py} (99%)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index a9b8ad4dd..e94f6b800 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -13,24 +13,18 @@
 from app.core.util import configure_langfuse, configure_openai
 from app.crud.assistants import get_assistant_by_id
 from app.crud.credentials import get_provider_credential
-from app.crud.evaluation import (
-    create_evaluation_run,
-    get_evaluation_run_by_id,
-)
-from app.crud.evaluation import (
-    list_evaluation_runs as list_evaluation_runs_crud,
-)
-from app.crud.evaluation_batch import start_evaluation_batch
-from app.crud.evaluation_dataset import (
+from app.crud.evaluations import (
     create_evaluation_dataset,
+    create_evaluation_run,
     get_dataset_by_id,
+    get_evaluation_run_by_id,
     list_datasets,
+    start_evaluation_batch,
     upload_csv_to_object_store,
+    upload_dataset_to_langfuse_from_csv,
 )
-from app.crud.evaluation_dataset import (
-    delete_dataset as delete_dataset_crud,
-)
-from app.crud.evaluation_langfuse import upload_dataset_to_langfuse_from_csv
+from app.crud.evaluations import list_evaluation_runs as list_evaluation_runs_crud
+from app.crud.evaluations.dataset import delete_dataset as delete_dataset_crud
 from app.models import UserProjectOrg
 from app.models.evaluation import (
     DatasetUploadResponse,
diff --git a/backend/app/celery/tasks/evaluation_polling.py b/backend/app/celery/tasks/evaluation_polling.py
index 6d7879e76..4a437da4b 100644
--- a/backend/app/celery/tasks/evaluation_polling.py
+++ b/backend/app/celery/tasks/evaluation_polling.py
@@ -12,7 +12,7 @@
 from sqlmodel import Session, select
 
 from app.core.db import get_engine
-from app.crud.evaluation_processing import poll_all_pending_evaluations
+from app.crud.evaluations.processing import poll_all_pending_evaluations
 from app.models import Organization
 
 logger = logging.getLogger(__name__)
diff --git a/backend/app/celery/tasks/evaluation_score_sync.py b/backend/app/celery/tasks/evaluation_score_sync.py
index 77320f733..d1b0373c8 100644
--- a/backend/app/celery/tasks/evaluation_score_sync.py
+++ b/backend/app/celery/tasks/evaluation_score_sync.py
@@ -12,7 +12,7 @@
 from sqlmodel import Session, select
 
 from app.core.db import get_engine
-from app.crud.evaluation_processing import poll_all_pending_evaluations
+from app.crud.evaluations.processing import poll_all_pending_evaluations
 from app.models import Organization
 
 logger = logging.getLogger(__name__)
diff --git a/backend/app/crud/evaluations/__init__.py b/backend/app/crud/evaluations/__init__.py
new file mode 100644
index 000000000..2239c3daa
--- /dev/null
+++ b/backend/app/crud/evaluations/__init__.py
@@ -0,0 +1,59 @@
+"""Evaluation-related CRUD operations."""
+
+from app.crud.evaluations.batch import start_evaluation_batch
+from app.crud.evaluations.core import (
+    create_evaluation_run,
+    get_evaluation_run_by_id,
+    list_evaluation_runs,
+)
+from app.crud.evaluations.dataset import (
+    create_evaluation_dataset,
+    delete_dataset,
+    get_dataset_by_id,
+    list_datasets,
+    upload_csv_to_object_store,
+)
+from app.crud.evaluations.embeddings import (
+    calculate_average_similarity,
+    calculate_cosine_similarity,
+    start_embedding_batch,
+)
+from app.crud.evaluations.langfuse import (
+    create_langfuse_dataset_run,
+    update_traces_with_cosine_scores,
+    upload_dataset_to_langfuse_from_csv,
+)
+from app.crud.evaluations.processing import (
+    check_and_process_evaluation,
+    poll_all_pending_evaluations,
+    process_completed_embedding_batch,
+    process_completed_evaluation,
+)
+
+__all__ = [
+    # Core
+    "create_evaluation_run",
+    "get_evaluation_run_by_id",
+    "list_evaluation_runs",
+    # Dataset
+    "create_evaluation_dataset",
+    "delete_dataset",
+    "get_dataset_by_id",
+    "list_datasets",
+    "upload_csv_to_object_store",
+    # Batch
+    "start_evaluation_batch",
+    # Processing
+    "check_and_process_evaluation",
+    "poll_all_pending_evaluations",
+    "process_completed_embedding_batch",
+    "process_completed_evaluation",
+    # Embeddings
+    "calculate_average_similarity",
+    "calculate_cosine_similarity",
+    "start_embedding_batch",
+    # Langfuse
+    "create_langfuse_dataset_run",
+    "update_traces_with_cosine_scores",
+    "upload_dataset_to_langfuse_from_csv",
+]
diff --git a/backend/app/crud/evaluation_batch.py b/backend/app/crud/evaluations/batch.py
similarity index 97%
rename from backend/app/crud/evaluation_batch.py
rename to backend/app/crud/evaluations/batch.py
index d0adff919..4a67d7e34 100644
--- a/backend/app/crud/evaluation_batch.py
+++ b/backend/app/crud/evaluations/batch.py
@@ -144,6 +144,11 @@ def start_evaluation_batch(
             langfuse=langfuse, dataset_name=eval_run.dataset_name
         )
 
+        if not jsonl_data:
+            raise ValueError(
+                "Evaluation dataset did not produce any JSONL entries (missing questions?)."
+            )
+
         # Step 2: Build evaluation-specific JSONL
         jsonl_data = build_evaluation_jsonl(dataset_items=dataset_items, config=config)
 
diff --git a/backend/app/crud/evaluation.py b/backend/app/crud/evaluations/core.py
similarity index 98%
rename from backend/app/crud/evaluation.py
rename to backend/app/crud/evaluations/core.py
index 0e57df9c4..66560c6fa 100644
--- a/backend/app/crud/evaluation.py
+++ b/backend/app/crud/evaluations/core.py
@@ -16,6 +16,7 @@
 async def upload_dataset_to_langfuse(
     csv_content: bytes,
     dataset_name: str,
+    dataset_id: int,
     duplication_factor: int,
     _session: Session,
     _current_user: UserProjectOrg,
@@ -26,6 +27,7 @@ async def upload_dataset_to_langfuse(
     Args:
         csv_content: Raw CSV file content as bytes
         dataset_name: Name for the dataset in Langfuse
+        dataset_id: Database ID of the created dataset
         duplication_factor: Number of times to duplicate each item (default 5)
         _session: Database session
         _current_user: Current user organization
@@ -121,6 +123,7 @@ async def upload_dataset_to_langfuse(
         return (
             True,
             DatasetUploadResponse(
+                dataset_id=dataset_id,
                 dataset_name=dataset_name,
                 total_items=total_uploaded,
                 original_items=len(original_items),
diff --git a/backend/app/crud/evaluation_dataset.py b/backend/app/crud/evaluations/dataset.py
similarity index 100%
rename from backend/app/crud/evaluation_dataset.py
rename to backend/app/crud/evaluations/dataset.py
diff --git a/backend/app/crud/evaluation_embeddings.py b/backend/app/crud/evaluations/embeddings.py
similarity index 100%
rename from backend/app/crud/evaluation_embeddings.py
rename to backend/app/crud/evaluations/embeddings.py
diff --git a/backend/app/crud/evaluation_langfuse.py b/backend/app/crud/evaluations/langfuse.py
similarity index 100%
rename from backend/app/crud/evaluation_langfuse.py
rename to backend/app/crud/evaluations/langfuse.py
diff --git a/backend/app/crud/evaluation_processing.py b/backend/app/crud/evaluations/processing.py
similarity index 99%
rename from backend/app/crud/evaluation_processing.py
rename to backend/app/crud/evaluations/processing.py
index 3da0695e7..d4da792d4 100644
--- a/backend/app/crud/evaluation_processing.py
+++ b/backend/app/crud/evaluations/processing.py
@@ -26,13 +26,13 @@
     upload_batch_results_to_object_store,
 )
 from app.crud.credentials import get_provider_credential
-from app.crud.evaluation_batch import fetch_dataset_items
-from app.crud.evaluation_embeddings import (
+from app.crud.evaluations.batch import fetch_dataset_items
+from app.crud.evaluations.embeddings import (
     calculate_average_similarity,
     parse_embedding_results,
     start_embedding_batch,
 )
-from app.crud.evaluation_langfuse import (
+from app.crud.evaluations.langfuse import (
     create_langfuse_dataset_run,
     update_traces_with_cosine_scores,
 )
diff --git a/backend/app/models/batch_job.py b/backend/app/models/batch_job.py
index 6d44c81d8..183dc8481 100644
--- a/backend/app/models/batch_job.py
+++ b/backend/app/models/batch_job.py
@@ -5,6 +5,8 @@
 from sqlalchemy.dialects.postgresql import JSONB
 from sqlmodel import Field, Relationship, SQLModel
 
+from app.core.util import now
+
 
 class BatchJob(SQLModel, table=True):
     """Batch job table for tracking async LLM batch operations."""
@@ -61,8 +63,12 @@ class BatchJob(SQLModel, table=True):
     project_id: int = Field(foreign_key="project.id")
 
     # Timestamps
-    inserted_at: datetime = Field(default_factory=datetime.utcnow)
-    updated_at: datetime = Field(default_factory=datetime.utcnow)
+    inserted_at: datetime = Field(
+        default_factory=now, description="The timestamp when the document was inserted"
+    )
+    updated_at: datetime = Field(
+        default_factory=now, description="The timestamp when the document was inserted"
+    )
 
     # Relationships
     organization: Optional["Organization"] = Relationship(  # noqa: F821
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index af1bcfa22..a7b6f08af 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -203,8 +203,12 @@ class EvaluationRun(SQLModel, table=True):
     )
 
     # Timestamps
-    inserted_at: datetime = SQLField(default_factory=now, nullable=False)
-    updated_at: datetime = SQLField(default_factory=now, nullable=False)
+    inserted_at: datetime = Field(
+        default_factory=now, description="The timestamp when the document was inserted"
+    )
+    updated_at: datetime = Field(
+        default_factory=now, description="The timestamp when the document was inserted"
+    )
 
     # Relationships
     project: "Project" = Relationship(back_populates="evaluation_runs")  # noqa: F821
diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
index 17e4ee339..fbf39794a 100644
--- a/backend/app/tests/api/routes/test_evaluation.py
+++ b/backend/app/tests/api/routes/test_evaluation.py
@@ -4,7 +4,7 @@
 import pytest
 from sqlmodel import select
 
-from app.crud.evaluation_batch import build_evaluation_jsonl
+from app.crud.evaluations.batch import build_evaluation_jsonl
 from app.models import EvaluationDataset
 
 
diff --git a/backend/app/tests/crud/evaluations/__init__.py b/backend/app/tests/crud/evaluations/__init__.py
new file mode 100644
index 000000000..e99ebf01b
--- /dev/null
+++ b/backend/app/tests/crud/evaluations/__init__.py
@@ -0,0 +1 @@
+"""Tests for evaluation-related CRUD operations."""
diff --git a/backend/app/tests/crud/test_evaluation_dataset.py b/backend/app/tests/crud/evaluations/test_dataset.py
similarity index 99%
rename from backend/app/tests/crud/test_evaluation_dataset.py
rename to backend/app/tests/crud/evaluations/test_dataset.py
index 82a8d1d74..ccd2e4f34 100644
--- a/backend/app/tests/crud/test_evaluation_dataset.py
+++ b/backend/app/tests/crud/evaluations/test_dataset.py
@@ -8,7 +8,7 @@
 from sqlmodel import Session, select
 
 from app.core.cloud.storage import CloudStorageError
-from app.crud.evaluation_dataset import (
+from app.crud.evaluations.dataset import (
     create_evaluation_dataset,
     download_csv_from_object_store,
     get_dataset_by_id,
diff --git a/backend/app/tests/crud/test_evaluation_embeddings.py b/backend/app/tests/crud/evaluations/test_embeddings.py
similarity index 99%
rename from backend/app/tests/crud/test_evaluation_embeddings.py
rename to backend/app/tests/crud/evaluations/test_embeddings.py
index 7556fa7df..c06d78250 100644
--- a/backend/app/tests/crud/test_evaluation_embeddings.py
+++ b/backend/app/tests/crud/evaluations/test_embeddings.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 
-from app.crud.evaluation_embeddings import (
+from app.crud.evaluations.embeddings import (
     build_embedding_jsonl,
     calculate_average_similarity,
     calculate_cosine_similarity,
diff --git a/backend/app/tests/crud/test_evaluation_langfuse.py b/backend/app/tests/crud/evaluations/test_langfuse.py
similarity index 99%
rename from backend/app/tests/crud/test_evaluation_langfuse.py
rename to backend/app/tests/crud/evaluations/test_langfuse.py
index 1bc8b2198..4717ca6c8 100644
--- a/backend/app/tests/crud/test_evaluation_langfuse.py
+++ b/backend/app/tests/crud/evaluations/test_langfuse.py
@@ -6,7 +6,7 @@
 
 import pytest
 
-from app.crud.evaluation_langfuse import (
+from app.crud.evaluations.langfuse import (
     create_langfuse_dataset_run,
     update_traces_with_cosine_scores,
     upload_dataset_to_langfuse_from_csv,

From 4f4cea1a89b1a861ea1a22a5c602a3f7808b04d3 Mon Sep 17 00:00:00 2001
From: Kartikeya Pophali <kartikeyapophali@gmail.com>
Date: Tue, 4 Nov 2025 17:12:05 +0530
Subject: [PATCH 47/64] Evaluation: Add cron job endpoint and script for
 periodic evaluation processing (#428)

* add cron job endpoint and script for periodic evaluation processing

* add cron job endpoint and update invocation script for periodic evaluation processing

* refactor: remove redundant dependency from evaluation cron job
---
 backend/app/api/main.py                  |   2 +
 backend/app/api/routes/cron.py           |  63 ++++++++
 backend/app/crud/evaluations/__init__.py |   7 +
 backend/app/crud/evaluations/cron.py     | 154 ++++++++++++++++++
 scripts/python/invoke-cron.py            | 189 +++++++++++++++++++++++
 5 files changed, 415 insertions(+)
 create mode 100644 backend/app/api/routes/cron.py
 create mode 100644 backend/app/crud/evaluations/cron.py
 create mode 100644 scripts/python/invoke-cron.py

diff --git a/backend/app/api/main.py b/backend/app/api/main.py
index 500fdccc1..a44726b0b 100644
--- a/backend/app/api/main.py
+++ b/backend/app/api/main.py
@@ -17,6 +17,7 @@
     utils,
     onboarding,
     credentials,
+    cron,
     evaluation,
     fine_tuning,
     model_evaluation,
@@ -30,6 +31,7 @@
 api_router.include_router(collections.router)
 api_router.include_router(collection_job.router)
 api_router.include_router(credentials.router)
+api_router.include_router(cron.router)
 api_router.include_router(documents.router)
 api_router.include_router(doc_transformation_job.router)
 api_router.include_router(evaluation.router)
diff --git a/backend/app/api/routes/cron.py b/backend/app/api/routes/cron.py
new file mode 100644
index 000000000..1a13c6f76
--- /dev/null
+++ b/backend/app/api/routes/cron.py
@@ -0,0 +1,63 @@
+import logging
+
+from fastapi import APIRouter, Depends
+from sqlmodel import Session
+
+from app.api.deps import get_current_active_superuser, get_db
+from app.crud.evaluations import process_all_pending_evaluations_sync
+from app.models import User
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(tags=["cron"])
+
+
+@router.get(
+    "/cron/evaluations",
+    include_in_schema=True,
+    dependencies=[Depends(get_current_active_superuser)],
+)
+def evaluation_cron_job(
+    session: Session = Depends(get_db),
+) -> dict:
+    """
+    Cron job endpoint for periodic evaluation tasks.
+
+    This endpoint:
+    1. Gets all organizations
+    2. For each org, polls their pending evaluations
+    3. Processes completed batches automatically
+    4. Returns aggregated results
+
+    Hidden from Swagger documentation.
+    Requires authentication via FIRST_SUPERUSER credentials.
+    """
+    logger.info("[evaluation_cron_job] Cron job invoked")
+
+    try:
+        # Process all pending evaluations across all organizations
+        result = process_all_pending_evaluations_sync(session=session)
+
+        logger.info(
+            f"[evaluation_cron_job] Completed: "
+            f"orgs={result.get('organizations_processed', 0)}, "
+            f"processed={result.get('total_processed', 0)}, "
+            f"failed={result.get('total_failed', 0)}, "
+            f"still_processing={result.get('total_still_processing', 0)}"
+        )
+
+        return result
+
+    except Exception as e:
+        logger.error(
+            f"[evaluation_cron_job] Error executing cron job: {e}",
+            exc_info=True,
+        )
+        return {
+            "status": "error",
+            "error": str(e),
+            "organizations_processed": 0,
+            "total_processed": 0,
+            "total_failed": 0,
+            "total_still_processing": 0,
+        }
diff --git a/backend/app/crud/evaluations/__init__.py b/backend/app/crud/evaluations/__init__.py
index 2239c3daa..d07cf8676 100644
--- a/backend/app/crud/evaluations/__init__.py
+++ b/backend/app/crud/evaluations/__init__.py
@@ -6,6 +6,10 @@
     get_evaluation_run_by_id,
     list_evaluation_runs,
 )
+from app.crud.evaluations.cron import (
+    process_all_pending_evaluations,
+    process_all_pending_evaluations_sync,
+)
 from app.crud.evaluations.dataset import (
     create_evaluation_dataset,
     delete_dataset,
@@ -35,6 +39,9 @@
     "create_evaluation_run",
     "get_evaluation_run_by_id",
     "list_evaluation_runs",
+    # Cron
+    "process_all_pending_evaluations",
+    "process_all_pending_evaluations_sync",
     # Dataset
     "create_evaluation_dataset",
     "delete_dataset",
diff --git a/backend/app/crud/evaluations/cron.py b/backend/app/crud/evaluations/cron.py
new file mode 100644
index 000000000..a670a547f
--- /dev/null
+++ b/backend/app/crud/evaluations/cron.py
@@ -0,0 +1,154 @@
+"""
+CRUD operations for evaluation cron jobs.
+
+This module provides functions that can be invoked periodically to process
+pending evaluations across all organizations.
+"""
+
+import asyncio
+import logging
+from typing import Any
+
+from sqlmodel import Session, select
+
+from app.crud.evaluations.processing import poll_all_pending_evaluations
+from app.models import Organization
+
+logger = logging.getLogger(__name__)
+
+
+async def process_all_pending_evaluations(session: Session) -> dict[str, Any]:
+    """
+    Process all pending evaluations across all organizations.
+
+    This function:
+    1. Gets all organizations
+    2. For each org, polls their pending evaluations
+    3. Processes completed batches automatically
+    4. Returns aggregated results
+
+    This is the main function that should be called by the cron endpoint.
+
+    Args:
+        session: Database session
+
+    Returns:
+        Dict with aggregated results:
+        {
+            "status": "success",
+            "organizations_processed": 3,
+            "total_processed": 5,
+            "total_failed": 1,
+            "total_still_processing": 2,
+            "results": [
+                {
+                    "org_id": 1,
+                    "org_name": "Org 1",
+                    "summary": {...}
+                },
+                ...
+            ]
+        }
+    """
+    logger.info("[process_all_pending_evaluations] Starting evaluation processing")
+
+    try:
+        # Get all organizations
+        orgs = session.exec(select(Organization)).all()
+
+        if not orgs:
+            logger.info("[process_all_pending_evaluations] No organizations found")
+            return {
+                "status": "success",
+                "organizations_processed": 0,
+                "total_processed": 0,
+                "total_failed": 0,
+                "total_still_processing": 0,
+                "message": "No organizations to process",
+                "results": [],
+            }
+
+        logger.info(
+            f"[process_all_pending_evaluations] Found {len(orgs)} organizations to process"
+        )
+
+        results = []
+        total_processed = 0
+        total_failed = 0
+        total_still_processing = 0
+
+        # Process each organization
+        for org in orgs:
+            try:
+                logger.info(
+                    f"[process_all_pending_evaluations] Processing org_id={org.id} ({org.name})"
+                )
+
+                # Poll all pending evaluations for this org
+                summary = await poll_all_pending_evaluations(session=session, org_id=org.id)
+
+                results.append(
+                    {
+                        "org_id": org.id,
+                        "org_name": org.name,
+                        "summary": summary,
+                    }
+                )
+
+                total_processed += summary.get("processed", 0)
+                total_failed += summary.get("failed", 0)
+                total_still_processing += summary.get("still_processing", 0)
+
+            except Exception as e:
+                logger.error(
+                    f"[process_all_pending_evaluations] Error processing org_id={org.id}: {e}",
+                    exc_info=True,
+                )
+                session.rollback()
+                results.append({"org_id": org.id, "org_name": org.name, "error": str(e)})
+                total_failed += 1
+
+        logger.info(
+            f"[process_all_pending_evaluations] Completed: "
+            f"{total_processed} processed, {total_failed} failed, "
+            f"{total_still_processing} still processing"
+        )
+
+        return {
+            "status": "success",
+            "organizations_processed": len(orgs),
+            "total_processed": total_processed,
+            "total_failed": total_failed,
+            "total_still_processing": total_still_processing,
+            "results": results,
+        }
+
+    except Exception as e:
+        logger.error(
+            f"[process_all_pending_evaluations] Fatal error: {e}",
+            exc_info=True,
+        )
+        return {
+            "status": "error",
+            "organizations_processed": 0,
+            "total_processed": 0,
+            "total_failed": 0,
+            "total_still_processing": 0,
+            "error": str(e),
+            "results": [],
+        }
+
+
+def process_all_pending_evaluations_sync(session: Session) -> dict[str, Any]:
+    """
+    Synchronous wrapper for process_all_pending_evaluations.
+
+    This function can be called from synchronous contexts (like FastAPI endpoints).
+
+    Args:
+        session: Database session
+
+    Returns:
+        Dict with aggregated results (same as process_all_pending_evaluations)
+    """
+    return asyncio.run(process_all_pending_evaluations(session=session))
diff --git a/scripts/python/invoke-cron.py b/scripts/python/invoke-cron.py
new file mode 100644
index 000000000..306f2711c
--- /dev/null
+++ b/scripts/python/invoke-cron.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+"""
+Cron script to invoke an API endpoint periodically.
+Uses async HTTP client to be resource-efficient.
+"""
+
+import asyncio
+import logging
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+
+import httpx
+from dotenv import load_dotenv
+
+# Configuration
+INTERVAL_MINUTES = 1  # How often to invoke the endpoint
+BASE_URL = "http://localhost:8000"  # Base URL of the API
+ENDPOINT = "/api/v1/cron/evaluations"  # Endpoint to invoke
+REQUEST_TIMEOUT = 30  # Timeout for requests in seconds
+
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+class EndpointInvoker:
+    """Handles periodic endpoint invocation with authentication."""
+
+    def __init__(self):
+        self.base_url = BASE_URL.rstrip("/")
+        self.endpoint = ENDPOINT
+        self.interval_seconds = INTERVAL_MINUTES * 60
+        self.access_token = None
+        self.token_expiry = None
+
+        # Load credentials from environment
+        self.email = os.getenv("FIRST_SUPERUSER")
+        self.password = os.getenv("FIRST_SUPERUSER_PASSWORD")
+
+        if not self.email or not self.password:
+            raise ValueError(
+                "FIRST_SUPERUSER and FIRST_SUPERUSER_PASSWORD must be set in environment"
+            )
+
+    async def authenticate(self, client: httpx.AsyncClient) -> str:
+        """Authenticate and get access token."""
+        logger.info("Authenticating with API...")
+
+        login_data = {
+            "username": self.email,
+            "password": self.password,
+        }
+
+        try:
+            response = await client.post(
+                f"{self.base_url}/api/v1/login/access-token",
+                data=login_data,
+                timeout=REQUEST_TIMEOUT,
+            )
+            response.raise_for_status()
+
+            data = response.json()
+            self.access_token = data.get("access_token")
+
+            if not self.access_token:
+                raise ValueError("No access token in response")
+
+            logger.info("Authentication successful")
+            return self.access_token
+
+        except httpx.HTTPStatusError as e:
+            logger.error(f"Authentication failed with status {e.response.status_code}")
+            raise
+        except Exception as e:
+            logger.error(f"Authentication error: {e}")
+            raise
+
+    async def invoke_endpoint(self, client: httpx.AsyncClient) -> dict:
+        """Invoke the configured endpoint."""
+        if not self.access_token:
+            await self.authenticate(client)
+
+        headers = {"Authorization": f"Bearer {self.access_token}"}
+
+        # Debug: Log what we're sending
+        logger.debug(f"Request URL: {self.base_url}{self.endpoint}")
+        logger.debug(f"Request headers: {headers}")
+
+        try:
+            response = await client.get(
+                f"{self.base_url}{self.endpoint}",
+                headers=headers,
+                timeout=REQUEST_TIMEOUT,
+            )
+
+            # Debug: Log response headers and first part of body
+            logger.debug(f"Response status: {response.status_code}")
+            logger.debug(f"Response headers: {dict(response.headers)}")
+
+            # If unauthorized, re-authenticate and retry once
+            if response.status_code == 401:
+                logger.info("Token expired, re-authenticating...")
+                await self.authenticate(client)
+                headers = {"Authorization": f"Bearer {self.access_token}"}
+                response = await client.get(
+                    f"{self.base_url}{self.endpoint}",
+                    headers=headers,
+                    timeout=REQUEST_TIMEOUT,
+                )
+
+            response.raise_for_status()
+            return response.json()
+
+        except httpx.HTTPStatusError as e:
+            logger.error(
+                f"Endpoint invocation failed with status {e.response.status_code}: {e.response.text}"
+            )
+            raise
+        except Exception as e:
+            logger.error(f"Endpoint invocation error: {e}")
+            raise
+
+    async def run(self):
+        """Main loop to invoke endpoint periodically."""
+        logger.info(
+            f"Starting cron job - invoking {self.endpoint} every {INTERVAL_MINUTES} minutes"
+        )
+
+        # Use async context manager to ensure proper cleanup
+        async with httpx.AsyncClient() as client:
+            # Authenticate once at startup
+            await self.authenticate(client)
+
+            while True:
+                try:
+                    start_time = datetime.now()
+                    logger.info(f"Invoking endpoint at {start_time}")
+
+                    result = await self.invoke_endpoint(client)
+                    logger.info(f"Endpoint invoked successfully: {result}")
+
+                    # Calculate next invocation time
+                    elapsed = (datetime.now() - start_time).total_seconds()
+                    sleep_time = max(0, self.interval_seconds - elapsed)
+
+                    if sleep_time > 0:
+                        logger.info(
+                            f"Sleeping for {sleep_time:.1f} seconds until next invocation"
+                        )
+                        await asyncio.sleep(sleep_time)
+
+                except KeyboardInterrupt:
+                    logger.info("Shutting down gracefully...")
+                    break
+                except Exception as e:
+                    logger.error(f"Error during invocation: {e}")
+                    # Wait before retrying on error
+                    logger.info(f"Waiting {self.interval_seconds} seconds before retry")
+                    await asyncio.sleep(self.interval_seconds)
+
+
+def main():
+    """Entry point for the script."""
+    # Load environment variables
+    env_path = Path(__file__).parent.parent.parent / ".env"
+    if env_path.exists():
+        load_dotenv(env_path)
+        logger.info(f"Loaded environment from {env_path}")
+    else:
+        logger.warning(f"No .env file found at {env_path}")
+
+    try:
+        invoker = EndpointInvoker()
+        asyncio.run(invoker.run())
+    except KeyboardInterrupt:
+        logger.info("Interrupted by user")
+        sys.exit(0)
+    except Exception as e:
+        logger.error(f"Fatal error: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()

From 9b038c6bf8b9227eb57578877f640e1a4ec74746 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 4 Nov 2025 17:24:19 +0530
Subject: [PATCH 48/64] minor fixes

---
 backend/app/crud/evaluations/batch.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/app/crud/evaluations/batch.py b/backend/app/crud/evaluations/batch.py
index 4a67d7e34..67fd1f7c2 100644
--- a/backend/app/crud/evaluations/batch.py
+++ b/backend/app/crud/evaluations/batch.py
@@ -144,14 +144,14 @@ def start_evaluation_batch(
             langfuse=langfuse, dataset_name=eval_run.dataset_name
         )
 
+        # Step 2: Build evaluation-specific JSONL
+        jsonl_data = build_evaluation_jsonl(dataset_items=dataset_items, config=config)
+
         if not jsonl_data:
             raise ValueError(
                 "Evaluation dataset did not produce any JSONL entries (missing questions?)."
             )
 
-        # Step 2: Build evaluation-specific JSONL
-        jsonl_data = build_evaluation_jsonl(dataset_items=dataset_items, config=config)
-
         # Step 3: Create batch provider
         provider = OpenAIBatchProvider(client=openai_client)
 

From 622b4eb68f4f55e0c331b442df309102e859453d Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 4 Nov 2025 18:04:52 +0530
Subject: [PATCH 49/64] cleanup cruds

---
 backend/app/api/routes/evaluation.py    | 77 ++++++++++---------------
 backend/app/crud/evaluations/dataset.py | 66 +++++++++++++++------
 2 files changed, 76 insertions(+), 67 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index e94f6b800..0f92fe608 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -5,7 +5,6 @@
 from pathlib import Path
 
 from fastapi import APIRouter, Body, Depends, File, Form, HTTPException, UploadFile
-from sqlalchemy.exc import IntegrityError
 from sqlmodel import Session
 
 from app.api.deps import get_current_user_org_project, get_db
@@ -305,56 +304,38 @@ async def upload_dataset(
         )
 
     # Step 4: Store metadata in database
-    try:
-        metadata = {
-            "original_items_count": original_items_count,
-            "total_items_count": total_items_count,
-            "duplication_factor": duplication_factor,
-        }
-
-        dataset = create_evaluation_dataset(
-            session=_session,
-            name=dataset_name,
-            description=description,
-            dataset_metadata=metadata,
-            object_store_url=object_store_url,
-            langfuse_dataset_id=langfuse_dataset_id,
-            organization_id=_current_user.organization_id,
-            project_id=_current_user.project_id,
-        )
-
-        logger.info(
-            f"Successfully created dataset record in database: id={dataset.id}, "
-            f"name={dataset_name}"
-        )
+    metadata = {
+        "original_items_count": original_items_count,
+        "total_items_count": total_items_count,
+        "duplication_factor": duplication_factor,
+    }
 
-        # Return response
-        return DatasetUploadResponse(
-            dataset_id=dataset.id,
-            dataset_name=dataset_name,
-            total_items=total_items_count,
-            original_items=original_items_count,
-            duplication_factor=duplication_factor,
-            langfuse_dataset_id=langfuse_dataset_id,
-            object_store_url=object_store_url,
-        )
+    dataset = create_evaluation_dataset(
+        session=_session,
+        name=dataset_name,
+        description=description,
+        dataset_metadata=metadata,
+        object_store_url=object_store_url,
+        langfuse_dataset_id=langfuse_dataset_id,
+        organization_id=_current_user.organization_id,
+        project_id=_current_user.project_id,
+    )
 
-    except IntegrityError as e:
-        logger.error(
-            f"Database integrity error creating dataset '{dataset_name}': {e}",
-            exc_info=True,
-        )
-        raise HTTPException(
-            status_code=409,
-            detail=f"Dataset with name '{dataset_name}' already exists in this "
-            "organization and project. Please choose a different name.",
-        )
+    logger.info(
+        f"Successfully created dataset record in database: id={dataset.id}, "
+        f"name={dataset_name}"
+    )
 
-    except Exception as e:
-        logger.error(f"Failed to create dataset record in database: {e}", exc_info=True)
-        raise HTTPException(
-            status_code=500, detail=f"Failed to save dataset metadata: {e}"
-        )
+    # Return response
+    return DatasetUploadResponse(
+        dataset_id=dataset.id,
+        dataset_name=dataset_name,
+        total_items=total_items_count,
+        original_items=original_items_count,
+        duplication_factor=duplication_factor,
+        langfuse_dataset_id=langfuse_dataset_id,
+        object_store_url=object_store_url,
+    )
 
 
 @router.get("/evaluations/datasets/list", response_model=list[DatasetUploadResponse])
diff --git a/backend/app/crud/evaluations/dataset.py b/backend/app/crud/evaluations/dataset.py
index f00175020..1645fdfc3 100644
--- a/backend/app/crud/evaluations/dataset.py
+++ b/backend/app/crud/evaluations/dataset.py
@@ -11,6 +11,8 @@
 import logging
 from typing import Any
 
+from fastapi import HTTPException
+from sqlalchemy.exc import IntegrityError
 from sqlmodel import Session, select
 
 from app.core.cloud.storage import CloudStorage
@@ -52,29 +54,55 @@ def create_evaluation_dataset(
 
     Returns:
         Created EvaluationDataset object
+
+    Raises:
+        HTTPException: 409 if dataset with same name exists, 500 for other errors
     """
-    dataset = EvaluationDataset(
-        name=name,
-        description=description,
-        dataset_metadata=dataset_metadata,
-        object_store_url=object_store_url,
-        langfuse_dataset_id=langfuse_dataset_id,
-        organization_id=organization_id,
-        project_id=project_id,
-        inserted_at=now(),
-        updated_at=now(),
-    )
+    try:
+        dataset = EvaluationDataset(
+            name=name,
+            description=description,
+            dataset_metadata=dataset_metadata,
+            object_store_url=object_store_url,
+            langfuse_dataset_id=langfuse_dataset_id,
+            organization_id=organization_id,
+            project_id=project_id,
+            inserted_at=now(),
+            updated_at=now(),
+        )
 
-    session.add(dataset)
-    session.commit()
-    session.refresh(dataset)
+        session.add(dataset)
+        session.commit()
+        session.refresh(dataset)
 
-    logger.info(
-        f"Created evaluation dataset: id={dataset.id}, name={name}, "
-        f"org_id={organization_id}, project_id={project_id}"
-    )
+        logger.info(
+            f"Created evaluation dataset: id={dataset.id}, name={name}, "
+            f"org_id={organization_id}, project_id={project_id}"
+        )
 
-    return dataset
+        return dataset
+
+    except IntegrityError as e:
+        session.rollback()
+        logger.error(
+            f"Database integrity error creating dataset '{name}': {e}",
+            exc_info=True,
+        )
+        raise HTTPException(
+            status_code=409,
+            detail=f"Dataset with name '{name}' already exists in this "
+            "organization and project. Please choose a different name.",
+        )
+
+    except Exception as e:
+        session.rollback()
+        logger.error(
+            f"Failed to create dataset record in database: {e}",
+            exc_info=True,
+        )
+        raise HTTPException(
+            status_code=500, detail=f"Failed to save dataset metadata: {e}"
+        )
 
 
 def get_dataset_by_id(

From 4c61d7402ed65f947eaa02c17d99338709dc6ad3 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 4 Nov 2025 18:17:11 +0530
Subject: [PATCH 50/64] removed celery beat

---
 backend/app/celery/celery_app.py              |  12 --
 .../app/celery/tasks/evaluation_polling.py    | 115 -----------------
 .../app/celery/tasks/evaluation_score_sync.py | 118 ------------------
 3 files changed, 245 deletions(-)
 delete mode 100644 backend/app/celery/tasks/evaluation_polling.py
 delete mode 100644 backend/app/celery/tasks/evaluation_score_sync.py

diff --git a/backend/app/celery/celery_app.py b/backend/app/celery/celery_app.py
index e223f804c..81fba8cb2 100644
--- a/backend/app/celery/celery_app.py
+++ b/backend/app/celery/celery_app.py
@@ -10,7 +10,6 @@
     backend=settings.REDIS_URL,
     include=[
         "app.celery.tasks.job_execution",
-        "app.celery.tasks.evaluation_score_sync",
     ],
 )
 
@@ -86,17 +85,6 @@
     # Connection settings from environment
     broker_connection_retry_on_startup=True,
     broker_pool_limit=settings.CELERY_BROKER_POOL_LIMIT,
-    # Beat configuration
-    beat_schedule={
-        # Process evaluation batches (polls provider status and processes results)
-        "process-evaluation-batches": {
-            "task": "process_evaluation_batches",
-            "schedule": 60.0,  # Every 60 seconds
-        },
-        # Future: Add similar tasks for other job types
-        # "process-classification-batches": {...}
-        # "process-embedding-batches": {...}
-    },
 )
 
 # Auto-discover tasks
diff --git a/backend/app/celery/tasks/evaluation_polling.py b/backend/app/celery/tasks/evaluation_polling.py
deleted file mode 100644
index 4a437da4b..000000000
--- a/backend/app/celery/tasks/evaluation_polling.py
+++ /dev/null
@@ -1,115 +0,0 @@
-"""
-Celery tasks for evaluation batch polling.
-
-This module contains periodic tasks that poll OpenAI batch status
-and process completed evaluations.
-"""
-
-import asyncio
-import logging
-
-from celery import shared_task
-from sqlmodel import Session, select
-
-from app.core.db import get_engine
-from app.crud.evaluations.processing import poll_all_pending_evaluations
-from app.models import Organization
-
-logger = logging.getLogger(__name__)
-
-
-@shared_task(name="poll_evaluation_batches", bind=True)
-def poll_evaluation_batches_task(self):
-    """
-    Periodic task to poll all pending evaluation batches.
-
-    This task:
-    1. Gets all organizations
-    2. For each org, polls their pending evaluations
-    3. Processes completed batches automatically
-
-    Runs every 60 seconds (configured in celery_app.py beat_schedule)
-    """
-    logger.info("[poll_evaluation_batches] Starting evaluation batch polling")
-
-    try:
-        # Get database session
-        engine = get_engine()
-        with Session(engine) as session:
-            # Get all organizations
-            orgs = session.exec(select(Organization)).all()
-
-            if not orgs:
-                logger.info("[poll_evaluation_batches] No organizations found")
-                return {
-                    "status": "success",
-                    "organizations_processed": 0,
-                    "message": "No organizations to process",
-                }
-
-            logger.info(
-                f"[poll_evaluation_batches] Found {len(orgs)} organizations to process"
-            )
-
-            results = []
-            total_processed = 0
-            total_failed = 0
-            total_still_processing = 0
-
-            # Process each organization
-            for org in orgs:
-                try:
-                    logger.info(
-                        f"[poll_evaluation_batches] Processing org_id={org.id} ({org.name})"
-                    )
-
-                    # Poll all pending evaluations for this org
-                    # Use asyncio.run since poll_all_pending_evaluations is async
-                    summary = asyncio.run(
-                        poll_all_pending_evaluations(session=session, org_id=org.id)
-                    )
-
-                    results.append(
-                        {
-                            "org_id": org.id,
-                            "org_name": org.name,
-                            "summary": summary,
-                        }
-                    )
-
-                    total_processed += summary.get("processed", 0)
-                    total_failed += summary.get("failed", 0)
-                    total_still_processing += summary.get("still_processing", 0)
-
-                except Exception as e:
-                    logger.error(
-                        f"[poll_evaluation_batches] Error processing org_id={org.id}: {e}",
-                        exc_info=True,
-                    )
-                    session.rollback()
-                    results.append(
-                        {"org_id": org.id, "org_name": org.name, "error": str(e)}
-                    )
-
-            logger.info(
-                f"[poll_evaluation_batches] Completed: "
-                f"{total_processed} processed, {total_failed} failed, "
-                f"{total_still_processing} still processing"
-            )
-
-            return {
-                "status": "success",
-                "organizations_processed": len(orgs),
-                "total_processed": total_processed,
-                "total_failed": total_failed,
-                "total_still_processing": total_still_processing,
-                "results": results,
-            }
-
-    except Exception as e:
-        logger.error(
-            f"[poll_evaluation_batches] Fatal error: {e}",
-            exc_info=True,
-        )
-        # Retry the task after 5 minutes
-        raise self.retry(exc=e, countdown=300, max_retries=3)
diff --git a/backend/app/celery/tasks/evaluation_score_sync.py b/backend/app/celery/tasks/evaluation_score_sync.py
deleted file mode 100644
index d1b0373c8..000000000
--- a/backend/app/celery/tasks/evaluation_score_sync.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""
-Celery tasks for evaluation-specific processing.
-
-This module contains periodic tasks that process completed evaluation batches,
-parse results, create Langfuse traces, and calculate scores.
-"""
-
-import asyncio
-import logging
-
-from celery import shared_task
-from sqlmodel import Session, select
-
-from app.core.db import get_engine
-from app.crud.evaluations.processing import poll_all_pending_evaluations
-from app.models import Organization
-
-logger = logging.getLogger(__name__)
-
-
-@shared_task(name="process_evaluation_batches", bind=True)
-def process_evaluation_batches_task(self):
-    """
-    Periodic task to process completed evaluation batches.
-
-    This task:
-    1. Gets all organizations
-    2. For each org, checks their pending evaluations
-    3. Processes completed batches (parses results, creates Langfuse traces)
-    4. Updates evaluation_run records with final status
-
-    Runs every 60 seconds (configured in celery_app.py beat_schedule)
-
-    Note: Generic batch_job status polling is handled by poll_batch_jobs task.
-    This task focuses on evaluation-specific result processing.
-    """
-    logger.info("[process_evaluation_batches] Starting evaluation processing")
-
-    try:
-        # Get database session
-        engine = get_engine()
-        with Session(engine) as session:
-            # Get all organizations
-            orgs = session.exec(select(Organization)).all()
-
-            if not orgs:
-                logger.info("[process_evaluation_batches] No organizations found")
-                return {
-                    "status": "success",
-                    "organizations_processed": 0,
-                    "message": "No organizations to process",
-                }
-
-            logger.info(
-                f"[process_evaluation_batches] Found {len(orgs)} organizations to process"
-            )
-
-            results = []
-            total_processed = 0
-            total_failed = 0
-            total_still_processing = 0
-
-            # Process each organization
-            for org in orgs:
-                try:
-                    logger.info(
-                        f"[process_evaluation_batches] Processing org_id={org.id} ({org.name})"
-                    )
-
-                    # Poll and process all pending evaluations for this org
-                    # Use asyncio.run since poll_all_pending_evaluations is async
-                    summary = asyncio.run(
-                        poll_all_pending_evaluations(session=session, org_id=org.id)
-                    )
-
-                    results.append(
-                        {
-                            "org_id": org.id,
-                            "org_name": org.name,
-                            "summary": summary,
-                        }
-                    )
-
-                    total_processed += summary.get("processed", 0)
-                    total_failed += summary.get("failed", 0)
-                    total_still_processing += summary.get("still_processing", 0)
-
-                except Exception as e:
-                    logger.error(
-                        f"[process_evaluation_batches] Error processing org_id={org.id}: {e}",
-                        exc_info=True,
-                    )
-                    results.append(
-                        {"org_id": org.id, "org_name": org.name, "error": str(e)}
-                    )
-
-            logger.info(
-                f"[process_evaluation_batches] Completed: "
-                f"{total_processed} processed, {total_failed} failed, "
-                f"{total_still_processing} still processing"
-            )
-
-            return {
-                "status": "success",
-                "organizations_processed": len(orgs),
-                "total_processed": total_processed,
-                "total_failed": total_failed,
-                "total_still_processing": total_still_processing,
-                "results": results,
-            }
-
-    except Exception as e:
-        logger.error(
-            f"[process_evaluation_batches] Fatal error: {e}",
-            exc_info=True,
-        )
-        # Retry the task after 5 minutes
-        raise self.retry(exc=e, countdown=300, max_retries=3)

From 4ec5971c0639c72cc10ca7434207f25bffe5ed71 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 4 Nov 2025 18:51:08 +0530
Subject: [PATCH 51/64] cleanup evaluation run update and context runs

---
 backend/app/api/routes/evaluation.py       | 104 +++++++++---------
 backend/app/crud/evaluations/core.py       |  53 ++++++++-
 backend/app/crud/evaluations/processing.py | 119 +++++++++++++--------
 backend/app/models/batch_job.py            |   5 +-
 backend/app/models/evaluation.py           |   6 +-
 5 files changed, 183 insertions(+), 104 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 0f92fe608..6582777c8 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -4,10 +4,9 @@
 import re
 from pathlib import Path
 
-from fastapi import APIRouter, Body, Depends, File, Form, HTTPException, UploadFile
-from sqlmodel import Session
+from fastapi import APIRouter, Body, File, Form, HTTPException, UploadFile
 
-from app.api.deps import get_current_user_org_project, get_db
+from app.api.deps import AuthContextDep, SessionDep
 from app.core.cloud import get_cloud_storage
 from app.core.util import configure_langfuse, configure_openai
 from app.crud.assistants import get_assistant_by_id
@@ -24,7 +23,6 @@
 )
 from app.crud.evaluations import list_evaluation_runs as list_evaluation_runs_crud
 from app.crud.evaluations.dataset import delete_dataset as delete_dataset_crud
-from app.models import UserProjectOrg
 from app.models.evaluation import (
     DatasetUploadResponse,
     EvaluationRunPublic,
@@ -94,6 +92,8 @@ def sanitize_dataset_name(name: str) -> str:
 
 @router.post("/evaluations/datasets", response_model=DatasetUploadResponse)
 async def upload_dataset(
+    _session: SessionDep,
+    auth_context: AuthContextDep,
     file: UploadFile = File(
         ..., description="CSV file with 'question' and 'answer' columns"
     ),
@@ -105,8 +105,6 @@ async def upload_dataset(
         le=5,
         description="Number of times to duplicate each item (min: 1, max: 5)",
     ),
-    _session: Session = Depends(get_db),
-    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
 ) -> DatasetUploadResponse:
     """
     Upload a CSV file containing Golden Q&A pairs.
@@ -160,8 +158,8 @@ async def upload_dataset(
 
     logger.info(
         f"Uploading dataset: {dataset_name} with duplication factor: "
-        f"{duplication_factor}, org_id={_current_user.organization_id}, "
-        f"project_id={_current_user.project_id}"
+        f"{duplication_factor}, org_id={auth_context.organization.id}, "
+        f"project_id={auth_context.project.id}"
     )
 
     # Security validation: Check file extension
@@ -243,7 +241,7 @@ async def upload_dataset(
     object_store_url = None
     try:
         storage = get_cloud_storage(
-            session=_session, project_id=_current_user.project_id
+            session=_session, project_id=auth_context.project.id
         )
         object_store_url = upload_csv_to_object_store(
             storage=storage, csv_content=csv_content, dataset_name=dataset_name
@@ -269,8 +267,8 @@ async def upload_dataset(
         # Get Langfuse credentials
         langfuse_credentials = get_provider_credential(
             session=_session,
-            org_id=_current_user.organization_id,
-            project_id=_current_user.project_id,
+            org_id=auth_context.organization.id,
+            project_id=auth_context.project.id,
             provider="langfuse",
         )
         if not langfuse_credentials:
@@ -317,8 +315,8 @@ async def upload_dataset(
         dataset_metadata=metadata,
         object_store_url=object_store_url,
         langfuse_dataset_id=langfuse_dataset_id,
-        organization_id=_current_user.organization_id,
-        project_id=_current_user.project_id,
+        organization_id=auth_context.organization.id,
+        project_id=auth_context.project.id,
     )
 
     logger.info(
@@ -340,10 +338,10 @@ async def upload_dataset(
 
 @router.get("/evaluations/datasets/list", response_model=list[DatasetUploadResponse])
 async def list_datasets_endpoint(
+    _session: SessionDep,
+    auth_context: AuthContextDep,
     limit: int = 50,
     offset: int = 0,
-    _session: Session = Depends(get_db),
-    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
 ) -> list[DatasetUploadResponse]:
     """
     List all datasets for the current organization and project.
@@ -361,8 +359,8 @@ async def list_datasets_endpoint(
 
     datasets = list_datasets(
         session=_session,
-        organization_id=_current_user.organization_id,
-        project_id=_current_user.project_id,
+        organization_id=auth_context.organization.id,
+        project_id=auth_context.project.id,
         limit=limit,
         offset=offset,
     )
@@ -390,8 +388,8 @@ async def list_datasets_endpoint(
 @router.get("/evaluations/datasets/{dataset_id}", response_model=DatasetUploadResponse)
 async def get_dataset(
     dataset_id: int,
-    _session: Session = Depends(get_db),
-    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
+    _session: SessionDep,
+    auth_context: AuthContextDep,
 ) -> DatasetUploadResponse:
     """
     Get details of a specific dataset by ID.
@@ -404,15 +402,15 @@ async def get_dataset(
     """
     logger.info(
         f"Fetching dataset: id={dataset_id}, "
-        f"org_id={_current_user.organization_id}, "
-        f"project_id={_current_user.project_id}"
+        f"org_id={auth_context.organization.id}, "
+        f"project_id={auth_context.project.id}"
     )
 
     dataset = get_dataset_by_id(
         session=_session,
         dataset_id=dataset_id,
-        organization_id=_current_user.organization_id,
-        project_id=_current_user.project_id,
+        organization_id=auth_context.organization.id,
+        project_id=auth_context.project.id,
     )
 
     if not dataset:
@@ -434,8 +432,8 @@ async def get_dataset(
 @router.delete("/evaluations/datasets/{dataset_id}")
 async def delete_dataset(
     dataset_id: int,
-    _session: Session = Depends(get_db),
-    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
+    _session: SessionDep,
+    auth_context: AuthContextDep,
 ) -> dict:
     """
     Delete a dataset by ID.
@@ -452,15 +450,15 @@ async def delete_dataset(
     """
     logger.info(
         f"Deleting dataset: id={dataset_id}, "
-        f"org_id={_current_user.organization_id}, "
-        f"project_id={_current_user.project_id}"
+        f"org_id={auth_context.organization.id}, "
+        f"project_id={auth_context.project.id}"
     )
 
     success, message = delete_dataset_crud(
         session=_session,
         dataset_id=dataset_id,
-        organization_id=_current_user.organization_id,
-        project_id=_current_user.project_id,
+        organization_id=auth_context.organization.id,
+        project_id=auth_context.project.id,
     )
 
     if not success:
@@ -476,6 +474,8 @@ async def delete_dataset(
 
 @router.post("/evaluations", response_model=EvaluationRunPublic)
 async def evaluate(
+    _session: SessionDep,
+    auth_context: AuthContextDep,
     dataset_id: int = Body(..., description="ID of the evaluation dataset"),
     experiment_name: str = Body(
         ..., description="Name for this evaluation experiment/run"
@@ -485,8 +485,6 @@ async def evaluate(
     | None = Body(
         None, description="Optional assistant ID to fetch configuration from"
     ),
-    _session: Session = Depends(get_db),
-    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
 ) -> EvaluationRunPublic:
     """
     Start an evaluation using OpenAI Batch API.
@@ -551,7 +549,7 @@ async def evaluate(
     logger.info(
         f"Starting evaluation: experiment_name={experiment_name}, "
         f"dataset_id={dataset_id}, "
-        f"org_id={_current_user.organization_id}, "
+        f"org_id={auth_context.organization.id}, "
         f"assistant_id={assistant_id}, "
         f"config_keys={list(config.keys())}"
     )
@@ -560,8 +558,8 @@ async def evaluate(
     dataset = get_dataset_by_id(
         session=_session,
         dataset_id=dataset_id,
-        organization_id=_current_user.organization_id,
-        project_id=_current_user.project_id,
+        organization_id=auth_context.organization.id,
+        project_id=auth_context.project.id,
     )
 
     if not dataset:
@@ -582,14 +580,14 @@ async def evaluate(
     # Get credentials
     openai_credentials = get_provider_credential(
         session=_session,
-        org_id=_current_user.organization_id,
-        project_id=_current_user.project_id,
+        org_id=auth_context.organization.id,
+        project_id=auth_context.project.id,
         provider="openai",
     )
     langfuse_credentials = get_provider_credential(
         session=_session,
-        org_id=_current_user.organization_id,
-        project_id=_current_user.project_id,
+        org_id=auth_context.organization.id,
+        project_id=auth_context.project.id,
         provider="langfuse",
     )
 
@@ -619,7 +617,7 @@ async def evaluate(
         assistant = get_assistant_by_id(
             session=_session,
             assistant_id=assistant_id,
-            project_id=_current_user.project_id,
+            project_id=auth_context.project.id,
         )
 
         if not assistant:
@@ -670,8 +668,8 @@ async def evaluate(
         dataset_name=dataset_name,
         dataset_id=dataset_id,
         config=config,
-        organization_id=_current_user.organization_id,
-        project_id=_current_user.project_id,
+        organization_id=auth_context.organization.id,
+        project_id=auth_context.project.id,
     )
 
     # Start the batch evaluation
@@ -703,8 +701,8 @@ async def evaluate(
 
 @router.get("/evaluations/list", response_model=list[EvaluationRunPublic])
 async def list_evaluation_runs(
-    _session: Session = Depends(get_db),
-    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
+    _session: SessionDep,
+    auth_context: AuthContextDep,
     limit: int = 50,
     offset: int = 0,
 ) -> list[EvaluationRunPublic]:
@@ -719,14 +717,14 @@ async def list_evaluation_runs(
         List of EvaluationRunPublic objects, ordered by most recent first
     """
     logger.info(
-        f"Listing evaluation runs for org_id={_current_user.organization_id}, "
-        f"project_id={_current_user.project_id} (limit={limit}, offset={offset})"
+        f"Listing evaluation runs for org_id={auth_context.organization.id}, "
+        f"project_id={auth_context.project.id} (limit={limit}, offset={offset})"
     )
 
     return list_evaluation_runs_crud(
         session=_session,
-        organization_id=_current_user.organization_id,
-        project_id=_current_user.project_id,
+        organization_id=auth_context.organization.id,
+        project_id=auth_context.project.id,
         limit=limit,
         offset=offset,
     )
@@ -735,8 +733,8 @@ async def list_evaluation_runs(
 @router.get("/evaluations/{evaluation_id}", response_model=EvaluationRunPublic)
 async def get_evaluation_run_status(
     evaluation_id: int,
-    _session: Session = Depends(get_db),
-    _current_user: UserProjectOrg = Depends(get_current_user_org_project),
+    _session: SessionDep,
+    auth_context: AuthContextDep,
 ) -> EvaluationRunPublic:
     """
     Get the current status of a specific evaluation run.
@@ -749,15 +747,15 @@ async def get_evaluation_run_status(
     """
     logger.info(
         f"Fetching status for evaluation run {evaluation_id} "
-        f"(org_id={_current_user.organization_id}, "
-        f"project_id={_current_user.project_id})"
+        f"(org_id={auth_context.organization.id}, "
+        f"project_id={auth_context.project.id})"
     )
 
     eval_run = get_evaluation_run_by_id(
         session=_session,
         evaluation_id=evaluation_id,
-        organization_id=_current_user.organization_id,
-        project_id=_current_user.project_id,
+        organization_id=auth_context.organization.id,
+        project_id=auth_context.project.id,
     )
 
     if not eval_run:
diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py
index 66560c6fa..235aa50d8 100644
--- a/backend/app/crud/evaluations/core.py
+++ b/backend/app/crud/evaluations/core.py
@@ -2,10 +2,9 @@
 import io
 import logging
 
-from langfuse import Langfuse
 from sqlmodel import Session, select
 
-from app.core.util import configure_langfuse, configure_openai, now
+from app.core.util import configure_langfuse, now
 from app.crud.credentials import get_provider_credential
 from app.models import EvaluationRun, UserProjectOrg
 from app.models.evaluation import DatasetUploadResponse
@@ -261,3 +260,53 @@ def get_evaluation_run_by_id(
         )
 
     return eval_run
+
+
+def update_evaluation_run(
+    session: Session,
+    eval_run: EvaluationRun,
+    status: str | None = None,
+    error_message: str | None = None,
+    object_store_url: str | None = None,
+    score: dict | None = None,
+    embedding_batch_job_id: int | None = None,
+) -> EvaluationRun:
+    """
+    Update an evaluation run with new values and persist to database.
+
+    This helper function ensures consistency when updating evaluation runs
+    by always updating the timestamp and properly committing changes.
+
+    Args:
+        session: Database session
+        eval_run: EvaluationRun instance to update
+        status: New status value (optional)
+        error_message: New error message (optional)
+        object_store_url: New object store URL (optional)
+        score: New score dict (optional)
+        embedding_batch_job_id: New embedding batch job ID (optional)
+
+    Returns:
+        Updated and refreshed EvaluationRun instance
+    """
+    # Update provided fields
+    if status is not None:
+        eval_run.status = status
+    if error_message is not None:
+        eval_run.error_message = error_message
+    if object_store_url is not None:
+        eval_run.object_store_url = object_store_url
+    if score is not None:
+        eval_run.score = score
+    if embedding_batch_job_id is not None:
+        eval_run.embedding_batch_job_id = embedding_batch_job_id
+
+    # Always update timestamp
+    eval_run.updated_at = now()
+
+    # Persist to database
+    session.add(eval_run)
+    session.commit()
+    session.refresh(eval_run)
+
+    return eval_run
diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py
index d4da792d4..c21b99580 100644
--- a/backend/app/crud/evaluations/processing.py
+++ b/backend/app/crud/evaluations/processing.py
@@ -19,7 +19,7 @@
 from sqlmodel import Session, select
 
 from app.core.batch.openai_provider import OpenAIBatchProvider
-from app.core.util import configure_langfuse, configure_openai, now
+from app.core.util import configure_langfuse, configure_openai
 from app.crud.batch_job import get_batch_job
 from app.crud.batch_operations import (
     download_batch_results,
@@ -27,6 +27,7 @@
 )
 from app.crud.credentials import get_provider_credential
 from app.crud.evaluations.batch import fetch_dataset_items
+from app.crud.evaluations.core import update_evaluation_run
 from app.crud.evaluations.embeddings import (
     calculate_average_similarity,
     parse_embedding_results,
@@ -258,12 +259,12 @@ async def process_completed_evaluation(
                 exc_info=True,
             )
             # Don't fail the entire evaluation, just mark as completed without embeddings
-            eval_run.status = "completed"
-            eval_run.error_message = f"Embeddings failed: {str(e)}"
-            eval_run.updated_at = now()
-            session.add(eval_run)
-            session.commit()
-            session.refresh(eval_run)
+            eval_run = update_evaluation_run(
+                session=session,
+                eval_run=eval_run,
+                status="completed",
+                error_message=f"Embeddings failed: {str(e)}",
+            )
 
         logger.info(f"{log_prefix} Processed evaluation: {len(results)} items")
 
@@ -275,13 +276,12 @@ async def process_completed_evaluation(
             exc_info=True,
         )
         # Mark as failed
-        eval_run.status = "failed"
-        eval_run.error_message = f"Processing failed: {str(e)}"
-        eval_run.updated_at = now()
-        session.add(eval_run)
-        session.commit()
-        session.refresh(eval_run)
-        return eval_run
+        return update_evaluation_run(
+            session=session,
+            eval_run=eval_run,
+            status="failed",
+            error_message=f"Processing failed: {str(e)}",
+        )
 
 
 async def process_completed_embedding_batch(
@@ -382,12 +382,11 @@ async def process_completed_embedding_batch(
                 )
 
         # Step 7: Mark evaluation as completed
-        eval_run.status = "completed"
-        eval_run.updated_at = now()
-
-        session.add(eval_run)
-        session.commit()
-        session.refresh(eval_run)
+        eval_run = update_evaluation_run(
+            session=session,
+            eval_run=eval_run,
+            status="completed",
+        )
 
         logger.info(
             f"{log_prefix} Completed evaluation: "
@@ -402,13 +401,12 @@ async def process_completed_embedding_batch(
             exc_info=True,
         )
         # Mark as completed anyway, but with error message
-        eval_run.status = "completed"
-        eval_run.error_message = f"Embedding processing failed: {str(e)}"
-        eval_run.updated_at = now()
-        session.add(eval_run)
-        session.commit()
-        session.refresh(eval_run)
-        return eval_run
+        return update_evaluation_run(
+            session=session,
+            eval_run=eval_run,
+            status="completed",
+            error_message=f"Embedding processing failed: {str(e)}",
+        )
 
 
 async def check_and_process_evaluation(
@@ -491,14 +489,12 @@ async def check_and_process_evaluation(
                         f"{embedding_batch_job.error_message}"
                     )
                     # Mark as completed without embeddings
-                    eval_run.status = "completed"
-                    eval_run.error_message = (
-                        f"Embedding batch failed: {embedding_batch_job.error_message}"
+                    eval_run = update_evaluation_run(
+                        session=session,
+                        eval_run=eval_run,
+                        status="completed",
+                        error_message=f"Embedding batch failed: {embedding_batch_job.error_message}",
                     )
-                    eval_run.updated_at = now()
-                    session.add(eval_run)
-                    session.commit()
-                    session.refresh(eval_run)
 
                     return {
                         "run_id": eval_run.id,
@@ -563,12 +559,12 @@ async def check_and_process_evaluation(
             # Mark evaluation as failed based on provider status
             error_msg = batch_job.error_message or f"Provider batch {provider_status}"
 
-            eval_run.status = "failed"
-            eval_run.error_message = error_msg
-            eval_run.updated_at = now()
-            session.add(eval_run)
-            session.commit()
-            session.refresh(eval_run)
+            eval_run = update_evaluation_run(
+                session=session,
+                eval_run=eval_run,
+                status="failed",
+                error_message=error_msg,
+            )
 
             logger.error(
                 f"{log_prefix} Batch {batch_job.provider_batch_id} failed: {error_msg}"
@@ -599,11 +595,12 @@ async def check_and_process_evaluation(
         logger.error(f"{log_prefix} Error checking evaluation: {e}", exc_info=True)
 
         # Mark as failed
-        eval_run.status = "failed"
-        eval_run.error_message = f"Checking failed: {str(e)}"
-        eval_run.updated_at = now()
-        session.add(eval_run)
-        session.commit()
+        update_evaluation_run(
+            session=session,
+            eval_run=eval_run,
+            status="failed",
+            error_message=f"Checking failed: {str(e)}",
+        )
 
         return {
             "run_id": eval_run.id,
@@ -683,6 +680,14 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
                 )
                 # Mark all runs in this project as failed due to missing credentials
                 for eval_run in project_runs:
+                    # Persist failure status to database
+                    update_evaluation_run(
+                        session=session,
+                        eval_run=eval_run,
+                        status="failed",
+                        error_message="Missing OpenAI or Langfuse credentials",
+                    )
+
                     all_results.append(
                         {
                             "run_id": eval_run.id,
@@ -704,6 +709,14 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
                 )
                 # Mark all runs in this project as failed due to client configuration
                 for eval_run in project_runs:
+                    # Persist failure status to database
+                    update_evaluation_run(
+                        session=session,
+                        eval_run=eval_run,
+                        status="failed",
+                        error_message="Failed to configure API clients",
+                    )
+
                     all_results.append(
                         {
                             "run_id": eval_run.id,
@@ -738,6 +751,14 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
                         f"Failed to check evaluation run {eval_run.id}: {e}",
                         exc_info=True,
                     )
+                    # Persist failure status to database
+                    update_evaluation_run(
+                        session=session,
+                        eval_run=eval_run,
+                        status="failed",
+                        error_message=f"Check failed: {str(e)}",
+                    )
+
                     all_results.append(
                         {
                             "run_id": eval_run.id,
@@ -752,6 +773,14 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
             logger.error(f"Failed to process project {project_id}: {e}", exc_info=True)
             # Mark all runs in this project as failed
             for eval_run in project_runs:
+                # Persist failure status to database
+                update_evaluation_run(
+                    session=session,
+                    eval_run=eval_run,
+                    status="failed",
+                    error_message=f"Project processing failed: {str(e)}",
+                )
+
                 all_results.append(
                     {
                         "run_id": eval_run.id,
diff --git a/backend/app/models/batch_job.py b/backend/app/models/batch_job.py
index 183dc8481..22477948c 100644
--- a/backend/app/models/batch_job.py
+++ b/backend/app/models/batch_job.py
@@ -64,10 +64,11 @@ class BatchJob(SQLModel, table=True):
 
     # Timestamps
     inserted_at: datetime = Field(
-        default_factory=now, description="The timestamp when the document was inserted"
+        default_factory=now, description="The timestamp when the batch job was started"
     )
     updated_at: datetime = Field(
-        default_factory=now, description="The timestamp when the document was inserted"
+        default_factory=now,
+        description="The timestamp when the batch job was last updated",
     )
 
     # Relationships
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index a7b6f08af..883f664a5 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -204,10 +204,12 @@ class EvaluationRun(SQLModel, table=True):
 
     # Timestamps
     inserted_at: datetime = Field(
-        default_factory=now, description="The timestamp when the document was inserted"
+        default_factory=now,
+        description="The timestamp when the evaluation run was started",
     )
     updated_at: datetime = Field(
-        default_factory=now, description="The timestamp when the document was inserted"
+        default_factory=now,
+        description="The timestamp when the evaluation run was last updated",
     )
 
     # Relationships

From 6f19f05822e55c91ce932df3315afb0ba050db52 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 4 Nov 2025 19:59:33 +0530
Subject: [PATCH 52/64] cleanup logs

---
 backend/app/api/routes/cron.py               |  1 -
 backend/app/api/routes/evaluation.py         | 81 +++++++++++---------
 backend/app/core/batch/openai_provider.py    | 73 ++++++++----------
 backend/app/core/batch/provider_interface.py | 16 ----
 backend/app/crud/batch_job.py                | 44 +++++++----
 backend/app/crud/batch_operations.py         | 55 ++++++++-----
 backend/app/crud/evaluations/batch.py        | 24 ++++--
 backend/app/crud/evaluations/dataset.py      | 42 +++++-----
 backend/app/crud/evaluations/langfuse.py     | 54 ++++++++-----
 backend/app/crud/evaluations/processing.py   | 81 ++++++++++++--------
 10 files changed, 261 insertions(+), 210 deletions(-)

diff --git a/backend/app/api/routes/cron.py b/backend/app/api/routes/cron.py
index 1a13c6f76..f4426c8f1 100644
--- a/backend/app/api/routes/cron.py
+++ b/backend/app/api/routes/cron.py
@@ -5,7 +5,6 @@
 
 from app.api.deps import get_current_active_superuser, get_db
 from app.crud.evaluations import process_all_pending_evaluations_sync
-from app.models import User
 
 logger = logging.getLogger(__name__)
 
diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 6582777c8..908c54179 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -154,11 +154,13 @@ async def upload_dataset(
         raise HTTPException(status_code=422, detail=f"Invalid dataset name: {str(e)}")
 
     if original_name != dataset_name:
-        logger.info(f"Dataset name sanitized: '{original_name}' -> '{dataset_name}'")
+        logger.info(
+            f"[upload_dataset] Dataset name sanitized | '{original_name}' -> '{dataset_name}'"
+        )
 
     logger.info(
-        f"Uploading dataset: {dataset_name} with duplication factor: "
-        f"{duplication_factor}, org_id={auth_context.organization.id}, "
+        f"[upload_dataset] Uploading dataset | dataset={dataset_name} | "
+        f"duplication_factor={duplication_factor} | org_id={auth_context.organization.id} | "
         f"project_id={auth_context.project.id}"
     )
 
@@ -229,12 +231,12 @@ async def upload_dataset(
         total_items_count = original_items_count * duplication_factor
 
         logger.info(
-            f"Parsed {original_items_count} items from CSV, "
-            f"will create {total_items_count} total items with duplication"
+            f"[upload_dataset] Parsed items from CSV | original={original_items_count} | "
+            f"total_with_duplication={total_items_count}"
         )
 
     except Exception as e:
-        logger.error(f"Failed to parse CSV: {e}", exc_info=True)
+        logger.error(f"[upload_dataset] Failed to parse CSV | {e}", exc_info=True)
         raise HTTPException(status_code=422, detail=f"Invalid CSV file: {e}")
 
     # Step 2: Upload to object store (if credentials configured)
@@ -248,15 +250,15 @@ async def upload_dataset(
         )
         if object_store_url:
             logger.info(
-                f"Successfully uploaded CSV to object store: {object_store_url}"
+                f"[upload_dataset] Successfully uploaded CSV to object store | {object_store_url}"
             )
         else:
             logger.info(
-                "Object store upload returned None, continuing without object store storage"
+                "[upload_dataset] Object store upload returned None | continuing without object store storage"
             )
     except Exception as e:
         logger.warning(
-            f"Failed to upload CSV to object store (continuing without object store): {e}",
+            f"[upload_dataset] Failed to upload CSV to object store (continuing without object store) | {e}",
             exc_info=True,
         )
         object_store_url = None
@@ -291,12 +293,15 @@ async def upload_dataset(
         )
 
         logger.info(
-            f"Successfully uploaded dataset to Langfuse: {dataset_name} "
-            f"(id={langfuse_dataset_id})"
+            f"[upload_dataset] Successfully uploaded dataset to Langfuse | "
+            f"dataset={dataset_name} | id={langfuse_dataset_id}"
         )
 
     except Exception as e:
-        logger.error(f"Failed to upload dataset to Langfuse: {e}", exc_info=True)
+        logger.error(
+            f"[upload_dataset] Failed to upload dataset to Langfuse | {e}",
+            exc_info=True,
+        )
         raise HTTPException(
             status_code=500, detail=f"Failed to upload dataset to Langfuse: {e}"
         )
@@ -320,8 +325,8 @@ async def upload_dataset(
     )
 
     logger.info(
-        f"Successfully created dataset record in database: id={dataset.id}, "
-        f"name={dataset_name}"
+        f"[upload_dataset] Successfully created dataset record in database | "
+        f"id={dataset.id} | name={dataset_name}"
     )
 
     # Return response
@@ -401,8 +406,8 @@ async def get_dataset(
         DatasetUploadResponse with dataset details
     """
     logger.info(
-        f"Fetching dataset: id={dataset_id}, "
-        f"org_id={auth_context.organization.id}, "
+        f"[get_dataset] Fetching dataset | id={dataset_id} | "
+        f"org_id={auth_context.organization.id} | "
         f"project_id={auth_context.project.id}"
     )
 
@@ -449,8 +454,8 @@ async def delete_dataset(
         Success message with deleted dataset details
     """
     logger.info(
-        f"Deleting dataset: id={dataset_id}, "
-        f"org_id={auth_context.organization.id}, "
+        f"[delete_dataset] Deleting dataset | id={dataset_id} | "
+        f"org_id={auth_context.organization.id} | "
         f"project_id={auth_context.project.id}"
     )
 
@@ -468,7 +473,7 @@ async def delete_dataset(
         else:
             raise HTTPException(status_code=400, detail=message)
 
-    logger.info(f"Successfully deleted dataset: id={dataset_id}")
+    logger.info(f"[delete_dataset] Successfully deleted dataset | id={dataset_id}")
     return {"message": message, "dataset_id": dataset_id}
 
 
@@ -547,10 +552,10 @@ async def evaluate(
         EvaluationRunPublic with batch details and status
     """
     logger.info(
-        f"Starting evaluation: experiment_name={experiment_name}, "
-        f"dataset_id={dataset_id}, "
-        f"org_id={auth_context.organization.id}, "
-        f"assistant_id={assistant_id}, "
+        f"[evaluate] Starting evaluation | experiment_name={experiment_name} | "
+        f"dataset_id={dataset_id} | "
+        f"org_id={auth_context.organization.id} | "
+        f"assistant_id={assistant_id} | "
         f"config_keys={list(config.keys())}"
     )
 
@@ -570,8 +575,8 @@ async def evaluate(
         )
 
     logger.info(
-        f"Found dataset: id={dataset.id}, name={dataset.name}, "
-        f"object_store_url={'present' if dataset.object_store_url else 'None'}, "
+        f"[evaluate] Found dataset | id={dataset.id} | name={dataset.name} | "
+        f"object_store_url={'present' if dataset.object_store_url else 'None'} | "
         f"langfuse_id={dataset.langfuse_dataset_id}"
     )
 
@@ -626,8 +631,8 @@ async def evaluate(
             )
 
         logger.info(
-            f"Found assistant in DB: id={assistant.id}, "
-            f"model={assistant.model}, instructions="
+            f"[evaluate] Found assistant in DB | id={assistant.id} | "
+            f"model={assistant.model} | instructions="
             f"{assistant.instructions[:50] if assistant.instructions else 'None'}..."
         )
 
@@ -651,9 +656,9 @@ async def evaluate(
                 }
             ]
 
-        logger.info("Using config from assistant")
+        logger.info("[evaluate] Using config from assistant")
     else:
-        logger.info("Using provided config directly")
+        logger.info("[evaluate] Using provided config directly")
         # Validate that config has minimum required fields
         if not config.get("model"):
             raise HTTPException(
@@ -683,15 +688,15 @@ async def evaluate(
         )
 
         logger.info(
-            f"Evaluation started successfully: "
-            f"batch_job_id={eval_run.batch_job_id}, total_items={eval_run.total_items}"
+            f"[evaluate] Evaluation started successfully | "
+            f"batch_job_id={eval_run.batch_job_id} | total_items={eval_run.total_items}"
         )
 
         return eval_run
 
     except Exception as e:
         logger.error(
-            f"Failed to start evaluation for run {eval_run.id}: {e}",
+            f"[evaluate] Failed to start evaluation | run_id={eval_run.id} | {e}",
             exc_info=True,
         )
         # Error is already handled in start_evaluation_batch
@@ -717,8 +722,9 @@ async def list_evaluation_runs(
         List of EvaluationRunPublic objects, ordered by most recent first
     """
     logger.info(
-        f"Listing evaluation runs for org_id={auth_context.organization.id}, "
-        f"project_id={auth_context.project.id} (limit={limit}, offset={offset})"
+        f"[list_evaluation_runs] Listing evaluation runs | "
+        f"org_id={auth_context.organization.id} | "
+        f"project_id={auth_context.project.id} | limit={limit} | offset={offset}"
     )
 
     return list_evaluation_runs_crud(
@@ -746,9 +752,10 @@ async def get_evaluation_run_status(
         EvaluationRunPublic with current status and results if completed
     """
     logger.info(
-        f"Fetching status for evaluation run {evaluation_id} "
-        f"(org_id={auth_context.organization.id}, "
-        f"project_id={auth_context.project.id})"
+        f"[get_evaluation_run_status] Fetching status for evaluation run | "
+        f"evaluation_id={evaluation_id} | "
+        f"org_id={auth_context.organization.id} | "
+        f"project_id={auth_context.project.id}"
     )
 
     eval_run = get_evaluation_run_by_id(
diff --git a/backend/app/core/batch/openai_provider.py b/backend/app/core/batch/openai_provider.py
index 3e17fd696..eadaa6d70 100644
--- a/backend/app/core/batch/openai_provider.py
+++ b/backend/app/core/batch/openai_provider.py
@@ -51,7 +51,7 @@ def create_batch(
         completion_window = config.get("completion_window", "24h")
 
         logger.info(
-            f"Creating OpenAI batch with {len(jsonl_data)} items for endpoint {endpoint}"
+            f"[create_batch] Creating OpenAI batch | items={len(jsonl_data)} | endpoint={endpoint}"
         )
 
         try:
@@ -77,13 +77,13 @@ def create_batch(
             }
 
             logger.info(
-                f"Created OpenAI batch: {batch.id} (status={batch.status}, {len(jsonl_data)} items)"
+                f"[create_batch] Created OpenAI batch | batch_id={batch.id} | status={batch.status} | items={len(jsonl_data)}"
             )
 
             return result
 
         except Exception as e:
-            logger.error(f"Failed to create OpenAI batch: {e}")
+            logger.error(f"[create_batch] Failed to create OpenAI batch | {e}")
             raise
 
     def get_batch_status(self, batch_id: str) -> dict[str, Any]:
@@ -103,7 +103,9 @@ def get_batch_status(self, batch_id: str) -> dict[str, Any]:
         Raises:
             Exception: If status check fails
         """
-        logger.info(f"Polling OpenAI batch status: {batch_id}")
+        logger.info(
+            f"[get_batch_status] Polling OpenAI batch status | batch_id={batch_id}"
+        )
 
         try:
             batch = self.client.batches.retrieve(batch_id)
@@ -127,14 +129,15 @@ def get_batch_status(self, batch_id: str) -> dict[str, Any]:
                 result["error_message"] = error_msg
 
             logger.info(
-                f"OpenAI batch {batch_id} status: {batch.status} "
-                f"({batch.request_counts.completed}/{batch.request_counts.total} completed)"
+                f"[get_batch_status] OpenAI batch status | batch_id={batch_id} | status={batch.status} | completed={batch.request_counts.completed}/{batch.request_counts.total}"
             )
 
             return result
 
         except Exception as e:
-            logger.error(f"Failed to poll OpenAI batch status for {batch_id}: {e}")
+            logger.error(
+                f"[get_batch_status] Failed to poll OpenAI batch status | batch_id={batch_id} | {e}"
+            )
             raise
 
     def download_batch_results(self, output_file_id: str) -> list[dict[str, Any]]:
@@ -153,7 +156,9 @@ def download_batch_results(self, output_file_id: str) -> list[dict[str, Any]]:
         Raises:
             Exception: If download or parsing fails
         """
-        logger.info(f"Downloading OpenAI batch results: {output_file_id}")
+        logger.info(
+            f"[download_batch_results] Downloading OpenAI batch results | output_file_id={output_file_id}"
+        )
 
         try:
             # Download file content
@@ -168,43 +173,21 @@ def download_batch_results(self, output_file_id: str) -> list[dict[str, Any]]:
                     result = json.loads(line)
                     results.append(result)
                 except json.JSONDecodeError as e:
-                    logger.error(f"Line {line_num}: Failed to parse JSON: {e}")
+                    logger.error(
+                        f"[download_batch_results] Failed to parse JSON | line={line_num} | {e}"
+                    )
                     continue
 
             logger.info(
-                f"Downloaded and parsed {len(results)} results from OpenAI batch output"
+                f"[download_batch_results] Downloaded and parsed results from OpenAI batch output | results={len(results)}"
             )
 
             return results
 
         except Exception as e:
-            logger.error(f"Failed to download OpenAI batch results: {e}")
-            raise
-
-    def cancel_batch(self, batch_id: str) -> bool:
-        """
-        Cancel a running OpenAI batch job.
-
-        Args:
-            batch_id: OpenAI batch ID
-
-        Returns:
-            True if cancellation was successful or batch was already terminal
-
-        Raises:
-            Exception: If cancellation fails
-        """
-        logger.info(f"Cancelling OpenAI batch: {batch_id}")
-
-        try:
-            batch = self.client.batches.cancel(batch_id)
-
-            logger.info(f"OpenAI batch {batch_id} cancelled (status={batch.status})")
-
-            return True
-
-        except Exception as e:
-            logger.error(f"Failed to cancel OpenAI batch {batch_id}: {e}")
+            logger.error(
+                f"[download_batch_results] Failed to download OpenAI batch results | {e}"
+            )
             raise
 
     def upload_file(self, content: str, purpose: str = "batch") -> str:
@@ -221,7 +204,7 @@ def upload_file(self, content: str, purpose: str = "batch") -> str:
         Raises:
             Exception: If upload fails
         """
-        logger.info(f"Uploading file to OpenAI ({len(content)} bytes)")
+        logger.info(f"[upload_file] Uploading file to OpenAI | bytes={len(content)}")
 
         try:
             file_response = self.client.files.create(
@@ -229,12 +212,14 @@ def upload_file(self, content: str, purpose: str = "batch") -> str:
                 purpose=purpose,
             )
 
-            logger.info(f"Uploaded file to OpenAI: {file_response.id}")
+            logger.info(
+                f"[upload_file] Uploaded file to OpenAI | file_id={file_response.id}"
+            )
 
             return file_response.id
 
         except Exception as e:
-            logger.error(f"Failed to upload file to OpenAI: {e}")
+            logger.error(f"[upload_file] Failed to upload file to OpenAI | {e}")
             raise
 
     def download_file(self, file_id: str) -> str:
@@ -250,18 +235,20 @@ def download_file(self, file_id: str) -> str:
         Raises:
             Exception: If download fails
         """
-        logger.info(f"Downloading file from OpenAI: {file_id}")
+        logger.info(f"[download_file] Downloading file from OpenAI | file_id={file_id}")
 
         try:
             file_content = self.client.files.content(file_id)
             content = file_content.read().decode("utf-8")
 
             logger.info(
-                f"Downloaded file from OpenAI: {file_id} ({len(content)} bytes)"
+                f"[download_file] Downloaded file from OpenAI | file_id={file_id} | bytes={len(content)}"
             )
 
             return content
 
         except Exception as e:
-            logger.error(f"Failed to download file from OpenAI {file_id}: {e}")
+            logger.error(
+                f"[download_file] Failed to download file from OpenAI | file_id={file_id} | {e}"
+            )
             raise
diff --git a/backend/app/core/batch/provider_interface.py b/backend/app/core/batch/provider_interface.py
index 953a57225..94e316e21 100644
--- a/backend/app/core/batch/provider_interface.py
+++ b/backend/app/core/batch/provider_interface.py
@@ -71,22 +71,6 @@ def download_batch_results(self, output_file_id: str) -> list[dict[str, Any]]:
         """
         pass
 
-    @abstractmethod
-    def cancel_batch(self, batch_id: str) -> bool:
-        """
-        Cancel a running batch job.
-
-        Args:
-            batch_id: Provider's batch job ID
-
-        Returns:
-            True if cancellation was successful or batch was already terminal
-
-        Raises:
-            Exception: If cancellation fails
-        """
-        pass
-
     @abstractmethod
     def upload_file(self, content: str, purpose: str = "batch") -> str:
         """
diff --git a/backend/app/crud/batch_job.py b/backend/app/crud/batch_job.py
index 186ff9474..c121a90cd 100644
--- a/backend/app/crud/batch_job.py
+++ b/backend/app/crud/batch_job.py
@@ -28,9 +28,10 @@ def create_batch_job(
         Exception: If creation fails
     """
     logger.info(
-        f"Creating batch job: provider={batch_job_create.provider}, "
-        f"job_type={batch_job_create.job_type}, "
-        f"org_id={batch_job_create.organization_id}, "
+        f"[create_batch_job] Creating batch job | "
+        f"provider={batch_job_create.provider} | "
+        f"job_type={batch_job_create.job_type} | "
+        f"org_id={batch_job_create.organization_id} | "
         f"project_id={batch_job_create.project_id}"
     )
 
@@ -43,12 +44,14 @@ def create_batch_job(
         session.commit()
         session.refresh(batch_job)
 
-        logger.info(f"Created batch job: id={batch_job.id}")
+        logger.info(f"[create_batch_job] Created batch job | id={batch_job.id}")
 
         return batch_job
 
     except Exception as e:
-        logger.error(f"Failed to create batch job: {e}", exc_info=True)
+        logger.error(
+            f"[create_batch_job] Failed to create batch job | {e}", exc_info=True
+        )
         session.rollback()
         raise
 
@@ -89,7 +92,7 @@ def update_batch_job(
     Raises:
         Exception: If update fails
     """
-    logger.info(f"Updating batch job: id={batch_job.id}")
+    logger.info(f"[update_batch_job] Updating batch job | id={batch_job.id}")
 
     try:
         # Update fields if provided
@@ -104,12 +107,15 @@ def update_batch_job(
         session.commit()
         session.refresh(batch_job)
 
-        logger.info(f"Updated batch job: id={batch_job.id}")
+        logger.info(f"[update_batch_job] Updated batch job | id={batch_job.id}")
 
         return batch_job
 
     except Exception as e:
-        logger.error(f"Failed to update batch job {batch_job.id}: {e}", exc_info=True)
+        logger.error(
+            f"[update_batch_job] Failed to update batch job | id={batch_job.id} | {e}",
+            exc_info=True,
+        )
         session.rollback()
         raise
 
@@ -136,7 +142,9 @@ def get_batch_jobs_by_ids(
     statement = select(BatchJob).where(BatchJob.id.in_(batch_job_ids))
     results = session.exec(statement).all()
 
-    logger.info(f"Found {len(results)} batch jobs for {len(batch_job_ids)} IDs")
+    logger.info(
+        f"[get_batch_jobs_by_ids] Found batch jobs | found={len(results)} | requested={len(batch_job_ids)}"
+    )
 
     return list(results)
 
@@ -175,9 +183,12 @@ def get_batches_by_type(
     results = session.exec(statement).all()
 
     logger.info(
-        f"Found {len(results)} batch jobs "
-        f"(job_type={job_type}, org_id={organization_id}, "
-        f"project_id={project_id}, provider_status={provider_status})"
+        f"[get_batches_by_type] Found batch jobs | "
+        f"count={len(results)} | "
+        f"job_type={job_type} | "
+        f"org_id={organization_id} | "
+        f"project_id={project_id} | "
+        f"provider_status={provider_status}"
     )
 
     return list(results)
@@ -194,15 +205,18 @@ def delete_batch_job(session: Session, batch_job: BatchJob) -> None:
     Raises:
         Exception: If deletion fails
     """
-    logger.info(f"Deleting batch job: id={batch_job.id}")
+    logger.info(f"[delete_batch_job] Deleting batch job | id={batch_job.id}")
 
     try:
         session.delete(batch_job)
         session.commit()
 
-        logger.info(f"Deleted batch job: id={batch_job.id}")
+        logger.info(f"[delete_batch_job] Deleted batch job | id={batch_job.id}")
 
     except Exception as e:
-        logger.error(f"Failed to delete batch job {batch_job.id}: {e}", exc_info=True)
+        logger.error(
+            f"[delete_batch_job] Failed to delete batch job | id={batch_job.id} | {e}",
+            exc_info=True,
+        )
         session.rollback()
         raise
diff --git a/backend/app/crud/batch_operations.py b/backend/app/crud/batch_operations.py
index b0806efaf..11aa74543 100644
--- a/backend/app/crud/batch_operations.py
+++ b/backend/app/crud/batch_operations.py
@@ -52,8 +52,8 @@ def start_batch_job(
         Exception: If batch creation fails
     """
     logger.info(
-        f"Starting {provider_name} batch job: job_type={job_type}, "
-        f"org_id={organization_id}, project_id={project_id}, "
+        f"[start_batch_job] Starting batch job | provider={provider_name} | "
+        f"job_type={job_type} | org_id={organization_id} | project_id={project_id} | "
         f"items={len(jsonl_data)}"
     )
 
@@ -71,7 +71,9 @@ def start_batch_job(
 
     try:
         # Step 2: Call provider to create batch
-        logger.info(f"Creating batch with {provider_name} provider...")
+        logger.info(
+            f"[start_batch_job] Creating batch with provider | provider={provider_name}"
+        )
         batch_result = provider.create_batch(jsonl_data=jsonl_data, config=config)
 
         # Step 3: Update batch_job with provider IDs
@@ -87,14 +89,16 @@ def start_batch_job(
         )
 
         logger.info(
-            f"Successfully started batch job: id={batch_job.id}, "
+            f"[start_batch_job] Successfully started batch job | id={batch_job.id} | "
             f"provider_batch_id={batch_job.provider_batch_id}"
         )
 
         return batch_job
 
     except Exception as e:
-        logger.error(f"Failed to start batch job: {e}", exc_info=True)
+        logger.error(
+            f"[start_batch_job] Failed to start batch job | {e}", exc_info=True
+        )
 
         # Store error in batch_job (parent table will handle status)
         batch_job_update = BatchJobUpdate(
@@ -125,7 +129,7 @@ def poll_batch_status(
         Exception: If polling fails
     """
     logger.info(
-        f"Polling batch status: id={batch_job.id}, "
+        f"[poll_batch_status] Polling batch status | id={batch_job.id} | "
         f"provider_batch_id={batch_job.provider_batch_id}"
     )
 
@@ -154,14 +158,16 @@ def poll_batch_status(
             )
 
             logger.info(
-                f"Updated batch_job {batch_job.id} status: "
+                f"[poll_batch_status] Updated batch_job status | id={batch_job.id} | "
                 f"{batch_job.provider_status} -> {provider_status}"
             )
 
         return status_result
 
     except Exception as e:
-        logger.error(f"Failed to poll batch status: {e}", exc_info=True)
+        logger.error(
+            f"[poll_batch_status] Failed to poll batch status | {e}", exc_info=True
+        )
         raise
 
 
@@ -188,19 +194,24 @@ def download_batch_results(
         )
 
     logger.info(
-        f"Downloading batch results: id={batch_job.id}, "
+        f"[download_batch_results] Downloading batch results | id={batch_job.id} | "
         f"output_file_id={batch_job.provider_output_file_id}"
     )
 
     try:
         results = provider.download_batch_results(batch_job.provider_output_file_id)
 
-        logger.info(f"Downloaded {len(results)} results for batch job {batch_job.id}")
+        logger.info(
+            f"[download_batch_results] Downloaded results | batch_job_id={batch_job.id} | results={len(results)}"
+        )
 
         return results
 
     except Exception as e:
-        logger.error(f"Failed to download batch results: {e}", exc_info=True)
+        logger.error(
+            f"[download_batch_results] Failed to download batch results | {e}",
+            exc_info=True,
+        )
         raise
 
 
@@ -227,7 +238,9 @@ def process_completed_batch(
     Raises:
         Exception: If processing fails
     """
-    logger.info(f"Processing completed batch: id={batch_job.id}")
+    logger.info(
+        f"[process_completed_batch] Processing completed batch | id={batch_job.id}"
+    )
 
     try:
         # Download results
@@ -241,12 +254,12 @@ def process_completed_batch(
                     session=session, batch_job=batch_job, results=results
                 )
                 logger.info(
-                    f"Uploaded batch results to object store: {object_store_url}"
+                    f"[process_completed_batch] Uploaded batch results to object store | {object_store_url}"
                 )
             except Exception as store_error:
                 logger.warning(
-                    f"Object store upload failed (credentials may not be configured): {store_error}. "
-                    f"Continuing without object store storage.",
+                    f"[process_completed_batch] Object store upload failed (credentials may not be configured) | "
+                    f"{store_error} | Continuing without object store storage",
                     exc_info=True,
                 )
 
@@ -260,7 +273,10 @@ def process_completed_batch(
         return results, object_store_url
 
     except Exception as e:
-        logger.error(f"Failed to process completed batch: {e}", exc_info=True)
+        logger.error(
+            f"[process_completed_batch] Failed to process completed batch | {e}",
+            exc_info=True,
+        )
         raise
 
 
@@ -283,7 +299,9 @@ def upload_batch_results_to_object_store(
     Raises:
         Exception: If upload fails
     """
-    logger.info(f"Uploading batch results to object store for batch_job {batch_job.id}")
+    logger.info(
+        f"[upload_batch_results_to_object_store] Uploading batch results to object store | batch_job_id={batch_job.id}"
+    )
 
     try:
         # Get cloud storage instance
@@ -306,7 +324,8 @@ def upload_batch_results_to_object_store(
 
     except Exception as e:
         logger.error(
-            f"Failed to upload batch results to object store: {e}", exc_info=True
+            f"[upload_batch_results_to_object_store] Failed to upload batch results to object store | {e}",
+            exc_info=True,
         )
         raise
 
diff --git a/backend/app/crud/evaluations/batch.py b/backend/app/crud/evaluations/batch.py
index 67fd1f7c2..8f3ef092d 100644
--- a/backend/app/crud/evaluations/batch.py
+++ b/backend/app/crud/evaluations/batch.py
@@ -38,7 +38,9 @@ def fetch_dataset_items(langfuse: Langfuse, dataset_name: str) -> list[dict[str,
     try:
         dataset = langfuse.get_dataset(dataset_name)
     except Exception as e:
-        logger.error(f"Failed to fetch dataset '{dataset_name}': {e}")
+        logger.error(
+            f"[fetch_dataset_items] Failed to fetch dataset | dataset={dataset_name} | {e}"
+        )
         raise ValueError(f"Dataset '{dataset_name}' not found: {e}")
 
     if not dataset.items:
@@ -92,7 +94,9 @@ def build_evaluation_jsonl(
         # Extract question from input
         question = item["input"].get("question", "")
         if not question:
-            logger.warning(f"Skipping item {item['id']} - no question found")
+            logger.warning(
+                f"[build_evaluation_jsonl] Skipping item - no question found | item_id={item['id']}"
+            )
             continue
 
         # Build the batch request object for Responses API
@@ -139,7 +143,9 @@ def start_evaluation_batch(
     """
     try:
         # Step 1: Fetch dataset items from Langfuse
-        logger.info(f"Starting evaluation batch for run '{eval_run.run_name}'")
+        logger.info(
+            f"[start_evaluation_batch] Starting evaluation batch | run={eval_run.run_name}"
+        )
         dataset_items = fetch_dataset_items(
             langfuse=langfuse, dataset_name=eval_run.dataset_name
         )
@@ -186,15 +192,19 @@ def start_evaluation_batch(
         session.refresh(eval_run)
 
         logger.info(
-            f"Successfully started evaluation batch: batch_job_id={batch_job.id}, "
-            f"provider_batch_id={batch_job.provider_batch_id} "
-            f"for run '{eval_run.run_name}' with {batch_job.total_items} items"
+            f"[start_evaluation_batch] Successfully started evaluation batch | "
+            f"batch_job_id={batch_job.id} | "
+            f"provider_batch_id={batch_job.provider_batch_id} | "
+            f"run={eval_run.run_name} | items={batch_job.total_items}"
         )
 
         return eval_run
 
     except Exception as e:
-        logger.error(f"Failed to start evaluation batch: {e}", exc_info=True)
+        logger.error(
+            f"[start_evaluation_batch] Failed to start evaluation batch | {e}",
+            exc_info=True,
+        )
         eval_run.status = "failed"
         eval_run.error_message = str(e)
         session.add(eval_run)
diff --git a/backend/app/crud/evaluations/dataset.py b/backend/app/crud/evaluations/dataset.py
index 1645fdfc3..7efa03d46 100644
--- a/backend/app/crud/evaluations/dataset.py
+++ b/backend/app/crud/evaluations/dataset.py
@@ -76,8 +76,7 @@ def create_evaluation_dataset(
         session.refresh(dataset)
 
         logger.info(
-            f"Created evaluation dataset: id={dataset.id}, name={name}, "
-            f"org_id={organization_id}, project_id={project_id}"
+            f"[create_evaluation_dataset] Created evaluation dataset | id={dataset.id} | name={name} | org_id={organization_id} | project_id={project_id}"
         )
 
         return dataset
@@ -85,7 +84,7 @@ def create_evaluation_dataset(
     except IntegrityError as e:
         session.rollback()
         logger.error(
-            f"Database integrity error creating dataset '{name}': {e}",
+            f"[create_evaluation_dataset] Database integrity error creating dataset | name={name} | {e}",
             exc_info=True,
         )
         raise HTTPException(
@@ -97,7 +96,7 @@ def create_evaluation_dataset(
     except Exception as e:
         session.rollback()
         logger.error(
-            f"Failed to create dataset record in database: {e}",
+            f"[create_evaluation_dataset] Failed to create dataset record in database | {e}",
             exc_info=True,
         )
         raise HTTPException(
@@ -131,13 +130,11 @@ def get_dataset_by_id(
 
     if dataset:
         logger.info(
-            f"Found dataset: id={dataset_id}, name={dataset.name}, "
-            f"org_id={organization_id}, project_id={project_id}"
+            f"[get_dataset_by_id] Found dataset | id={dataset_id} | name={dataset.name} | org_id={organization_id} | project_id={project_id}"
         )
     else:
         logger.warning(
-            f"Dataset not found or not accessible: id={dataset_id}, "
-            f"org_id={organization_id}, project_id={project_id}"
+            f"[get_dataset_by_id] Dataset not found or not accessible | id={dataset_id} | org_id={organization_id} | project_id={project_id}"
         )
 
     return dataset
@@ -169,8 +166,7 @@ def get_dataset_by_name(
 
     if dataset:
         logger.info(
-            f"Found dataset by name: name={name}, id={dataset.id}, "
-            f"org_id={organization_id}, project_id={project_id}"
+            f"[get_dataset_by_name] Found dataset by name | name={name} | id={dataset.id} | org_id={organization_id} | project_id={project_id}"
         )
 
     return dataset
@@ -208,8 +204,7 @@ def list_datasets(
     datasets = session.exec(statement).all()
 
     logger.info(
-        f"Listed {len(datasets)} datasets for org_id={organization_id}, "
-        f"project_id={project_id} (limit={limit}, offset={offset})"
+        f"[list_datasets] Listed datasets | count={len(datasets)} | org_id={organization_id} | project_id={project_id} | limit={limit} | offset={offset}"
     )
 
     return list(datasets)
@@ -275,16 +270,18 @@ def download_csv_from_object_store(
         raise ValueError("object_store_url cannot be None or empty")
 
     try:
-        logger.info(f"Downloading CSV from object store: {object_store_url}")
+        logger.info(
+            f"[download_csv_from_object_store] Downloading CSV from object store | {object_store_url}"
+        )
         body = storage.stream(object_store_url)
         csv_content = body.read()
         logger.info(
-            f"Successfully downloaded CSV from object store: {len(csv_content)} bytes"
+            f"[download_csv_from_object_store] Successfully downloaded CSV from object store | bytes={len(csv_content)}"
         )
         return csv_content
     except Exception as e:
         logger.error(
-            f"Failed to download CSV from object store: {object_store_url}: {e}",
+            f"[download_csv_from_object_store] Failed to download CSV from object store | {object_store_url} | {e}",
             exc_info=True,
         )
         raise
@@ -315,11 +312,12 @@ def update_dataset_langfuse_id(
         session.add(dataset)
         session.commit()
         logger.info(
-            f"Updated langfuse_dataset_id for dataset {dataset_id}: "
-            f"{langfuse_dataset_id}"
+            f"[update_dataset_langfuse_id] Updated langfuse_dataset_id | dataset_id={dataset_id} | langfuse_dataset_id={langfuse_dataset_id}"
         )
     else:
-        logger.warning(f"Dataset {dataset_id} not found for langfuse_id update")
+        logger.warning(
+            f"[update_dataset_langfuse_id] Dataset not found for langfuse_id update | dataset_id={dataset_id}"
+        )
 
 
 def delete_dataset(
@@ -372,8 +370,7 @@ def delete_dataset(
         session.commit()
 
         logger.info(
-            f"Deleted dataset: id={dataset_id}, name={dataset.name}, "
-            f"org_id={organization_id}, project_id={project_id}"
+            f"[delete_dataset] Deleted dataset | id={dataset_id} | name={dataset.name} | org_id={organization_id} | project_id={project_id}"
         )
 
         return (
@@ -383,5 +380,8 @@ def delete_dataset(
 
     except Exception as e:
         session.rollback()
-        logger.error(f"Failed to delete dataset {dataset_id}: {e}", exc_info=True)
+        logger.error(
+            f"[delete_dataset] Failed to delete dataset | dataset_id={dataset_id} | {e}",
+            exc_info=True,
+        )
         return (False, f"Failed to delete dataset: {e}")
diff --git a/backend/app/crud/evaluations/langfuse.py b/backend/app/crud/evaluations/langfuse.py
index 6c91d4252..feddd9817 100644
--- a/backend/app/crud/evaluations/langfuse.py
+++ b/backend/app/crud/evaluations/langfuse.py
@@ -52,8 +52,8 @@ def create_langfuse_dataset_run(
         Exception: If Langfuse operations fail
     """
     logger.info(
-        f"Creating Langfuse dataset run '{run_name}' for dataset '{dataset_name}' "
-        f"with {len(results)} items"
+        f"[create_langfuse_dataset_run] Creating Langfuse dataset run | "
+        f"run_name={run_name} | dataset={dataset_name} | items={len(results)}"
     )
 
     try:
@@ -72,7 +72,10 @@ def create_langfuse_dataset_run(
 
             dataset_item = dataset_items_map.get(item_id)
             if not dataset_item:
-                logger.warning(f"Dataset item '{item_id}' not found, skipping")
+                logger.warning(
+                    f"[create_langfuse_dataset_run] Dataset item not found, skipping | "
+                    f"item_id={item_id}"
+                )
                 continue
 
             try:
@@ -90,21 +93,25 @@ def create_langfuse_dataset_run(
 
             except Exception as e:
                 logger.error(
-                    f"Failed to create trace for item {item_id}: {e}", exc_info=True
+                    f"[create_langfuse_dataset_run] Failed to create trace | "
+                    f"item_id={item_id} | {e}",
+                    exc_info=True,
                 )
                 continue
 
         langfuse.flush()
         logger.info(
-            f"Created Langfuse dataset run '{run_name}' with "
-            f"{len(trace_id_mapping)} traces"
+            f"[create_langfuse_dataset_run] Created Langfuse dataset run | "
+            f"run_name={run_name} | traces={len(trace_id_mapping)}"
         )
 
         return trace_id_mapping
 
     except Exception as e:
         logger.error(
-            f"Failed to create Langfuse dataset run '{run_name}': {e}", exc_info=True
+            f"[create_langfuse_dataset_run] Failed to create Langfuse dataset run | "
+            f"run_name={run_name} | {e}",
+            exc_info=True,
         )
         raise
 
@@ -140,7 +147,9 @@ def update_traces_with_cosine_scores(
         cosine_score = score_item.get("cosine_similarity")
 
         if not trace_id:
-            logger.warning("Score item missing trace_id, skipping")
+            logger.warning(
+                "[update_traces_with_cosine_scores] Score item missing trace_id, skipping"
+            )
             continue
 
         try:
@@ -155,7 +164,8 @@ def update_traces_with_cosine_scores(
             )
         except Exception as e:
             logger.error(
-                f"Failed to add score for trace {trace_id}: {e}",
+                f"[update_traces_with_cosine_scores] Failed to add score | "
+                f"trace_id={trace_id} | {e}",
                 exc_info=True,
             )
 
@@ -191,8 +201,8 @@ def upload_dataset_to_langfuse_from_csv(
     import io
 
     logger.info(
-        f"Uploading dataset '{dataset_name}' to Langfuse from CSV "
-        f"(duplication_factor={duplication_factor})"
+        f"[upload_dataset_to_langfuse_from_csv] Uploading dataset to Langfuse from CSV | "
+        f"dataset={dataset_name} | duplication_factor={duplication_factor}"
     )
 
     try:
@@ -218,7 +228,9 @@ def upload_dataset_to_langfuse_from_csv(
             answer = row.get("answer", "").strip()
 
             if not question or not answer:
-                logger.warning(f"Skipping row with empty question or answer: {row}")
+                logger.warning(
+                    f"[upload_dataset_to_langfuse_from_csv] Skipping row with empty question or answer | {row}"
+                )
                 continue
 
             original_items.append({"question": question, "answer": answer})
@@ -227,9 +239,9 @@ def upload_dataset_to_langfuse_from_csv(
             raise ValueError("No valid items found in CSV file")
 
         logger.info(
-            f"Parsed {len(original_items)} items from CSV. "
-            f"Will duplicate {duplication_factor}x for a total of "
-            f"{len(original_items) * duplication_factor} items."
+            f"[upload_dataset_to_langfuse_from_csv] Parsed items from CSV | "
+            f"original={len(original_items)} | duplication_factor={duplication_factor} | "
+            f"total={len(original_items) * duplication_factor}"
         )
 
         # Create or get dataset in Langfuse
@@ -254,8 +266,8 @@ def upload_dataset_to_langfuse_from_csv(
                     total_uploaded += 1
                 except Exception as e:
                     logger.error(
-                        f"Failed to upload item (duplicate {duplicate_num + 1}): "
-                        f"{item['question'][:50]}... Error: {e}"
+                        f"[upload_dataset_to_langfuse_from_csv] Failed to upload item | "
+                        f"duplicate={duplicate_num + 1} | question={item['question'][:50]}... | {e}"
                     )
 
         # Flush to ensure all items are uploaded
@@ -264,14 +276,16 @@ def upload_dataset_to_langfuse_from_csv(
         langfuse_dataset_id = dataset.id if hasattr(dataset, "id") else None
 
         logger.info(
-            f"Successfully uploaded {total_uploaded} items to Langfuse dataset "
-            f"'{dataset_name}' (id={langfuse_dataset_id})"
+            f"[upload_dataset_to_langfuse_from_csv] Successfully uploaded items to Langfuse dataset | "
+            f"items={total_uploaded} | dataset={dataset_name} | id={langfuse_dataset_id}"
         )
 
         return langfuse_dataset_id, total_uploaded
 
     except Exception as e:
         logger.error(
-            f"Failed to upload dataset '{dataset_name}' to Langfuse: {e}", exc_info=True
+            f"[upload_dataset_to_langfuse_from_csv] Failed to upload dataset to Langfuse | "
+            f"dataset={dataset_name} | {e}",
+            exc_info=True,
         )
         raise
diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py
index c21b99580..f46e88e32 100644
--- a/backend/app/crud/evaluations/processing.py
+++ b/backend/app/crud/evaluations/processing.py
@@ -77,13 +77,17 @@ def parse_evaluation_output(
             # Extract custom_id (which is our dataset item ID)
             item_id = response.get("custom_id")
             if not item_id:
-                logger.warning(f"Line {line_num}: No custom_id found, skipping")
+                logger.warning(
+                    f"[parse_evaluation_output] No custom_id found, skipping | line={line_num}"
+                )
                 continue
 
             # Get original dataset item
             dataset_item = dataset_map.get(item_id)
             if not dataset_item:
-                logger.warning(f"Line {line_num}: No dataset item found for {item_id}")
+                logger.warning(
+                    f"[parse_evaluation_output] No dataset item found | line={line_num} | item_id={item_id}"
+                )
                 continue
 
             # Extract the response body
@@ -92,7 +96,9 @@ def parse_evaluation_output(
             # Handle errors in batch processing
             if response.get("error"):
                 error_msg = response["error"].get("message", "Unknown error")
-                logger.error(f"Item {item_id} had error: {error_msg}")
+                logger.error(
+                    f"[parse_evaluation_output] Item had error | item_id={item_id} | {error_msg}"
+                )
                 generated_output = f"ERROR: {error_msg}"
             else:
                 # Extract text from output (can be string, list, or complex structure)
@@ -128,7 +134,7 @@ def parse_evaluation_output(
                     # output was not a string and not a list
                     generated_output = ""
                     logger.warning(
-                        f"Item {item_id}: Unexpected output type: {type(output)}"
+                        f"[parse_evaluation_output] Unexpected output type | item_id={item_id} | type={type(output)}"
                     )
 
             # Extract question and ground truth from dataset item
@@ -145,11 +151,13 @@ def parse_evaluation_output(
             )
 
         except Exception as e:
-            logger.error(f"Line {line_num}: Unexpected error: {e}")
+            logger.error(
+                f"[parse_evaluation_output] Unexpected error | line={line_num} | {e}"
+            )
             continue
 
     logger.info(
-        f"Parsed {len(results)} evaluation results from {len(raw_results)} output lines"
+        f"[parse_evaluation_output] Parsed evaluation results | results={len(results)} | output_lines={len(raw_results)}"
     )
     return results
 
@@ -182,7 +190,9 @@ async def process_completed_evaluation(
         Exception: If processing fails
     """
     log_prefix = f"[org={eval_run.organization_id}][project={eval_run.project_id}][eval={eval_run.id}]"
-    logger.info(f"{log_prefix} Processing completed evaluation")
+    logger.info(
+        f"[process_completed_evaluation] {log_prefix} Processing completed evaluation"
+    )
 
     try:
         # Step 1: Get batch_job
@@ -197,7 +207,7 @@ async def process_completed_evaluation(
 
         # Step 2: Create provider and download results
         logger.info(
-            f"{log_prefix} Downloading batch results for batch_job {batch_job.id}"
+            f"[process_completed_evaluation] {log_prefix} Downloading batch results | batch_job_id={batch_job.id}"
         )
         provider = OpenAIBatchProvider(client=openai_client)
         raw_results = download_batch_results(provider=provider, batch_job=batch_job)
@@ -209,11 +219,13 @@ async def process_completed_evaluation(
                 session=session, batch_job=batch_job, results=raw_results
             )
         except Exception as store_error:
-            logger.warning(f"{log_prefix} Object store upload failed: {store_error}")
+            logger.warning(
+                f"[process_completed_evaluation] {log_prefix} Object store upload failed | {store_error}"
+            )
 
         # Step 3: Fetch dataset items (needed for matching ground truth)
         logger.info(
-            f"{log_prefix} Fetching dataset items for '{eval_run.dataset_name}'"
+            f"[process_completed_evaluation] {log_prefix} Fetching dataset items | dataset={eval_run.dataset_name}"
         )
         dataset_items = fetch_dataset_items(
             langfuse=langfuse, dataset_name=eval_run.dataset_name
@@ -255,7 +267,7 @@ async def process_completed_evaluation(
 
         except Exception as e:
             logger.error(
-                f"{log_prefix} Failed to start embedding batch: {e}",
+                f"[process_completed_evaluation] {log_prefix} Failed to start embedding batch | {e}",
                 exc_info=True,
             )
             # Don't fail the entire evaluation, just mark as completed without embeddings
@@ -266,13 +278,15 @@ async def process_completed_evaluation(
                 error_message=f"Embeddings failed: {str(e)}",
             )
 
-        logger.info(f"{log_prefix} Processed evaluation: {len(results)} items")
+        logger.info(
+            f"[process_completed_evaluation] {log_prefix} Processed evaluation | items={len(results)}"
+        )
 
         return eval_run
 
     except Exception as e:
         logger.error(
-            f"{log_prefix} Failed to process completed evaluation: {e}",
+            f"[process_completed_evaluation] {log_prefix} Failed to process completed evaluation | {e}",
             exc_info=True,
         )
         # Mark as failed
@@ -315,7 +329,9 @@ async def process_completed_embedding_batch(
         Exception: If processing fails
     """
     log_prefix = f"[org={eval_run.organization_id}][project={eval_run.project_id}][eval={eval_run.id}]"
-    logger.info(f"{log_prefix} Processing completed embedding batch")
+    logger.info(
+        f"[process_completed_embedding_batch] {log_prefix} Processing completed embedding batch"
+    )
 
     try:
         # Step 1: Get embedding_batch_job
@@ -365,7 +381,7 @@ async def process_completed_embedding_batch(
 
         # Step 6: Update Langfuse traces with cosine similarity scores
         logger.info(
-            f"{log_prefix} Updating Langfuse traces with cosine similarity scores"
+            f"[process_completed_embedding_batch] {log_prefix} Updating Langfuse traces with cosine similarity scores"
         )
         per_item_scores = similarity_stats.get("per_item_scores", [])
         if per_item_scores:
@@ -377,7 +393,7 @@ async def process_completed_embedding_batch(
             except Exception as e:
                 # Log error but don't fail the evaluation
                 logger.error(
-                    f"{log_prefix} Failed to update Langfuse traces with scores: {e}",
+                    f"[process_completed_embedding_batch] {log_prefix} Failed to update Langfuse traces with scores | {e}",
                     exc_info=True,
                 )
 
@@ -389,15 +405,14 @@ async def process_completed_embedding_batch(
         )
 
         logger.info(
-            f"{log_prefix} Completed evaluation: "
-            f"avg_similarity={similarity_stats['cosine_similarity_avg']:.3f}"
+            f"[process_completed_embedding_batch] {log_prefix} Completed evaluation | avg_similarity={similarity_stats['cosine_similarity_avg']:.3f}"
         )
 
         return eval_run
 
     except Exception as e:
         logger.error(
-            f"{log_prefix} Failed to process completed embedding batch: {e}",
+            f"[process_completed_embedding_batch] {log_prefix} Failed to process completed embedding batch | {e}",
             exc_info=True,
         )
         # Mark as completed anyway, but with error message
@@ -464,7 +479,7 @@ async def check_and_process_evaluation(
 
                 if embedding_status == "completed":
                     logger.info(
-                        f"{log_prefix} Processing embedding batch {embedding_batch_job.provider_batch_id}"
+                        f"[check_and_process_evaluation] {log_prefix} Processing embedding batch | provider_batch_id={embedding_batch_job.provider_batch_id}"
                     )
 
                     await process_completed_embedding_batch(
@@ -485,8 +500,7 @@ async def check_and_process_evaluation(
 
                 elif embedding_status in ["failed", "expired", "cancelled"]:
                     logger.error(
-                        f"{log_prefix} Embedding batch {embedding_batch_job.provider_batch_id} failed: "
-                        f"{embedding_batch_job.error_message}"
+                        f"[check_and_process_evaluation] {log_prefix} Embedding batch failed | provider_batch_id={embedding_batch_job.provider_batch_id} | {embedding_batch_job.error_message}"
                     )
                     # Mark as completed without embeddings
                     eval_run = update_evaluation_run(
@@ -567,7 +581,7 @@ async def check_and_process_evaluation(
             )
 
             logger.error(
-                f"{log_prefix} Batch {batch_job.provider_batch_id} failed: {error_msg}"
+                f"[check_and_process_evaluation] {log_prefix} Batch failed | provider_batch_id={batch_job.provider_batch_id} | {error_msg}"
             )
 
             return {
@@ -592,7 +606,10 @@ async def check_and_process_evaluation(
             }
 
     except Exception as e:
-        logger.error(f"{log_prefix} Error checking evaluation: {e}", exc_info=True)
+        logger.error(
+            f"[check_and_process_evaluation] {log_prefix} Error checking evaluation | {e}",
+            exc_info=True,
+        )
 
         # Mark as failed
         update_evaluation_run(
@@ -675,8 +692,7 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
 
             if not openai_credentials or not langfuse_credentials:
                 logger.error(
-                    f"Missing credentials for org_id={org_id}, project_id={project_id}: "
-                    f"openai={bool(openai_credentials)}, langfuse={bool(langfuse_credentials)}"
+                    f"[poll_all_pending_evaluations] Missing credentials | org_id={org_id} | project_id={project_id} | openai={bool(openai_credentials)} | langfuse={bool(langfuse_credentials)}"
                 )
                 # Mark all runs in this project as failed due to missing credentials
                 for eval_run in project_runs:
@@ -705,7 +721,7 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
 
             if not openai_success or not langfuse_success:
                 logger.error(
-                    f"Failed to configure clients for org_id={org_id}, project_id={project_id}"
+                    f"[poll_all_pending_evaluations] Failed to configure clients | org_id={org_id} | project_id={project_id}"
                 )
                 # Mark all runs in this project as failed due to client configuration
                 for eval_run in project_runs:
@@ -748,7 +764,7 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
 
                 except Exception as e:
                     logger.error(
-                        f"Failed to check evaluation run {eval_run.id}: {e}",
+                        f"[poll_all_pending_evaluations] Failed to check evaluation run | run_id={eval_run.id} | {e}",
                         exc_info=True,
                     )
                     # Persist failure status to database
@@ -770,7 +786,10 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
                     total_failed_count += 1
 
         except Exception as e:
-            logger.error(f"Failed to process project {project_id}: {e}", exc_info=True)
+            logger.error(
+                f"[poll_all_pending_evaluations] Failed to process project | project_id={project_id} | {e}",
+                exc_info=True,
+            )
             # Mark all runs in this project as failed
             for eval_run in project_runs:
                 # Persist failure status to database
@@ -800,9 +819,7 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
     }
 
     logger.info(
-        f"Polling summary for org_id={org_id}: "
-        f"{total_processed_count} processed, {total_failed_count} failed, "
-        f"{total_still_processing_count} still processing"
+        f"[poll_all_pending_evaluations] Polling summary | org_id={org_id} | processed={total_processed_count} | failed={total_failed_count} | still_processing={total_still_processing_count}"
     )
 
     return summary

From 4a649d37d7ebef658035eb47a8344bea3e8fcf15 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 4 Nov 2025 20:10:57 +0530
Subject: [PATCH 53/64] using response id

---
 backend/app/crud/evaluations/langfuse.py   | 16 +++++++++++-----
 backend/app/crud/evaluations/processing.py |  7 ++++++-
 backend/app/models/evaluation.py           |  2 +-
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/backend/app/crud/evaluations/langfuse.py b/backend/app/crud/evaluations/langfuse.py
index feddd9817..8117210c8 100644
--- a/backend/app/crud/evaluations/langfuse.py
+++ b/backend/app/crud/evaluations/langfuse.py
@@ -40,7 +40,8 @@ def create_langfuse_dataset_run(
                          "item_id": "item_123",
                          "question": "What is 2+2?",
                          "generated_output": "4",
-                         "ground_truth": "4"
+                         "ground_truth": "4",
+                         "response_id": "resp_0b99aadfead1fb62006908e7f540c48197bd110183a347c1d8"
                      },
                      ...
                  ]
@@ -69,6 +70,7 @@ def create_langfuse_dataset_run(
             question = result["question"]
             generated_output = result["generated_output"]
             ground_truth = result["ground_truth"]
+            response_id = result.get("response_id")
 
             dataset_item = dataset_items_map.get(item_id)
             if not dataset_item:
@@ -80,14 +82,18 @@ def create_langfuse_dataset_run(
 
             try:
                 with dataset_item.observe(run_name=run_name) as trace_id:
+                    metadata = {
+                        "ground_truth": ground_truth,
+                        "item_id": item_id,
+                    }
+                    if response_id:
+                        metadata["response_id"] = response_id
+
                     langfuse.trace(
                         id=trace_id,
                         input={"question": question},
                         output={"answer": generated_output},
-                        metadata={
-                            "ground_truth": ground_truth,
-                            "item_id": item_id,
-                        },
+                        metadata=metadata,
                     )
                     trace_id_mapping[item_id] = trace_id
 
diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py
index f46e88e32..9b5573db8 100644
--- a/backend/app/crud/evaluations/processing.py
+++ b/backend/app/crud/evaluations/processing.py
@@ -62,7 +62,8 @@ def parse_evaluation_output(
                 "item_id": "item_123",
                 "question": "What is 2+2?",
                 "generated_output": "4",
-                "ground_truth": "4"
+                "ground_truth": "4",
+                "response_id": "resp_0b99aadfead1fb62006908e7f540c48197bd110183a347c1d8"
             },
             ...
         ]
@@ -93,6 +94,9 @@ def parse_evaluation_output(
             # Extract the response body
             response_body = response.get("response", {}).get("body", {})
 
+            # Extract response ID from response.body.id
+            response_id = response_body.get("id")
+
             # Handle errors in batch processing
             if response.get("error"):
                 error_msg = response["error"].get("message", "Unknown error")
@@ -147,6 +151,7 @@ def parse_evaluation_output(
                     "question": question,
                     "generated_output": generated_output,
                     "ground_truth": ground_truth,
+                    "response_id": response_id,
                 }
             )
 
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index 883f664a5..1f28ad02f 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -44,7 +44,7 @@ class EvaluationResult(BaseModel):
     input: str = Field(..., description="The input question/prompt used for evaluation")
     output: str = Field(..., description="The actual output from the assistant")
     expected: str = Field(..., description="The expected output from the dataset")
-    thread_id: str | None = Field(None, description="ID of the OpenAI")
+    response_id: str | None = Field(None, description="ID from the batch response body")
 
 
 class Experiment(BaseModel):

From 9fc12a6ac91c955aa50187880363fa4b70b60cd8 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 4 Nov 2025 20:46:25 +0530
Subject: [PATCH 54/64] type checking for clean code

---
 backend/app/models/batch_job.py    | 14 +++++++-------
 backend/app/models/evaluation.py   | 25 ++++++++++++-------------
 backend/app/models/organization.py | 19 ++++---------------
 backend/app/models/project.py      | 22 +++++++++++-----------
 4 files changed, 34 insertions(+), 46 deletions(-)

diff --git a/backend/app/models/batch_job.py b/backend/app/models/batch_job.py
index 22477948c..3ef07f7f1 100644
--- a/backend/app/models/batch_job.py
+++ b/backend/app/models/batch_job.py
@@ -1,5 +1,5 @@
 from datetime import datetime
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 from sqlalchemy import Column
 from sqlalchemy.dialects.postgresql import JSONB
@@ -7,6 +7,10 @@
 
 from app.core.util import now
 
+if TYPE_CHECKING:
+    from .organization import Organization
+    from .project import Project
+
 
 class BatchJob(SQLModel, table=True):
     """Batch job table for tracking async LLM batch operations."""
@@ -72,12 +76,8 @@ class BatchJob(SQLModel, table=True):
     )
 
     # Relationships
-    organization: Optional["Organization"] = Relationship(  # noqa: F821
-        back_populates="batch_jobs"
-    )
-    project: Optional["Project"] = Relationship(
-        back_populates="batch_jobs"
-    )  # noqa: F821
+    organization: Optional["Organization"] = Relationship()
+    project: Optional["Project"] = Relationship()
 
 
 class BatchJobCreate(SQLModel):
diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py
index 1f28ad02f..57a83d35d 100644
--- a/backend/app/models/evaluation.py
+++ b/backend/app/models/evaluation.py
@@ -1,5 +1,5 @@
 from datetime import datetime
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 from pydantic import BaseModel, Field
 from sqlalchemy import JSON, Column, Text, UniqueConstraint
@@ -8,6 +8,11 @@
 
 from app.core.util import now
 
+if TYPE_CHECKING:
+    from .batch_job import BatchJob
+    from .organization import Organization
+    from .project import Project
+
 
 class DatasetItem(BaseModel):
     """Model for a single dataset item (Q&A pair)."""
@@ -116,12 +121,8 @@ class EvaluationDataset(SQLModel, table=True):
     updated_at: datetime = SQLField(default_factory=now, nullable=False)
 
     # Relationships
-    project: "Project" = Relationship(
-        back_populates="evaluation_datasets"
-    )  # noqa: F821
-    organization: "Organization" = Relationship(
-        back_populates="evaluation_datasets"
-    )  # noqa: F821
+    project: "Project" = Relationship()
+    organization: "Organization" = Relationship()
     evaluation_runs: list["EvaluationRun"] = Relationship(
         back_populates="evaluation_dataset"
     )
@@ -213,17 +214,15 @@ class EvaluationRun(SQLModel, table=True):
     )
 
     # Relationships
-    project: "Project" = Relationship(back_populates="evaluation_runs")  # noqa: F821
-    organization: "Organization" = Relationship(
-        back_populates="evaluation_runs"
-    )  # noqa: F821
+    project: "Project" = Relationship()
+    organization: "Organization" = Relationship()
     evaluation_dataset: "EvaluationDataset" = Relationship(
         back_populates="evaluation_runs"
     )
-    batch_job: Optional["BatchJob"] = Relationship(  # noqa: F821
+    batch_job: Optional["BatchJob"] = Relationship(
         sa_relationship_kwargs={"foreign_keys": "[EvaluationRun.batch_job_id]"}
     )
-    embedding_batch_job: Optional["BatchJob"] = Relationship(  # noqa: F821
+    embedding_batch_job: Optional["BatchJob"] = Relationship(
         sa_relationship_kwargs={
             "foreign_keys": "[EvaluationRun.embedding_batch_job_id]"
         }
diff --git a/backend/app/models/organization.py b/backend/app/models/organization.py
index 09a1f9af2..db660891a 100644
--- a/backend/app/models/organization.py
+++ b/backend/app/models/organization.py
@@ -1,18 +1,16 @@
 from datetime import datetime
-from typing import List, TYPE_CHECKING
+from typing import TYPE_CHECKING
+
 from sqlmodel import Field, Relationship, SQLModel
 
 from app.core.util import now
 
 if TYPE_CHECKING:
-    from .credentials import Credential
-    from .project import Project
-    from .api_key import APIKey
     from .assistants import Assistant
     from .collection import Collection
+    from .credentials import Credential
     from .openai_conversation import OpenAIConversation
-    from .batch_job import BatchJob
-    from .evaluation import EvaluationRun, EvaluationDataset
+    from .project import Project
 
 
 # Shared properties for an Organization
@@ -54,15 +52,6 @@ class Organization(OrganizationBase, table=True):
     openai_conversations: list["OpenAIConversation"] = Relationship(
         back_populates="organization", cascade_delete=True
     )
-    evaluation_runs: list["EvaluationRun"] = Relationship(
-        back_populates="organization", cascade_delete=True
-    )
-    evaluation_datasets: list["EvaluationDataset"] = Relationship(
-        back_populates="organization", cascade_delete=True
-    )
-    batch_jobs: list["BatchJob"] = Relationship(
-        back_populates="organization", cascade_delete=True
-    )
 
 
 # Properties to return via API
diff --git a/backend/app/models/project.py b/backend/app/models/project.py
index ae43b1e5d..c0d8a87ac 100644
--- a/backend/app/models/project.py
+++ b/backend/app/models/project.py
@@ -1,10 +1,19 @@
-from uuid import UUID, uuid4
 from datetime import datetime
-from typing import Optional, List
+from typing import TYPE_CHECKING, Optional
+from uuid import UUID, uuid4
+
 from sqlmodel import Field, Relationship, SQLModel, UniqueConstraint
 
 from app.core.util import now
 
+if TYPE_CHECKING:
+    from .assistants import Assistant
+    from .collection import Collection
+    from .credentials import Credential
+    from .fine_tuning import Fine_Tuning
+    from .openai_conversation import OpenAIConversation
+    from .organization import Organization
+
 
 # Shared properties for a Project
 class ProjectBase(SQLModel):
@@ -55,15 +64,6 @@ class Project(ProjectBase, table=True):
     openai_conversations: list["OpenAIConversation"] = Relationship(
         back_populates="project", cascade_delete=True
     )
-    evaluation_runs: list["EvaluationRun"] = Relationship(
-        back_populates="project", cascade_delete=True
-    )
-    evaluation_datasets: list["EvaluationDataset"] = Relationship(
-        back_populates="project", cascade_delete=True
-    )
-    batch_jobs: list["BatchJob"] = Relationship(
-        back_populates="project", cascade_delete=True
-    )
 
 
 # Properties to return via API

From a5c8a03acd880c3174c991fa198d8e9ac8cbf13a Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 4 Nov 2025 23:09:24 +0530
Subject: [PATCH 55/64] cleaner documentation

---
 .../api/docs/evaluation/create_evaluation.md  |  80 ++++++++
 .../app/api/docs/evaluation/delete_dataset.md |  18 ++
 .../app/api/docs/evaluation/get_dataset.md    |  22 ++
 .../app/api/docs/evaluation/get_evaluation.md |  32 +++
 .../app/api/docs/evaluation/list_datasets.md  |  19 ++
 .../api/docs/evaluation/list_evaluations.md   |  25 +++
 .../app/api/docs/evaluation/upload_dataset.md |  42 ++++
 backend/app/api/routes/evaluation.py          | 193 ++++--------------
 8 files changed, 273 insertions(+), 158 deletions(-)
 create mode 100644 backend/app/api/docs/evaluation/create_evaluation.md
 create mode 100644 backend/app/api/docs/evaluation/delete_dataset.md
 create mode 100644 backend/app/api/docs/evaluation/get_dataset.md
 create mode 100644 backend/app/api/docs/evaluation/get_evaluation.md
 create mode 100644 backend/app/api/docs/evaluation/list_datasets.md
 create mode 100644 backend/app/api/docs/evaluation/list_evaluations.md
 create mode 100644 backend/app/api/docs/evaluation/upload_dataset.md

diff --git a/backend/app/api/docs/evaluation/create_evaluation.md b/backend/app/api/docs/evaluation/create_evaluation.md
new file mode 100644
index 000000000..313ad0079
--- /dev/null
+++ b/backend/app/api/docs/evaluation/create_evaluation.md
@@ -0,0 +1,80 @@
+Start an evaluation using OpenAI Batch API.
+
+This endpoint:
+1. Fetches the dataset from database and validates it has Langfuse dataset ID
+2. Creates an EvaluationRun record in the database
+3. Fetches dataset items from Langfuse
+4. Builds JSONL for batch processing (config is used as-is)
+5. Creates a batch job via the generic batch infrastructure
+6. Returns the evaluation run details with batch_job_id
+
+The batch will be processed asynchronously by Celery Beat (every 60s).
+Use GET /evaluations/{evaluation_id} to check progress.
+
+## Request Body
+
+- **dataset_id** (required): ID of the evaluation dataset (from /evaluations/datasets)
+- **experiment_name** (required): Name for this evaluation experiment/run
+- **config** (optional): Configuration dict that will be used as-is in JSONL generation. Can include any OpenAI Responses API parameters like:
+  - model: str (e.g., "gpt-4o", "gpt-5")
+  - instructions: str
+  - tools: list (e.g., [{"type": "file_search", "vector_store_ids": [...]}])
+  - reasoning: dict (e.g., {"effort": "low"})
+  - text: dict (e.g., {"verbosity": "low"})
+  - temperature: float
+  - include: list (e.g., ["file_search_call.results"])
+  - Note: "input" will be added automatically from the dataset
+- **assistant_id** (optional): Assistant ID to fetch configuration from. If provided, configuration will be fetched from the assistant in the database. Config can be passed as empty dict {} when using assistant_id.
+
+## Example with config
+
+```json
+{
+    "dataset_id": 123,
+    "experiment_name": "test_run",
+    "config": {
+        "model": "gpt-4.1",
+        "instructions": "You are a helpful FAQ assistant.",
+        "tools": [
+            {
+                "type": "file_search",
+                "vector_store_ids": ["vs_12345"],
+                "max_num_results": 3
+            }
+        ],
+        "include": ["file_search_call.results"]
+    }
+}
+```
+
+## Example with assistant_id
+
+```json
+{
+    "dataset_id": 123,
+    "experiment_name": "test_run",
+    "config": {},
+    "assistant_id": "asst_xyz"
+}
+```
+
+## Returns
+
+EvaluationRunPublic with batch details and status:
+- id: Evaluation run ID
+- run_name: Name of the evaluation run
+- dataset_name: Name of the dataset used
+- dataset_id: ID of the dataset used
+- config: Configuration used for the evaluation
+- batch_job_id: ID of the batch job processing this evaluation
+- status: Current status (pending, running, completed, failed)
+- total_items: Total number of items being evaluated
+- completed_items: Number of items completed so far
+- results: Evaluation results (when completed)
+- error_message: Error message if failed
+
+## Error Responses
+
+- **404**: Dataset or assistant not found or not accessible
+- **400**: Missing required credentials (OpenAI or Langfuse), dataset missing Langfuse ID, or config missing required fields
+- **500**: Failed to configure API clients or start batch evaluation
diff --git a/backend/app/api/docs/evaluation/delete_dataset.md b/backend/app/api/docs/evaluation/delete_dataset.md
new file mode 100644
index 000000000..461c30fce
--- /dev/null
+++ b/backend/app/api/docs/evaluation/delete_dataset.md
@@ -0,0 +1,18 @@
+Delete a dataset by ID.
+
+This will remove the dataset record from the database. The CSV file in object store (if exists) will remain for audit purposes, but the dataset will no longer be accessible for creating new evaluations.
+
+## Path Parameters
+
+- **dataset_id**: ID of the dataset to delete
+
+## Returns
+
+Success message with deleted dataset details:
+- message: Confirmation message
+- dataset_id: ID of the deleted dataset
+
+## Error Responses
+
+- **404**: Dataset not found or not accessible to your organization/project
+- **400**: Dataset cannot be deleted (e.g., has active evaluation runs)
diff --git a/backend/app/api/docs/evaluation/get_dataset.md b/backend/app/api/docs/evaluation/get_dataset.md
new file mode 100644
index 000000000..02e1e73aa
--- /dev/null
+++ b/backend/app/api/docs/evaluation/get_dataset.md
@@ -0,0 +1,22 @@
+Get details of a specific dataset by ID.
+
+Retrieves comprehensive information about a dataset including metadata, object store URL, and Langfuse integration details.
+
+## Path Parameters
+
+- **dataset_id**: ID of the dataset to retrieve
+
+## Returns
+
+DatasetUploadResponse with dataset details:
+- dataset_id: Unique identifier for the dataset
+- dataset_name: Name of the dataset (sanitized)
+- total_items: Total number of items including duplication
+- original_items: Number of original items before duplication
+- duplication_factor: Factor by which items were duplicated
+- langfuse_dataset_id: ID of the dataset in Langfuse
+- object_store_url: URL to the CSV file in object storage
+
+## Error Responses
+
+- **404**: Dataset not found or not accessible to your organization/project
diff --git a/backend/app/api/docs/evaluation/get_evaluation.md b/backend/app/api/docs/evaluation/get_evaluation.md
new file mode 100644
index 000000000..509e27640
--- /dev/null
+++ b/backend/app/api/docs/evaluation/get_evaluation.md
@@ -0,0 +1,32 @@
+Get the current status of a specific evaluation run.
+
+Retrieves comprehensive information about an evaluation run including its current processing status, results (if completed), and error details (if failed).
+
+## Path Parameters
+
+- **evaluation_id**: ID of the evaluation run
+
+## Returns
+
+EvaluationRunPublic with current status and results:
+- id: Evaluation run ID
+- run_name: Name of the evaluation run
+- dataset_name: Name of the dataset used
+- dataset_id: ID of the dataset used
+- config: Configuration used for the evaluation
+- batch_job_id: ID of the batch job processing this evaluation
+- status: Current status (pending, running, completed, failed)
+- total_items: Total number of items being evaluated
+- completed_items: Number of items completed so far
+- results: Evaluation results (when completed)
+- error_message: Error message if failed
+- created_at: Timestamp when the evaluation was created
+- updated_at: Timestamp when the evaluation was last updated
+
+## Usage
+
+Use this endpoint to poll for evaluation progress. The evaluation is processed asynchronously by Celery Beat (every 60s), so you should poll periodically to check if the status has changed to "completed" or "failed".
+
+## Error Responses
+
+- **404**: Evaluation run not found or not accessible to this organization/project
diff --git a/backend/app/api/docs/evaluation/list_datasets.md b/backend/app/api/docs/evaluation/list_datasets.md
new file mode 100644
index 000000000..bd5576efc
--- /dev/null
+++ b/backend/app/api/docs/evaluation/list_datasets.md
@@ -0,0 +1,19 @@
+List all datasets for the current organization and project.
+
+Returns a paginated list of dataset records ordered by most recent first.
+
+## Query Parameters
+
+- **limit**: Maximum number of datasets to return (default 50, max 100)
+- **offset**: Number of datasets to skip for pagination (default 0)
+
+## Returns
+
+List of DatasetUploadResponse objects, each containing:
+- dataset_id: Unique identifier for the dataset
+- dataset_name: Name of the dataset (sanitized)
+- total_items: Total number of items including duplication
+- original_items: Number of original items before duplication
+- duplication_factor: Factor by which items were duplicated
+- langfuse_dataset_id: ID of the dataset in Langfuse
+- object_store_url: URL to the CSV file in object storage
diff --git a/backend/app/api/docs/evaluation/list_evaluations.md b/backend/app/api/docs/evaluation/list_evaluations.md
new file mode 100644
index 000000000..64c667726
--- /dev/null
+++ b/backend/app/api/docs/evaluation/list_evaluations.md
@@ -0,0 +1,25 @@
+List all evaluation runs for the current organization and project.
+
+Returns a paginated list of evaluation runs ordered by most recent first. Each evaluation run represents a batch processing job evaluating a dataset against a specific configuration.
+
+## Query Parameters
+
+- **limit**: Maximum number of runs to return (default 50)
+- **offset**: Number of runs to skip (for pagination, default 0)
+
+## Returns
+
+List of EvaluationRunPublic objects, each containing:
+- id: Evaluation run ID
+- run_name: Name of the evaluation run
+- dataset_name: Name of the dataset used
+- dataset_id: ID of the dataset used
+- config: Configuration used for the evaluation
+- batch_job_id: ID of the batch job processing this evaluation
+- status: Current status (pending, running, completed, failed)
+- total_items: Total number of items being evaluated
+- completed_items: Number of items completed so far
+- results: Evaluation results (when completed)
+- error_message: Error message if failed
+- created_at: Timestamp when the evaluation was created
+- updated_at: Timestamp when the evaluation was last updated
diff --git a/backend/app/api/docs/evaluation/upload_dataset.md b/backend/app/api/docs/evaluation/upload_dataset.md
new file mode 100644
index 000000000..b73902860
--- /dev/null
+++ b/backend/app/api/docs/evaluation/upload_dataset.md
@@ -0,0 +1,42 @@
+Upload a CSV file containing Golden Q&A pairs.
+
+This endpoint:
+1. Sanitizes the dataset name (removes spaces, special characters)
+2. Validates and parses the CSV file
+3. Uploads CSV to object store (if credentials configured)
+4. Uploads dataset to Langfuse (for immediate use)
+5. Stores metadata in database
+
+## Dataset Name
+
+- Will be sanitized for Langfuse compatibility
+- Spaces replaced with underscores
+- Special characters removed
+- Converted to lowercase
+- Example: "My Dataset 01!" becomes "my_dataset_01"
+
+## CSV Format
+
+- Must contain 'question' and 'answer' columns
+- Can have additional columns (will be ignored)
+- Missing values in 'question' or 'answer' rows will be skipped
+
+## Duplication Factor
+
+- Minimum: 1 (no duplication)
+- Maximum: 5
+- Default: 5
+- Each item in the dataset will be duplicated this many times
+- Used to ensure statistical significance in evaluation results
+
+## Example CSV
+
+```
+question,answer
+"What is the capital of France?","Paris"
+"What is 2+2?","4"
+```
+
+## Returns
+
+DatasetUploadResponse with dataset_id, object_store_url, and Langfuse details (dataset_name in response will be the sanitized version)
diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 908c54179..3449d7e30 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -27,6 +27,7 @@
     DatasetUploadResponse,
     EvaluationRunPublic,
 )
+from app.utils import load_description
 
 logger = logging.getLogger(__name__)
 
@@ -90,7 +91,11 @@ def sanitize_dataset_name(name: str) -> str:
     return sanitized
 
 
-@router.post("/evaluations/datasets", response_model=DatasetUploadResponse)
+@router.post(
+    "/evaluations/datasets",
+    description=load_description("evaluation/upload_dataset.md"),
+    response_model=DatasetUploadResponse,
+)
 async def upload_dataset(
     _session: SessionDep,
     auth_context: AuthContextDep,
@@ -106,46 +111,6 @@ async def upload_dataset(
         description="Number of times to duplicate each item (min: 1, max: 5)",
     ),
 ) -> DatasetUploadResponse:
-    """
-    Upload a CSV file containing Golden Q&A pairs.
-
-    This endpoint:
-    1. Sanitizes the dataset name (removes spaces, special characters)
-    2. Validates and parses the CSV file
-    3. Uploads CSV to object store (if credentials configured)
-    4. Uploads dataset to Langfuse (for immediate use)
-    5. Stores metadata in database
-
-    Dataset Name:
-    - Will be sanitized for Langfuse compatibility
-    - Spaces replaced with underscores
-    - Special characters removed
-    - Converted to lowercase
-    - Example: "My Dataset 01!" becomes "my_dataset_01"
-
-    CSV Format:
-    - Must contain 'question' and 'answer' columns
-    - Can have additional columns (will be ignored)
-    - Missing values in 'question' or 'answer' rows will be skipped
-
-    Duplication Factor:
-    - Minimum: 1 (no duplication)
-    - Maximum: 5
-    - Default: 5
-    - Each item in the dataset will be duplicated this many times
-    - Used to ensure statistical significance in evaluation results
-
-    Example CSV:
-    ```
-    question,answer
-    "What is the capital of France?","Paris"
-    "What is 2+2?","4"
-    ```
-
-    Returns:
-        DatasetUploadResponse with dataset_id, object_store_url, and Langfuse details
-        (dataset_name in response will be the sanitized version)
-    """
     # Sanitize dataset name for Langfuse compatibility
     original_name = dataset_name
     try:
@@ -341,23 +306,17 @@ async def upload_dataset(
     )
 
 
-@router.get("/evaluations/datasets/list", response_model=list[DatasetUploadResponse])
+@router.get(
+    "/evaluations/datasets/list",
+    description=load_description("evaluation/list_datasets.md"),
+    response_model=list[DatasetUploadResponse],
+)
 async def list_datasets_endpoint(
     _session: SessionDep,
     auth_context: AuthContextDep,
     limit: int = 50,
     offset: int = 0,
 ) -> list[DatasetUploadResponse]:
-    """
-    List all datasets for the current organization and project.
-
-    Args:
-        limit: Maximum number of datasets to return (default 50, max 100)
-        offset: Number of datasets to skip for pagination (default 0)
-
-    Returns:
-        List of DatasetUploadResponse objects, ordered by most recent first
-    """
     # Enforce maximum limit
     if limit > 100:
         limit = 100
@@ -390,21 +349,16 @@ async def list_datasets_endpoint(
     return response
 
 
-@router.get("/evaluations/datasets/{dataset_id}", response_model=DatasetUploadResponse)
+@router.get(
+    "/evaluations/datasets/{dataset_id}",
+    description=load_description("evaluation/get_dataset.md"),
+    response_model=DatasetUploadResponse,
+)
 async def get_dataset(
     dataset_id: int,
     _session: SessionDep,
     auth_context: AuthContextDep,
 ) -> DatasetUploadResponse:
-    """
-    Get details of a specific dataset by ID.
-
-    Args:
-        dataset_id: ID of the dataset to retrieve
-
-    Returns:
-        DatasetUploadResponse with dataset details
-    """
     logger.info(
         f"[get_dataset] Fetching dataset | id={dataset_id} | "
         f"org_id={auth_context.organization.id} | "
@@ -434,25 +388,15 @@ async def get_dataset(
     )
 
 
-@router.delete("/evaluations/datasets/{dataset_id}")
+@router.delete(
+    "/evaluations/datasets/{dataset_id}",
+    description=load_description("evaluation/delete_dataset.md"),
+)
 async def delete_dataset(
     dataset_id: int,
     _session: SessionDep,
     auth_context: AuthContextDep,
 ) -> dict:
-    """
-    Delete a dataset by ID.
-
-    This will remove the dataset record from the database. The CSV file in object store
-    (if exists) will remain for audit purposes, but the dataset will no longer
-    be accessible for creating new evaluations.
-
-    Args:
-        dataset_id: ID of the dataset to delete
-
-    Returns:
-        Success message with deleted dataset details
-    """
     logger.info(
         f"[delete_dataset] Deleting dataset | id={dataset_id} | "
         f"org_id={auth_context.organization.id} | "
@@ -477,7 +421,11 @@ async def delete_dataset(
     return {"message": message, "dataset_id": dataset_id}
 
 
-@router.post("/evaluations", response_model=EvaluationRunPublic)
+@router.post(
+    "/evaluations",
+    description=load_description("evaluation/create_evaluation.md"),
+    response_model=EvaluationRunPublic,
+)
 async def evaluate(
     _session: SessionDep,
     auth_context: AuthContextDep,
@@ -491,66 +439,6 @@ async def evaluate(
         None, description="Optional assistant ID to fetch configuration from"
     ),
 ) -> EvaluationRunPublic:
-    """
-    Start an evaluation using OpenAI Batch API.
-
-    This endpoint:
-    1. Fetches the dataset from database and validates it has Langfuse dataset ID
-    2. Creates an EvaluationRun record in the database
-    3. Fetches dataset items from Langfuse
-    4. Builds JSONL for batch processing (config is used as-is)
-    5. Creates a batch job via the generic batch infrastructure
-    6. Returns the evaluation run details with batch_job_id
-
-    The batch will be processed asynchronously by Celery Beat (every 60s).
-    Use GET /evaluations/{evaluation_id} to check progress.
-
-    Args:
-        dataset_id: ID of the evaluation dataset (from /evaluations/datasets)
-        experiment_name: Name for this evaluation experiment/run
-        config: Configuration dict that will be used as-is in JSONL generation.
-            Can include any OpenAI Responses API parameters like:
-            - model: str (e.g., "gpt-4o", "gpt-5")
-            - instructions: str
-            - tools: list (e.g., [{"type": "file_search", "vector_store_ids": [...]}])
-            - reasoning: dict (e.g., {"effort": "low"})
-            - text: dict (e.g., {"verbosity": "low"})
-            - temperature: float
-            - include: list (e.g., ["file_search_call.results"])
-            Note: "input" will be added automatically from the dataset
-        assistant_id: Optional assistant ID. If provided, configuration will be
-            fetched from the assistant in the database. Config can be passed as
-            empty dict {} when using assistant_id.
-
-    Example with config:
-    {
-        "dataset_id": 123,
-        "experiment_name": "test_run",
-        "config": {
-            "model": "gpt-4.1",
-            "instructions": "You are a helpful FAQ assistant.",
-            "tools": [
-                {
-                    "type": "file_search",
-                    "vector_store_ids": ["vs_12345"],
-                    "max_num_results": 3
-                }
-            ],
-            "include": ["file_search_call.results"]
-        }
-    }
-
-    Example with assistant_id:
-    {
-        "dataset_id": 123,
-        "experiment_name": "test_run",
-        "config": {},
-        "assistant_id": "asst_xyz"
-    }
-
-    Returns:
-        EvaluationRunPublic with batch details and status
-    """
     logger.info(
         f"[evaluate] Starting evaluation | experiment_name={experiment_name} | "
         f"dataset_id={dataset_id} | "
@@ -704,23 +592,17 @@ async def evaluate(
         return eval_run
 
 
-@router.get("/evaluations/list", response_model=list[EvaluationRunPublic])
+@router.get(
+    "/evaluations/list",
+    description=load_description("evaluation/list_evaluations.md"),
+    response_model=list[EvaluationRunPublic],
+)
 async def list_evaluation_runs(
     _session: SessionDep,
     auth_context: AuthContextDep,
     limit: int = 50,
     offset: int = 0,
 ) -> list[EvaluationRunPublic]:
-    """
-    List all evaluation runs for the current organization.
-
-    Args:
-        limit: Maximum number of runs to return (default 50)
-        offset: Number of runs to skip (for pagination)
-
-    Returns:
-        List of EvaluationRunPublic objects, ordered by most recent first
-    """
     logger.info(
         f"[list_evaluation_runs] Listing evaluation runs | "
         f"org_id={auth_context.organization.id} | "
@@ -736,21 +618,16 @@ async def list_evaluation_runs(
     )
 
 
-@router.get("/evaluations/{evaluation_id}", response_model=EvaluationRunPublic)
+@router.get(
+    "/evaluations/{evaluation_id}",
+    description=load_description("evaluation/get_evaluation.md"),
+    response_model=EvaluationRunPublic,
+)
 async def get_evaluation_run_status(
     evaluation_id: int,
     _session: SessionDep,
     auth_context: AuthContextDep,
 ) -> EvaluationRunPublic:
-    """
-    Get the current status of a specific evaluation run.
-
-    Args:
-        evaluation_id: ID of the evaluation run
-
-    Returns:
-        EvaluationRunPublic with current status and results if completed
-    """
     logger.info(
         f"[get_evaluation_run_status] Fetching status for evaluation run | "
         f"evaluation_id={evaluation_id} | "

From cba41c99c76a842905aa80f7ddbb86d2d378b4fe Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Wed, 5 Nov 2025 00:31:45 +0530
Subject: [PATCH 56/64] added indexes

---
 ...5747495bd7c_create_evaluation_run_table.py | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
index 681e881ae..2d7e9b014 100644
--- a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
+++ b/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
@@ -106,6 +106,18 @@ def upgrade():
     op.create_index(
         op.f("ix_batch_job_project_id"), "batch_job", ["project_id"], unique=False
     )
+    op.create_index(
+        "idx_batch_job_status_org",
+        "batch_job",
+        ["provider_status", "organization_id"],
+        unique=False,
+    )
+    op.create_index(
+        "idx_batch_job_status_project",
+        "batch_job",
+        ["provider_status", "project_id"],
+        unique=False,
+    )
 
     # Create evaluation_dataset table
     op.create_table(
@@ -202,10 +214,24 @@ def upgrade():
     op.create_index(
         op.f("ix_evaluation_run_run_name"), "evaluation_run", ["run_name"], unique=False
     )
+    op.create_index(
+        "idx_eval_run_status_org",
+        "evaluation_run",
+        ["status", "organization_id"],
+        unique=False,
+    )
+    op.create_index(
+        "idx_eval_run_status_project",
+        "evaluation_run",
+        ["status", "project_id"],
+        unique=False,
+    )
 
 
 def downgrade():
     # Drop evaluation_run table first (has foreign keys to batch_job and evaluation_dataset)
+    op.drop_index("idx_eval_run_status_project", table_name="evaluation_run")
+    op.drop_index("idx_eval_run_status_org", table_name="evaluation_run")
     op.drop_index(op.f("ix_evaluation_run_run_name"), table_name="evaluation_run")
     op.drop_table("evaluation_run")
 
@@ -214,6 +240,8 @@ def downgrade():
     op.drop_table("evaluation_dataset")
 
     # Drop batch_job table
+    op.drop_index("idx_batch_job_status_project", table_name="batch_job")
+    op.drop_index("idx_batch_job_status_org", table_name="batch_job")
     op.drop_index(op.f("ix_batch_job_project_id"), table_name="batch_job")
     op.drop_index(op.f("ix_batch_job_organization_id"), table_name="batch_job")
     op.drop_index(op.f("ix_batch_job_job_type"), table_name="batch_job")

From 8d32883dd9df7a566f449e4d166f02550830b6e0 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Wed, 5 Nov 2025 22:12:44 +0530
Subject: [PATCH 57/64] removing unnecessary asyncs

---
 backend/app/api/routes/evaluation.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 3449d7e30..18064af3d 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -311,7 +311,7 @@ async def upload_dataset(
     description=load_description("evaluation/list_datasets.md"),
     response_model=list[DatasetUploadResponse],
 )
-async def list_datasets_endpoint(
+def list_datasets_endpoint(
     _session: SessionDep,
     auth_context: AuthContextDep,
     limit: int = 50,
@@ -354,7 +354,7 @@ async def list_datasets_endpoint(
     description=load_description("evaluation/get_dataset.md"),
     response_model=DatasetUploadResponse,
 )
-async def get_dataset(
+def get_dataset(
     dataset_id: int,
     _session: SessionDep,
     auth_context: AuthContextDep,
@@ -392,7 +392,7 @@ async def get_dataset(
     "/evaluations/datasets/{dataset_id}",
     description=load_description("evaluation/delete_dataset.md"),
 )
-async def delete_dataset(
+def delete_dataset(
     dataset_id: int,
     _session: SessionDep,
     auth_context: AuthContextDep,
@@ -426,7 +426,7 @@ async def delete_dataset(
     description=load_description("evaluation/create_evaluation.md"),
     response_model=EvaluationRunPublic,
 )
-async def evaluate(
+def evaluate(
     _session: SessionDep,
     auth_context: AuthContextDep,
     dataset_id: int = Body(..., description="ID of the evaluation dataset"),
@@ -597,7 +597,7 @@ async def evaluate(
     description=load_description("evaluation/list_evaluations.md"),
     response_model=list[EvaluationRunPublic],
 )
-async def list_evaluation_runs(
+def list_evaluation_runs(
     _session: SessionDep,
     auth_context: AuthContextDep,
     limit: int = 50,
@@ -623,7 +623,7 @@ async def list_evaluation_runs(
     description=load_description("evaluation/get_evaluation.md"),
     response_model=EvaluationRunPublic,
 )
-async def get_evaluation_run_status(
+def get_evaluation_run_status(
     evaluation_id: int,
     _session: SessionDep,
     auth_context: AuthContextDep,

From 24a958e563547841a9cb043f3360ddf85872d551 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Wed, 5 Nov 2025 22:36:08 +0530
Subject: [PATCH 58/64] using get_langfuse_client instead

---
 backend/app/api/routes/evaluation.py          | 39 ++---------
 backend/app/api/routes/llm.py                 |  2 +-
 backend/app/crud/evaluations/core.py          | 27 ++++----
 backend/app/crud/evaluations/processing.py    | 66 +++++--------------
 .../app/tests/api/routes/test_evaluation.py   | 40 +++++------
 backend/app/utils.py                          | 33 ++++++----
 6 files changed, 78 insertions(+), 129 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 18064af3d..6d091b31a 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -8,9 +8,7 @@
 
 from app.api.deps import AuthContextDep, SessionDep
 from app.core.cloud import get_cloud_storage
-from app.core.util import configure_langfuse, configure_openai
 from app.crud.assistants import get_assistant_by_id
-from app.crud.credentials import get_provider_credential
 from app.crud.evaluations import (
     create_evaluation_dataset,
     create_evaluation_run,
@@ -27,7 +25,7 @@
     DatasetUploadResponse,
     EvaluationRunPublic,
 )
-from app.utils import load_description
+from app.utils import get_langfuse_client, get_openai_client, load_description
 
 logger = logging.getLogger(__name__)
 
@@ -231,23 +229,12 @@ async def upload_dataset(
     # Step 3: Upload to Langfuse
     langfuse_dataset_id = None
     try:
-        # Get Langfuse credentials
-        langfuse_credentials = get_provider_credential(
+        # Get Langfuse client
+        langfuse = get_langfuse_client(
             session=_session,
             org_id=auth_context.organization.id,
             project_id=auth_context.project.id,
-            provider="langfuse",
         )
-        if not langfuse_credentials:
-            raise HTTPException(
-                status_code=400, detail="Langfuse credentials not configured"
-            )
-
-        langfuse, langfuse_success = configure_langfuse(langfuse_credentials)
-        if not langfuse_success:
-            raise HTTPException(
-                status_code=500, detail="Failed to configure Langfuse client"
-            )
 
         # Upload to Langfuse
         langfuse_dataset_id, _ = upload_dataset_to_langfuse_from_csv(
@@ -470,32 +457,18 @@ def evaluate(
 
     dataset_name = dataset.name
 
-    # Get credentials
-    openai_credentials = get_provider_credential(
+    # Get API clients
+    openai_client = get_openai_client(
         session=_session,
         org_id=auth_context.organization.id,
         project_id=auth_context.project.id,
-        provider="openai",
     )
-    langfuse_credentials = get_provider_credential(
+    langfuse = get_langfuse_client(
         session=_session,
         org_id=auth_context.organization.id,
         project_id=auth_context.project.id,
-        provider="langfuse",
     )
 
-    if not openai_credentials or not langfuse_credentials:
-        raise HTTPException(
-            status_code=400, detail="OpenAI or Langfuse credentials not configured"
-        )
-
-    # Configure clients
-    openai_client, openai_success = configure_openai(openai_credentials)
-    langfuse, langfuse_success = configure_langfuse(langfuse_credentials)
-
-    if not openai_success or not langfuse_success:
-        raise HTTPException(status_code=500, detail="Failed to configure API clients")
-
     # Validate dataset has Langfuse ID (should have been set during dataset creation)
     if not dataset.langfuse_dataset_id:
         raise HTTPException(
diff --git a/backend/app/api/routes/llm.py b/backend/app/api/routes/llm.py
index 4eed7c1bc..26c9ee423 100644
--- a/backend/app/api/routes/llm.py
+++ b/backend/app/api/routes/llm.py
@@ -35,7 +35,7 @@ def llm_callback_notification(body: APIResponse[LLMCallResponse]):
     response_model=APIResponse[Message],
     callbacks=llm_callback_router.routes,
 )
-async def llm_call(
+def llm_call(
     _current_user: AuthContextDep, _session: SessionDep, request: LLMCallRequest
 ):
     """
diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py
index 235aa50d8..70ddb0766 100644
--- a/backend/app/crud/evaluations/core.py
+++ b/backend/app/crud/evaluations/core.py
@@ -2,12 +2,14 @@
 import io
 import logging
 
+from fastapi import HTTPException
 from sqlmodel import Session, select
 
-from app.core.util import configure_langfuse, now
+from app.core.util import now
 from app.crud.credentials import get_provider_credential
 from app.models import EvaluationRun, UserProjectOrg
 from app.models.evaluation import DatasetUploadResponse
+from app.utils import get_langfuse_client
 
 logger = logging.getLogger(__name__)
 
@@ -35,20 +37,15 @@ async def upload_dataset_to_langfuse(
         Tuple of (success, dataset_response, error_message)
     """
     try:
-        # Get Langfuse credentials
-        langfuse_credentials = get_provider_credential(
-            session=_session,
-            org_id=_current_user.organization_id,
-            project_id=_current_user.project_id,
-            provider="langfuse",
-        )
-        if not langfuse_credentials:
-            return False, None, "LANGFUSE keys not configured for this organization."
-
-        # Configure Langfuse
-        langfuse, success = configure_langfuse(langfuse_credentials)
-        if not success:
-            return False, None, "Failed to configure Langfuse client."
+        # Get Langfuse client
+        try:
+            langfuse = get_langfuse_client(
+                session=_session,
+                org_id=_current_user.organization_id,
+                project_id=_current_user.project_id,
+            )
+        except HTTPException as http_exc:
+            return False, None, http_exc.detail
 
         # Parse CSV content
         csv_text = csv_content.decode("utf-8")
diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py
index 9b5573db8..6b986b9cb 100644
--- a/backend/app/crud/evaluations/processing.py
+++ b/backend/app/crud/evaluations/processing.py
@@ -14,12 +14,12 @@
 from collections import defaultdict
 from typing import Any
 
+from fastapi import HTTPException
 from langfuse import Langfuse
 from openai import OpenAI
 from sqlmodel import Session, select
 
 from app.core.batch.openai_provider import OpenAIBatchProvider
-from app.core.util import configure_langfuse, configure_openai
 from app.crud.batch_job import get_batch_job
 from app.crud.batch_operations import (
     download_batch_results,
@@ -38,6 +38,7 @@
     update_traces_with_cosine_scores,
 )
 from app.models import EvaluationRun
+from app.utils import get_langfuse_client, get_openai_client
 
 logger = logging.getLogger(__name__)
 
@@ -681,61 +682,30 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
 
     for project_id, project_runs in evaluations_by_project.items():
         try:
-            # Get credentials for this project
-            openai_credentials = get_provider_credential(
-                session=session,
-                org_id=org_id,
-                project_id=project_id,
-                provider="openai",
-            )
-            langfuse_credentials = get_provider_credential(
-                session=session,
-                org_id=org_id,
-                project_id=project_id,
-                provider="langfuse",
-            )
-
-            if not openai_credentials or not langfuse_credentials:
-                logger.error(
-                    f"[poll_all_pending_evaluations] Missing credentials | org_id={org_id} | project_id={project_id} | openai={bool(openai_credentials)} | langfuse={bool(langfuse_credentials)}"
+            # Get API clients for this project
+            try:
+                openai_client = get_openai_client(
+                    session=session,
+                    org_id=org_id,
+                    project_id=project_id,
                 )
-                # Mark all runs in this project as failed due to missing credentials
-                for eval_run in project_runs:
-                    # Persist failure status to database
-                    update_evaluation_run(
-                        session=session,
-                        eval_run=eval_run,
-                        status="failed",
-                        error_message="Missing OpenAI or Langfuse credentials",
-                    )
-
-                    all_results.append(
-                        {
-                            "run_id": eval_run.id,
-                            "run_name": eval_run.run_name,
-                            "action": "failed",
-                            "error": "Missing OpenAI or Langfuse credentials",
-                        }
-                    )
-                    total_failed_count += 1
-                continue
-
-            # Configure clients
-            openai_client, openai_success = configure_openai(openai_credentials)
-            langfuse, langfuse_success = configure_langfuse(langfuse_credentials)
-
-            if not openai_success or not langfuse_success:
+                langfuse = get_langfuse_client(
+                    session=session,
+                    org_id=org_id,
+                    project_id=project_id,
+                )
+            except HTTPException as http_exc:
                 logger.error(
-                    f"[poll_all_pending_evaluations] Failed to configure clients | org_id={org_id} | project_id={project_id}"
+                    f"[poll_all_pending_evaluations] Failed to get API clients | org_id={org_id} | project_id={project_id} | error={http_exc.detail}"
                 )
-                # Mark all runs in this project as failed due to client configuration
+                # Mark all runs in this project as failed due to client configuration error
                 for eval_run in project_runs:
                     # Persist failure status to database
                     update_evaluation_run(
                         session=session,
                         eval_run=eval_run,
                         status="failed",
-                        error_message="Failed to configure API clients",
+                        error_message=http_exc.detail,
                     )
 
                     all_results.append(
@@ -743,7 +713,7 @@ async def poll_all_pending_evaluations(session: Session, org_id: int) -> dict[st
                             "run_id": eval_run.id,
                             "run_name": eval_run.run_name,
                             "action": "failed",
-                            "error": "Failed to configure API clients",
+                            "error": http_exc.detail,
                         }
                     )
                     total_failed_count += 1
diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
index fbf39794a..5538ab146 100644
--- a/backend/app/tests/api/routes/test_evaluation.py
+++ b/backend/app/tests/api/routes/test_evaluation.py
@@ -1,5 +1,5 @@
 import io
-from unittest.mock import patch
+from unittest.mock import Mock, patch
 
 import pytest
 from sqlmodel import select
@@ -57,8 +57,8 @@ def test_upload_dataset_valid_csv(
                 "app.api.routes.evaluation.upload_csv_to_object_store"
             ) as mock_store_upload,
             patch(
-                "app.api.routes.evaluation.configure_langfuse"
-            ) as mock_configure_langfuse,
+                "app.api.routes.evaluation.get_langfuse_client"
+            ) as mock_get_langfuse_client,
             patch(
                 "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
@@ -66,8 +66,8 @@ def test_upload_dataset_valid_csv(
             # Mock object store upload
             mock_store_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
 
-            # Mock Langfuse configuration
-            mock_configure_langfuse.return_value = (None, True)
+            # Mock Langfuse client
+            mock_get_langfuse_client.return_value = Mock()
 
             # Mock Langfuse upload
             mock_langfuse_upload.return_value = ("test_dataset_id", 9)
@@ -141,15 +141,15 @@ def test_upload_dataset_empty_rows(
                 "app.api.routes.evaluation.upload_csv_to_object_store"
             ) as mock_store_upload,
             patch(
-                "app.api.routes.evaluation.configure_langfuse"
-            ) as mock_configure_langfuse,
+                "app.api.routes.evaluation.get_langfuse_client"
+            ) as mock_get_langfuse_client,
             patch(
                 "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
         ):
             # Mock object store and Langfuse uploads
             mock_store_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
-            mock_configure_langfuse.return_value = (None, True)
+            mock_get_langfuse_client.return_value = Mock()
             mock_langfuse_upload.return_value = ("test_dataset_id", 4)
 
             filename, file_obj = create_csv_file(csv_with_empty_rows)
@@ -185,14 +185,14 @@ def test_upload_with_default_duplication(
                 "app.api.routes.evaluation.upload_csv_to_object_store"
             ) as mock_store_upload,
             patch(
-                "app.api.routes.evaluation.configure_langfuse"
-            ) as mock_configure_langfuse,
+                "app.api.routes.evaluation.get_langfuse_client"
+            ) as mock_get_langfuse_client,
             patch(
                 "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
         ):
             mock_store_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
-            mock_configure_langfuse.return_value = (None, True)
+            mock_get_langfuse_client.return_value = Mock()
             mock_langfuse_upload.return_value = ("test_dataset_id", 15)
 
             filename, file_obj = create_csv_file(valid_csv_content)
@@ -224,14 +224,14 @@ def test_upload_with_custom_duplication(
                 "app.api.routes.evaluation.upload_csv_to_object_store"
             ) as mock_store_upload,
             patch(
-                "app.api.routes.evaluation.configure_langfuse"
-            ) as mock_configure_langfuse,
+                "app.api.routes.evaluation.get_langfuse_client"
+            ) as mock_get_langfuse_client,
             patch(
                 "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
         ):
             mock_store_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
-            mock_configure_langfuse.return_value = (None, True)
+            mock_get_langfuse_client.return_value = Mock()
             mock_langfuse_upload.return_value = ("test_dataset_id", 12)
 
             filename, file_obj = create_csv_file(valid_csv_content)
@@ -263,14 +263,14 @@ def test_upload_with_description(
                 "app.api.routes.evaluation.upload_csv_to_object_store"
             ) as mock_store_upload,
             patch(
-                "app.api.routes.evaluation.configure_langfuse"
-            ) as mock_configure_langfuse,
+                "app.api.routes.evaluation.get_langfuse_client"
+            ) as mock_get_langfuse_client,
             patch(
                 "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
         ):
             mock_store_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
-            mock_configure_langfuse.return_value = (None, True)
+            mock_get_langfuse_client.return_value = Mock()
             mock_langfuse_upload.return_value = ("test_dataset_id", 9)
 
             filename, file_obj = create_csv_file(valid_csv_content)
@@ -353,14 +353,14 @@ def test_upload_with_duplication_factor_boundary_minimum(
                 "app.api.routes.evaluation.upload_csv_to_object_store"
             ) as mock_store_upload,
             patch(
-                "app.api.routes.evaluation.configure_langfuse"
-            ) as mock_configure_langfuse,
+                "app.api.routes.evaluation.get_langfuse_client"
+            ) as mock_get_langfuse_client,
             patch(
                 "app.api.routes.evaluation.upload_dataset_to_langfuse_from_csv"
             ) as mock_langfuse_upload,
         ):
             mock_store_upload.return_value = "s3://bucket/datasets/test_dataset.csv"
-            mock_configure_langfuse.return_value = (None, True)
+            mock_get_langfuse_client.return_value = Mock()
             mock_langfuse_upload.return_value = ("test_dataset_id", 3)
 
             filename, file_obj = create_csv_file(valid_csv_content)
diff --git a/backend/app/utils.py b/backend/app/utils.py
index 360054dc8..094c36829 100644
--- a/backend/app/utils.py
+++ b/backend/app/utils.py
@@ -206,9 +206,7 @@ def get_openai_client(session: Session, org_id: int, project_id: int) -> OpenAI:
         )
 
 
-def get_langfuse_client(
-    session: Session, org_id: int, project_id: int
-) -> Langfuse | None:
+def get_langfuse_client(session: Session, org_id: int, project_id: int) -> Langfuse:
     """
     Fetch Langfuse credentials for the current org/project and return a configured client.
     """
@@ -219,21 +217,32 @@ def get_langfuse_client(
         project_id=project_id,
     )
 
-    has_credentials = (
-        credentials
-        and "public_key" in credentials
-        and "secret_key" in credentials
-        and "host" in credentials
-    )
+    if not credentials or not all(
+        key in credentials for key in ["public_key", "secret_key", "host"]
+    ):
+        logger.error(
+            f"[get_langfuse_client] Langfuse credentials not found or incomplete. | project_id: {project_id}"
+        )
+        raise HTTPException(
+            status_code=400,
+            detail="Langfuse credentials not configured for this organization/project.",
+        )
 
-    if has_credentials:
+    try:
         return Langfuse(
             public_key=credentials["public_key"],
             secret_key=credentials["secret_key"],
             host=credentials["host"],
         )
-
-    return None
+    except Exception as e:
+        logger.error(
+            f"[get_langfuse_client] Failed to configure Langfuse client. | project_id: {project_id} | error: {str(e)}",
+            exc_info=True,
+        )
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to configure Langfuse client: {str(e)}",
+        )
 
 
 def handle_openai_error(e: openai.OpenAIError) -> str:

From 34700c52cc5a64b5910abe3166ad361599ea9202 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Wed, 5 Nov 2025 22:49:30 +0530
Subject: [PATCH 59/64] update migration head

---
 ...=> 6fe772038a5a_create_evaluation_run_table.py} | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)
 rename backend/app/alembic/versions/{d5747495bd7c_create_evaluation_run_table.py => 6fe772038a5a_create_evaluation_run_table.py} (97%)

diff --git a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py b/backend/app/alembic/versions/6fe772038a5a_create_evaluation_run_table.py
similarity index 97%
rename from backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
rename to backend/app/alembic/versions/6fe772038a5a_create_evaluation_run_table.py
index 2d7e9b014..0ea65c7ec 100644
--- a/backend/app/alembic/versions/d5747495bd7c_create_evaluation_run_table.py
+++ b/backend/app/alembic/versions/6fe772038a5a_create_evaluation_run_table.py
@@ -1,18 +1,18 @@
 """create_evaluation_run_table, batch_job_table, and evaluation_dataset_table
 
-Revision ID: d5747495bd7c
-Revises: e7c68e43ce6f
-Create Date: 2025-10-14 12:42:15.464302
+Revision ID: 6fe772038a5a
+Revises: 219033c644de
+Create Date: 2025-11-05 22:47:18.266070
 
 """
+from alembic import op
 import sqlalchemy as sa
 import sqlmodel.sql.sqltypes
-from alembic import op
-from sqlalchemy.dialects import postgresql
+
 
 # revision identifiers, used by Alembic.
-revision = "d5747495bd7c"
-down_revision = "e7c68e43ce6f"
+revision = "6fe772038a5a"
+down_revision = "219033c644de"
 branch_labels = None
 depends_on = None
 

From 97554036f59b709e48bd1cddd86b04fd5fdfc9cb Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Wed, 5 Nov 2025 23:47:30 +0530
Subject: [PATCH 60/64] refactoring and cleanups

---
 ...fe772038a5a_create_evaluation_run_table.py |   1 +
 backend/app/core/batch/__init__.py            |   2 +-
 .../batch/{provider_interface.py => base.py}  |   0
 .../batch/{openai_provider.py => openai.py}   |   2 +-
 backend/app/crud/batch_operations.py          | 166 +++---------------
 backend/app/crud/evaluations/batch.py         |   2 +-
 backend/app/crud/evaluations/embeddings.py    |   2 +-
 backend/app/crud/evaluations/processing.py    |   8 +-
 docker-compose.yml                            |  17 --
 9 files changed, 34 insertions(+), 166 deletions(-)
 rename backend/app/core/batch/{provider_interface.py => base.py} (100%)
 rename backend/app/core/batch/{openai_provider.py => openai.py} (99%)

diff --git a/backend/app/alembic/versions/6fe772038a5a_create_evaluation_run_table.py b/backend/app/alembic/versions/6fe772038a5a_create_evaluation_run_table.py
index 0ea65c7ec..c9fd595aa 100644
--- a/backend/app/alembic/versions/6fe772038a5a_create_evaluation_run_table.py
+++ b/backend/app/alembic/versions/6fe772038a5a_create_evaluation_run_table.py
@@ -7,6 +7,7 @@
 """
 from alembic import op
 import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
 import sqlmodel.sql.sqltypes
 
 
diff --git a/backend/app/core/batch/__init__.py b/backend/app/core/batch/__init__.py
index 73ee2fe93..9f7cd88d5 100644
--- a/backend/app/core/batch/__init__.py
+++ b/backend/app/core/batch/__init__.py
@@ -1,5 +1,5 @@
 """Batch processing infrastructure for LLM providers."""
 
-from .provider_interface import BatchProvider
+from .base import BatchProvider
 
 __all__ = ["BatchProvider"]
diff --git a/backend/app/core/batch/provider_interface.py b/backend/app/core/batch/base.py
similarity index 100%
rename from backend/app/core/batch/provider_interface.py
rename to backend/app/core/batch/base.py
diff --git a/backend/app/core/batch/openai_provider.py b/backend/app/core/batch/openai.py
similarity index 99%
rename from backend/app/core/batch/openai_provider.py
rename to backend/app/core/batch/openai.py
index eadaa6d70..8bb4abe6a 100644
--- a/backend/app/core/batch/openai_provider.py
+++ b/backend/app/core/batch/openai.py
@@ -6,7 +6,7 @@
 
 from openai import OpenAI
 
-from .provider_interface import BatchProvider
+from .base import BatchProvider
 
 logger = logging.getLogger(__name__)
 
diff --git a/backend/app/crud/batch_operations.py b/backend/app/crud/batch_operations.py
index 11aa74543..f2bb332e8 100644
--- a/backend/app/crud/batch_operations.py
+++ b/backend/app/crud/batch_operations.py
@@ -5,7 +5,7 @@
 
 from sqlmodel import Session
 
-from app.core.batch.provider_interface import BatchProvider
+from app.core.batch.base import BatchProvider
 from app.core.cloud import get_cloud_storage
 from app.core.storage_utils import upload_jsonl_to_object_store as shared_upload_jsonl
 from app.crud.batch_job import (
@@ -30,34 +30,17 @@ def start_batch_job(
     """
     Create and start a batch job with the specified provider.
 
-    This orchestrates the complete batch creation workflow:
-    1. Create batch_job record in DB with status='pending'
-    2. Call provider to upload data and create batch
-    3. Update batch_job with provider IDs and status='processing'
-
-    Args:
-        session: Database session
-        provider: BatchProvider instance (e.g., OpenAIBatchProvider)
-        provider_name: Provider name (e.g., "openai", "anthropic")
-        job_type: Job type (e.g., "evaluation", "classification")
-        organization_id: Organization ID
-        project_id: Project ID
-        jsonl_data: List of dictionaries representing JSONL lines
-        config: Complete batch configuration including provider-specific params
+    Creates a batch_job record, calls the provider to create the batch,
+    and updates the record with provider IDs.
 
     Returns:
-        BatchJob object with provider IDs populated
-
-    Raises:
-        Exception: If batch creation fails
+        BatchJob with provider IDs populated
     """
     logger.info(
-        f"[start_batch_job] Starting batch job | provider={provider_name} | "
-        f"job_type={job_type} | org_id={organization_id} | project_id={project_id} | "
-        f"items={len(jsonl_data)}"
+        f"[start_batch_job] Starting | provider={provider_name} | type={job_type} | "
+        f"org={organization_id} | project={project_id} | items={len(jsonl_data)}"
     )
 
-    # Step 1: Create batch_job record
     batch_job_create = BatchJobCreate(
         provider=provider_name,
         job_type=job_type,
@@ -70,13 +53,8 @@ def start_batch_job(
     batch_job = create_batch_job(session=session, batch_job_create=batch_job_create)
 
     try:
-        # Step 2: Call provider to create batch
-        logger.info(
-            f"[start_batch_job] Creating batch with provider | provider={provider_name}"
-        )
         batch_result = provider.create_batch(jsonl_data=jsonl_data, config=config)
 
-        # Step 3: Update batch_job with provider IDs
         batch_job_update = BatchJobUpdate(
             provider_batch_id=batch_result["provider_batch_id"],
             provider_file_id=batch_result["provider_file_id"],
@@ -89,18 +67,15 @@ def start_batch_job(
         )
 
         logger.info(
-            f"[start_batch_job] Successfully started batch job | id={batch_job.id} | "
+            f"[start_batch_job] Success | id={batch_job.id} | "
             f"provider_batch_id={batch_job.provider_batch_id}"
         )
 
         return batch_job
 
     except Exception as e:
-        logger.error(
-            f"[start_batch_job] Failed to start batch job | {e}", exc_info=True
-        )
+        logger.error(f"[start_batch_job] Failed | {e}", exc_info=True)
 
-        # Store error in batch_job (parent table will handle status)
         batch_job_update = BatchJobUpdate(
             error_message=f"Batch creation failed: {str(e)}"
         )
@@ -114,41 +89,24 @@ def start_batch_job(
 def poll_batch_status(
     session: Session, provider: BatchProvider, batch_job: BatchJob
 ) -> dict[str, Any]:
-    """
-    Poll provider for batch status and update database.
-
-    Args:
-        session: Database session
-        provider: BatchProvider instance
-        batch_job: BatchJob object
-
-    Returns:
-        Dictionary with status information from provider
-
-    Raises:
-        Exception: If polling fails
-    """
+    """Poll provider for batch status and update database."""
     logger.info(
-        f"[poll_batch_status] Polling batch status | id={batch_job.id} | "
+        f"[poll_batch_status] Polling | id={batch_job.id} | "
         f"provider_batch_id={batch_job.provider_batch_id}"
     )
 
     try:
-        # Poll provider for status
         status_result = provider.get_batch_status(batch_job.provider_batch_id)
 
-        # Update batch_job if status changed
         provider_status = status_result["provider_status"]
         if provider_status != batch_job.provider_status:
             update_data = {"provider_status": provider_status}
 
-            # Update output file ID if available
             if status_result.get("provider_output_file_id"):
                 update_data["provider_output_file_id"] = status_result[
                     "provider_output_file_id"
                 ]
 
-            # Update error message if failed
             if status_result.get("error_message"):
                 update_data["error_message"] = status_result["error_message"]
 
@@ -158,43 +116,28 @@ def poll_batch_status(
             )
 
             logger.info(
-                f"[poll_batch_status] Updated batch_job status | id={batch_job.id} | "
+                f"[poll_batch_status] Updated | id={batch_job.id} | "
                 f"{batch_job.provider_status} -> {provider_status}"
             )
 
         return status_result
 
     except Exception as e:
-        logger.error(
-            f"[poll_batch_status] Failed to poll batch status | {e}", exc_info=True
-        )
+        logger.error(f"[poll_batch_status] Failed | {e}", exc_info=True)
         raise
 
 
 def download_batch_results(
     provider: BatchProvider, batch_job: BatchJob
 ) -> list[dict[str, Any]]:
-    """
-    Download raw batch results from provider.
-
-    Args:
-        provider: BatchProvider instance
-        batch_job: BatchJob object (must have provider_output_file_id)
-
-    Returns:
-        List of result dictionaries from provider
-
-    Raises:
-        ValueError: If output_file_id not available
-        Exception: If download fails
-    """
+    """Download raw batch results from provider."""
     if not batch_job.provider_output_file_id:
         raise ValueError(
             f"Batch job {batch_job.id} does not have provider_output_file_id"
         )
 
     logger.info(
-        f"[download_batch_results] Downloading batch results | id={batch_job.id} | "
+        f"[download_batch_results] Downloading | id={batch_job.id} | "
         f"output_file_id={batch_job.provider_output_file_id}"
     )
 
@@ -202,16 +145,14 @@ def download_batch_results(
         results = provider.download_batch_results(batch_job.provider_output_file_id)
 
         logger.info(
-            f"[download_batch_results] Downloaded results | batch_job_id={batch_job.id} | results={len(results)}"
+            f"[download_batch_results] Downloaded | batch_job_id={batch_job.id} | "
+            f"results={len(results)}"
         )
 
         return results
 
     except Exception as e:
-        logger.error(
-            f"[download_batch_results] Failed to download batch results | {e}",
-            exc_info=True,
-        )
+        logger.error(f"[download_batch_results] Failed | {e}", exc_info=True)
         raise
 
 
@@ -224,29 +165,14 @@ def process_completed_batch(
     """
     Process a completed batch: download results and optionally upload to object store.
 
-    Args:
-        session: Database session
-        provider: BatchProvider instance
-        batch_job: BatchJob object
-        upload_to_object_store: Whether to upload raw results to object store
-
     Returns:
         Tuple of (results, object_store_url)
-        - results: List of result dictionaries
-        - object_store_url: Object store URL if uploaded, None otherwise
-
-    Raises:
-        Exception: If processing fails
     """
-    logger.info(
-        f"[process_completed_batch] Processing completed batch | id={batch_job.id}"
-    )
+    logger.info(f"[process_completed_batch] Processing | id={batch_job.id}")
 
     try:
-        # Download results
         results = download_batch_results(provider=provider, batch_job=batch_job)
 
-        # Upload to object store if requested
         object_store_url = None
         if upload_to_object_store:
             try:
@@ -254,16 +180,15 @@ def process_completed_batch(
                     session=session, batch_job=batch_job, results=results
                 )
                 logger.info(
-                    f"[process_completed_batch] Uploaded batch results to object store | {object_store_url}"
+                    f"[process_completed_batch] Uploaded to object store | {object_store_url}"
                 )
             except Exception as store_error:
                 logger.warning(
-                    f"[process_completed_batch] Object store upload failed (credentials may not be configured) | "
-                    f"{store_error} | Continuing without object store storage",
+                    f"[process_completed_batch] Object store upload failed "
+                    f"(credentials may not be configured) | {store_error}",
                     exc_info=True,
                 )
 
-        # Update batch_job with object store URL
         if object_store_url:
             batch_job_update = BatchJobUpdate(raw_output_url=object_store_url)
             update_batch_job(
@@ -273,46 +198,24 @@ def process_completed_batch(
         return results, object_store_url
 
     except Exception as e:
-        logger.error(
-            f"[process_completed_batch] Failed to process completed batch | {e}",
-            exc_info=True,
-        )
+        logger.error(f"[process_completed_batch] Failed | {e}", exc_info=True)
         raise
 
 
 def upload_batch_results_to_object_store(
     session: Session, batch_job: BatchJob, results: list[dict[str, Any]]
 ) -> str | None:
-    """
-    Upload batch results to object store.
-
-    This function uses the shared storage utility for consistent upload behavior.
-
-    Args:
-        session: Database session (for getting cloud storage)
-        batch_job: BatchJob object
-        results: List of result dictionaries
-
-    Returns:
-        Object store URL if successful, None if failed
-
-    Raises:
-        Exception: If upload fails
-    """
+    """Upload batch results to object store."""
     logger.info(
-        f"[upload_batch_results_to_object_store] Uploading batch results to object store | batch_job_id={batch_job.id}"
+        f"[upload_batch_results_to_object_store] Uploading | batch_job_id={batch_job.id}"
     )
 
     try:
-        # Get cloud storage instance
         storage = get_cloud_storage(session=session, project_id=batch_job.project_id)
 
-        # Define subdirectory and filename
-        # Format: {job_type}/batch-{id}/results.jsonl
         subdirectory = f"{batch_job.job_type}/batch-{batch_job.id}"
         filename = "results.jsonl"
 
-        # Use shared utility for upload
         object_store_url = shared_upload_jsonl(
             storage=storage,
             results=results,
@@ -324,25 +227,6 @@ def upload_batch_results_to_object_store(
 
     except Exception as e:
         logger.error(
-            f"[upload_batch_results_to_object_store] Failed to upload batch results to object store | {e}",
-            exc_info=True,
+            f"[upload_batch_results_to_object_store] Failed | {e}", exc_info=True
         )
         raise
-
-
-# Backward compatibility alias
-upload_batch_results_to_s3 = upload_batch_results_to_object_store
-
-
-# NOTE: Batch-level polling has been removed from this module.
-# Polling should be done at the parent table level (e.g., evaluation_run)
-# because only the parent knows when its business logic is complete.
-#
-# For example:
-# - poll_all_pending_evaluations() in evaluation_processing.py
-# - poll_all_pending_classifications() in classification_processing.py (future)
-#
-# Each parent-specific polling function should:
-# 1. Query parent table for status="processing"
-# 2. Poll batch_job.provider_status via poll_batch_status()
-# 3. Update parent table status based on business logic
diff --git a/backend/app/crud/evaluations/batch.py b/backend/app/crud/evaluations/batch.py
index 8f3ef092d..7e8b69043 100644
--- a/backend/app/crud/evaluations/batch.py
+++ b/backend/app/crud/evaluations/batch.py
@@ -14,7 +14,7 @@
 from openai import OpenAI
 from sqlmodel import Session
 
-from app.core.batch.openai_provider import OpenAIBatchProvider
+from app.core.batch.openai import OpenAIBatchProvider
 from app.crud.batch_operations import start_batch_job
 from app.models import EvaluationRun
 
diff --git a/backend/app/crud/evaluations/embeddings.py b/backend/app/crud/evaluations/embeddings.py
index 77ea39251..70e374211 100644
--- a/backend/app/crud/evaluations/embeddings.py
+++ b/backend/app/crud/evaluations/embeddings.py
@@ -15,7 +15,7 @@
 from openai import OpenAI
 from sqlmodel import Session
 
-from app.core.batch.openai_provider import OpenAIBatchProvider
+from app.core.batch.openai import OpenAIBatchProvider
 from app.core.util import now
 from app.crud.batch_operations import start_batch_job
 from app.models import EvaluationRun
diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py
index 6b986b9cb..48c9f43d9 100644
--- a/backend/app/crud/evaluations/processing.py
+++ b/backend/app/crud/evaluations/processing.py
@@ -19,7 +19,7 @@
 from openai import OpenAI
 from sqlmodel import Session, select
 
-from app.core.batch.openai_provider import OpenAIBatchProvider
+from app.core.batch.openai import OpenAIBatchProvider
 from app.crud.batch_job import get_batch_job
 from app.crud.batch_operations import (
     download_batch_results,
@@ -405,9 +405,7 @@ async def process_completed_embedding_batch(
 
         # Step 7: Mark evaluation as completed
         eval_run = update_evaluation_run(
-            session=session,
-            eval_run=eval_run,
-            status="completed",
+            session=session, eval_run=eval_run, status="completed", score=eval_run.score
         )
 
         logger.info(
@@ -474,6 +472,8 @@ async def check_and_process_evaluation(
             if embedding_batch_job:
                 # Poll embedding batch status
                 provider = OpenAIBatchProvider(client=openai_client)
+
+                # Local import to avoid circular dependency with batch_operations
                 from app.crud.batch_operations import poll_batch_status
 
                 poll_batch_status(
diff --git a/docker-compose.yml b/docker-compose.yml
index 5e7ed313d..10fc0d914 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -152,23 +152,6 @@ services:
       RABBITMQ_HOST: rabbitmq
     command: ["uv", "run", "celery", "-A", "app.celery.celery_app", "worker", "--loglevel=info"]
 
-  celery_beat:
-    image: "${DOCKER_IMAGE_BACKEND?Variable not set}:${TAG:-latest}"
-    container_name: celery-beat
-    restart: always
-    build:
-      context: ./backend
-    depends_on:
-      backend:
-        condition: service_healthy
-    env_file:
-      - .env
-    environment:
-      POSTGRES_SERVER: db
-      REDIS_HOST: redis
-      RABBITMQ_HOST: rabbitmq
-    command: ["uv", "run", "celery", "-A", "app.celery.celery_app", "beat", "--loglevel=info"]
-
   celery_flower:
     image: "${DOCKER_IMAGE_BACKEND?Variable not set}:${TAG:-latest}"
     container_name: celery-flower

From 2b3829345d149c0fe28bb05e92174111941fefcc Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 6 Nov 2025 00:00:01 +0530
Subject: [PATCH 61/64] cleanup cron

---
 backend/app/api/routes/cron.py             | 10 ++++++----
 backend/app/crud/evaluations/core.py       |  1 -
 backend/app/crud/evaluations/processing.py |  1 -
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/backend/app/api/routes/cron.py b/backend/app/api/routes/cron.py
index f4426c8f1..a9e7b66ed 100644
--- a/backend/app/api/routes/cron.py
+++ b/backend/app/api/routes/cron.py
@@ -1,10 +1,12 @@
 import logging
 
+from app.api.permissions import Permission, require_permission
 from fastapi import APIRouter, Depends
 from sqlmodel import Session
 
-from app.api.deps import get_current_active_superuser, get_db
+from app.api.deps import SessionDep, AuthContextDep
 from app.crud.evaluations import process_all_pending_evaluations_sync
+from app.models import User
 
 logger = logging.getLogger(__name__)
 
@@ -13,11 +15,11 @@
 
 @router.get(
     "/cron/evaluations",
-    include_in_schema=True,
-    dependencies=[Depends(get_current_active_superuser)],
+    include_in_schema=False,
+    dependencies=[Depends(require_permission(Permission.SUPERUSER))],
 )
 def evaluation_cron_job(
-    session: Session = Depends(get_db),
+    session: SessionDep,
 ) -> dict:
     """
     Cron job endpoint for periodic evaluation tasks.
diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py
index 70ddb0766..a964f26b9 100644
--- a/backend/app/crud/evaluations/core.py
+++ b/backend/app/crud/evaluations/core.py
@@ -6,7 +6,6 @@
 from sqlmodel import Session, select
 
 from app.core.util import now
-from app.crud.credentials import get_provider_credential
 from app.models import EvaluationRun, UserProjectOrg
 from app.models.evaluation import DatasetUploadResponse
 from app.utils import get_langfuse_client
diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py
index 48c9f43d9..50698d00c 100644
--- a/backend/app/crud/evaluations/processing.py
+++ b/backend/app/crud/evaluations/processing.py
@@ -25,7 +25,6 @@
     download_batch_results,
     upload_batch_results_to_object_store,
 )
-from app.crud.credentials import get_provider_credential
 from app.crud.evaluations.batch import fetch_dataset_items
 from app.crud.evaluations.core import update_evaluation_run
 from app.crud.evaluations.embeddings import (

From dce502b1efb1a0364b76248c4d7cb4b0472d905a Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 6 Nov 2025 12:40:14 +0530
Subject: [PATCH 62/64] moving to env for cron

---
 .env.example                  |  6 ++++++
 scripts/python/invoke-cron.py | 14 +++++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/.env.example b/.env.example
index a6df893df..7c2cd9f02 100644
--- a/.env.example
+++ b/.env.example
@@ -23,6 +23,12 @@ FIRST_SUPERUSER=superuser@example.com
 FIRST_SUPERUSER_PASSWORD=changethis
 EMAIL_TEST_USER="test@example.com"
 
+# API Base URL for cron scripts (defaults to http://localhost:8000 if not set)
+API_BASE_URL=http://localhost:8000
+
+# Cron interval in minutes (defaults to 5 minutes if not set)
+CRON_INTERVAL_MINUTES=5
+
 # Postgres
 POSTGRES_SERVER=localhost
 POSTGRES_PORT=5432
diff --git a/scripts/python/invoke-cron.py b/scripts/python/invoke-cron.py
index 306f2711c..cbc42a82b 100644
--- a/scripts/python/invoke-cron.py
+++ b/scripts/python/invoke-cron.py
@@ -15,8 +15,6 @@
 from dotenv import load_dotenv
 
 # Configuration
-INTERVAL_MINUTES = 1  # How often to invoke the endpoint
-BASE_URL = "http://localhost:8000"  # Base URL of the API
 ENDPOINT = "/api/v1/cron/evaluations"  # Endpoint to invoke
 REQUEST_TIMEOUT = 30  # Timeout for requests in seconds
 
@@ -32,9 +30,14 @@ class EndpointInvoker:
     """Handles periodic endpoint invocation with authentication."""
 
     def __init__(self):
-        self.base_url = BASE_URL.rstrip("/")
+        # Load BASE_URL from environment with default fallback
+        base_url = os.getenv("API_BASE_URL", "http://localhost:8000")
+        self.base_url = base_url.rstrip("/")
         self.endpoint = ENDPOINT
-        self.interval_seconds = INTERVAL_MINUTES * 60
+
+        # Load interval from environment with default of 5 minutes
+        self.interval_minutes = int(os.getenv("CRON_INTERVAL_MINUTES", "5"))
+        self.interval_seconds = self.interval_minutes * 60
         self.access_token = None
         self.token_expiry = None
 
@@ -127,8 +130,9 @@ async def invoke_endpoint(self, client: httpx.AsyncClient) -> dict:
 
     async def run(self):
         """Main loop to invoke endpoint periodically."""
+        logger.info(f"Using API Base URL: {self.base_url}")
         logger.info(
-            f"Starting cron job - invoking {self.endpoint} every {INTERVAL_MINUTES} minutes"
+            f"Starting cron job - invoking {self.endpoint} every {self.interval_minutes} minutes"
         )
 
         # Use async context manager to ensure proper cleanup

From 8ad6982c6e4650a4ffb82d5cf4f1e1c0b0d2f22c Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 6 Nov 2025 12:42:17 +0530
Subject: [PATCH 63/64] formatting code

---
 backend/app/crud/evaluations/cron.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/backend/app/crud/evaluations/cron.py b/backend/app/crud/evaluations/cron.py
index a670a547f..ca6bd2af2 100644
--- a/backend/app/crud/evaluations/cron.py
+++ b/backend/app/crud/evaluations/cron.py
@@ -85,7 +85,9 @@ async def process_all_pending_evaluations(session: Session) -> dict[str, Any]:
                 )
 
                 # Poll all pending evaluations for this org
-                summary = await poll_all_pending_evaluations(session=session, org_id=org.id)
+                summary = await poll_all_pending_evaluations(
+                    session=session, org_id=org.id
+                )
 
                 results.append(
                     {
@@ -105,7 +107,9 @@ async def process_all_pending_evaluations(session: Session) -> dict[str, Any]:
                     exc_info=True,
                 )
                 session.rollback()
-                results.append({"org_id": org.id, "org_name": org.name, "error": str(e)})
+                results.append(
+                    {"org_id": org.id, "org_name": org.name, "error": str(e)}
+                )
                 total_failed += 1
 
         logger.info(

From c08d6267e3c9758add6ee7820cc0fc2c797f4cb9 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 6 Nov 2025 12:54:50 +0530
Subject: [PATCH 64/64] updated endpoints

---
 backend/app/api/routes/evaluation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/app/api/routes/evaluation.py b/backend/app/api/routes/evaluation.py
index 6d091b31a..a62048d13 100644
--- a/backend/app/api/routes/evaluation.py
+++ b/backend/app/api/routes/evaluation.py
@@ -294,7 +294,7 @@ async def upload_dataset(
 
 
 @router.get(
-    "/evaluations/datasets/list",
+    "/evaluations/datasets",
     description=load_description("evaluation/list_datasets.md"),
     response_model=list[DatasetUploadResponse],
 )
@@ -566,7 +566,7 @@ def evaluate(
 
 
 @router.get(
-    "/evaluations/list",
+    "/evaluations",
     description=load_description("evaluation/list_evaluations.md"),
     response_model=list[EvaluationRunPublic],
 )