From c939d3a569a0b14688dd517bbb42f58b33a9b821 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Wed, 17 Sep 2025 18:51:20 +0530
Subject: [PATCH 01/18] experimenting claude

---
 CLAUDE.md | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 CLAUDE.md

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 00000000..f8193593
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,105 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+This is an AI Platform named Kaapi built with FastAPI (backend) and PostgreSQL (database), containerized with Docker. The platform provides AI capabilities including OpenAI assistants, fine-tuning, document processing, and collection management.
+
+## Key Commands
+
+### Development
+
+```bash
+# Start development environment with auto-reload
+source .venv/bin/activate
+fastapi run --reload app/main.py
+
+# Run backend tests
+uv run bash scripts/tests-start.sh
+
+# Seed data
+uv run python -m app.seed_data.seed_data
+
+# Run pre-commit
+uv run pre-commit run --all-files
+
+# Activate virtual environment
+source .venv/bin/activate
+
+# Run linting and type checking
+cd backend && bash scripts/lint.sh
+
+# Generate new Migration
+alembic revision --autogenerate -m 'Add new meta'
+```
+
+### Testing
+
+```bash
+# Run backend tests
+uv run bash scripts/tests-start.sh
+```
+
+## Architecture
+
+### Backend Structure
+
+The backend follows a layered architecture:
+
+- **API Layer** (`backend/app/api/`): FastAPI routes organized by domain
+  - Authentication (`login.py`)
+  - Core resources: `users.py`, `organizations.py`, `projects.py`
+  - AI features: `assistants.py`, `fine_tuning.py`, `openai_conversation.py`
+  - Document management: `documents.py`, `collections.py`, `doc_transformation_job.py`
+
+- **Models** (`backend/app/models/`): SQLModel entities representing database tables
+  - User system: User, Organization, Project, ProjectUser
+  - AI components: Assistant, Thread, Message, FineTuning
+  - Document system: Document, Collection, DocumentCollection, DocTransformationJob
+
+- **CRUD Operations** (`backend/app/crud/`): Database operations for each model
+
+- **Core Services** (`backend/app/core/`):
+  - `providers.py`: OpenAI client management
+  - `finetune/`: Fine-tuning pipeline (preprocessing, evaluation)
+  - `doctransform/`: Document transformation services
+  - `cloud/storage.py`: S3 storage integration
+  - `langfuse/`: Observability and tracing
+
+### Database
+
+PostgreSQL with Alembic migrations. Key relationships:
+- Organizations contain Projects
+- Projects have Users (many-to-many via ProjectUser)
+- Projects contain Collections and Documents
+- Documents can belong to Collections (many-to-many)
+- Projects have Assistants, Threads, and FineTuning jobs
+
+### Authentication & Security
+
+- JWT-based authentication
+- API key support for programmatic access
+- Role-based access control (User, Admin, Super Admin)
+- Organization and project-level permissions
+
+## Environment Configuration
+
+Critical environment variables:
+- `SECRET_KEY`: JWT signing key
+- `POSTGRES_*`: Database connection
+- `LOCAL_CREDENTIALS_ORG_OPENAI_API_KEY`: OpenAI API key
+- `AWS_S3_BUCKET_PREFIX`: S3 storage configuration
+- `LANGFUSE_*`: Observability configuration
+
+## Testing Strategy
+
+- Unit tests in `backend/app/tests/`
+- Test fixtures use factory pattern
+- Mock external services (OpenAI, S3) using `moto` and `openai_responses`
+- Coverage reports generated automatically
+
+## Code Standards
+
+- Python 3.11+ with type hints
+- Pre-commit hooks configured for consistency

From 1482a0c141acc84f88b21bf6b9392f0be765893f Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 18 Sep 2025 16:03:31 +0530
Subject: [PATCH 02/18] first stab with claude

---
 backend/app/api/routes/fine_tuning.py | 66 ++++++++++++++++++++++++---
 1 file changed, 60 insertions(+), 6 deletions(-)

diff --git a/backend/app/api/routes/fine_tuning.py b/backend/app/api/routes/fine_tuning.py
index 9d053393..c04a23a6 100644
--- a/backend/app/api/routes/fine_tuning.py
+++ b/backend/app/api/routes/fine_tuning.py
@@ -1,17 +1,19 @@
 from typing import Optional
 import logging
 import time
-from uuid import UUID
+from uuid import UUID, uuid4
+from pathlib import Path
 
 import openai
 from sqlmodel import Session
-from fastapi import APIRouter, HTTPException, BackgroundTasks
+from fastapi import APIRouter, HTTPException, BackgroundTasks, File, Form, UploadFile
 
 from app.models import (
     FineTuningJobCreate,
     FineTuningJobPublic,
     FineTuningUpdate,
     FineTuningStatus,
+    Document,
 )
 from app.core.cloud import get_cloud_storage
 from app.crud.document import DocumentCrud
@@ -179,12 +181,40 @@ def process_fine_tuning_job(
     description=load_description("fine_tuning/create.md"),
     response_model=APIResponse,
 )
-def fine_tune_from_CSV(
+async def fine_tune_from_CSV(
     session: SessionDep,
     current_user: CurrentUserOrgProject,
-    request: FineTuningJobCreate,
     background_tasks: BackgroundTasks,
+    file: UploadFile = File(..., description="CSV file to use for fine-tuning"),
+    base_model: str = Form(
+        ..., description="Base model for fine-tuning (e.g., gpt-3.5-turbo)"
+    ),
+    split_ratio: str = Form(
+        ..., description="Comma-separated split ratios (e.g., '0.8' or '0.7,0.8,0.9')"
+    ),
+    system_prompt: str = Form(..., description="System prompt for the fine-tuning job"),
 ):
+    # Validate and parse split ratios
+    try:
+        split_ratios = [float(r.strip()) for r in split_ratio.split(",")]
+        for ratio in split_ratios:
+            if not (0 < ratio < 1):
+                raise ValueError(
+                    f"Invalid split_ratio: {ratio}. Must be between 0 and 1."
+                )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    # Validate system prompt
+    if not system_prompt.strip():
+        raise HTTPException(
+            status_code=400, detail="System prompt must be a non-empty string"
+        )
+
+    # Validate file is CSV
+    if not file.filename.lower().endswith(".csv"):
+        raise HTTPException(status_code=400, detail="File must be a CSV file")
+
     client = get_openai_client(  # Used here only to validate the user's OpenAI key;
         # the actual client is re-initialized separately inside the background task
         session,
@@ -192,9 +222,31 @@ def fine_tune_from_CSV(
         current_user.project_id,
     )
 
+    # Upload the file to storage and create document
+    storage = get_cloud_storage(session=session, project_id=current_user.project_id)
+    document_id = uuid4()
+    object_store_url = storage.put(file, Path(str(document_id)))
+
+    # Create document in database
+    document_crud = DocumentCrud(session, current_user.project_id)
+    document = Document(
+        id=document_id,
+        fname=file.filename,
+        object_store_url=str(object_store_url),
+    )
+    created_document = document_crud.update(document)
+
+    # Create FineTuningJobCreate request object
+    request = FineTuningJobCreate(
+        document_id=created_document.id,
+        base_model=base_model,
+        split_ratio=split_ratios,
+        system_prompt=system_prompt.strip(),
+    )
+
     results = []
 
-    for ratio in request.split_ratio:
+    for ratio in split_ratios:
         job, created = create_fine_tuning_job(
             session=session,
             request=request,
@@ -237,7 +289,9 @@ def fine_tune_from_CSV(
         else f"Started {created_count} job(s); {total - created_count} active fine-tuning job(s) already exists."
     )
 
-    return APIResponse.success_response({"message": message, "jobs": job_infos})
+    return APIResponse.success_response(
+        {"message": message, "document_id": str(created_document.id), "jobs": job_infos}
+    )
 
 
 @router.get(

From c3364d5b0dc81ef7e4c1aa1fe91346f2e1de3cd5 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 18 Sep 2025 16:03:41 +0530
Subject: [PATCH 03/18] first stab with claude

---
 CLAUDE.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index f8193593..2dc7be8a 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -27,9 +27,6 @@ uv run pre-commit run --all-files
 # Activate virtual environment
 source .venv/bin/activate
 
-# Run linting and type checking
-cd backend && bash scripts/lint.sh
-
 # Generate new Migration
 alembic revision --autogenerate -m 'Add new meta'
 ```

From ebcd9a0ede5852bee87bb2f9f22adaa770a60826 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Mon, 22 Sep 2025 22:53:16 +0530
Subject: [PATCH 04/18] adding additional logic to call evaluation directly if
 status is changed from in progress to completed

---
 backend/app/api/routes/fine_tuning.py   | 68 +++++++++++++++++++++----
 backend/app/api/routes/responses.py     | 21 +-------
 backend/app/api/routes/threads.py       |  9 +---
 backend/app/core/finetune/evaluation.py |  2 +-
 backend/app/utils.py                    |  8 +++
 5 files changed, 70 insertions(+), 38 deletions(-)

diff --git a/backend/app/api/routes/fine_tuning.py b/backend/app/api/routes/fine_tuning.py
index c04a23a6..e2c3b5fb 100644
--- a/backend/app/api/routes/fine_tuning.py
+++ b/backend/app/api/routes/fine_tuning.py
@@ -14,19 +14,30 @@
     FineTuningUpdate,
     FineTuningStatus,
     Document,
+    ModelEvaluationBase,
+    ModelEvaluationStatus,
 )
 from app.core.cloud import get_cloud_storage
 from app.crud.document import DocumentCrud
-from app.utils import get_openai_client, APIResponse, mask_string, load_description
+from app.utils import (
+    get_openai_client,
+    APIResponse,
+    mask_string,
+    load_description,
+    handle_openai_error,
+)
 from app.crud import (
     create_fine_tuning_job,
     fetch_by_id,
     update_finetune_job,
     fetch_by_document_id,
+    create_model_evaluation,
+    fetch_active_model_evals,
 )
 from app.core.db import engine
 from app.api.deps import CurrentUserOrgProject, SessionDep
 from app.core.finetune.preprocessing import DataPreprocessor
+from app.api.routes.model_evaluation import run_model_evaluation
 
 
 logger = logging.getLogger(__name__)
@@ -43,13 +54,6 @@
 }
 
 
-def handle_openai_error(e: openai.OpenAIError) -> str:
-    """Extract error message from OpenAI error."""
-    if isinstance(e.body, dict) and "message" in e.body:
-        return e.body["message"]
-    return str(e)
-
-
 def process_fine_tuning_job(
     job_id: int,
     ratio: float,
@@ -300,7 +304,10 @@ async def fine_tune_from_CSV(
     response_model=APIResponse[FineTuningJobPublic],
 )
 def refresh_fine_tune_status(
-    fine_tuning_id: int, session: SessionDep, current_user: CurrentUserOrgProject
+    fine_tuning_id: int,
+    background_tasks: BackgroundTasks,
+    session: SessionDep,
+    current_user: CurrentUserOrgProject,
 ):
     project_id = current_user.project_id
     job = fetch_by_id(session, fine_tuning_id, project_id)
@@ -336,6 +343,12 @@ def refresh_fine_tune_status(
             error_message=openai_error_msg,
         )
 
+        # Check if status is changing from running to completed
+        is_newly_completed = (
+            job.status == FineTuningStatus.running
+            and update_payload.status == FineTuningStatus.completed
+        )
+
         if (
             job.status != update_payload.status
             or job.fine_tuned_model != update_payload.fine_tuned_model
@@ -343,6 +356,43 @@ def refresh_fine_tune_status(
         ):
             job = update_finetune_job(session=session, job=job, update=update_payload)
 
+        # If the job just completed, automatically trigger evaluation
+        if is_newly_completed:
+            logger.info(
+                f"[refresh_fine_tune_status] Fine-tuning job completed, triggering evaluation | "
+                f"fine_tuning_id={fine_tuning_id}, project_id={project_id}"
+            )
+
+            # Check if there's already an active evaluation for this job
+            active_evaluations = fetch_active_model_evals(
+                session, fine_tuning_id, project_id
+            )
+
+            if not active_evaluations:
+                # Create a new evaluation
+                model_eval = create_model_evaluation(
+                    session=session,
+                    request=ModelEvaluationBase(fine_tuning_id=fine_tuning_id),
+                    project_id=project_id,
+                    organization_id=current_user.organization_id,
+                    status=ModelEvaluationStatus.pending,
+                )
+
+                # Queue the evaluation task
+                background_tasks.add_task(
+                    run_model_evaluation, model_eval.id, current_user
+                )
+
+                logger.info(
+                    f"[refresh_fine_tune_status] Created and queued evaluation | "
+                    f"eval_id={model_eval.id}, fine_tuning_id={fine_tuning_id}, project_id={project_id}"
+                )
+            else:
+                logger.info(
+                    f"[refresh_fine_tune_status] Skipping evaluation creation - active evaluation exists | "
+                    f"fine_tuning_id={fine_tuning_id}, project_id={project_id}"
+                )
+
     job = job.model_copy(
         update={
             "train_data_file_url": storage.get_signed_url(job.train_data_s3_object)
diff --git a/backend/app/api/routes/responses.py b/backend/app/api/routes/responses.py
index 94e5f19d..d4e2389c 100644
--- a/backend/app/api/routes/responses.py
+++ b/backend/app/api/routes/responses.py
@@ -18,32 +18,13 @@
     get_conversation_by_ancestor_id,
 )
 from app.models import UserProjectOrg, OpenAIConversationCreate, OpenAIConversation
-from app.utils import APIResponse, mask_string
+from app.utils import APIResponse, mask_string, handle_openai_error
 from app.core.langfuse.langfuse import LangfuseTracer
 
 logger = logging.getLogger(__name__)
 router = APIRouter(tags=["responses"])
 
 
-def handle_openai_error(e: openai.OpenAIError) -> str:
-    """Extract error message from OpenAI error."""
-    # Try to get error message from different possible attributes
-    if hasattr(e, "body") and isinstance(e.body, dict) and "message" in e.body:
-        return e.body["message"]
-    elif hasattr(e, "message"):
-        return e.message
-    elif hasattr(e, "response") and hasattr(e.response, "json"):
-        try:
-            error_data = e.response.json()
-            if isinstance(error_data, dict) and "error" in error_data:
-                error_info = error_data["error"]
-                if isinstance(error_info, dict) and "message" in error_info:
-                    return error_info["message"]
-        except:
-            pass
-    return str(e)
-
-
 class ResponsesAPIRequest(BaseModel):
     assistant_id: str
     question: str
diff --git a/backend/app/api/routes/threads.py b/backend/app/api/routes/threads.py
index 95630bfb..be7e0578 100644
--- a/backend/app/api/routes/threads.py
+++ b/backend/app/api/routes/threads.py
@@ -13,7 +13,7 @@
 from app.core import logging, settings
 from app.models import UserOrganization, OpenAIThreadCreate, UserProjectOrg
 from app.crud import upsert_thread_result, get_thread_result
-from app.utils import APIResponse, mask_string
+from app.utils import APIResponse, mask_string, handle_openai_error
 from app.crud.credentials import get_provider_credential
 from app.core.util import configure_openai
 from app.core.langfuse.langfuse import LangfuseTracer
@@ -49,13 +49,6 @@ def send_callback(callback_url: str, data: dict):
         return False
 
 
-def handle_openai_error(e: openai.OpenAIError) -> str:
-    """Extract error message from OpenAI error."""
-    if isinstance(e.body, dict) and "message" in e.body:
-        return e.body["message"]
-    return str(e)
-
-
 def validate_thread(client: OpenAI, thread_id: str) -> tuple[bool, str]:
     """Validate if a thread exists and has no active runs."""
     if not thread_id:
diff --git a/backend/app/core/finetune/evaluation.py b/backend/app/core/finetune/evaluation.py
index 527087eb..4a85e85c 100644
--- a/backend/app/core/finetune/evaluation.py
+++ b/backend/app/core/finetune/evaluation.py
@@ -11,7 +11,7 @@
     matthews_corrcoef,
 )
 from app.core.cloud import AmazonCloudStorage
-from app.api.routes.fine_tuning import handle_openai_error
+from app.utils import handle_openai_error
 from app.core.finetune.preprocessing import DataPreprocessor
 
 
diff --git a/backend/app/utils.py b/backend/app/utils.py
index 1c03839a..8f96fd95 100644
--- a/backend/app/utils.py
+++ b/backend/app/utils.py
@@ -7,6 +7,7 @@
 
 import jwt
 import emails
+import openai
 from jinja2 import Template
 from jwt.exceptions import InvalidTokenError
 from fastapi import HTTPException
@@ -48,6 +49,13 @@ def failure_response(
         return cls(success=False, data=None, error=error_message, metadata=metadata)
 
 
+def handle_openai_error(e: openai.OpenAIError) -> str:
+    """Extract error message from OpenAI error."""
+    if hasattr(e, "body") and isinstance(e.body, dict) and "message" in e.body:
+        return e.body["message"]
+    return str(e)
+
+
 @dataclass
 class EmailData:
     html_content: str

From 00b415f5bacef5707928115a230183b39be3beb1 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 23 Sep 2025 10:12:04 +0530
Subject: [PATCH 05/18] updating testcases

---
 .../app/tests/api/routes/test_fine_tuning.py  | 121 +++++++++++-------
 1 file changed, 73 insertions(+), 48 deletions(-)

diff --git a/backend/app/tests/api/routes/test_fine_tuning.py b/backend/app/tests/api/routes/test_fine_tuning.py
index 5582b73f..9162044a 100644
--- a/backend/app/tests/api/routes/test_fine_tuning.py
+++ b/backend/app/tests/api/routes/test_fine_tuning.py
@@ -1,10 +1,30 @@
+import os
+import io
 import pytest
-
+from moto import mock_aws
 from unittest.mock import patch, MagicMock
+import boto3
 
 from app.tests.utils.test_data import create_test_fine_tuning_jobs
 from app.tests.utils.utils import get_document
-from app.models import Fine_Tuning
+from app.models import (
+    Fine_Tuning,
+    FineTuningStatus,
+    ModelEvaluation,
+    ModelEvaluationStatus,
+)
+from app.core.config import settings
+
+
+@pytest.fixture(scope="function")
+def aws_credentials():
+    """Set up AWS credentials for moto."""
+    os.environ["AWS_ACCESS_KEY_ID"] = "testing"
+    os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
+    os.environ["AWS_SECURITY_TOKEN"] = "testing"
+    os.environ["AWS_SESSION_TOKEN"] = "testing"
+    os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
+    os.environ["AWS_S3_BUCKET_PREFIX"] = "test-bucket"
 
 
 def create_file_mock(file_type):
@@ -22,73 +42,78 @@ def _side_effect(file=None, purpose=None):
     return _side_effect
 
 
-@pytest.mark.usefixtures("client", "db", "user_api_key_header")
-@patch("app.api.routes.fine_tuning.DataPreprocessor")
-@patch("app.api.routes.fine_tuning.get_openai_client")
+@pytest.mark.usefixtures("client", "db", "user_api_key_header", "aws_credentials")
 class TestCreateFineTuningJobAPI:
+    @mock_aws
     def test_finetune_from_csv_multiple_split_ratio(
         self,
-        mock_get_openai_client,
-        mock_preprocessor_cls,
         client,
         db,
         user_api_key_header,
     ):
-        document = get_document(db, "dalgo_sample.json")
+        # Setup S3 bucket for moto
+        s3 = boto3.client("s3", region_name="us-east-1")
+        s3.create_bucket(Bucket="test-bucket")
 
+        # Create a test CSV file content
+        csv_content = "prompt,label\ntest1,label1\ntest2,label2\ntest3,label3"
+
+        # Setup test files for preprocessing
         for path in ["/tmp/train.jsonl", "/tmp/test.jsonl"]:
             with open(path, "w") as f:
-                f.write("{}")
-
-        mock_preprocessor = MagicMock()
-        mock_preprocessor.process.return_value = {
-            "train_jsonl_temp_filepath": "/tmp/train.jsonl",
-            "train_csv_s3_object": "s3://bucket/train.csv",
-            "test_csv_s3_object": "s3://bucket/test.csv",
-        }
-        mock_preprocessor.cleanup = MagicMock()
-        mock_preprocessor_cls.return_value = mock_preprocessor
-
-        mock_openai = MagicMock()
-        mock_openai.files.create.side_effect = create_file_mock("fine-tune")
-        mock_openai.fine_tuning.jobs.create.side_effect = [
-            MagicMock(id=f"ft_mock_job_{i}", status="running") for i in range(1, 4)
-        ]
-        mock_get_openai_client.return_value = mock_openai
-
-        body = {
-            "document_id": str(document.id),
-            "base_model": "gpt-4",
-            "split_ratio": [0.5, 0.7, 0.9],
-            "system_prompt": "you are a model able to classify",
-        }
-
-        with patch("app.api.routes.fine_tuning.Session") as SessionMock:
-            SessionMock.return_value.__enter__.return_value = db
-            SessionMock.return_value.__exit__.return_value = None
-
-            response = client.post(
-                "/api/v1/fine_tuning/fine_tune",
-                json=body,
-                headers=user_api_key_header,
-            )
+                f.write('{"prompt": "test", "completion": "label"}')
+
+        with patch(
+            "app.api.routes.fine_tuning.get_cloud_storage"
+        ) as mock_get_cloud_storage:
+            with patch(
+                "app.api.routes.fine_tuning.get_openai_client"
+            ) as mock_get_openai_client:
+                with patch(
+                    "app.api.routes.fine_tuning.process_fine_tuning_job"
+                ) as mock_process_job:
+                    # Mock cloud storage
+                    mock_storage = MagicMock()
+                    mock_storage.put.return_value = "s3://test-bucket/test.csv"
+                    mock_get_cloud_storage.return_value = mock_storage
+
+                    # Mock OpenAI client (for validation only)
+                    mock_openai = MagicMock()
+                    mock_get_openai_client.return_value = mock_openai
+
+                    # Create file upload data
+                    csv_file = io.BytesIO(csv_content.encode())
+                    response = client.post(
+                        "/api/v1/fine_tuning/fine_tune",
+                        files={"file": ("test.csv", csv_file, "text/csv")},
+                        data={
+                            "base_model": "gpt-4",
+                            "split_ratio": "0.5,0.7,0.9",
+                            "system_prompt": "you are a model able to classify",
+                        },
+                        headers=user_api_key_header,
+                    )
 
         assert response.status_code == 200
         json_data = response.json()
         assert json_data["success"] is True
         assert json_data["data"]["message"] == "Fine-tuning job(s) started."
         assert json_data["metadata"] is None
+        assert "document_id" in json_data["data"]
+        assert "jobs" in json_data["data"]
+        assert len(json_data["data"]["jobs"]) == 3
+
+        # Verify that the background task was called for each split ratio
+        assert mock_process_job.call_count == 3
 
         jobs = db.query(Fine_Tuning).all()
         assert len(jobs) == 3
 
-        for i, job in enumerate(jobs, start=1):
+        for job in jobs:
             db.refresh(job)
-            assert job.status == "running"
-            assert job.provider_job_id == f"ft_mock_job_{i}"
-            assert job.training_file_id is not None
-            assert job.train_data_s3_object == "s3://bucket/train.csv"
-            assert job.test_data_s3_object == "s3://bucket/test.csv"
+            assert (
+                job.status == "pending"
+            )  # Since background processing is mocked, status remains pending
             assert job.split_ratio in [0.5, 0.7, 0.9]
 
 

From a2ef0050f3018fe0d8fa58f3f27d2cb2bdf24da3 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 23 Sep 2025 11:54:43 +0530
Subject: [PATCH 06/18] added more testcases

---
 .../app/tests/api/routes/test_fine_tuning.py  | 264 ++++++++++++++++++
 1 file changed, 264 insertions(+)

diff --git a/backend/app/tests/api/routes/test_fine_tuning.py b/backend/app/tests/api/routes/test_fine_tuning.py
index 9162044a..23a96157 100644
--- a/backend/app/tests/api/routes/test_fine_tuning.py
+++ b/backend/app/tests/api/routes/test_fine_tuning.py
@@ -203,3 +203,267 @@ def test_fetch_jobs_document(self, client, db, user_api_key_header):
         for job in json_data["data"]:
             assert job["document_id"] == str(document.id)
             assert job["status"] == "pending"
+
+
+@pytest.mark.usefixtures("client", "db", "user_api_key_header")
+@patch("app.api.routes.fine_tuning.get_openai_client")
+@patch("app.api.routes.fine_tuning.get_cloud_storage")
+@patch("app.api.routes.fine_tuning.run_model_evaluation")
+class TestAutoEvaluationTrigger:
+    """Test cases for automatic evaluation triggering when fine-tuning completes."""
+
+    def test_successful_auto_evaluation_trigger(
+        self,
+        mock_run_model_evaluation,
+        mock_get_cloud_storage,
+        mock_get_openai_client,
+        client,
+        db,
+        user_api_key_header,
+    ):
+        """Test that evaluation is automatically triggered when job status changes from running to completed."""
+        # Setup: Create a fine-tuning job with running status
+        jobs, _ = create_test_fine_tuning_jobs(db, [0.7])
+        job = jobs[0]
+        job.status = FineTuningStatus.running
+        job.provider_job_id = "ft_mock_job_123"
+        # Add required fields for model evaluation
+        job.test_data_s3_object = "test-bucket/test-data.csv"
+        job.system_prompt = "You are a helpful assistant"
+        db.add(job)
+        db.commit()
+        db.refresh(job)
+
+        # Mock cloud storage
+        mock_storage = MagicMock()
+        mock_storage.get_signed_url.return_value = (
+            "https://test.s3.amazonaws.com/signed-url"
+        )
+        mock_get_cloud_storage.return_value = mock_storage
+
+        # Mock OpenAI response indicating job completion
+        mock_openai_job = MagicMock(
+            status="succeeded",
+            fine_tuned_model="ft:gpt-4:custom-model:12345",
+            error=None,
+        )
+        mock_openai = MagicMock()
+        mock_openai.fine_tuning.jobs.retrieve.return_value = mock_openai_job
+        mock_get_openai_client.return_value = mock_openai
+
+        # Action: Refresh the fine-tuning job status
+        response = client.get(
+            f"/api/v1/fine_tuning/{job.id}/refresh", headers=user_api_key_header
+        )
+
+        # Verify response
+        assert response.status_code == 200
+        json_data = response.json()
+        assert json_data["data"]["status"] == "completed"
+        assert json_data["data"]["fine_tuned_model"] == "ft:gpt-4:custom-model:12345"
+
+        # Verify that model evaluation was triggered
+        mock_run_model_evaluation.assert_called_once()
+        call_args = mock_run_model_evaluation.call_args[0]
+        eval_id = call_args[0]
+
+        # Verify evaluation was created in database
+        model_eval = (
+            db.query(ModelEvaluation).filter(ModelEvaluation.id == eval_id).first()
+        )
+        assert model_eval is not None
+        assert model_eval.fine_tuning_id == job.id
+        assert model_eval.status == ModelEvaluationStatus.pending
+
+    def test_skip_evaluation_when_already_exists(
+        self,
+        mock_run_model_evaluation,
+        mock_get_cloud_storage,
+        mock_get_openai_client,
+        client,
+        db,
+        user_api_key_header,
+    ):
+        """Test that evaluation is skipped when an active evaluation already exists."""
+        # Setup: Create a fine-tuning job with running status
+        jobs, _ = create_test_fine_tuning_jobs(db, [0.7])
+        job = jobs[0]
+        job.status = FineTuningStatus.running
+        job.provider_job_id = "ft_mock_job_123"
+        # Add required fields for model evaluation
+        job.test_data_s3_object = "test-bucket/test-data.csv"
+        job.system_prompt = "You are a helpful assistant"
+        db.add(job)
+        db.commit()
+
+        # Create an existing active evaluation
+        existing_eval = ModelEvaluation(
+            fine_tuning_id=job.id,
+            status=ModelEvaluationStatus.pending,
+            project_id=job.project_id,
+            organization_id=job.organization_id,
+            document_id=job.document_id,
+            fine_tuned_model="ft:gpt-4:test-model:123",
+            test_data_s3_object="test-bucket/test-data.csv",
+            base_model="gpt-4",
+            split_ratio=0.7,
+            system_prompt="You are a helpful assistant",
+        )
+        db.add(existing_eval)
+        db.commit()
+
+        # Mock cloud storage
+        mock_storage = MagicMock()
+        mock_storage.get_signed_url.return_value = (
+            "https://test.s3.amazonaws.com/signed-url"
+        )
+        mock_get_cloud_storage.return_value = mock_storage
+
+        # Mock OpenAI response indicating job completion
+        mock_openai_job = MagicMock(
+            status="succeeded",
+            fine_tuned_model="ft:gpt-4:custom-model:12345",
+            error=None,
+        )
+        mock_openai = MagicMock()
+        mock_openai.fine_tuning.jobs.retrieve.return_value = mock_openai_job
+        mock_get_openai_client.return_value = mock_openai
+
+        # Action: Refresh the fine-tuning job status
+        response = client.get(
+            f"/api/v1/fine_tuning/{job.id}/refresh", headers=user_api_key_header
+        )
+
+        # Verify response
+        assert response.status_code == 200
+        json_data = response.json()
+        assert json_data["data"]["status"] == "completed"
+
+        # Verify that no new evaluation was triggered
+        mock_run_model_evaluation.assert_not_called()
+
+        # Verify only one evaluation exists in database
+        evaluations = (
+            db.query(ModelEvaluation)
+            .filter(ModelEvaluation.fine_tuning_id == job.id)
+            .all()
+        )
+        assert len(evaluations) == 1
+        assert evaluations[0].id == existing_eval.id
+
+    def test_evaluation_not_triggered_for_non_completion_status_changes(
+        self,
+        mock_run_model_evaluation,
+        mock_get_cloud_storage,
+        mock_get_openai_client,
+        client,
+        db,
+        user_api_key_header,
+    ):
+        """Test that evaluation is not triggered for status changes other than to completed."""
+        # Test Case 1: pending to running
+        jobs, _ = create_test_fine_tuning_jobs(db, [0.7])
+        job = jobs[0]
+        job.status = FineTuningStatus.pending
+        job.provider_job_id = "ft_mock_job_123"
+        db.add(job)
+        db.commit()
+
+        # Mock cloud storage
+        mock_storage = MagicMock()
+        mock_storage.get_signed_url.return_value = (
+            "https://test.s3.amazonaws.com/signed-url"
+        )
+        mock_get_cloud_storage.return_value = mock_storage
+
+        mock_openai_job = MagicMock(
+            status="running",
+            fine_tuned_model=None,
+            error=None,
+        )
+        mock_openai = MagicMock()
+        mock_openai.fine_tuning.jobs.retrieve.return_value = mock_openai_job
+        mock_get_openai_client.return_value = mock_openai
+
+        response = client.get(
+            f"/api/v1/fine_tuning/{job.id}/refresh", headers=user_api_key_header
+        )
+
+        assert response.status_code == 200
+        json_data = response.json()
+        assert json_data["data"]["status"] == "running"
+        mock_run_model_evaluation.assert_not_called()
+
+        # Test Case 2: running to failed
+        job.status = FineTuningStatus.running
+        db.add(job)
+        db.commit()
+
+        mock_openai_job.status = "failed"
+        mock_openai_job.error = MagicMock(message="Training failed")
+
+        response = client.get(
+            f"/api/v1/fine_tuning/{job.id}/refresh", headers=user_api_key_header
+        )
+
+        assert response.status_code == 200
+        json_data = response.json()
+        assert json_data["data"]["status"] == "failed"
+        mock_run_model_evaluation.assert_not_called()
+
+    def test_evaluation_not_triggered_for_already_completed_jobs(
+        self,
+        mock_run_model_evaluation,
+        mock_get_cloud_storage,
+        mock_get_openai_client,
+        client,
+        db,
+        user_api_key_header,
+    ):
+        """Test that evaluation is not triggered when refreshing an already completed job."""
+        # Setup: Create a fine-tuning job that's already completed
+        jobs, _ = create_test_fine_tuning_jobs(db, [0.7])
+        job = jobs[0]
+        job.status = FineTuningStatus.completed
+        job.provider_job_id = "ft_mock_job_123"
+        job.fine_tuned_model = "ft:gpt-4:custom-model:12345"
+        db.add(job)
+        db.commit()
+
+        # Mock cloud storage
+        mock_storage = MagicMock()
+        mock_storage.get_signed_url.return_value = (
+            "https://test.s3.amazonaws.com/signed-url"
+        )
+        mock_get_cloud_storage.return_value = mock_storage
+
+        # Mock OpenAI response (job remains succeeded)
+        mock_openai_job = MagicMock(
+            status="succeeded",
+            fine_tuned_model="ft:gpt-4:custom-model:12345",
+            error=None,
+        )
+        mock_openai = MagicMock()
+        mock_openai.fine_tuning.jobs.retrieve.return_value = mock_openai_job
+        mock_get_openai_client.return_value = mock_openai
+
+        # Action: Refresh the fine-tuning job status
+        response = client.get(
+            f"/api/v1/fine_tuning/{job.id}/refresh", headers=user_api_key_header
+        )
+
+        # Verify response
+        assert response.status_code == 200
+        json_data = response.json()
+        assert json_data["data"]["status"] == "completed"
+
+        # Verify that no evaluation was triggered (since it wasn't newly completed)
+        mock_run_model_evaluation.assert_not_called()
+
+        # Verify no evaluations exist in database for this job
+        evaluations = (
+            db.query(ModelEvaluation)
+            .filter(ModelEvaluation.fine_tuning_id == job.id)
+            .all()
+        )
+        assert len(evaluations) == 0

From f8e28e9028f5eb3547dafd200388e0fa8856f3c5 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 23 Sep 2025 12:26:22 +0530
Subject: [PATCH 07/18] added cancelled status in enum

---
 backend/app/api/routes/fine_tuning.py            |  3 ++-
 backend/app/models/fine_tuning.py                |  1 +
 backend/app/tests/api/routes/test_fine_tuning.py | 12 ++++++------
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/backend/app/api/routes/fine_tuning.py b/backend/app/api/routes/fine_tuning.py
index e2c3b5fb..670252c9 100644
--- a/backend/app/api/routes/fine_tuning.py
+++ b/backend/app/api/routes/fine_tuning.py
@@ -51,6 +51,7 @@
     "running": FineTuningStatus.running,
     "succeeded": FineTuningStatus.completed,
     "failed": FineTuningStatus.failed,
+    "cancelled": FineTuningStatus.cancelled,
 }
 
 
@@ -191,7 +192,7 @@ async def fine_tune_from_CSV(
     background_tasks: BackgroundTasks,
     file: UploadFile = File(..., description="CSV file to use for fine-tuning"),
     base_model: str = Form(
-        ..., description="Base model for fine-tuning (e.g., gpt-3.5-turbo)"
+        ..., description="Base model for fine-tuning (e.g., gpt-4.1-2025-04-14)"
     ),
     split_ratio: str = Form(
         ..., description="Comma-separated split ratios (e.g., '0.8' or '0.7,0.8,0.9')"
diff --git a/backend/app/models/fine_tuning.py b/backend/app/models/fine_tuning.py
index a3b0e866..4e326ee5 100644
--- a/backend/app/models/fine_tuning.py
+++ b/backend/app/models/fine_tuning.py
@@ -15,6 +15,7 @@ class FineTuningStatus(str, Enum):
     running = "running"
     completed = "completed"
     failed = "failed"
+    cancelled = "cancelled"
 
 
 class FineTuningJobBase(SQLModel):
diff --git a/backend/app/tests/api/routes/test_fine_tuning.py b/backend/app/tests/api/routes/test_fine_tuning.py
index 23a96157..321bb740 100644
--- a/backend/app/tests/api/routes/test_fine_tuning.py
+++ b/backend/app/tests/api/routes/test_fine_tuning.py
@@ -125,7 +125,7 @@ def test_retrieve_fine_tuning_job(
     ):
         jobs, _ = create_test_fine_tuning_jobs(db, [0.3])
         job = jobs[0]
-        job.provider_job_id = "ft_mock_job_123"
+        job.provider_job_id = "ftjob-mock_job_123"
         db.flush()
 
         mock_openai_job = MagicMock(
@@ -154,7 +154,7 @@ def test_retrieve_fine_tuning_job_failed(
     ):
         jobs, _ = create_test_fine_tuning_jobs(db, [0.3])
         job = jobs[0]
-        job.provider_job_id = "ft_mock_job_123"
+        job.provider_job_id = "ftjob-mock_job_123"
         db.flush()
 
         mock_openai_job = MagicMock(
@@ -226,7 +226,7 @@ def test_successful_auto_evaluation_trigger(
         jobs, _ = create_test_fine_tuning_jobs(db, [0.7])
         job = jobs[0]
         job.status = FineTuningStatus.running
-        job.provider_job_id = "ft_mock_job_123"
+        job.provider_job_id = "ftjob-mock_job_123"
         # Add required fields for model evaluation
         job.test_data_s3_object = "test-bucket/test-data.csv"
         job.system_prompt = "You are a helpful assistant"
@@ -289,7 +289,7 @@ def test_skip_evaluation_when_already_exists(
         jobs, _ = create_test_fine_tuning_jobs(db, [0.7])
         job = jobs[0]
         job.status = FineTuningStatus.running
-        job.provider_job_id = "ft_mock_job_123"
+        job.provider_job_id = "ftjob-mock_job_123"
         # Add required fields for model evaluation
         job.test_data_s3_object = "test-bucket/test-data.csv"
         job.system_prompt = "You are a helpful assistant"
@@ -365,7 +365,7 @@ def test_evaluation_not_triggered_for_non_completion_status_changes(
         jobs, _ = create_test_fine_tuning_jobs(db, [0.7])
         job = jobs[0]
         job.status = FineTuningStatus.pending
-        job.provider_job_id = "ft_mock_job_123"
+        job.provider_job_id = "ftjob-mock_job_123"
         db.add(job)
         db.commit()
 
@@ -425,7 +425,7 @@ def test_evaluation_not_triggered_for_already_completed_jobs(
         jobs, _ = create_test_fine_tuning_jobs(db, [0.7])
         job = jobs[0]
         job.status = FineTuningStatus.completed
-        job.provider_job_id = "ft_mock_job_123"
+        job.provider_job_id = "ftjob-mock_job_123"
         job.fine_tuned_model = "ft:gpt-4:custom-model:12345"
         db.add(job)
         db.commit()

From 397807dddffc8a3198490029688b665f452aaab4 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 23 Sep 2025 14:12:09 +0530
Subject: [PATCH 08/18] cleanups

---
 .../app/tests/api/routes/test_fine_tuning.py  | 37 +++++++++----------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/backend/app/tests/api/routes/test_fine_tuning.py b/backend/app/tests/api/routes/test_fine_tuning.py
index 321bb740..34ba4b34 100644
--- a/backend/app/tests/api/routes/test_fine_tuning.py
+++ b/backend/app/tests/api/routes/test_fine_tuning.py
@@ -1,4 +1,3 @@
-import os
 import io
 import pytest
 from moto import mock_aws
@@ -16,17 +15,6 @@
 from app.core.config import settings
 
 
-@pytest.fixture(scope="function")
-def aws_credentials():
-    """Set up AWS credentials for moto."""
-    os.environ["AWS_ACCESS_KEY_ID"] = "testing"
-    os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
-    os.environ["AWS_SECURITY_TOKEN"] = "testing"
-    os.environ["AWS_SESSION_TOKEN"] = "testing"
-    os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
-    os.environ["AWS_S3_BUCKET_PREFIX"] = "test-bucket"
-
-
 def create_file_mock(file_type):
     counter = {"train": 0, "test": 0}
 
@@ -42,7 +30,7 @@ def _side_effect(file=None, purpose=None):
     return _side_effect
 
 
-@pytest.mark.usefixtures("client", "db", "user_api_key_header", "aws_credentials")
+@pytest.mark.usefixtures("client", "db", "user_api_key_header")
 class TestCreateFineTuningJobAPI:
     @mock_aws
     def test_finetune_from_csv_multiple_split_ratio(
@@ -52,8 +40,17 @@ def test_finetune_from_csv_multiple_split_ratio(
         user_api_key_header,
     ):
         # Setup S3 bucket for moto
-        s3 = boto3.client("s3", region_name="us-east-1")
-        s3.create_bucket(Bucket="test-bucket")
+        s3 = boto3.client("s3", region_name=settings.AWS_DEFAULT_REGION)
+        bucket_name = settings.AWS_S3_BUCKET_PREFIX
+        if settings.AWS_DEFAULT_REGION == "us-east-1":
+            s3.create_bucket(Bucket=bucket_name)
+        else:
+            s3.create_bucket(
+                Bucket=bucket_name,
+                CreateBucketConfiguration={
+                    "LocationConstraint": settings.AWS_DEFAULT_REGION
+                },
+            )
 
         # Create a test CSV file content
         csv_content = "prompt,label\ntest1,label1\ntest2,label2\ntest3,label3"
@@ -74,7 +71,9 @@ def test_finetune_from_csv_multiple_split_ratio(
                 ) as mock_process_job:
                     # Mock cloud storage
                     mock_storage = MagicMock()
-                    mock_storage.put.return_value = "s3://test-bucket/test.csv"
+                    mock_storage.put.return_value = (
+                        f"s3://{settings.AWS_S3_BUCKET_PREFIX}/test.csv"
+                    )
                     mock_get_cloud_storage.return_value = mock_storage
 
                     # Mock OpenAI client (for validation only)
@@ -228,7 +227,7 @@ def test_successful_auto_evaluation_trigger(
         job.status = FineTuningStatus.running
         job.provider_job_id = "ftjob-mock_job_123"
         # Add required fields for model evaluation
-        job.test_data_s3_object = "test-bucket/test-data.csv"
+        job.test_data_s3_object = f"{settings.AWS_S3_BUCKET_PREFIX}/test-data.csv"
         job.system_prompt = "You are a helpful assistant"
         db.add(job)
         db.commit()
@@ -291,7 +290,7 @@ def test_skip_evaluation_when_already_exists(
         job.status = FineTuningStatus.running
         job.provider_job_id = "ftjob-mock_job_123"
         # Add required fields for model evaluation
-        job.test_data_s3_object = "test-bucket/test-data.csv"
+        job.test_data_s3_object = f"{settings.AWS_S3_BUCKET_PREFIX}/test-data.csv"
         job.system_prompt = "You are a helpful assistant"
         db.add(job)
         db.commit()
@@ -304,7 +303,7 @@ def test_skip_evaluation_when_already_exists(
             organization_id=job.organization_id,
             document_id=job.document_id,
             fine_tuned_model="ft:gpt-4:test-model:123",
-            test_data_s3_object="test-bucket/test-data.csv",
+            test_data_s3_object=f"{settings.AWS_S3_BUCKET_PREFIX}/test-data.csv",
             base_model="gpt-4",
             split_ratio=0.7,
             system_prompt="You are a helpful assistant",

From ca862cf94be2aa57ace2e8eba250cab3e997f19e Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 23 Sep 2025 14:22:50 +0530
Subject: [PATCH 09/18] update claude.md

---
 CLAUDE.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CLAUDE.md b/CLAUDE.md
index 2dc7be8a..df2ca38b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -33,6 +33,8 @@ alembic revision --autogenerate -m 'Add new meta'
 
 ### Testing
 
+We also use .env.test to keep environment variable separate for test environment and can use it in testcases
+
 ```bash
 # Run backend tests
 uv run bash scripts/tests-start.sh

From 375eb5e2a2bf3a4082c7723742205d12a684e891 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Tue, 23 Sep 2025 19:20:15 +0530
Subject: [PATCH 10/18] coderabbit suggestion

---
 backend/app/api/routes/fine_tuning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/app/api/routes/fine_tuning.py b/backend/app/api/routes/fine_tuning.py
index 670252c9..00b48b91 100644
--- a/backend/app/api/routes/fine_tuning.py
+++ b/backend/app/api/routes/fine_tuning.py
@@ -217,7 +217,7 @@ async def fine_tune_from_CSV(
         )
 
     # Validate file is CSV
-    if not file.filename.lower().endswith(".csv"):
+    if not file.filename.lower().endswith(".csv") and file.content_type != "text/csv":
         raise HTTPException(status_code=400, detail="File must be a CSV file")
 
     client = get_openai_client(  # Used here only to validate the user's OpenAI key;

From 38dcf456b1c51106d79d8133f37a20fb400136b1 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 25 Sep 2025 11:42:15 +0530
Subject: [PATCH 11/18] reverting unnecessary changes

---
 backend/app/api/routes/responses.py     | 21 ++++++++++++++++++++-
 backend/app/api/routes/threads.py       |  9 ++++++++-
 backend/app/core/finetune/evaluation.py |  3 +--
 backend/app/utils.py                    |  8 --------
 4 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/backend/app/api/routes/responses.py b/backend/app/api/routes/responses.py
index d4e2389c..94e5f19d 100644
--- a/backend/app/api/routes/responses.py
+++ b/backend/app/api/routes/responses.py
@@ -18,13 +18,32 @@
     get_conversation_by_ancestor_id,
 )
 from app.models import UserProjectOrg, OpenAIConversationCreate, OpenAIConversation
-from app.utils import APIResponse, mask_string, handle_openai_error
+from app.utils import APIResponse, mask_string
 from app.core.langfuse.langfuse import LangfuseTracer
 
 logger = logging.getLogger(__name__)
 router = APIRouter(tags=["responses"])
 
 
+def handle_openai_error(e: openai.OpenAIError) -> str:
+    """Extract error message from OpenAI error."""
+    # Try to get error message from different possible attributes
+    if hasattr(e, "body") and isinstance(e.body, dict) and "message" in e.body:
+        return e.body["message"]
+    elif hasattr(e, "message"):
+        return e.message
+    elif hasattr(e, "response") and hasattr(e.response, "json"):
+        try:
+            error_data = e.response.json()
+            if isinstance(error_data, dict) and "error" in error_data:
+                error_info = error_data["error"]
+                if isinstance(error_info, dict) and "message" in error_info:
+                    return error_info["message"]
+        except:
+            pass
+    return str(e)
+
+
 class ResponsesAPIRequest(BaseModel):
     assistant_id: str
     question: str
diff --git a/backend/app/api/routes/threads.py b/backend/app/api/routes/threads.py
index be7e0578..95630bfb 100644
--- a/backend/app/api/routes/threads.py
+++ b/backend/app/api/routes/threads.py
@@ -13,7 +13,7 @@
 from app.core import logging, settings
 from app.models import UserOrganization, OpenAIThreadCreate, UserProjectOrg
 from app.crud import upsert_thread_result, get_thread_result
-from app.utils import APIResponse, mask_string, handle_openai_error
+from app.utils import APIResponse, mask_string
 from app.crud.credentials import get_provider_credential
 from app.core.util import configure_openai
 from app.core.langfuse.langfuse import LangfuseTracer
@@ -49,6 +49,13 @@ def send_callback(callback_url: str, data: dict):
         return False
 
 
+def handle_openai_error(e: openai.OpenAIError) -> str:
+    """Extract error message from OpenAI error."""
+    if isinstance(e.body, dict) and "message" in e.body:
+        return e.body["message"]
+    return str(e)
+
+
 def validate_thread(client: OpenAI, thread_id: str) -> tuple[bool, str]:
     """Validate if a thread exists and has no active runs."""
     if not thread_id:
diff --git a/backend/app/core/finetune/evaluation.py b/backend/app/core/finetune/evaluation.py
index 4a85e85c..4acc62e6 100644
--- a/backend/app/core/finetune/evaluation.py
+++ b/backend/app/core/finetune/evaluation.py
@@ -11,7 +11,6 @@
     matthews_corrcoef,
 )
 from app.core.cloud import AmazonCloudStorage
-from app.utils import handle_openai_error
 from app.core.finetune.preprocessing import DataPreprocessor
 
 
@@ -151,7 +150,7 @@ def generate_predictions(self) -> tuple[list[str], str]:
                     break
 
                 except openai.OpenAIError as e:
-                    error_msg = handle_openai_error(e)
+                    error_msg = str(e)
                     logger.error(
                         f"[generate_predictions] OpenAI API error at prompt {idx}/{total_prompts}: {error_msg}"
                     )
diff --git a/backend/app/utils.py b/backend/app/utils.py
index 8f96fd95..1c03839a 100644
--- a/backend/app/utils.py
+++ b/backend/app/utils.py
@@ -7,7 +7,6 @@
 
 import jwt
 import emails
-import openai
 from jinja2 import Template
 from jwt.exceptions import InvalidTokenError
 from fastapi import HTTPException
@@ -49,13 +48,6 @@ def failure_response(
         return cls(success=False, data=None, error=error_message, metadata=metadata)
 
 
-def handle_openai_error(e: openai.OpenAIError) -> str:
-    """Extract error message from OpenAI error."""
-    if hasattr(e, "body") and isinstance(e.body, dict) and "message" in e.body:
-        return e.body["message"]
-    return str(e)
-
-
 @dataclass
 class EmailData:
     html_content: str

From 8a1b496c86602dccf5b12c08814392898382616a Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 25 Sep 2025 11:44:13 +0530
Subject: [PATCH 12/18] coderabbit suggestions

---
 backend/app/api/routes/fine_tuning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/app/api/routes/fine_tuning.py b/backend/app/api/routes/fine_tuning.py
index 00b48b91..32a3a6e6 100644
--- a/backend/app/api/routes/fine_tuning.py
+++ b/backend/app/api/routes/fine_tuning.py
@@ -220,7 +220,7 @@ async def fine_tune_from_CSV(
     if not file.filename.lower().endswith(".csv") and file.content_type != "text/csv":
         raise HTTPException(status_code=400, detail="File must be a CSV file")
 
-    client = get_openai_client(  # Used here only to validate the user's OpenAI key;
+    get_openai_client(  # Used here only to validate the user's OpenAI key;
         # the actual client is re-initialized separately inside the background task
         session,
         current_user.organization_id,

From 9e8d046edfdd7e02f4b406a415bd0a2599e32c2b Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 25 Sep 2025 12:24:45 +0530
Subject: [PATCH 13/18] remove import

---
 backend/app/api/routes/fine_tuning.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/backend/app/api/routes/fine_tuning.py b/backend/app/api/routes/fine_tuning.py
index 32a3a6e6..f5ff511f 100644
--- a/backend/app/api/routes/fine_tuning.py
+++ b/backend/app/api/routes/fine_tuning.py
@@ -19,13 +19,7 @@
 )
 from app.core.cloud import get_cloud_storage
 from app.crud.document import DocumentCrud
-from app.utils import (
-    get_openai_client,
-    APIResponse,
-    mask_string,
-    load_description,
-    handle_openai_error,
-)
+from app.utils import get_openai_client, APIResponse, mask_string, load_description
 from app.crud import (
     create_fine_tuning_job,
     fetch_by_id,

From 724497b02e479ea7ae9a2d035c09ec53983dd7fa Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Mon, 6 Oct 2025 10:27:16 +0530
Subject: [PATCH 14/18] merging endpoints

---
 CLAUDE.md | 104 ------------------------------------------------------
 1 file changed, 104 deletions(-)
 delete mode 100644 CLAUDE.md

diff --git a/CLAUDE.md b/CLAUDE.md
deleted file mode 100644
index df2ca38b..00000000
--- a/CLAUDE.md
+++ /dev/null
@@ -1,104 +0,0 @@
-# CLAUDE.md
-
-This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
-
-## Project Overview
-
-This is an AI Platform named Kaapi built with FastAPI (backend) and PostgreSQL (database), containerized with Docker. The platform provides AI capabilities including OpenAI assistants, fine-tuning, document processing, and collection management.
-
-## Key Commands
-
-### Development
-
-```bash
-# Start development environment with auto-reload
-source .venv/bin/activate
-fastapi run --reload app/main.py
-
-# Run backend tests
-uv run bash scripts/tests-start.sh
-
-# Seed data
-uv run python -m app.seed_data.seed_data
-
-# Run pre-commit
-uv run pre-commit run --all-files
-
-# Activate virtual environment
-source .venv/bin/activate
-
-# Generate new Migration
-alembic revision --autogenerate -m 'Add new meta'
-```
-
-### Testing
-
-We also use .env.test to keep environment variable separate for test environment and can use it in testcases
-
-```bash
-# Run backend tests
-uv run bash scripts/tests-start.sh
-```
-
-## Architecture
-
-### Backend Structure
-
-The backend follows a layered architecture:
-
-- **API Layer** (`backend/app/api/`): FastAPI routes organized by domain
-  - Authentication (`login.py`)
-  - Core resources: `users.py`, `organizations.py`, `projects.py`
-  - AI features: `assistants.py`, `fine_tuning.py`, `openai_conversation.py`
-  - Document management: `documents.py`, `collections.py`, `doc_transformation_job.py`
-
-- **Models** (`backend/app/models/`): SQLModel entities representing database tables
-  - User system: User, Organization, Project, ProjectUser
-  - AI components: Assistant, Thread, Message, FineTuning
-  - Document system: Document, Collection, DocumentCollection, DocTransformationJob
-
-- **CRUD Operations** (`backend/app/crud/`): Database operations for each model
-
-- **Core Services** (`backend/app/core/`):
-  - `providers.py`: OpenAI client management
-  - `finetune/`: Fine-tuning pipeline (preprocessing, evaluation)
-  - `doctransform/`: Document transformation services
-  - `cloud/storage.py`: S3 storage integration
-  - `langfuse/`: Observability and tracing
-
-### Database
-
-PostgreSQL with Alembic migrations. Key relationships:
-- Organizations contain Projects
-- Projects have Users (many-to-many via ProjectUser)
-- Projects contain Collections and Documents
-- Documents can belong to Collections (many-to-many)
-- Projects have Assistants, Threads, and FineTuning jobs
-
-### Authentication & Security
-
-- JWT-based authentication
-- API key support for programmatic access
-- Role-based access control (User, Admin, Super Admin)
-- Organization and project-level permissions
-
-## Environment Configuration
-
-Critical environment variables:
-- `SECRET_KEY`: JWT signing key
-- `POSTGRES_*`: Database connection
-- `LOCAL_CREDENTIALS_ORG_OPENAI_API_KEY`: OpenAI API key
-- `AWS_S3_BUCKET_PREFIX`: S3 storage configuration
-- `LANGFUSE_*`: Observability configuration
-
-## Testing Strategy
-
-- Unit tests in `backend/app/tests/`
-- Test fixtures use factory pattern
-- Mock external services (OpenAI, S3) using `moto` and `openai_responses`
-- Coverage reports generated automatically
-
-## Code Standards
-
-- Python 3.11+ with type hints
-- Pre-commit hooks configured for consistency

From b6a7073f9f8497dc8983cf11c72150d2a20888b9 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Mon, 6 Oct 2025 11:29:19 +0530
Subject: [PATCH 15/18] following PEP8 standards

---
 backend/app/core/finetune/evaluation.py | 79 ++++++++++++++++---------
 1 file changed, 51 insertions(+), 28 deletions(-)

diff --git a/backend/app/core/finetune/evaluation.py b/backend/app/core/finetune/evaluation.py
index 4acc62e6..560a4c75 100644
--- a/backend/app/core/finetune/evaluation.py
+++ b/backend/app/core/finetune/evaluation.py
@@ -1,18 +1,17 @@
 import difflib
-import time
 import logging
+import time
+import uuid
 from typing import Set
 
 import openai
 import pandas as pd
 from openai import OpenAI
-import uuid
-from sklearn.metrics import (
-    matthews_corrcoef,
-)
+from sklearn.metrics import matthews_corrcoef
+
 from app.core.cloud import AmazonCloudStorage
 from app.core.finetune.preprocessing import DataPreprocessor
-
+from app.utils import handle_openai_error
 
 logger = logging.getLogger(__name__)
 
@@ -50,7 +49,8 @@ def load_labels_and_prompts(self) -> None:
           - 'label'
         """
         logger.info(
-            f"[ModelEvaluator.load_labels_and_prompts] Loading CSV from: {self.test_data_s3_object}"
+            f"[ModelEvaluator.load_labels_and_prompts] Loading CSV from: "
+            f"{self.test_data_s3_object}"
         )
         file_obj = self.storage.stream(self.test_data_s3_object)
         try:
@@ -65,11 +65,13 @@ def load_labels_and_prompts(self) -> None:
 
             if not query_col or not label_col:
                 logger.error(
-                    "[ModelEvaluator.load_labels_and_prompts] CSV must contain a 'label' column "
-                    f"and one of: {possible_query_columns}"
+                    "[ModelEvaluator.load_labels_and_prompts] CSV must "
+                    "contain a 'label' column and one of: "
+                    f"{possible_query_columns}"
                 )
                 raise ValueError(
-                    f"CSV must contain a 'label' column and one of: {possible_query_columns}"
+                    f"CSV must contain a 'label' column and one of: "
+                    f"{possible_query_columns}"
                 )
 
             prompts = df[query_col].astype(str).tolist()
@@ -84,12 +86,15 @@ def load_labels_and_prompts(self) -> None:
 
             logger.info(
                 "[ModelEvaluator.load_labels_and_prompts] "
-                f"Loaded {len(self.prompts)} prompts and {len(self.y_true)} labels; "
-                f"query_col={query_col}, label_col={label_col}, allowed_labels={self.allowed_labels}"
+                f"Loaded {len(self.prompts)} prompts and "
+                f"{len(self.y_true)} labels; "
+                f"query_col={query_col}, label_col={label_col}, "
+                f"allowed_labels={self.allowed_labels}"
             )
         except Exception as e:
             logger.error(
-                f"[ModelEvaluator.load_labels_and_prompts] Failed to load/parse test CSV: {e}",
+                f"[ModelEvaluator.load_labels_and_prompts] "
+                f"Failed to load/parse test CSV: {e}",
                 exc_info=True,
             )
             raise
@@ -110,13 +115,15 @@ def normalize_prediction(self, text: str) -> str:
             return closest[0]
 
         logger.warning(
-            f"[normalize_prediction] No close match found for '{t}'. Using default label '{next(iter(self.allowed_labels))}'."
+            f"[normalize_prediction] No close match found for '{t}'. "
+            f"Using default label '{next(iter(self.allowed_labels))}'."
         )
         return next(iter(self.allowed_labels))
 
     def generate_predictions(self) -> tuple[list[str], str]:
         logger.info(
-            f"[generate_predictions] Generating predictions for {len(self.prompts)} prompts."
+            f"[generate_predictions] Generating predictions for "
+            f"{len(self.prompts)} prompts."
         )
         start_preds = time.time()
         predictions = []
@@ -127,7 +134,9 @@ def generate_predictions(self) -> tuple[list[str], str]:
             while attempt < self.retries:
                 start_time = time.time()
                 logger.info(
-                    f"[generate_predictions] Processing prompt {idx}/{total_prompts} (Attempt {attempt + 1}/{self.retries})"
+                    f"[generate_predictions] Processing prompt "
+                    f"{idx}/{total_prompts} "
+                    f"(Attempt {attempt + 1}/{self.retries})"
                 )
 
                 try:
@@ -140,7 +149,8 @@ def generate_predictions(self) -> tuple[list[str], str]:
                     elapsed_time = time.time() - start_time
                     if elapsed_time > self.max_latency:
                         logger.warning(
-                            f"[generate_predictions] Timeout exceeded for prompt {idx}/{total_prompts}. Retrying..."
+                            f"[generate_predictions] Timeout exceeded for "
+                            f"prompt {idx}/{total_prompts}. Retrying..."
                         )
                         continue
 
@@ -150,25 +160,31 @@ def generate_predictions(self) -> tuple[list[str], str]:
                     break
 
                 except openai.OpenAIError as e:
-                    error_msg = str(e)
+                    error_msg = handle_openai_error(e)
                     logger.error(
-                        f"[generate_predictions] OpenAI API error at prompt {idx}/{total_prompts}: {error_msg}"
+                        f"[generate_predictions] OpenAI API error at prompt "
+                        f"{idx}/{total_prompts}: {error_msg}"
                     )
                     attempt += 1
                     if attempt == self.retries:
                         predictions.append("openai_error")
                         logger.error(
-                            f"[generate_predictions] Maximum retries reached for prompt {idx}/{total_prompts}. Appending 'openai_error'."
+                            f"[generate_predictions] Maximum retries reached "
+                            f"for prompt {idx}/{total_prompts}. "
+                            f"Appending 'openai_error'."
                         )
                     else:
                         logger.info(
-                            f"[generate_predictions] Retrying prompt {idx}/{total_prompts} after OpenAI error ({attempt}/{self.retries})."
+                            f"[generate_predictions] Retrying prompt "
+                            f"{idx}/{total_prompts} after OpenAI error "
+                            f"({attempt}/{self.retries})."
                         )
 
         total_elapsed = time.time() - start_preds
         logger.info(
-            f"[generate_predictions] Finished {total_prompts} prompts in {total_elapsed:.2f}s | "
-            f"Generated {len(predictions)} predictions."
+            f"[generate_predictions] Finished {total_prompts} prompts in "
+            f"{total_elapsed:.2f}s | Generated {len(predictions)} "
+            f"predictions."
         )
 
         prediction_data = pd.DataFrame(
@@ -187,7 +203,8 @@ def generate_predictions(self) -> tuple[list[str], str]:
         self.prediction_data_s3_object = prediction_data_s3_object
 
         logger.info(
-            f"[generate_predictions] Predictions CSV uploaded to S3 | url={prediction_data_s3_object}"
+            f"[generate_predictions] Predictions CSV uploaded to S3 | "
+            f"url={prediction_data_s3_object}"
         )
 
         return predictions, prediction_data_s3_object
@@ -196,11 +213,13 @@ def evaluate(self) -> dict:
         """Evaluate using the predictions CSV previously uploaded to S3."""
         if not getattr(self, "prediction_data_s3_object", None):
             raise RuntimeError(
-                "[evaluate] predictions_s3_object not set. Call generate_predictions() first."
+                "[evaluate] predictions_s3_object not set. "
+                "Call generate_predictions() first."
             )
 
         logger.info(
-            f"[evaluate] Streaming predictions CSV from: {self.prediction_data_s3_object}"
+            f"[evaluate] Streaming predictions CSV from: "
+            f"{self.prediction_data_s3_object}"
         )
         prediction_obj = self.storage.stream(self.prediction_data_s3_object)
         try:
@@ -210,7 +229,8 @@ def evaluate(self) -> dict:
 
         if "true_label" not in df.columns or "prediction" not in df.columns:
             raise ValueError(
-                "[evaluate] prediction data CSV must contain 'true_label' and 'prediction' columns."
+                "[evaluate] prediction data CSV must contain 'true_label' "
+                "and 'prediction' columns."
             )
 
         y_true = df["true_label"].astype(str).str.strip().str.lower().tolist()
@@ -225,7 +245,10 @@ def evaluate(self) -> dict:
             raise
 
     def run(self) -> dict:
-        """Run the full evaluation process: load data, generate predictions, evaluate results."""
+        """Run the full evaluation process.
+
+        Load data, generate predictions, and evaluate results.
+        """
         try:
             self.load_labels_and_prompts()
             predictions, prediction_data_s3_object = self.generate_predictions()

From c47b254cede639d0bcff4778314a60ed5a112ceb Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Wed, 8 Oct 2025 12:04:42 +0530
Subject: [PATCH 16/18] removed redundant checks

---
 backend/app/api/routes/fine_tuning.py | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/backend/app/api/routes/fine_tuning.py b/backend/app/api/routes/fine_tuning.py
index f5ff511f..401a761b 100644
--- a/backend/app/api/routes/fine_tuning.py
+++ b/backend/app/api/routes/fine_tuning.py
@@ -193,22 +193,11 @@ async def fine_tune_from_CSV(
     ),
     system_prompt: str = Form(..., description="System prompt for the fine-tuning job"),
 ):
-    # Validate and parse split ratios
+    # Parse split ratios (validation happens in FineTuningJobCreate model)
     try:
         split_ratios = [float(r.strip()) for r in split_ratio.split(",")]
-        for ratio in split_ratios:
-            if not (0 < ratio < 1):
-                raise ValueError(
-                    f"Invalid split_ratio: {ratio}. Must be between 0 and 1."
-                )
     except ValueError as e:
-        raise HTTPException(status_code=400, detail=str(e))
-
-    # Validate system prompt
-    if not system_prompt.strip():
-        raise HTTPException(
-            status_code=400, detail="System prompt must be a non-empty string"
-        )
+        raise HTTPException(status_code=400, detail=f"Invalid split_ratio format: {e}")
 
     # Validate file is CSV
     if not file.filename.lower().endswith(".csv") and file.content_type != "text/csv":

From 0979fd84692c893022225cd4b719286e214771e5 Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Fri, 10 Oct 2025 09:40:17 +0530
Subject: [PATCH 17/18] added as todo

---
 backend/app/api/routes/fine_tuning.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/backend/app/api/routes/fine_tuning.py b/backend/app/api/routes/fine_tuning.py
index 401a761b..66baa3ad 100644
--- a/backend/app/api/routes/fine_tuning.py
+++ b/backend/app/api/routes/fine_tuning.py
@@ -185,15 +185,11 @@ async def fine_tune_from_CSV(
     current_user: CurrentUserOrgProject,
     background_tasks: BackgroundTasks,
     file: UploadFile = File(..., description="CSV file to use for fine-tuning"),
-    base_model: str = Form(
-        ..., description="Base model for fine-tuning (e.g., gpt-4.1-2025-04-14)"
-    ),
-    split_ratio: str = Form(
-        ..., description="Comma-separated split ratios (e.g., '0.8' or '0.7,0.8,0.9')"
-    ),
-    system_prompt: str = Form(..., description="System prompt for the fine-tuning job"),
+    base_model: str = Form(...),
+    split_ratio: str = Form(...),
+    system_prompt: str = Form(...),
 ):
-    # Parse split ratios (validation happens in FineTuningJobCreate model)
+    # Parse split ratios
     try:
         split_ratios = [float(r.strip()) for r in split_ratio.split(",")]
     except ValueError as e:
@@ -211,6 +207,7 @@ async def fine_tune_from_CSV(
     )
 
     # Upload the file to storage and create document
+    # ToDo: create a helper function and then use it rather than doing things in router
     storage = get_cloud_storage(session=session, project_id=current_user.project_id)
     document_id = uuid4()
     object_store_url = storage.put(file, Path(str(document_id)))
@@ -277,9 +274,7 @@ async def fine_tune_from_CSV(
         else f"Started {created_count} job(s); {total - created_count} active fine-tuning job(s) already exists."
     )
 
-    return APIResponse.success_response(
-        {"message": message, "document_id": str(created_document.id), "jobs": job_infos}
-    )
+    return APIResponse.success_response({"message": message, "jobs": job_infos})
 
 
 @router.get(

From 8801b39c4840ef6d0a0e8c19fe477fe759b4387a Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Fri, 10 Oct 2025 09:52:52 +0530
Subject: [PATCH 18/18] updated the testcase

---
 backend/app/tests/api/routes/test_fine_tuning.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/backend/app/tests/api/routes/test_fine_tuning.py b/backend/app/tests/api/routes/test_fine_tuning.py
index 34ba4b34..abe00680 100644
--- a/backend/app/tests/api/routes/test_fine_tuning.py
+++ b/backend/app/tests/api/routes/test_fine_tuning.py
@@ -98,7 +98,6 @@ def test_finetune_from_csv_multiple_split_ratio(
         assert json_data["success"] is True
         assert json_data["data"]["message"] == "Fine-tuning job(s) started."
         assert json_data["metadata"] is None
-        assert "document_id" in json_data["data"]
         assert "jobs" in json_data["data"]
         assert len(json_data["data"]["jobs"]) == 3