From 07267eafc592f6ccdb6eae808187c8840b7f0641 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Sun, 10 Aug 2025 21:41:02 +0530 Subject: [PATCH 01/10] db models and alembic script --- .../e317d05f49e4_add_evaluation_table.py | 72 ++++++++++++++ backend/app/models/__init__.py | 9 ++ backend/app/models/fine_tuning.py | 1 + backend/app/models/model_evaluation.py | 98 +++++++++++++++++++ backend/app/models/project.py | 3 + 5 files changed, 183 insertions(+) create mode 100644 backend/app/alembic/versions/e317d05f49e4_add_evaluation_table.py create mode 100644 backend/app/models/model_evaluation.py diff --git a/backend/app/alembic/versions/e317d05f49e4_add_evaluation_table.py b/backend/app/alembic/versions/e317d05f49e4_add_evaluation_table.py new file mode 100644 index 00000000..9c47f3bf --- /dev/null +++ b/backend/app/alembic/versions/e317d05f49e4_add_evaluation_table.py @@ -0,0 +1,72 @@ +"""add evaluation table + +Revision ID: e317d05f49e4 +Revises: db9b5413d3ce +Create Date: 2025-08-10 21:36:07.863951 + +""" +from alembic import op +import sqlalchemy as sa +import sqlmodel.sql.sqltypes +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "e317d05f49e4" +down_revision = "db9b5413d3ce" +branch_labels = None +depends_on = None + +modelevaluation_status_enum = postgresql.ENUM( + "pending", + "running", + "completed", + "failed", + name="modelevaluationstatus", + create_type=False, +) + + +def upgrade(): + modelevaluation_status_enum.create(op.get_bind(), checkfirst=True) + + op.create_table( + "model_evaluation", + sa.Column("id", sa.Integer(), nullable=False), + sa.PrimaryKeyConstraint("id"), + sa.Column("fine_tuning_id", sa.Integer(), nullable=False), + sa.ForeignKeyConstraint( + ["fine_tuning_id"], ["fine_tuning.id"], ondelete="CASCADE" + ), + sa.Column("document_id", sa.Uuid(), nullable=False), + sa.ForeignKeyConstraint(["document_id"], ["document.id"]), + sa.Column("model_name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), + sa.Column( + "testing_file_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False + ), + sa.Column("base_model", sqlmodel.sql.sqltypes.AutoString(), nullable=False), + sa.Column("split_ratio", sa.Float(), nullable=False), + sa.Column("system_prompt", sa.Text(), nullable=False), + sa.Column("metric", postgresql.JSON(astext_type=sa.Text()), nullable=False), + sa.Column("score", postgresql.JSON(astext_type=sa.Text()), nullable=True), + sa.Column( + "status", + modelevaluation_status_enum, + nullable=False, + server_default="pending", + ), + sa.Column("error_message", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + sa.Column("project_id", sa.Integer(), nullable=False), + sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"), + sa.Column("organization_id", sa.Integer(), nullable=False), + sa.ForeignKeyConstraint( + ["organization_id"], ["organization.id"], ondelete="CASCADE" + ), + sa.Column("is_deleted", sa.Boolean(), nullable=False), + sa.Column("inserted_at", sa.DateTime(), nullable=False), + sa.Column("updated_at", sa.DateTime(), nullable=False), + sa.Column("deleted_at", sa.DateTime(), nullable=True), + ) + + +def downgrade(): + op.drop_table("model_evaluation") diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py index d4291be6..2f4de433 100644 --- a/backend/app/models/__init__.py +++ b/backend/app/models/__init__.py @@ -71,3 +71,12 @@ OpenAIConversationBase, OpenAIConversationCreate, ) + +from .model_evaluation import ( + Model_Evaluation, + ModelEvaluationBase, + ModelEvaluationCreate, + ModelEvaluationPublic, + ModelEvaluationStatus, + ModelEvaluationUpdate, +) diff --git a/backend/app/models/fine_tuning.py b/backend/app/models/fine_tuning.py index 813628af..737c2cd5 100644 --- a/backend/app/models/fine_tuning.py +++ b/backend/app/models/fine_tuning.py @@ -81,6 +81,7 @@ class Fine_Tuning(FineTuningJobBase, table=True): deleted_at: datetime | None = Field(default=None, nullable=True) project: "Project" = Relationship(back_populates="fine_tuning") + model_evaluation: "Model_Evaluation" = Relationship(back_populates="fine-tuning") class FineTuningUpdate(SQLModel): diff --git a/backend/app/models/model_evaluation.py b/backend/app/models/model_evaluation.py new file mode 100644 index 00000000..eb687438 --- /dev/null +++ b/backend/app/models/model_evaluation.py @@ -0,0 +1,98 @@ +from typing import Optional +from uuid import UUID +from enum import Enum +from datetime import datetime + +from sqlmodel import SQLModel, Field, Relationship +from sqlalchemy import Column, Text +from sqlalchemy.dialects.postgresql import JSON + +from app.core.util import now + + +class ModelEvaluationStatus(str, Enum): + pending = "pending" + running = "running" + completed = "completed" + failed = "failed" + + +class ModelEvaluationBase(SQLModel): + fine_tuning_id: int = Field( + foreign_key="fine_tuning.id", + nullable=False, + ondelete="CASCADE", + ) + + +class ModelEvaluationCreate(SQLModel): + fine_tuning_ids: list[int] + + +class Model_Evaluation(ModelEvaluationBase, table=True): + """Database model for keeping a record of model evaluation""" + + id: int = Field(primary_key=True) + + document_id: UUID = Field( + foreign_key="document.id", + nullable=False, + ) + model_name: str = Field(description="fine tuned model name from OpenAI") + testing_file_id: str = Field( + description="File ID of the testing file uploaded to OpenAI" + ) + base_model: str = Field(nullable=False, description="Base model for fine-tuning") + split_ratio: float = Field( + nullable=False, description="the ratio the dataset was divided in" + ) + system_prompt: str = Field(sa_column=Column(Text, nullable=False)) + metric: list[str] = Field( + sa_column=Column(JSON, nullable=False), + description="List of metrics used for evaluation (e.g., ['mcc', 'accuracy'])", + ) + score: Optional[dict[str, float]] = Field( + sa_column=Column(JSON, nullable=True), + description="Evaluation scores per metric (e.g., {'mcc': 0.85})", + ) + status: ModelEvaluationStatus = ( + Field(default=ModelEvaluationStatus.pending, description="Evaluation status"), + ) + error_message: str | None = Field( + default=None, description="error message for when something failed" + ) + project_id: int = Field( + foreign_key="project.id", nullable=False, ondelete="CASCADE" + ) + organization_id: int = Field( + foreign_key="organization.id", nullable=False, ondelete="CASCADE" + ) + is_deleted: bool = Field(default=False, nullable=False) + + inserted_at: datetime = Field(default_factory=now, nullable=False) + updated_at: datetime = Field(default_factory=now, nullable=False) + deleted_at: datetime | None = Field(default=None, nullable=True) + + project: "Project" = Relationship(back_populates="model_evaluation") + fine_tuning: "Fine_Tuning" = Relationship(back_populates="model_evaluation") + + +class ModelEvaluationUpdate(SQLModel): + metric: Optional[list[str]] = None + score: Optional[dict[str, float]] = None + status: Optional[ModelEvaluationStatus] = None + error_message: Optional[str] = None + + +class ModelEvaluationPublic(ModelEvaluationBase): + """Public response model for evaluation result.""" + + id: int + document_id: UUID + model_name: str + score: dict[str, float] | None = None + status: ModelEvaluationStatus + is_best_model: bool | None = None + inserted_at: datetime + updated_at: datetime + deleted_at: datetime | None = None diff --git a/backend/app/models/project.py b/backend/app/models/project.py index a5eb3a3b..bf998fad 100644 --- a/backend/app/models/project.py +++ b/backend/app/models/project.py @@ -52,6 +52,9 @@ class Project(ProjectBase, table=True): fine_tuning: list["Fine_Tuning"] = Relationship( back_populates="project", cascade_delete=True ) + model_evalutaion: list["Model_Evaluation"] = Relationship( + back_populates="project", cascade_delete=True + ) openai_conversations: list["OpenAIConversation"] = Relationship( back_populates="project", cascade_delete=True ) From 35f514721cf3c3b390a5c60bd4c365b7759f8a97 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Sun, 10 Aug 2025 21:54:52 +0530 Subject: [PATCH 02/10] small fix on db model --- backend/app/models/fine_tuning.py | 2 +- backend/app/models/project.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/app/models/fine_tuning.py b/backend/app/models/fine_tuning.py index 737c2cd5..b035c289 100644 --- a/backend/app/models/fine_tuning.py +++ b/backend/app/models/fine_tuning.py @@ -81,7 +81,7 @@ class Fine_Tuning(FineTuningJobBase, table=True): deleted_at: datetime | None = Field(default=None, nullable=True) project: "Project" = Relationship(back_populates="fine_tuning") - model_evaluation: "Model_Evaluation" = Relationship(back_populates="fine-tuning") + model_evaluation: "Model_Evaluation" = Relationship(back_populates="fine_tuning") class FineTuningUpdate(SQLModel): diff --git a/backend/app/models/project.py b/backend/app/models/project.py index bf998fad..755b6045 100644 --- a/backend/app/models/project.py +++ b/backend/app/models/project.py @@ -52,7 +52,7 @@ class Project(ProjectBase, table=True): fine_tuning: list["Fine_Tuning"] = Relationship( back_populates="project", cascade_delete=True ) - model_evalutaion: list["Model_Evaluation"] = Relationship( + model_evaluation: list["Model_Evaluation"] = Relationship( back_populates="project", cascade_delete=True ) openai_conversations: list["OpenAIConversation"] = Relationship( From f8a1e3027a80dfe7e2fa7d1e68b8d6066ad3e0e4 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Mon, 11 Aug 2025 12:47:13 +0530 Subject: [PATCH 03/10] routes, crud and core --- backend/app/api/main.py | 2 + backend/app/api/routes/model_evaluation.py | 183 ++++++++++++++++++ backend/app/core/finetune/evaluation.py | 213 +++++++++++++++++++++ backend/app/crud/__init__.py | 9 + backend/app/crud/model_evaluation.py | 199 +++++++++++++++++++ 5 files changed, 606 insertions(+) create mode 100644 backend/app/api/routes/model_evaluation.py create mode 100644 backend/app/core/finetune/evaluation.py create mode 100644 backend/app/crud/model_evaluation.py diff --git a/backend/app/api/main.py b/backend/app/api/main.py index 85ce1cf6..4f22a1cc 100644 --- a/backend/app/api/main.py +++ b/backend/app/api/main.py @@ -18,6 +18,7 @@ onboarding, credentials, fine_tuning, + model_evaluation, ) from app.core.config import settings @@ -38,6 +39,7 @@ api_router.include_router(users.router) api_router.include_router(utils.router) api_router.include_router(fine_tuning.router) +api_router.include_router(model_evaluation.router) if settings.ENVIRONMENT == "local": diff --git a/backend/app/api/routes/model_evaluation.py b/backend/app/api/routes/model_evaluation.py new file mode 100644 index 00000000..0f2a74c5 --- /dev/null +++ b/backend/app/api/routes/model_evaluation.py @@ -0,0 +1,183 @@ +import logging +import time +from uuid import UUID + +from fastapi import APIRouter, HTTPException, BackgroundTasks +from sqlmodel import Session +from openai import OpenAI + +from app.crud import ( + fetch_by_id, + create_model_evaluation, + fetch_by_eval_id, + fetch_active_model_evals, + fetch_eval_by_doc_id, + update_model_eval, + fetch_top_model_by_doc_id, +) +from app.models import ( + ModelEvaluationBase, + ModelEvaluationCreate, + ModelEvaluationStatus, + ModelEvaluationUpdate, + ModelEvaluationPublic, +) +from app.core.finetune.evaluation import ModelEvaluator +from app.utils import get_openai_client, APIResponse +from app.api.deps import CurrentUserOrgProject, SessionDep + + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/model_evaluation", tags=["model_evaluation"]) + + +metric = ["mcc", "f1", "accuracy"] + + +def run_model_evaluation( + eval_id: int, + session: Session, + current_user: CurrentUserOrgProject, + client: OpenAI, +): + start_time = time.time() + + logger.info( + f"[run_model_evaluation] Starting evaluation | eval ID={eval_id}, project_id={current_user.project_id}" + ) + + model_eval = fetch_by_eval_id(session, eval_id, current_user.project_id) + update_model_eval( + session=session, + model_eval=model_eval, + update=ModelEvaluationUpdate(status=ModelEvaluationStatus.running), + ) + + try: + evaluator = ModelEvaluator( + model_name=model_eval.model_name, + testing_file_id=model_eval.testing_file_id, + system_prompt=model_eval.system_prompt, + client=client, + ) + result = evaluator.run() + end_time = time.time() + elapsed_time = end_time - start_time + + logger.info( + f"[run_model_evaluation] Evaluation completed successfully | eval ID={eval_id}, " + f"model_name={model_eval.model_name}, project_id={current_user.project_id}. " + f"Elapsed time: {elapsed_time:.2f} seconds" + ) + + update_data = ModelEvaluationUpdate( + score=result, + metric=list(result.keys()), + status=ModelEvaluationStatus.completed, + ) + update_model_eval( + session=session, + model_eval=model_eval, + update=update_data, + ) + except Exception as e: + end_time = time.time() + elapsed_time = end_time - start_time + + logger.error( + f"[run_model_evaluation] Evaluation failed | eval ID={eval_id}, project_id={current_user.project_id}: " + f"{str(e)}. Elapsed time: {elapsed_time:.2f} seconds" + ) + + update_model_eval( + session=session, + model_eval=model_eval, + update=ModelEvaluationUpdate( + status=ModelEvaluationStatus.failed, + error_message="failed during background job processing", + ), + ) + + +@router.post("/evaluate-model/", response_model=APIResponse) +def evaluate_model( + request: ModelEvaluationCreate, + background_tasks: BackgroundTasks, + session: SessionDep, + current_user: CurrentUserOrgProject, +): + client = get_openai_client( + session, current_user.organization_id, current_user.project_id + ) + + if not request.fine_tuning_ids: + logger.error( + f"[evaluate_model] No fine tuning IDs provided | project_id:{current_user.project_id}" + ) + raise HTTPException(status_code=400, detail="No fine-tuned job IDs provided") + + evals: list[ModelEvaluationPublic] = [] + + for job_id in request.fine_tuning_ids: + fine_tune = fetch_by_id(session, job_id, current_user.project_id) + active_evals = fetch_active_model_evals( + session, job_id, current_user.project_id + ) + + if active_evals: + logger.info( + f"[evaluate_model] Skipping creation for {job_id}. Active evaluation exists, project_id:{current_user.project_id}" + ) + evals.extend( + ModelEvaluationPublic.model_validate(ev) for ev in active_evals + ) + continue + + model_eval = create_model_evaluation( + session=session, + request=ModelEvaluationBase(fine_tuning_id=fine_tune.id), + project_id=current_user.project_id, + organization_id=current_user.organization_id, + metric=metric, + status=ModelEvaluationStatus.pending, + ) + + evals.append(ModelEvaluationPublic.model_validate(model_eval)) + + logger.info( + f"[evaluate_model] Created evaluation for fine_tuning_id {job_id} with eval ID={model_eval.id}, project_id:{current_user.project_id}" + ) + + background_tasks.add_task( + run_model_evaluation, model_eval.id, session, current_user, client + ) + + return APIResponse.success_response( + {"message": "Model evaluation(s) started successfully", "data": evals} + ) + + +@router.get( + "/{document_id}/top_model", response_model=APIResponse[ModelEvaluationPublic] +) +def get_top_model_by_doc_id( + document_id: UUID, session: SessionDep, current_user: CurrentUserOrgProject +): + logger.info( + f"[get_top_model_by_doc_id]Fetching top model for document_id: {document_id}, project_id: {current_user.project_id}" + ) + top_model = fetch_top_model_by_doc_id(session, document_id, current_user.project_id) + + return APIResponse.success_response(top_model) + + +@router.get("/{document_id}", response_model=APIResponse[list[ModelEvaluationPublic]]) +def get_evals_by_doc_id( + document_id: UUID, session: SessionDep, current_user: CurrentUserOrgProject +): + logger.info( + f"[get_evals_by_doc_id]Fetching evaluations for document_id: {document_id}, project_id: {current_user.project_id}" + ) + evaluations = fetch_eval_by_doc_id(session, document_id, current_user.project_id) + return APIResponse.success_response(evaluations) diff --git a/backend/app/core/finetune/evaluation.py b/backend/app/core/finetune/evaluation.py new file mode 100644 index 00000000..e8441509 --- /dev/null +++ b/backend/app/core/finetune/evaluation.py @@ -0,0 +1,213 @@ +import json +import difflib +import time +import logging +from typing import List, Tuple, Set + +import openai +from openai import OpenAI +from sklearn.metrics import matthews_corrcoef, accuracy_score, f1_score +from app.api.routes.fine_tuning import handle_openai_error + + +logger = logging.getLogger(__name__) + + +class ModelEvaluator: + max_latency = 90 + retries = 3 + normalization_cutoff = 0.7 + + def __init__( + self, + model_name: str, + testing_file_id: str, + system_prompt: str, + client: openai.OpenAI, + ): + self.model_name = model_name + self.testing_file_id = testing_file_id + self.system_instruction = system_prompt + self.client = client + + self.allowed_labels: Set[str] = set() + self.y_true: List[str] = [] + self.prompts: List[str] = [] + + logger.info(f"ModelEvaluator initialized with model: {model_name}") + + def load_labels_and_prompts(self) -> None: + """Loads labels and prompts directly from OpenAI file content using the file ID.""" + logger.info( + f"[load_labels_and_prompts] Loading labels and prompts from file ID: {self.testing_file_id}" + ) + try: + response = self.client.files.content(self.testing_file_id) + file_bytes = response.read() + lines = file_bytes.decode("utf-8").splitlines() + + for ln, line in enumerate(lines, 1): + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + msgs = obj.get("messages", []) + if not isinstance(msgs, list) or not msgs: + logger.error( + f"[load_labels_and_prompts] Line {ln}: 'messages' missing or invalid" + ) + raise ValueError(f"Line {ln}: 'messages' missing or invalid") + + user_msgs = [ + m for m in msgs if m.get("role") == "user" and "content" in m + ] + model_msgs = [ + m + for m in msgs + if m.get("role") == "assistant" and "content" in m + ] + if not user_msgs or not model_msgs: + logger.error( + f"[load_labels_and_prompts] Line {ln}: missing user or assistant message" + ) + raise ValueError( + f"Line {ln}: missing user or assistant message" + ) + + prompt = user_msgs[-1]["content"] + label = model_msgs[-1]["content"].strip().lower() + self.prompts.append(prompt) + self.y_true.append(label) + self.allowed_labels.add(label) + + except Exception as e: + logger.error( + f"[load_labels_and_prompts] Error processing line {ln}: {str(e)}" + ) + raise + + logger.info( + f"[load_labels_and_prompts] Loaded {len(self.prompts)} prompts and {len(self.y_true)} labels." + ) + + except Exception as e: + logger.error( + f"[load_labels_and_prompts] Failed to load file content: {str(e)}" + ) + raise + + def normalize_prediction(self, text: str) -> str: + logger.debug(f"[normalize_prediction] Normalizing prediction: {text}") + t = (text or "").strip().lower() + + if t in self.allowed_labels: + return t + + closest = difflib.get_close_matches( + t, self.allowed_labels, n=1, cutoff=self.normalization_cutoff + ) + if closest: + return closest[0] + + logger.warning( + f"[normalize_prediction] No close match found for '{t}'. Using default label '{next(iter(self.allowed_labels))}'." + ) + return next(iter(self.allowed_labels)) + + def generate_predictions(self) -> List[str]: + logger.info( + f"[generate_predictions] Generating predictions for {len(self.prompts)} prompts." + ) + predictions = [] + total_prompts = len(self.prompts) + + for idx, prompt in enumerate(self.prompts, 1): + attempt = 0 + while attempt < self.retries: + start_time = time.time() + logger.info( + f"[generate_predictions] Processing prompt {idx}/{total_prompts} (Attempt {attempt + 1}/{self.retries})" + ) + + try: + response = self.client.chat.completions.create( + model=self.model_name, + messages=[ + {"role": "system", "content": self.system_instruction}, + {"role": "user", "content": prompt}, + ], + temperature=0, + max_tokens=3, + ) + + elapsed_time = time.time() - start_time + if elapsed_time > self.max_latency: + logger.warning( + f"[generate_predictions] Timeout exceeded for prompt {idx}/{total_prompts}. Retrying..." + ) + continue + + raw = response.choices[0].message.content or "" + prediction = self.normalize_prediction(raw) + predictions.append(prediction) + break + + except openai.OpenAIError as e: + error_msg = str(e) + logger.error( + f"[generate_predictions] OpenAI API error at prompt {idx}/{total_prompts}: {error_msg}" + ) + attempt += 1 + if attempt == self.retries: + predictions.append( + "openai_error" + ) # Placeholder for failed predictions + logger.error( + f"[generate_predictions] Maximum retries reached for prompt {idx}/{total_prompts}. Appending 'openai_error'." + ) + else: + logger.info( + f"[generate_predictions] Retrying prompt {idx}/{total_prompts} after OpenAI error ({attempt}/{self.retries})." + ) + + logger.info( + f"[generate_predictions] Generated predictions for {len(predictions)} prompts." + ) + return predictions + + def evaluate(self, y_pred: List[str]) -> dict: + """Evaluate the predictions against the true labels.""" + logger.info(f"[evaluate] Starting evaluation with {len(y_pred)} predictions.") + + try: + mcc_score = round(matthews_corrcoef(self.y_true, y_pred), 4) + accuracy = round(accuracy_score(self.y_true, y_pred), 4) + f1_query = round( + f1_score(self.y_true, y_pred, pos_label="query", average="binary"), 4 + ) + + logger.info( + f"[evaluate] Evaluation completed. MCC: {mcc_score}, Accuracy: {accuracy}, F1 Query: {f1_query}" + ) + + return { + "mcc": mcc_score, + "accuracy": accuracy, + "f1_query": f1_query, + } + except Exception as e: + logger.error(f"[evaluate] Error during evaluation: {str(e)}") + raise + + def run(self) -> dict: + """Run the full evaluation process: load data, generate predictions, evaluate results.""" + try: + self.load_labels_and_prompts() + predictions = self.generate_predictions() + evaluation_results = self.evaluate(predictions) + logger.info("[evaluate] Model evaluation completed successfully.") + return evaluation_results + except Exception as e: + logger.error(f"[evaluate] Error in running ModelEvaluator: {str(e)}") + raise diff --git a/backend/app/crud/__init__.py b/backend/app/crud/__init__.py index a7893260..810e153c 100644 --- a/backend/app/crud/__init__.py +++ b/backend/app/crud/__init__.py @@ -73,3 +73,12 @@ update_finetune_job, fetch_active_jobs_by_document_id, ) + +from .model_evaluation import ( + create_model_evaluation, + fetch_active_model_evals, + fetch_by_eval_id, + fetch_eval_by_doc_id, + fetch_top_model_by_doc_id, + update_model_eval, +) diff --git a/backend/app/crud/model_evaluation.py b/backend/app/crud/model_evaluation.py new file mode 100644 index 00000000..319982d2 --- /dev/null +++ b/backend/app/crud/model_evaluation.py @@ -0,0 +1,199 @@ +from typing import Optional +import logging +from uuid import UUID + +from fastapi import HTTPException +from sqlmodel import Session, select +from sqlalchemy import func + +from app.crud import fetch_by_id +from app.models import ( + Model_Evaluation, + ModelEvaluationStatus, + ModelEvaluationBase, + ModelEvaluationUpdate, +) +from app.core.util import now + + +logger = logging.getLogger(__name__) + + +def create_model_evaluation( + session: Session, + request: ModelEvaluationBase, + project_id: int, + organization_id: int, + metric: list[str], + status: ModelEvaluationStatus = ModelEvaluationStatus.pending, +) -> Model_Evaluation: + fine_tune = fetch_by_id(session, request.fine_tuning_id, project_id) + + if fine_tune.fine_tuned_model is None: + logger.error( + f"[create_model_evaluation] No fine tuned model found for the given fine tuning ID | fine_tuning_id={request.fine_tuning_id}, project_id={project_id}" + ) + raise HTTPException(404, "Fine tuned model not found") + + base_data = { + "fine_tuning_id": request.fine_tuning_id, + "metric": metric, + "system_prompt": fine_tune.system_prompt, + "base_model": fine_tune.base_model, + "split_ratio": fine_tune.split_ratio, + "model_name": fine_tune.fine_tuned_model, + "document_id": fine_tune.document_id, + "testing_file_id": fine_tune.testing_file_id, + "project_id": project_id, + "organization_id": organization_id, + "status": status, + } + + model_eval = Model_Evaluation(**base_data) + model_eval.updated_at = now() + + session.add(model_eval) + session.commit() + session.refresh(model_eval) + + logger.info( + f"[Create_fine_tuning_job]Created new model evaluation from job ID={fine_tune.id}, project_id={project_id}" + ) + return model_eval + + +def fetch_by_eval_id( + session: Session, eval_id: int, project_id: int +) -> Model_Evaluation: + model_eval = session.exec( + select(Model_Evaluation).where( + Model_Evaluation.id == eval_id, Model_Evaluation.project_id == project_id + ) + ).one_or_none() + + if model_eval is None: + logger.error( + f"[fetch_by_id]Model evaluation not found for eval_id={eval_id}, project_id={project_id}" + ) + raise HTTPException(status_code=404, detail="model eval not found") + + logger.info( + f"[fetch_by_id]Fetched model evaluation for eval ID={model_eval.id}, project_id={project_id}" + ) + return model_eval + + +def fetch_eval_by_doc_id( + session: Session, + document_id: UUID, + project_id: int, +) -> list[Model_Evaluation]: + query = ( + select(Model_Evaluation) + .where( + Model_Evaluation.document_id == document_id, + Model_Evaluation.project_id == project_id, + ) + .order_by(Model_Evaluation.updated_at.desc()) + ) + + model_evals = session.exec(query).all() + + if not model_evals: + logger.error( + f"[fetch_eval_by_doc_id]Model evaluation not found for document_id={document_id}, project_id={project_id}" + ) + raise HTTPException(status_code=404, detail="Model evaluation not found") + + logger.info( + f"[fetch_eval_by_doc_id]Found {len(model_evals)} model evaluation(s) for document_id={document_id}, " + f"project_id={project_id}, sorted by MCC" + ) + + return model_evals + + +def fetch_top_model_by_doc_id( + session: Session, document_id: UUID, project_id: int +) -> Model_Evaluation: + query = ( + select(Model_Evaluation) + .where( + Model_Evaluation.document_id == document_id, + Model_Evaluation.project_id == project_id, + ) + .order_by(Model_Evaluation.updated_at.desc()) + ) + + model_evals = session.exec(query).all() + + top_model = None + highest_mcc = -float("inf") + + for model_eval in model_evals: + if model_eval.score is not None: + mcc = model_eval.score.get("mcc", None) + if mcc is not None and mcc > highest_mcc: + highest_mcc = mcc + top_model = model_eval + + if not top_model: + logger.error( + f"[fetch_top_model_by_doc_id]No model evaluation found with populated score for document_id={document_id}, project_id={project_id}" + ) + raise HTTPException(status_code=404, detail="No top model found") + + logger.info( + f"[fetch_top_model_by_doc_id]Found top model evaluation for document_id={document_id}, " + f"project_id={project_id}, sorted by MCC" + ) + + return top_model + + +def fetch_active_model_evals( + session: Session, + fine_tuning_id: int, + project_id: int, +) -> list["Model_Evaluation"]: + """ + Return all ACTIVE model evaluations for the given document & project. + Active = status != failed AND is_deleted is false. + """ + stmt = ( + select(Model_Evaluation) + .where( + Model_Evaluation.fine_tuning_id == fine_tuning_id, + Model_Evaluation.project_id == project_id, + Model_Evaluation.is_deleted.is_(False), + Model_Evaluation.status != "failed", + ) + .order_by(Model_Evaluation.inserted_at.desc()) + ) + + return session.exec(stmt).all() + + +def update_model_eval( + session: Session, model_eval: Model_Evaluation, update: ModelEvaluationUpdate +) -> Model_Evaluation: + if model_eval is None: + raise HTTPException(status_code=404, detail="Model evaluation not found") + + logger.info( + f"[update_model_eval] Updating model evaluation ID={model_eval.id} with status={update.status}" + ) + + for key, value in update.dict(exclude_unset=True).items(): + setattr(model_eval, key, value) + + model_eval.updated_at = now() + + session.add(model_eval) + session.commit() + session.refresh(model_eval) + + logger.info( + f"[update_model_eval] Successfully updated model evaluation ID={model_eval.id}" + ) + return model_eval From b0c22fd06b15235b248837d92d0ad3a8354e7847 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Mon, 11 Aug 2025 12:48:09 +0530 Subject: [PATCH 04/10] test cases and small fixes --- backend/app/models/model_evaluation.py | 2 + .../tests/api/routes/test_model_evaluation.py | 148 ++++++++++++++++++ .../app/tests/crud/test_model_evaluation.py | 138 ++++++++++++++++ backend/app/tests/utils/test_data.py | 54 ++++++- 4 files changed, 340 insertions(+), 2 deletions(-) create mode 100644 backend/app/tests/api/routes/test_model_evaluation.py create mode 100644 backend/app/tests/crud/test_model_evaluation.py diff --git a/backend/app/models/model_evaluation.py b/backend/app/models/model_evaluation.py index eb687438..96146bc7 100644 --- a/backend/app/models/model_evaluation.py +++ b/backend/app/models/model_evaluation.py @@ -90,6 +90,8 @@ class ModelEvaluationPublic(ModelEvaluationBase): id: int document_id: UUID model_name: str + split_ratio: float + base_model: str score: dict[str, float] | None = None status: ModelEvaluationStatus is_best_model: bool | None = None diff --git a/backend/app/tests/api/routes/test_model_evaluation.py b/backend/app/tests/api/routes/test_model_evaluation.py new file mode 100644 index 00000000..d264710b --- /dev/null +++ b/backend/app/tests/api/routes/test_model_evaluation.py @@ -0,0 +1,148 @@ +import pytest +from unittest.mock import MagicMock, patch +from app.models import Model_Evaluation +from app.crud import fetch_by_eval_id +from app.tests.utils.test_data import ( + create_test_finetuning_job_with_extra_fields, + create_test_model_evaluation, +) + + +@patch("app.api.routes.model_evaluation.ModelEvaluator") +def test_evaluate_model( + mock_ModelEvaluator, client, db, user_api_key_header, user_api_key +): + fine_tuned, _ = create_test_finetuning_job_with_extra_fields(db, [0.5]) + + mock_evaluator = MagicMock() + mock_evaluator.run.return_value = {"mcc": 0.8, "accuracy": 0.9} + mock_ModelEvaluator.return_value = mock_evaluator + + body = {"fine_tuning_ids": [fine_tuned[0].id]} + + response = client.post( + "/api/v1/model_evaluation/evaluate-model/", + json=body, + headers=user_api_key_header, + ) + + assert response.status_code == 200 + json_data = response.json() + + assert json_data["data"]["message"] == "Model evaluation(s) started successfully" + + evaluations = [eval for eval in json_data["data"].get("data", []) if eval] + assert len(evaluations) == 1 + + assert evaluations[0]["status"] == "pending" + + mock_evaluator.run.assert_called_with() + assert mock_evaluator.run.call_count == 1 + + updated_model_eval = fetch_by_eval_id( + db, evaluations[0]["id"], user_api_key.project_id + ) + + assert updated_model_eval.score == {"mcc": 0.8, "accuracy": 0.9} + + assert updated_model_eval.fine_tuning_id == fine_tuned[0].id + assert updated_model_eval.model_name == fine_tuned[0].fine_tuned_model + assert updated_model_eval.testing_file_id == fine_tuned[0].testing_file_id + + +@patch("app.api.routes.model_evaluation.ModelEvaluator") +def test_run_model_evaluation_evaluator_run_failure( + mock_ModelEvaluator, client, db, user_api_key_header, user_api_key +): + fine_tuned, _ = create_test_finetuning_job_with_extra_fields(db, [0.5]) + fine_tune = fine_tuned[0] + + mock_evaluator = MagicMock() + mock_evaluator.run.side_effect = Exception("Evaluator failed") + mock_ModelEvaluator.return_value = mock_evaluator + + response = client.post( + "/api/v1/model_evaluation/evaluate-model/", + json={"fine_tuning_ids": [fine_tune.id]}, + headers=user_api_key_header, + ) + + json_data = response.json() + model_eval_id = json_data["data"]["data"][0]["id"] + + updated_model_eval = fetch_by_eval_id(db, model_eval_id, user_api_key.project_id) + assert updated_model_eval.status == "failed" + assert updated_model_eval.error_message == "failed during background job processing" + + +def test_evaluate_model_finetuning_not_found(client, db, user_api_key_header): + invalid_fine_tune_id = 9999 + + body = {"fine_tuning_ids": [invalid_fine_tune_id]} + + response = client.post( + "/api/v1/model_evaluation/evaluate-model/", + json=body, + headers=user_api_key_header, + ) + + assert response.status_code == 404 + json_data = response.json() + assert json_data["error"] == f"Job not found" + + +def test_top_model_by_doc(client, db, user_api_key_header): + model_evals = create_test_model_evaluation(db) + model_eval = model_evals[0] + + model_eval.score = {"mcc": 0.85, "accuracy": 0.9} + db.flush() + + response = client.get( + f"/api/v1/model_evaluation/{model_eval.document_id}/top_model", + headers=user_api_key_header, + ) + + assert response.status_code == 200 + json_data = response.json() + + assert json_data["data"]["score"] == {"mcc": 0.85, "accuracy": 0.9} + assert json_data["data"]["model_name"] == model_eval.model_name + assert json_data["data"]["document_id"] == str(model_eval.document_id) + + assert json_data["data"]["id"] == model_eval.id + + +def test_get_top_model_by_doc_id_no_score(client, db, user_api_key_header): + model_evals = create_test_model_evaluation(db) + + document_id = model_evals[0].document_id + + response = client.get( + f"/api/v1/model_evaluation/{document_id}/top_model", headers=user_api_key_header + ) + + assert response.status_code == 404 + + json_data = response.json() + assert json_data["error"] == "No top model found" + + +def test_get_evals_by_doc_id(client, db, user_api_key_header): + model_evals = create_test_model_evaluation(db) + document_id = model_evals[0].document_id + + response = client.get( + f"/api/v1/model_evaluation/{document_id}", headers=user_api_key_header + ) + + assert response.status_code == 200 + json_data = response.json() + + assert json_data["success"] is True + assert json_data["data"] is not None + assert len(json_data["data"]) == 2 + + evaluations = json_data["data"] + assert all(eval["document_id"] == str(document_id) for eval in evaluations) + assert all(eval["status"] == "pending" for eval in evaluations) diff --git a/backend/app/tests/crud/test_model_evaluation.py b/backend/app/tests/crud/test_model_evaluation.py new file mode 100644 index 00000000..aeb908bd --- /dev/null +++ b/backend/app/tests/crud/test_model_evaluation.py @@ -0,0 +1,138 @@ +import pytest +from uuid import UUID +from sqlmodel import Session +from fastapi import HTTPException + +from app.tests.utils.utils import get_document, get_project +from app.tests.utils.test_data import ( + create_test_model_evaluation, + create_test_finetuning_job_with_extra_fields, +) +from app.models import ModelEvaluationBase, ModelEvaluationUpdate, ModelEvaluationStatus +from app.crud import ( + create_model_evaluation, + fetch_by_eval_id, + fetch_eval_by_doc_id, + fetch_top_model_by_doc_id, + fetch_active_model_evals, + update_model_eval, +) + + +def test_create_model_evaluation(db: Session): + project = get_project(db, "Dalgo") + document = get_document(db) + + fine_tune_jobs, _ = create_test_finetuning_job_with_extra_fields(db, [0.5]) + fine_tune = fine_tune_jobs[0] + + job_request = ModelEvaluationBase( + fine_tuning_id=fine_tune.id, + metric=["mcc", "f1", "accuracy"], + system_prompt=fine_tune.system_prompt, + base_model=fine_tune.base_model, + model_name=fine_tune.fine_tuned_model, + document_id=fine_tune.document_id, + testing_file_id=fine_tune.testing_file_id, + status="pending", + ) + + created_eval = create_model_evaluation( + session=db, + request=job_request, + project_id=project.id, + organization_id=project.organization_id, + ) + + assert created_eval.id is not None + assert created_eval.status == "pending" + assert created_eval.document_id == fine_tune.document_id + assert created_eval.model_name == fine_tune.fine_tuned_model + assert created_eval.testing_file_id == fine_tune.testing_file_id + + +def test_fetch_by_eval_id_success(db: Session): + model_evals = create_test_model_evaluation(db) + model_eval = model_evals[0] + result = fetch_by_eval_id( + db, eval_id=model_eval.id, project_id=model_eval.project_id + ) + assert result.id == model_eval.id + + +def test_fetch_by_eval_id_not_found(db: Session): + with pytest.raises(HTTPException) as exc: + fetch_by_eval_id(db, eval_id=9999, project_id=1) + assert exc.value.status_code == 404 + + +def test_fetch_eval_by_doc_id_success(db: Session): + model_evals = create_test_model_evaluation(db) + doc_id = model_evals[0].document_id + + result = fetch_eval_by_doc_id( + db, document_id=doc_id, project_id=model_evals[0].project_id + ) + assert len(result) > 0 + + +def test_fetch_eval_by_doc_id_not_found(db: Session): + valid_uuid = UUID("c5d479e2-66a5-40b8-aa76-4a2290b6d1f3") + with pytest.raises(HTTPException) as exc: + fetch_eval_by_doc_id(db, document_id=valid_uuid, project_id=1) + assert exc.value.status_code == 404 + + +def test_fetch_top_model_by_doc_id_success(db: Session): + model_evals = create_test_model_evaluation(db) + model_eval = model_evals[0] + model_eval.score = {"mcc": 0.8, "accuracy": 0.9} + db.flush() + + doc_id = model_eval.document_id + + result = fetch_top_model_by_doc_id( + db, document_id=doc_id, project_id=model_evals[0].project_id + ) + assert result.id == model_eval.id + + +def test_fetch_top_model_by_doc_id_not_found(db: Session): + valid_uuid = UUID("c5d479e2-66a5-40b8-aa76-4a2290b6d1f3") + with pytest.raises(HTTPException) as exc: + fetch_top_model_by_doc_id(db, document_id=valid_uuid, project_id=1) + assert exc.value.status_code == 404 + + +def test_fetch_active_model_evals(db: Session): + model_evals = create_test_model_evaluation(db) + active_evals = fetch_active_model_evals( + db, + fine_tuning_id=model_evals[0].fine_tuning_id, + project_id=model_evals[0].project_id, + ) + assert len(active_evals) > 0 + assert all(eval.status != "failed" for eval in active_evals) + + +def test_update_model_eval_success(db: Session): + model_evals = create_test_model_evaluation(db) + model_eval = model_evals[0] + + update = ModelEvaluationUpdate(status="completed") + updated_eval = update_model_eval(db, model_eval=model_eval, update=update) + + assert updated_eval.status == "completed" + assert updated_eval.updated_at is not None + + +def test_update_model_eval_not_found(db: Session): + with pytest.raises(HTTPException) as exc: + update_model_eval( + db, + model_eval=None, + update=ModelEvaluationUpdate(status=ModelEvaluationStatus.completed), + ) + + assert exc.value.status_code == 404 + assert exc.value.detail == "Model evaluation not found" diff --git a/backend/app/tests/utils/test_data.py b/backend/app/tests/utils/test_data.py index d5cafc8f..6727312e 100644 --- a/backend/app/tests/utils/test_data.py +++ b/backend/app/tests/utils/test_data.py @@ -4,13 +4,15 @@ Organization, Project, APIKey, - Document, Credential, OrganizationCreate, ProjectCreate, CredsCreate, FineTuningJobCreate, Fine_Tuning, + Model_Evaluation, + ModelEvaluationBase, + ModelEvaluationStatus, ) from app.crud import ( create_organization, @@ -18,6 +20,7 @@ create_api_key, set_creds_for_org, create_fine_tuning_job, + create_model_evaluation, ) from app.core.providers import Provider from app.tests.utils.user import create_random_user @@ -136,7 +139,7 @@ def create_test_fine_tuning_jobs( job_request = FineTuningJobCreate( document_id=document.id, base_model="gpt-4", - split_ratio=[0.5], + split_ratio=[ratio], system_prompt="str", ) job, created = create_fine_tuning_job( @@ -151,3 +154,50 @@ def create_test_fine_tuning_jobs( any_created = True return jobs, any_created + + +def create_test_finetuning_job_with_extra_fields( + db: Session, + ratios: list[float], +) -> tuple[list[Fine_Tuning], bool]: + jobs, _ = create_test_fine_tuning_jobs(db, [0.5, 0.7]) + + if jobs: + for job in jobs: + job.testing_file_id = "testing_file_id_example" + job.fine_tuned_model = "fine_tuned_model_name" + + return jobs, True + + +def create_test_model_evaluation(db) -> list[Model_Evaluation]: + fine_tune_jobs, any_created = create_test_finetuning_job_with_extra_fields( + db, [0.5, 0.7] + ) + + model_evaluations = [] + + for fine_tune in fine_tune_jobs: + request = ModelEvaluationBase( + fine_tuning_id=fine_tune.id, + system_prompt=fine_tune.system_prompt, + base_model=fine_tune.base_model, + model_name=fine_tune.fine_tuned_model, + document_id=fine_tune.document_id, + testing_file_id=fine_tune.testing_file_id + if fine_tune.testing_file_id + else None, + metric=["mcc", "f1", "accuracy"], + ) + + model_eval = create_model_evaluation( + session=db, + request=request, + project_id=fine_tune.project_id, + organization_id=fine_tune.organization_id, + status=ModelEvaluationStatus.pending, + ) + + model_evaluations.append(model_eval) + + return model_evaluations From 57f13178c6d593be3da3252cfaea3c0b99b154ec Mon Sep 17 00:00:00 2001 From: nishika26 Date: Mon, 11 Aug 2025 13:04:07 +0530 Subject: [PATCH 05/10] removing unused imports and other small fixes --- backend/app/core/finetune/evaluation.py | 14 +++++++------- backend/app/crud/model_evaluation.py | 2 -- .../app/tests/api/routes/test_model_evaluation.py | 3 +-- backend/app/tests/crud/test_model_evaluation.py | 10 ++++++---- backend/app/tests/utils/test_data.py | 4 +++- 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/backend/app/core/finetune/evaluation.py b/backend/app/core/finetune/evaluation.py index e8441509..49cda937 100644 --- a/backend/app/core/finetune/evaluation.py +++ b/backend/app/core/finetune/evaluation.py @@ -2,7 +2,7 @@ import difflib import time import logging -from typing import List, Tuple, Set +from typing import Set import openai from openai import OpenAI @@ -23,7 +23,7 @@ def __init__( model_name: str, testing_file_id: str, system_prompt: str, - client: openai.OpenAI, + client: OpenAI, ): self.model_name = model_name self.testing_file_id = testing_file_id @@ -31,8 +31,8 @@ def __init__( self.client = client self.allowed_labels: Set[str] = set() - self.y_true: List[str] = [] - self.prompts: List[str] = [] + self.y_true: list[str] = [] + self.prompts: list[str] = [] logger.info(f"ModelEvaluator initialized with model: {model_name}") @@ -115,7 +115,7 @@ def normalize_prediction(self, text: str) -> str: ) return next(iter(self.allowed_labels)) - def generate_predictions(self) -> List[str]: + def generate_predictions(self) -> list[str]: logger.info( f"[generate_predictions] Generating predictions for {len(self.prompts)} prompts." ) @@ -154,7 +154,7 @@ def generate_predictions(self) -> List[str]: break except openai.OpenAIError as e: - error_msg = str(e) + error_msg = handle_openai_error(e) logger.error( f"[generate_predictions] OpenAI API error at prompt {idx}/{total_prompts}: {error_msg}" ) @@ -176,7 +176,7 @@ def generate_predictions(self) -> List[str]: ) return predictions - def evaluate(self, y_pred: List[str]) -> dict: + def evaluate(self, y_pred: list[str]) -> dict: """Evaluate the predictions against the true labels.""" logger.info(f"[evaluate] Starting evaluation with {len(y_pred)} predictions.") diff --git a/backend/app/crud/model_evaluation.py b/backend/app/crud/model_evaluation.py index 319982d2..9102f19c 100644 --- a/backend/app/crud/model_evaluation.py +++ b/backend/app/crud/model_evaluation.py @@ -1,10 +1,8 @@ -from typing import Optional import logging from uuid import UUID from fastapi import HTTPException from sqlmodel import Session, select -from sqlalchemy import func from app.crud import fetch_by_id from app.models import ( diff --git a/backend/app/tests/api/routes/test_model_evaluation.py b/backend/app/tests/api/routes/test_model_evaluation.py index d264710b..a060bb40 100644 --- a/backend/app/tests/api/routes/test_model_evaluation.py +++ b/backend/app/tests/api/routes/test_model_evaluation.py @@ -1,6 +1,5 @@ -import pytest from unittest.mock import MagicMock, patch -from app.models import Model_Evaluation + from app.crud import fetch_by_eval_id from app.tests.utils.test_data import ( create_test_finetuning_job_with_extra_fields, diff --git a/backend/app/tests/crud/test_model_evaluation.py b/backend/app/tests/crud/test_model_evaluation.py index aeb908bd..366b9a20 100644 --- a/backend/app/tests/crud/test_model_evaluation.py +++ b/backend/app/tests/crud/test_model_evaluation.py @@ -1,9 +1,10 @@ -import pytest from uuid import UUID + +import pytest from sqlmodel import Session from fastapi import HTTPException -from app.tests.utils.utils import get_document, get_project +from app.tests.utils.utils import get_project from app.tests.utils.test_data import ( create_test_model_evaluation, create_test_finetuning_job_with_extra_fields, @@ -21,14 +22,14 @@ def test_create_model_evaluation(db: Session): project = get_project(db, "Dalgo") - document = get_document(db) fine_tune_jobs, _ = create_test_finetuning_job_with_extra_fields(db, [0.5]) fine_tune = fine_tune_jobs[0] + metric = ["mcc", "f1", "accuracy"] + job_request = ModelEvaluationBase( fine_tuning_id=fine_tune.id, - metric=["mcc", "f1", "accuracy"], system_prompt=fine_tune.system_prompt, base_model=fine_tune.base_model, model_name=fine_tune.fine_tuned_model, @@ -42,6 +43,7 @@ def test_create_model_evaluation(db: Session): request=job_request, project_id=project.id, organization_id=project.organization_id, + metric=metric, ) assert created_eval.id is not None diff --git a/backend/app/tests/utils/test_data.py b/backend/app/tests/utils/test_data.py index 6727312e..a3cd712f 100644 --- a/backend/app/tests/utils/test_data.py +++ b/backend/app/tests/utils/test_data.py @@ -177,6 +177,8 @@ def create_test_model_evaluation(db) -> list[Model_Evaluation]: model_evaluations = [] + metric = ["mcc", "f1", "accuracy"] + for fine_tune in fine_tune_jobs: request = ModelEvaluationBase( fine_tuning_id=fine_tune.id, @@ -187,7 +189,6 @@ def create_test_model_evaluation(db) -> list[Model_Evaluation]: testing_file_id=fine_tune.testing_file_id if fine_tune.testing_file_id else None, - metric=["mcc", "f1", "accuracy"], ) model_eval = create_model_evaluation( @@ -195,6 +196,7 @@ def create_test_model_evaluation(db) -> list[Model_Evaluation]: request=request, project_id=fine_tune.project_id, organization_id=fine_tune.organization_id, + metric=metric, status=ModelEvaluationStatus.pending, ) From ac242290f6049a4b3e79c565345d894834fbdfe8 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Mon, 11 Aug 2025 22:52:53 +0530 Subject: [PATCH 06/10] router name uniformity --- backend/app/api/routes/model_evaluation.py | 8 ++++---- backend/app/tests/api/routes/test_model_evaluation.py | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/backend/app/api/routes/model_evaluation.py b/backend/app/api/routes/model_evaluation.py index 0f2a74c5..d89dd759 100644 --- a/backend/app/api/routes/model_evaluation.py +++ b/backend/app/api/routes/model_evaluation.py @@ -100,8 +100,8 @@ def run_model_evaluation( ) -@router.post("/evaluate-model/", response_model=APIResponse) -def evaluate_model( +@router.post("/evaluate_models/", response_model=APIResponse) +def evaluate_models( request: ModelEvaluationCreate, background_tasks: BackgroundTasks, session: SessionDep, @@ -120,7 +120,7 @@ def evaluate_model( evals: list[ModelEvaluationPublic] = [] for job_id in request.fine_tuning_ids: - fine_tune = fetch_by_id(session, job_id, current_user.project_id) + fine_tuning_job = fetch_by_id(session, job_id, current_user.project_id) active_evals = fetch_active_model_evals( session, job_id, current_user.project_id ) @@ -136,7 +136,7 @@ def evaluate_model( model_eval = create_model_evaluation( session=session, - request=ModelEvaluationBase(fine_tuning_id=fine_tune.id), + request=ModelEvaluationBase(fine_tuning_id=fine_tuning_job.id), project_id=current_user.project_id, organization_id=current_user.organization_id, metric=metric, diff --git a/backend/app/tests/api/routes/test_model_evaluation.py b/backend/app/tests/api/routes/test_model_evaluation.py index a060bb40..fd5e94c6 100644 --- a/backend/app/tests/api/routes/test_model_evaluation.py +++ b/backend/app/tests/api/routes/test_model_evaluation.py @@ -20,7 +20,7 @@ def test_evaluate_model( body = {"fine_tuning_ids": [fine_tuned[0].id]} response = client.post( - "/api/v1/model_evaluation/evaluate-model/", + "/api/v1/model_evaluation/evaluate_models/", json=body, headers=user_api_key_header, ) @@ -61,12 +61,13 @@ def test_run_model_evaluation_evaluator_run_failure( mock_ModelEvaluator.return_value = mock_evaluator response = client.post( - "/api/v1/model_evaluation/evaluate-model/", + "/api/v1/model_evaluation/evaluate_models/", json={"fine_tuning_ids": [fine_tune.id]}, headers=user_api_key_header, ) json_data = response.json() + print("jonnn", json_data) model_eval_id = json_data["data"]["data"][0]["id"] updated_model_eval = fetch_by_eval_id(db, model_eval_id, user_api_key.project_id) @@ -80,7 +81,7 @@ def test_evaluate_model_finetuning_not_found(client, db, user_api_key_header): body = {"fine_tuning_ids": [invalid_fine_tune_id]} response = client.post( - "/api/v1/model_evaluation/evaluate-model/", + "/api/v1/model_evaluation/evaluate_models/", json=body, headers=user_api_key_header, ) From 7ed1e0ad665fd02c2c6fb9042b10f063e4a0e300 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Wed, 13 Aug 2025 01:38:15 +0530 Subject: [PATCH 07/10] pr review fixes --- ...317d05f49e4_add_model_evaluation_table.py} | 3 +- backend/app/api/routes/model_evaluation.py | 133 ++++++++++-------- backend/app/core/finetune/evaluation.py | 33 ++++- backend/app/crud/model_evaluation.py | 75 +++++----- backend/app/models/__init__.py | 2 +- backend/app/models/fine_tuning.py | 2 +- backend/app/models/model_evaluation.py | 16 ++- backend/app/models/project.py | 2 +- .../app/tests/crud/test_model_evaluation.py | 25 ++-- backend/app/tests/utils/test_data.py | 7 +- 10 files changed, 167 insertions(+), 131 deletions(-) rename backend/app/alembic/versions/{e317d05f49e4_add_evaluation_table.py => e317d05f49e4_add_model_evaluation_table.py} (95%) diff --git a/backend/app/alembic/versions/e317d05f49e4_add_evaluation_table.py b/backend/app/alembic/versions/e317d05f49e4_add_model_evaluation_table.py similarity index 95% rename from backend/app/alembic/versions/e317d05f49e4_add_evaluation_table.py rename to backend/app/alembic/versions/e317d05f49e4_add_model_evaluation_table.py index 9c47f3bf..cb9b9172 100644 --- a/backend/app/alembic/versions/e317d05f49e4_add_evaluation_table.py +++ b/backend/app/alembic/versions/e317d05f49e4_add_model_evaluation_table.py @@ -1,4 +1,4 @@ -"""add evaluation table +"""add model evaluation table Revision ID: e317d05f49e4 Revises: db9b5413d3ce @@ -46,7 +46,6 @@ def upgrade(): sa.Column("base_model", sqlmodel.sql.sqltypes.AutoString(), nullable=False), sa.Column("split_ratio", sa.Float(), nullable=False), sa.Column("system_prompt", sa.Text(), nullable=False), - sa.Column("metric", postgresql.JSON(astext_type=sa.Text()), nullable=False), sa.Column("score", postgresql.JSON(astext_type=sa.Text()), nullable=True), sa.Column( "status", diff --git a/backend/app/api/routes/model_evaluation.py b/backend/app/api/routes/model_evaluation.py index d89dd759..23859f65 100644 --- a/backend/app/api/routes/model_evaluation.py +++ b/backend/app/api/routes/model_evaluation.py @@ -9,7 +9,6 @@ from app.crud import ( fetch_by_id, create_model_evaluation, - fetch_by_eval_id, fetch_active_model_evals, fetch_eval_by_doc_id, update_model_eval, @@ -22,6 +21,7 @@ ModelEvaluationUpdate, ModelEvaluationPublic, ) +from app.core.db import engine from app.core.finetune.evaluation import ModelEvaluator from app.utils import get_openai_client, APIResponse from app.api.deps import CurrentUserOrgProject, SessionDep @@ -32,72 +32,61 @@ router = APIRouter(prefix="/model_evaluation", tags=["model_evaluation"]) -metric = ["mcc", "f1", "accuracy"] - - def run_model_evaluation( eval_id: int, - session: Session, current_user: CurrentUserOrgProject, client: OpenAI, ): start_time = time.time() - logger.info( - f"[run_model_evaluation] Starting evaluation | eval ID={eval_id}, project_id={current_user.project_id}" - ) - - model_eval = fetch_by_eval_id(session, eval_id, current_user.project_id) - update_model_eval( - session=session, - model_eval=model_eval, - update=ModelEvaluationUpdate(status=ModelEvaluationStatus.running), + f"[run_model_evaluation] Starting | eval_id={eval_id}, project_id={current_user.project_id}" ) - try: - evaluator = ModelEvaluator( - model_name=model_eval.model_name, - testing_file_id=model_eval.testing_file_id, - system_prompt=model_eval.system_prompt, - client=client, - ) - result = evaluator.run() - end_time = time.time() - elapsed_time = end_time - start_time - - logger.info( - f"[run_model_evaluation] Evaluation completed successfully | eval ID={eval_id}, " - f"model_name={model_eval.model_name}, project_id={current_user.project_id}. " - f"Elapsed time: {elapsed_time:.2f} seconds" - ) + with Session(engine) as db: + try: + model_eval = update_model_eval( + session=db, + eval_id=eval_id, + project_id=current_user.project_id, + update=ModelEvaluationUpdate(status=ModelEvaluationStatus.running), + ) - update_data = ModelEvaluationUpdate( - score=result, - metric=list(result.keys()), - status=ModelEvaluationStatus.completed, - ) - update_model_eval( - session=session, - model_eval=model_eval, - update=update_data, - ) - except Exception as e: - end_time = time.time() - elapsed_time = end_time - start_time + evaluator = ModelEvaluator( + model_name=model_eval.model_name, + testing_file_id=model_eval.testing_file_id, + system_prompt=model_eval.system_prompt, + client=client, + ) + result = evaluator.run() + + update_model_eval( + session=db, + eval_id=eval_id, + project_id=current_user.project_id, + update=ModelEvaluationUpdate( + score=result, status=ModelEvaluationStatus.completed + ), + ) - logger.error( - f"[run_model_evaluation] Evaluation failed | eval ID={eval_id}, project_id={current_user.project_id}: " - f"{str(e)}. Elapsed time: {elapsed_time:.2f} seconds" - ) + elapsed = time.time() - start_time + logger.info( + f"[run_model_evaluation] Completed | eval_id={eval_id}, project_id={current_user.project_id}, elapsed={elapsed:.2f}s" + ) - update_model_eval( - session=session, - model_eval=model_eval, - update=ModelEvaluationUpdate( - status=ModelEvaluationStatus.failed, - error_message="failed during background job processing", - ), - ) + except Exception as e: + logger.error( + f"[run_model_evaluation] Failed | eval_id={eval_id}, project_id={current_user.project_id}: {e}" + ) + db.rollback() + update_model_eval( + session=db, + eval_id=eval_id, + project_id=current_user.project_id, + update=ModelEvaluationUpdate( + status=ModelEvaluationStatus.failed, + error_message="failed during background job processing", + ), + ) @router.post("/evaluate_models/", response_model=APIResponse) @@ -107,6 +96,19 @@ def evaluate_models( session: SessionDep, current_user: CurrentUserOrgProject, ): + """ + Start evaluations for one or more fine-tuning jobs. + + Request:{ fine_tuning_ids: list[int] } (one or many). + + Process: + For each ID, it fetches the fine-tuned model and its testing file from fine tuning table, + then queues a background task that runs predictions on the test set + and computes evaluation scores. + + Response: + APIResponse with the created/active evaluation records and a success message. + """ client = get_openai_client( session, current_user.organization_id, current_user.project_id ) @@ -139,7 +141,6 @@ def evaluate_models( request=ModelEvaluationBase(fine_tuning_id=fine_tuning_job.id), project_id=current_user.project_id, organization_id=current_user.organization_id, - metric=metric, status=ModelEvaluationStatus.pending, ) @@ -150,7 +151,7 @@ def evaluate_models( ) background_tasks.add_task( - run_model_evaluation, model_eval.id, session, current_user, client + run_model_evaluation, model_eval.id, current_user, client ) return APIResponse.success_response( @@ -159,16 +160,23 @@ def evaluate_models( @router.get( - "/{document_id}/top_model", response_model=APIResponse[ModelEvaluationPublic] + "/{document_id}/top_model", + response_model=APIResponse[ModelEvaluationPublic], ) def get_top_model_by_doc_id( - document_id: UUID, session: SessionDep, current_user: CurrentUserOrgProject + document_id: UUID, + session: SessionDep, + current_user: CurrentUserOrgProject, ): + """ + Return the top model trained on the given document_id, ranked by + Matthews correlation coefficient (MCC) across all evaluations. + """ logger.info( - f"[get_top_model_by_doc_id]Fetching top model for document_id: {document_id}, project_id: {current_user.project_id}" + f"[get_top_model_by_doc_id] Fetching top model for document_id={document_id}, " + f"project_id={current_user.project_id}" ) top_model = fetch_top_model_by_doc_id(session, document_id, current_user.project_id) - return APIResponse.success_response(top_model) @@ -176,6 +184,9 @@ def get_top_model_by_doc_id( def get_evals_by_doc_id( document_id: UUID, session: SessionDep, current_user: CurrentUserOrgProject ): + """ + Return all model evaluations for the given document_id within the current project. + """ logger.info( f"[get_evals_by_doc_id]Fetching evaluations for document_id: {document_id}, project_id: {current_user.project_id}" ) diff --git a/backend/app/core/finetune/evaluation.py b/backend/app/core/finetune/evaluation.py index 49cda937..53b2e4e0 100644 --- a/backend/app/core/finetune/evaluation.py +++ b/backend/app/core/finetune/evaluation.py @@ -37,7 +37,25 @@ def __init__( logger.info(f"ModelEvaluator initialized with model: {model_name}") def load_labels_and_prompts(self) -> None: - """Loads labels and prompts directly from OpenAI file content using the file ID.""" + """ + Loads labels and prompts directly from OpenAI NDJSON file content using the testing file ID. + + Example data format: + { + "messages": [ + {"role": "system", "content": "You are an assistant that is good at categorizing if what user is saying is a query or non-query"}, + {"role": "user", "content": "what is the colour of the apple"}, + {"role": "assistant", "content": "query"} + ] + } + { + "messages": [ + {"role": "system", "content": "You are an assistant that is good at categorizing if what user is saying is a query or non-query"}, + {"role": "user", "content": "i like apples"}, + {"role": "assistant", "content": "non-query"} + ] + } + """ logger.info( f"[load_labels_and_prompts] Loading labels and prompts from file ID: {self.testing_file_id}" ) @@ -75,8 +93,9 @@ def load_labels_and_prompts(self) -> None: f"Line {ln}: missing user or assistant message" ) - prompt = user_msgs[-1]["content"] - label = model_msgs[-1]["content"].strip().lower() + prompt = user_msgs[0]["content"] + label = (model_msgs[0]["content"] or "").strip().lower() + self.prompts.append(prompt) self.y_true.append(label) self.allowed_labels.add(label) @@ -119,6 +138,7 @@ def generate_predictions(self) -> list[str]: logger.info( f"[generate_predictions] Generating predictions for {len(self.prompts)} prompts." ) + start_preds = time.time() predictions = [] total_prompts = len(self.prompts) @@ -160,9 +180,7 @@ def generate_predictions(self) -> list[str]: ) attempt += 1 if attempt == self.retries: - predictions.append( - "openai_error" - ) # Placeholder for failed predictions + predictions.append("openai_error") logger.error( f"[generate_predictions] Maximum retries reached for prompt {idx}/{total_prompts}. Appending 'openai_error'." ) @@ -171,8 +189,9 @@ def generate_predictions(self) -> list[str]: f"[generate_predictions] Retrying prompt {idx}/{total_prompts} after OpenAI error ({attempt}/{self.retries})." ) + total_elapsed = time.time() - start_preds logger.info( - f"[generate_predictions] Generated predictions for {len(predictions)} prompts." + f"[generate_predictions] Finished {total_prompts} prompts in {total_elapsed:.2f}s | Generated {len(predictions)} predictions." ) return predictions diff --git a/backend/app/crud/model_evaluation.py b/backend/app/crud/model_evaluation.py index 9102f19c..9f3ddd48 100644 --- a/backend/app/crud/model_evaluation.py +++ b/backend/app/crud/model_evaluation.py @@ -6,7 +6,7 @@ from app.crud import fetch_by_id from app.models import ( - Model_Evaluation, + ModelEvaluation, ModelEvaluationStatus, ModelEvaluationBase, ModelEvaluationUpdate, @@ -22,12 +22,11 @@ def create_model_evaluation( request: ModelEvaluationBase, project_id: int, organization_id: int, - metric: list[str], status: ModelEvaluationStatus = ModelEvaluationStatus.pending, -) -> Model_Evaluation: - fine_tune = fetch_by_id(session, request.fine_tuning_id, project_id) +) -> ModelEvaluation: + fine_tuning_job = fetch_by_id(session, request.fine_tuning_id, project_id) - if fine_tune.fine_tuned_model is None: + if fine_tuning_job.fine_tuned_model is None: logger.error( f"[create_model_evaluation] No fine tuned model found for the given fine tuning ID | fine_tuning_id={request.fine_tuning_id}, project_id={project_id}" ) @@ -35,19 +34,18 @@ def create_model_evaluation( base_data = { "fine_tuning_id": request.fine_tuning_id, - "metric": metric, - "system_prompt": fine_tune.system_prompt, - "base_model": fine_tune.base_model, - "split_ratio": fine_tune.split_ratio, - "model_name": fine_tune.fine_tuned_model, - "document_id": fine_tune.document_id, - "testing_file_id": fine_tune.testing_file_id, + "system_prompt": fine_tuning_job.system_prompt, + "base_model": fine_tuning_job.base_model, + "split_ratio": fine_tuning_job.split_ratio, + "model_name": fine_tuning_job.fine_tuned_model, + "document_id": fine_tuning_job.document_id, + "testing_file_id": fine_tuning_job.testing_file_id, "project_id": project_id, "organization_id": organization_id, "status": status, } - model_eval = Model_Evaluation(**base_data) + model_eval = ModelEvaluation(**base_data) model_eval.updated_at = now() session.add(model_eval) @@ -55,17 +53,17 @@ def create_model_evaluation( session.refresh(model_eval) logger.info( - f"[Create_fine_tuning_job]Created new model evaluation from job ID={fine_tune.id}, project_id={project_id}" + f"[Create_fine_tuning_job]Created new model evaluation from fine tuning job ID={fine_tuning_job.id}, project_id={project_id}" ) return model_eval def fetch_by_eval_id( session: Session, eval_id: int, project_id: int -) -> Model_Evaluation: +) -> ModelEvaluation: model_eval = session.exec( - select(Model_Evaluation).where( - Model_Evaluation.id == eval_id, Model_Evaluation.project_id == project_id + select(ModelEvaluation).where( + ModelEvaluation.id == eval_id, ModelEvaluation.project_id == project_id ) ).one_or_none() @@ -73,7 +71,7 @@ def fetch_by_eval_id( logger.error( f"[fetch_by_id]Model evaluation not found for eval_id={eval_id}, project_id={project_id}" ) - raise HTTPException(status_code=404, detail="model eval not found") + raise HTTPException(status_code=404, detail="Model evaluation not found") logger.info( f"[fetch_by_id]Fetched model evaluation for eval ID={model_eval.id}, project_id={project_id}" @@ -85,14 +83,14 @@ def fetch_eval_by_doc_id( session: Session, document_id: UUID, project_id: int, -) -> list[Model_Evaluation]: +) -> list[ModelEvaluation]: query = ( - select(Model_Evaluation) + select(ModelEvaluation) .where( - Model_Evaluation.document_id == document_id, - Model_Evaluation.project_id == project_id, + ModelEvaluation.document_id == document_id, + ModelEvaluation.project_id == project_id, ) - .order_by(Model_Evaluation.updated_at.desc()) + .order_by(ModelEvaluation.updated_at.desc()) ) model_evals = session.exec(query).all() @@ -105,7 +103,7 @@ def fetch_eval_by_doc_id( logger.info( f"[fetch_eval_by_doc_id]Found {len(model_evals)} model evaluation(s) for document_id={document_id}, " - f"project_id={project_id}, sorted by MCC" + f"project_id={project_id}" ) return model_evals @@ -113,14 +111,14 @@ def fetch_eval_by_doc_id( def fetch_top_model_by_doc_id( session: Session, document_id: UUID, project_id: int -) -> Model_Evaluation: +) -> ModelEvaluation: query = ( - select(Model_Evaluation) + select(ModelEvaluation) .where( - Model_Evaluation.document_id == document_id, - Model_Evaluation.project_id == project_id, + ModelEvaluation.document_id == document_id, + ModelEvaluation.project_id == project_id, ) - .order_by(Model_Evaluation.updated_at.desc()) + .order_by(ModelEvaluation.updated_at.desc()) ) model_evals = session.exec(query).all() @@ -153,28 +151,29 @@ def fetch_active_model_evals( session: Session, fine_tuning_id: int, project_id: int, -) -> list["Model_Evaluation"]: +) -> list["ModelEvaluation"]: """ Return all ACTIVE model evaluations for the given document & project. Active = status != failed AND is_deleted is false. """ stmt = ( - select(Model_Evaluation) + select(ModelEvaluation) .where( - Model_Evaluation.fine_tuning_id == fine_tuning_id, - Model_Evaluation.project_id == project_id, - Model_Evaluation.is_deleted.is_(False), - Model_Evaluation.status != "failed", + ModelEvaluation.fine_tuning_id == fine_tuning_id, + ModelEvaluation.project_id == project_id, + ModelEvaluation.is_deleted.is_(False), + ModelEvaluation.status != "failed", ) - .order_by(Model_Evaluation.inserted_at.desc()) + .order_by(ModelEvaluation.inserted_at.desc()) ) return session.exec(stmt).all() def update_model_eval( - session: Session, model_eval: Model_Evaluation, update: ModelEvaluationUpdate -) -> Model_Evaluation: + session: Session, eval_id: int, project_id: int, update: ModelEvaluationUpdate +) -> ModelEvaluation: + model_eval = fetch_by_eval_id(session, eval_id, project_id) if model_eval is None: raise HTTPException(status_code=404, detail="Model evaluation not found") diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py index 2f4de433..34f7c047 100644 --- a/backend/app/models/__init__.py +++ b/backend/app/models/__init__.py @@ -73,7 +73,7 @@ ) from .model_evaluation import ( - Model_Evaluation, + ModelEvaluation, ModelEvaluationBase, ModelEvaluationCreate, ModelEvaluationPublic, diff --git a/backend/app/models/fine_tuning.py b/backend/app/models/fine_tuning.py index b035c289..7e2a0f2a 100644 --- a/backend/app/models/fine_tuning.py +++ b/backend/app/models/fine_tuning.py @@ -81,7 +81,7 @@ class Fine_Tuning(FineTuningJobBase, table=True): deleted_at: datetime | None = Field(default=None, nullable=True) project: "Project" = Relationship(back_populates="fine_tuning") - model_evaluation: "Model_Evaluation" = Relationship(back_populates="fine_tuning") + model_evaluation: "ModelEvaluation" = Relationship(back_populates="fine_tuning") class FineTuningUpdate(SQLModel): diff --git a/backend/app/models/model_evaluation.py b/backend/app/models/model_evaluation.py index 96146bc7..96f55a2d 100644 --- a/backend/app/models/model_evaluation.py +++ b/backend/app/models/model_evaluation.py @@ -6,6 +6,7 @@ from sqlmodel import SQLModel, Field, Relationship from sqlalchemy import Column, Text from sqlalchemy.dialects.postgresql import JSON +from pydantic import field_validator from app.core.util import now @@ -28,10 +29,17 @@ class ModelEvaluationBase(SQLModel): class ModelEvaluationCreate(SQLModel): fine_tuning_ids: list[int] + @field_validator("fine_tuning_ids") + @classmethod + def dedupe_ids(cls, v: list[int]) -> list[int]: + return list(dict.fromkeys(v)) -class Model_Evaluation(ModelEvaluationBase, table=True): + +class ModelEvaluation(ModelEvaluationBase, table=True): """Database model for keeping a record of model evaluation""" + __tablename__ = "model_evaluation" + id: int = Field(primary_key=True) document_id: UUID = Field( @@ -47,10 +55,6 @@ class Model_Evaluation(ModelEvaluationBase, table=True): nullable=False, description="the ratio the dataset was divided in" ) system_prompt: str = Field(sa_column=Column(Text, nullable=False)) - metric: list[str] = Field( - sa_column=Column(JSON, nullable=False), - description="List of metrics used for evaluation (e.g., ['mcc', 'accuracy'])", - ) score: Optional[dict[str, float]] = Field( sa_column=Column(JSON, nullable=True), description="Evaluation scores per metric (e.g., {'mcc': 0.85})", @@ -78,7 +82,6 @@ class Model_Evaluation(ModelEvaluationBase, table=True): class ModelEvaluationUpdate(SQLModel): - metric: Optional[list[str]] = None score: Optional[dict[str, float]] = None status: Optional[ModelEvaluationStatus] = None error_message: Optional[str] = None @@ -94,7 +97,6 @@ class ModelEvaluationPublic(ModelEvaluationBase): base_model: str score: dict[str, float] | None = None status: ModelEvaluationStatus - is_best_model: bool | None = None inserted_at: datetime updated_at: datetime deleted_at: datetime | None = None diff --git a/backend/app/models/project.py b/backend/app/models/project.py index 755b6045..84f368c4 100644 --- a/backend/app/models/project.py +++ b/backend/app/models/project.py @@ -52,7 +52,7 @@ class Project(ProjectBase, table=True): fine_tuning: list["Fine_Tuning"] = Relationship( back_populates="project", cascade_delete=True ) - model_evaluation: list["Model_Evaluation"] = Relationship( + model_evaluation: list["ModelEvaluation"] = Relationship( back_populates="project", cascade_delete=True ) openai_conversations: list["OpenAIConversation"] = Relationship( diff --git a/backend/app/tests/crud/test_model_evaluation.py b/backend/app/tests/crud/test_model_evaluation.py index 366b9a20..d10abf12 100644 --- a/backend/app/tests/crud/test_model_evaluation.py +++ b/backend/app/tests/crud/test_model_evaluation.py @@ -4,12 +4,17 @@ from sqlmodel import Session from fastapi import HTTPException -from app.tests.utils.utils import get_project +from app.tests.utils.utils import get_project, get_non_existent_id from app.tests.utils.test_data import ( create_test_model_evaluation, create_test_finetuning_job_with_extra_fields, ) -from app.models import ModelEvaluationBase, ModelEvaluationUpdate, ModelEvaluationStatus +from app.models import ( + ModelEvaluation, + ModelEvaluationBase, + ModelEvaluationUpdate, + ModelEvaluationStatus, +) from app.crud import ( create_model_evaluation, fetch_by_eval_id, @@ -26,8 +31,6 @@ def test_create_model_evaluation(db: Session): fine_tune_jobs, _ = create_test_finetuning_job_with_extra_fields(db, [0.5]) fine_tune = fine_tune_jobs[0] - metric = ["mcc", "f1", "accuracy"] - job_request = ModelEvaluationBase( fine_tuning_id=fine_tune.id, system_prompt=fine_tune.system_prompt, @@ -43,7 +46,6 @@ def test_create_model_evaluation(db: Session): request=job_request, project_id=project.id, organization_id=project.organization_id, - metric=metric, ) assert created_eval.id is not None @@ -64,7 +66,9 @@ def test_fetch_by_eval_id_success(db: Session): def test_fetch_by_eval_id_not_found(db: Session): with pytest.raises(HTTPException) as exc: - fetch_by_eval_id(db, eval_id=9999, project_id=1) + fetch_by_eval_id( + db, eval_id=get_non_existent_id(db, ModelEvaluation), project_id=1 + ) assert exc.value.status_code == 404 @@ -122,17 +126,22 @@ def test_update_model_eval_success(db: Session): model_eval = model_evals[0] update = ModelEvaluationUpdate(status="completed") - updated_eval = update_model_eval(db, model_eval=model_eval, update=update) + updated_eval = update_model_eval( + db, eval_id=model_eval.id, project_id=model_eval.project_id, update=update + ) assert updated_eval.status == "completed" assert updated_eval.updated_at is not None def test_update_model_eval_not_found(db: Session): + project = get_project(db) + with pytest.raises(HTTPException) as exc: update_model_eval( db, - model_eval=None, + eval_id=get_non_existent_id(db, ModelEvaluation), + project_id=project.id, update=ModelEvaluationUpdate(status=ModelEvaluationStatus.completed), ) diff --git a/backend/app/tests/utils/test_data.py b/backend/app/tests/utils/test_data.py index a3cd712f..72a2800e 100644 --- a/backend/app/tests/utils/test_data.py +++ b/backend/app/tests/utils/test_data.py @@ -10,7 +10,7 @@ CredsCreate, FineTuningJobCreate, Fine_Tuning, - Model_Evaluation, + ModelEvaluation, ModelEvaluationBase, ModelEvaluationStatus, ) @@ -170,15 +170,13 @@ def create_test_finetuning_job_with_extra_fields( return jobs, True -def create_test_model_evaluation(db) -> list[Model_Evaluation]: +def create_test_model_evaluation(db) -> list[ModelEvaluation]: fine_tune_jobs, any_created = create_test_finetuning_job_with_extra_fields( db, [0.5, 0.7] ) model_evaluations = [] - metric = ["mcc", "f1", "accuracy"] - for fine_tune in fine_tune_jobs: request = ModelEvaluationBase( fine_tuning_id=fine_tune.id, @@ -196,7 +194,6 @@ def create_test_model_evaluation(db) -> list[Model_Evaluation]: request=request, project_id=fine_tune.project_id, organization_id=fine_tune.organization_id, - metric=metric, status=ModelEvaluationStatus.pending, ) From 629c4bfd629a01cb18144dcb602acaaf724b5ee7 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Wed, 13 Aug 2025 17:26:08 +0530 Subject: [PATCH 08/10] test case update --- backend/app/crud/model_evaluation.py | 2 +- .../tests/api/routes/test_model_evaluation.py | 72 ++++--------------- 2 files changed, 13 insertions(+), 61 deletions(-) diff --git a/backend/app/crud/model_evaluation.py b/backend/app/crud/model_evaluation.py index 9f3ddd48..96b91477 100644 --- a/backend/app/crud/model_evaluation.py +++ b/backend/app/crud/model_evaluation.py @@ -26,7 +26,7 @@ def create_model_evaluation( ) -> ModelEvaluation: fine_tuning_job = fetch_by_id(session, request.fine_tuning_id, project_id) - if fine_tuning_job.fine_tuned_model is None: + if fine_tuning_job.fine_tuned_model and fine_tuning_job.testing_file_id is None: logger.error( f"[create_model_evaluation] No fine tuned model found for the given fine tuning ID | fine_tuning_id={request.fine_tuning_id}, project_id={project_id}" ) diff --git a/backend/app/tests/api/routes/test_model_evaluation.py b/backend/app/tests/api/routes/test_model_evaluation.py index fd5e94c6..8265ed59 100644 --- a/backend/app/tests/api/routes/test_model_evaluation.py +++ b/backend/app/tests/api/routes/test_model_evaluation.py @@ -1,81 +1,33 @@ -from unittest.mock import MagicMock, patch +from unittest.mock import patch -from app.crud import fetch_by_eval_id from app.tests.utils.test_data import ( create_test_finetuning_job_with_extra_fields, create_test_model_evaluation, ) -@patch("app.api.routes.model_evaluation.ModelEvaluator") -def test_evaluate_model( - mock_ModelEvaluator, client, db, user_api_key_header, user_api_key -): +@patch("app.api.routes.model_evaluation.run_model_evaluation") +def test_evaluate_model(mock_run_eval, client, db, user_api_key_header): fine_tuned, _ = create_test_finetuning_job_with_extra_fields(db, [0.5]) - - mock_evaluator = MagicMock() - mock_evaluator.run.return_value = {"mcc": 0.8, "accuracy": 0.9} - mock_ModelEvaluator.return_value = mock_evaluator - body = {"fine_tuning_ids": [fine_tuned[0].id]} - response = client.post( + resp = client.post( "/api/v1/model_evaluation/evaluate_models/", json=body, headers=user_api_key_header, ) + assert resp.status_code == 200, resp.text - assert response.status_code == 200 - json_data = response.json() - - assert json_data["data"]["message"] == "Model evaluation(s) started successfully" - - evaluations = [eval for eval in json_data["data"].get("data", []) if eval] - assert len(evaluations) == 1 - - assert evaluations[0]["status"] == "pending" - - mock_evaluator.run.assert_called_with() - assert mock_evaluator.run.call_count == 1 - - updated_model_eval = fetch_by_eval_id( - db, evaluations[0]["id"], user_api_key.project_id - ) - - assert updated_model_eval.score == {"mcc": 0.8, "accuracy": 0.9} - - assert updated_model_eval.fine_tuning_id == fine_tuned[0].id - assert updated_model_eval.model_name == fine_tuned[0].fine_tuned_model - assert updated_model_eval.testing_file_id == fine_tuned[0].testing_file_id - - -@patch("app.api.routes.model_evaluation.ModelEvaluator") -def test_run_model_evaluation_evaluator_run_failure( - mock_ModelEvaluator, client, db, user_api_key_header, user_api_key -): - fine_tuned, _ = create_test_finetuning_job_with_extra_fields(db, [0.5]) - fine_tune = fine_tuned[0] - - mock_evaluator = MagicMock() - mock_evaluator.run.side_effect = Exception("Evaluator failed") - mock_ModelEvaluator.return_value = mock_evaluator - - response = client.post( - "/api/v1/model_evaluation/evaluate_models/", - json={"fine_tuning_ids": [fine_tune.id]}, - headers=user_api_key_header, - ) - - json_data = response.json() - print("jonnn", json_data) - model_eval_id = json_data["data"]["data"][0]["id"] + j = resp.json() + evals = j["data"]["data"] + assert len(evals) == 1 + assert evals[0]["status"] == "pending" - updated_model_eval = fetch_by_eval_id(db, model_eval_id, user_api_key.project_id) - assert updated_model_eval.status == "failed" - assert updated_model_eval.error_message == "failed during background job processing" + mock_run_eval.assert_called_once() + assert mock_run_eval.call_args[0][0] == evals[0]["id"] -def test_evaluate_model_finetuning_not_found(client, db, user_api_key_header): +def test_evaluate_model_finetuning_not_found(client, user_api_key_header): invalid_fine_tune_id = 9999 body = {"fine_tuning_ids": [invalid_fine_tune_id]} From 2b122f5d0a57dc70160b7d2102fc7d4e88fe27f1 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Wed, 13 Aug 2025 22:27:48 +0530 Subject: [PATCH 09/10] adding false positive and false negetive --- backend/app/core/finetune/evaluation.py | 29 +++++++++++++------ .../tests/api/routes/test_model_evaluation.py | 10 +++++-- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/backend/app/core/finetune/evaluation.py b/backend/app/core/finetune/evaluation.py index 53b2e4e0..1cbaacff 100644 --- a/backend/app/core/finetune/evaluation.py +++ b/backend/app/core/finetune/evaluation.py @@ -6,7 +6,12 @@ import openai from openai import OpenAI -from sklearn.metrics import matthews_corrcoef, accuracy_score, f1_score +from sklearn.metrics import ( + matthews_corrcoef, + accuracy_score, + f1_score, + confusion_matrix, +) from app.api.routes.fine_tuning import handle_openai_error @@ -197,26 +202,32 @@ def generate_predictions(self) -> list[str]: def evaluate(self, y_pred: list[str]) -> dict: """Evaluate the predictions against the true labels.""" + logger.info(f"[evaluate] Starting evaluation with {len(y_pred)} predictions.") try: mcc_score = round(matthews_corrcoef(self.y_true, y_pred), 4) - accuracy = round(accuracy_score(self.y_true, y_pred), 4) - f1_query = round( - f1_score(self.y_true, y_pred, pos_label="query", average="binary"), 4 - ) + + y_true_bin = [1 if y == "query" else 0 for y in self.y_true] + y_pred_bin = [1 if y == "query" else 0 for y in y_pred] + tn, fp, fn, tp = confusion_matrix( + y_true_bin, y_pred_bin, labels=[0, 1] + ).ravel() + + fpr = round(fp / (fp + tn), 4) if (fp + tn) else 0.0 + fnr = round(fn / (fn + tp), 4) if (fn + tp) else 0.0 logger.info( - f"[evaluate] Evaluation completed. MCC: {mcc_score}, Accuracy: {accuracy}, F1 Query: {f1_query}" + f"[evaluate] Evaluation completed. MCC: {mcc_score} FPR: {fpr}, FNR: {fnr}" ) return { "mcc": mcc_score, - "accuracy": accuracy, - "f1_query": f1_query, + "false_positive_rate": fpr, + "false_negetive_rate": fnr, } except Exception as e: - logger.error(f"[evaluate] Error during evaluation: {str(e)}") + logger.error(f"[evaluate] Evaluation failed: {e}") raise def run(self) -> dict: diff --git a/backend/app/tests/api/routes/test_model_evaluation.py b/backend/app/tests/api/routes/test_model_evaluation.py index 8265ed59..6decc7a3 100644 --- a/backend/app/tests/api/routes/test_model_evaluation.py +++ b/backend/app/tests/api/routes/test_model_evaluation.py @@ -47,7 +47,10 @@ def test_top_model_by_doc(client, db, user_api_key_header): model_evals = create_test_model_evaluation(db) model_eval = model_evals[0] - model_eval.score = {"mcc": 0.85, "accuracy": 0.9} + model_eval.score = { + "mcc": 0.85, + "false_positive_rate": 0.9, + } db.flush() response = client.get( @@ -58,7 +61,10 @@ def test_top_model_by_doc(client, db, user_api_key_header): assert response.status_code == 200 json_data = response.json() - assert json_data["data"]["score"] == {"mcc": 0.85, "accuracy": 0.9} + assert json_data["data"]["score"] == { + "mcc": 0.85, + "false_positive_rate": 0.9, + } assert json_data["data"]["model_name"] == model_eval.model_name assert json_data["data"]["document_id"] == str(model_eval.document_id) From 9487ea3bc4069026f999d0a29b2f2f08e39b6294 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Tue, 19 Aug 2025 00:23:25 +0530 Subject: [PATCH 10/10] removing model eval field from project, changed chat completion to responses API --- backend/app/core/finetune/evaluation.py | 28 ++++--------------- backend/app/models/model_evaluation.py | 2 +- backend/app/models/project.py | 3 -- .../tests/api/routes/test_model_evaluation.py | 2 -- .../app/tests/crud/test_model_evaluation.py | 2 +- 5 files changed, 7 insertions(+), 30 deletions(-) diff --git a/backend/app/core/finetune/evaluation.py b/backend/app/core/finetune/evaluation.py index 1cbaacff..fb83c133 100644 --- a/backend/app/core/finetune/evaluation.py +++ b/backend/app/core/finetune/evaluation.py @@ -156,14 +156,10 @@ def generate_predictions(self) -> list[str]: ) try: - response = self.client.chat.completions.create( + response = self.client.responses.create( model=self.model_name, - messages=[ - {"role": "system", "content": self.system_instruction}, - {"role": "user", "content": prompt}, - ], - temperature=0, - max_tokens=3, + instructions=self.system_instruction, + input=prompt, ) elapsed_time = time.time() - start_time @@ -173,7 +169,7 @@ def generate_predictions(self) -> list[str]: ) continue - raw = response.choices[0].message.content or "" + raw = response.output_text or "" prediction = self.normalize_prediction(raw) predictions.append(prediction) break @@ -208,24 +204,10 @@ def evaluate(self, y_pred: list[str]) -> dict: try: mcc_score = round(matthews_corrcoef(self.y_true, y_pred), 4) - y_true_bin = [1 if y == "query" else 0 for y in self.y_true] - y_pred_bin = [1 if y == "query" else 0 for y in y_pred] - tn, fp, fn, tp = confusion_matrix( - y_true_bin, y_pred_bin, labels=[0, 1] - ).ravel() - - fpr = round(fp / (fp + tn), 4) if (fp + tn) else 0.0 - fnr = round(fn / (fn + tp), 4) if (fn + tp) else 0.0 - - logger.info( - f"[evaluate] Evaluation completed. MCC: {mcc_score} FPR: {fpr}, FNR: {fnr}" - ) - return { "mcc": mcc_score, - "false_positive_rate": fpr, - "false_negetive_rate": fnr, } + except Exception as e: logger.error(f"[evaluate] Evaluation failed: {e}") raise diff --git a/backend/app/models/model_evaluation.py b/backend/app/models/model_evaluation.py index 96f55a2d..9251d2fb 100644 --- a/backend/app/models/model_evaluation.py +++ b/backend/app/models/model_evaluation.py @@ -77,7 +77,7 @@ class ModelEvaluation(ModelEvaluationBase, table=True): updated_at: datetime = Field(default_factory=now, nullable=False) deleted_at: datetime | None = Field(default=None, nullable=True) - project: "Project" = Relationship(back_populates="model_evaluation") + project: "Project" = Relationship() fine_tuning: "Fine_Tuning" = Relationship(back_populates="model_evaluation") diff --git a/backend/app/models/project.py b/backend/app/models/project.py index 84f368c4..a5eb3a3b 100644 --- a/backend/app/models/project.py +++ b/backend/app/models/project.py @@ -52,9 +52,6 @@ class Project(ProjectBase, table=True): fine_tuning: list["Fine_Tuning"] = Relationship( back_populates="project", cascade_delete=True ) - model_evaluation: list["ModelEvaluation"] = Relationship( - back_populates="project", cascade_delete=True - ) openai_conversations: list["OpenAIConversation"] = Relationship( back_populates="project", cascade_delete=True ) diff --git a/backend/app/tests/api/routes/test_model_evaluation.py b/backend/app/tests/api/routes/test_model_evaluation.py index 6decc7a3..9b969e47 100644 --- a/backend/app/tests/api/routes/test_model_evaluation.py +++ b/backend/app/tests/api/routes/test_model_evaluation.py @@ -49,7 +49,6 @@ def test_top_model_by_doc(client, db, user_api_key_header): model_eval.score = { "mcc": 0.85, - "false_positive_rate": 0.9, } db.flush() @@ -63,7 +62,6 @@ def test_top_model_by_doc(client, db, user_api_key_header): assert json_data["data"]["score"] == { "mcc": 0.85, - "false_positive_rate": 0.9, } assert json_data["data"]["model_name"] == model_eval.model_name assert json_data["data"]["document_id"] == str(model_eval.document_id) diff --git a/backend/app/tests/crud/test_model_evaluation.py b/backend/app/tests/crud/test_model_evaluation.py index d10abf12..2b71ce4f 100644 --- a/backend/app/tests/crud/test_model_evaluation.py +++ b/backend/app/tests/crud/test_model_evaluation.py @@ -92,7 +92,7 @@ def test_fetch_eval_by_doc_id_not_found(db: Session): def test_fetch_top_model_by_doc_id_success(db: Session): model_evals = create_test_model_evaluation(db) model_eval = model_evals[0] - model_eval.score = {"mcc": 0.8, "accuracy": 0.9} + model_eval.score = {"mcc": 0.8} db.flush() doc_id = model_eval.document_id