Skip to content
Merged
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""add model evaluation table

Revision ID: e317d05f49e4
Revises: db9b5413d3ce
Create Date: 2025-08-10 21:36:07.863951

"""
from alembic import op
import sqlalchemy as sa
import sqlmodel.sql.sqltypes
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision = "e317d05f49e4"
down_revision = "db9b5413d3ce"
branch_labels = None
depends_on = None

modelevaluation_status_enum = postgresql.ENUM(
"pending",
"running",
"completed",
"failed",
name="modelevaluationstatus",
create_type=False,
)


def upgrade():
modelevaluation_status_enum.create(op.get_bind(), checkfirst=True)

op.create_table(
"model_evaluation",
sa.Column("id", sa.Integer(), nullable=False),
sa.PrimaryKeyConstraint("id"),
sa.Column("fine_tuning_id", sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(
["fine_tuning_id"], ["fine_tuning.id"], ondelete="CASCADE"
),
sa.Column("document_id", sa.Uuid(), nullable=False),
sa.ForeignKeyConstraint(["document_id"], ["document.id"]),
sa.Column("model_name", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
sa.Column(
"testing_file_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False
),
sa.Column("base_model", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
sa.Column("split_ratio", sa.Float(), nullable=False),
sa.Column("system_prompt", sa.Text(), nullable=False),
sa.Column("score", postgresql.JSON(astext_type=sa.Text()), nullable=True),
sa.Column(
"status",
modelevaluation_status_enum,
nullable=False,
server_default="pending",
),
sa.Column("error_message", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
sa.Column("project_id", sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"),
sa.Column("organization_id", sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(
["organization_id"], ["organization.id"], ondelete="CASCADE"
),
sa.Column("is_deleted", sa.Boolean(), nullable=False),
sa.Column("inserted_at", sa.DateTime(), nullable=False),
sa.Column("updated_at", sa.DateTime(), nullable=False),
sa.Column("deleted_at", sa.DateTime(), nullable=True),
)


def downgrade():
op.drop_table("model_evaluation")
2 changes: 2 additions & 0 deletions backend/app/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
onboarding,
credentials,
fine_tuning,
model_evaluation,
)
from app.core.config import settings

Expand All @@ -38,6 +39,7 @@
api_router.include_router(users.router)
api_router.include_router(utils.router)
api_router.include_router(fine_tuning.router)
api_router.include_router(model_evaluation.router)


if settings.ENVIRONMENT == "local":
Expand Down
194 changes: 194 additions & 0 deletions backend/app/api/routes/model_evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
import logging
import time
from uuid import UUID

from fastapi import APIRouter, HTTPException, BackgroundTasks
from sqlmodel import Session
from openai import OpenAI

from app.crud import (
fetch_by_id,
create_model_evaluation,
fetch_active_model_evals,
fetch_eval_by_doc_id,
update_model_eval,
fetch_top_model_by_doc_id,
)
from app.models import (
ModelEvaluationBase,
ModelEvaluationCreate,
ModelEvaluationStatus,
ModelEvaluationUpdate,
ModelEvaluationPublic,
)
from app.core.db import engine
from app.core.finetune.evaluation import ModelEvaluator
from app.utils import get_openai_client, APIResponse
from app.api.deps import CurrentUserOrgProject, SessionDep


logger = logging.getLogger(__name__)

router = APIRouter(prefix="/model_evaluation", tags=["model_evaluation"])


def run_model_evaluation(
eval_id: int,
current_user: CurrentUserOrgProject,
client: OpenAI,
):
start_time = time.time()
logger.info(
f"[run_model_evaluation] Starting | eval_id={eval_id}, project_id={current_user.project_id}"
)

with Session(engine) as db:
try:
model_eval = update_model_eval(
session=db,
eval_id=eval_id,
project_id=current_user.project_id,
update=ModelEvaluationUpdate(status=ModelEvaluationStatus.running),
)

evaluator = ModelEvaluator(
model_name=model_eval.model_name,
testing_file_id=model_eval.testing_file_id,
system_prompt=model_eval.system_prompt,
client=client,
)
result = evaluator.run()

update_model_eval(
session=db,
eval_id=eval_id,
project_id=current_user.project_id,
update=ModelEvaluationUpdate(
score=result, status=ModelEvaluationStatus.completed
),
)

elapsed = time.time() - start_time
logger.info(
f"[run_model_evaluation] Completed | eval_id={eval_id}, project_id={current_user.project_id}, elapsed={elapsed:.2f}s"
)

except Exception as e:
logger.error(
f"[run_model_evaluation] Failed | eval_id={eval_id}, project_id={current_user.project_id}: {e}"
)
db.rollback()
update_model_eval(
session=db,
eval_id=eval_id,
project_id=current_user.project_id,
update=ModelEvaluationUpdate(
status=ModelEvaluationStatus.failed,
error_message="failed during background job processing",
),
)


@router.post("/evaluate_models/", response_model=APIResponse)
def evaluate_models(
request: ModelEvaluationCreate,
background_tasks: BackgroundTasks,
session: SessionDep,
current_user: CurrentUserOrgProject,
):
"""
Start evaluations for one or more fine-tuning jobs.

Request:{ fine_tuning_ids: list[int] } (one or many).

Process:
For each ID, it fetches the fine-tuned model and its testing file from fine tuning table,
then queues a background task that runs predictions on the test set
and computes evaluation scores.

Response:
APIResponse with the created/active evaluation records and a success message.
"""
client = get_openai_client(
session, current_user.organization_id, current_user.project_id
)

if not request.fine_tuning_ids:
logger.error(
f"[evaluate_model] No fine tuning IDs provided | project_id:{current_user.project_id}"
)
raise HTTPException(status_code=400, detail="No fine-tuned job IDs provided")

evals: list[ModelEvaluationPublic] = []

for job_id in request.fine_tuning_ids:
fine_tuning_job = fetch_by_id(session, job_id, current_user.project_id)
active_evals = fetch_active_model_evals(
session, job_id, current_user.project_id
)

if active_evals:
logger.info(
f"[evaluate_model] Skipping creation for {job_id}. Active evaluation exists, project_id:{current_user.project_id}"
)
evals.extend(
ModelEvaluationPublic.model_validate(ev) for ev in active_evals
)
continue

model_eval = create_model_evaluation(
session=session,
request=ModelEvaluationBase(fine_tuning_id=fine_tuning_job.id),
project_id=current_user.project_id,
organization_id=current_user.organization_id,
status=ModelEvaluationStatus.pending,
)

evals.append(ModelEvaluationPublic.model_validate(model_eval))

logger.info(
f"[evaluate_model] Created evaluation for fine_tuning_id {job_id} with eval ID={model_eval.id}, project_id:{current_user.project_id}"
)

background_tasks.add_task(
run_model_evaluation, model_eval.id, current_user, client
)

return APIResponse.success_response(
{"message": "Model evaluation(s) started successfully", "data": evals}
)


@router.get(
"/{document_id}/top_model",
response_model=APIResponse[ModelEvaluationPublic],
)
def get_top_model_by_doc_id(
document_id: UUID,
session: SessionDep,
current_user: CurrentUserOrgProject,
):
"""
Return the top model trained on the given document_id, ranked by
Matthews correlation coefficient (MCC) across all evaluations.
"""
logger.info(
f"[get_top_model_by_doc_id] Fetching top model for document_id={document_id}, "
f"project_id={current_user.project_id}"
)
top_model = fetch_top_model_by_doc_id(session, document_id, current_user.project_id)
return APIResponse.success_response(top_model)


@router.get("/{document_id}", response_model=APIResponse[list[ModelEvaluationPublic]])
def get_evals_by_doc_id(
document_id: UUID, session: SessionDep, current_user: CurrentUserOrgProject
):
"""
Return all model evaluations for the given document_id within the current project.
"""
logger.info(
f"[get_evals_by_doc_id]Fetching evaluations for document_id: {document_id}, project_id: {current_user.project_id}"
)
evaluations = fetch_eval_by_doc_id(session, document_id, current_user.project_id)
return APIResponse.success_response(evaluations)
Loading