From 480f18136ab5d0dc9e7586b74f266f4a7c953dfc Mon Sep 17 00:00:00 2001 From: nishika26 Date: Tue, 26 Aug 2025 15:47:48 +0530 Subject: [PATCH 1/4] adding new columns to model eval table --- ...d9e7a6_add_additional_columns_to_model_.py | 92 +++++++++++++++++++ backend/app/models/model_evaluation.py | 24 ++++- 2 files changed, 112 insertions(+), 4 deletions(-) create mode 100644 backend/app/alembic/versions/06a8bed9e7a6_add_additional_columns_to_model_.py diff --git a/backend/app/alembic/versions/06a8bed9e7a6_add_additional_columns_to_model_.py b/backend/app/alembic/versions/06a8bed9e7a6_add_additional_columns_to_model_.py new file mode 100644 index 00000000..b78edc9b --- /dev/null +++ b/backend/app/alembic/versions/06a8bed9e7a6_add_additional_columns_to_model_.py @@ -0,0 +1,92 @@ +"""add additional columns to model evaluation table + +Revision ID: 06a8bed9e7a6 +Revises: 72896bcc94da +Create Date: 2025-08-25 22:36:58.959768 + +""" +from alembic import op +import sqlalchemy as sa +import sqlmodel.sql.sqltypes +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "06a8bed9e7a6" +down_revision = "72896bcc94da" +branch_labels = None +depends_on = None + +modelevaluation_status_enum = postgresql.ENUM( + "pending", + "running", + "completed", + "failed", + name="modelevaluationstatus", + create_type=False, +) + + +def upgrade(): + op.add_column( + "model_evaluation", + sa.Column("test_data_url", sqlmodel.sql.sqltypes.AutoString(), nullable=False), + ) + op.add_column( + "model_evaluation", + sa.Column("batch_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + ) + op.add_column( + "model_evaluation", + sa.Column("output_file_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + ) + op.add_column( + "model_evaluation", + sa.Column("batch_status", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + ) + op.add_column( + "model_evaluation", + sa.Column( + "prediction_data_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True + ), + ) + sa.Column( + "eval_status", + modelevaluation_status_enum, + nullable=False, + server_default="pending", + ) + op.alter_column( + "model_evaluation", "testing_file_id", existing_type=sa.VARCHAR(), nullable=True + ) + op.drop_column("model_evaluation", "status") + + +def downgrade(): + op.add_column( + "model_evaluation", + sa.Column( + "status", + postgresql.ENUM( + "pending", + "running", + "completed", + "failed", + name="modelevaluationstatus", + ), + server_default=sa.text("'pending'::modelevaluationstatus"), + autoincrement=False, + nullable=False, + ), + ) + op.alter_column( + "model_evaluation", + "testing_file_id", + existing_type=sa.VARCHAR(), + nullable=False, + ) + op.drop_column("model_evaluation", "eval_status") + op.drop_column("model_evaluation", "prediction_data_url") + op.drop_column("model_evaluation", "batch_status") + op.drop_column("model_evaluation", "output_file_id") + op.drop_column("model_evaluation", "batch_id") + op.drop_column("model_evaluation", "test_data_url") diff --git a/backend/app/models/model_evaluation.py b/backend/app/models/model_evaluation.py index 9251d2fb..9e9c693e 100644 --- a/backend/app/models/model_evaluation.py +++ b/backend/app/models/model_evaluation.py @@ -47,19 +47,33 @@ class ModelEvaluation(ModelEvaluationBase, table=True): nullable=False, ) model_name: str = Field(description="fine tuned model name from OpenAI") - testing_file_id: str = Field( - description="File ID of the testing file uploaded to OpenAI" - ) + test_data_url: str = Field(description="S3 url of the testing data stored ins S3") base_model: str = Field(nullable=False, description="Base model for fine-tuning") split_ratio: float = Field( nullable=False, description="the ratio the dataset was divided in" ) system_prompt: str = Field(sa_column=Column(Text, nullable=False)) + testing_file_id: Optional[str] = Field( + description="File ID of the testing file uploaded to OpenAI for Batch processing" + ) score: Optional[dict[str, float]] = Field( sa_column=Column(JSON, nullable=True), description="Evaluation scores per metric (e.g., {'mcc': 0.85})", ) - status: ModelEvaluationStatus = ( + batch_id: Optional[str] = Field( + description="ID of the batch job created for processing test data queries" + ) + output_file_id: Optional[str] = Field( + description="ID of the output file generated by OpenAI Batch processing" + ) + batch_status: Optional[str] = Field( + description="Current status of the batch job on OpenAI's side" + ) + prediction_data_url: str | None = Field( + default=None, + description="S3 URL where the prediction data generated by the fine-tuned model is stored", + ) + eval_status: ModelEvaluationStatus = ( Field(default=ModelEvaluationStatus.pending, description="Evaluation status"), ) error_message: str | None = Field( @@ -85,6 +99,7 @@ class ModelEvaluationUpdate(SQLModel): score: Optional[dict[str, float]] = None status: Optional[ModelEvaluationStatus] = None error_message: Optional[str] = None + prediction_data_url: Optional[str] = None class ModelEvaluationPublic(ModelEvaluationBase): @@ -95,6 +110,7 @@ class ModelEvaluationPublic(ModelEvaluationBase): model_name: str split_ratio: float base_model: str + prediction_data_url: str | None score: dict[str, float] | None = None status: ModelEvaluationStatus inserted_at: datetime From 8f4d817050cf67a88a7b6d650cb57214c24619af Mon Sep 17 00:00:00 2001 From: nishika26 Date: Tue, 26 Aug 2025 17:13:26 +0530 Subject: [PATCH 2/4] small fixes --- ...ed9e7a6_add_additional_columns_to_model_eval.py} | 13 ++++++++----- backend/app/models/model_evaluation.py | 13 +++++++------ 2 files changed, 15 insertions(+), 11 deletions(-) rename backend/app/alembic/versions/{06a8bed9e7a6_add_additional_columns_to_model_.py => 06a8bed9e7a6_add_additional_columns_to_model_eval.py} (91%) diff --git a/backend/app/alembic/versions/06a8bed9e7a6_add_additional_columns_to_model_.py b/backend/app/alembic/versions/06a8bed9e7a6_add_additional_columns_to_model_eval.py similarity index 91% rename from backend/app/alembic/versions/06a8bed9e7a6_add_additional_columns_to_model_.py rename to backend/app/alembic/versions/06a8bed9e7a6_add_additional_columns_to_model_eval.py index b78edc9b..7e0c911b 100644 --- a/backend/app/alembic/versions/06a8bed9e7a6_add_additional_columns_to_model_.py +++ b/backend/app/alembic/versions/06a8bed9e7a6_add_additional_columns_to_model_eval.py @@ -49,11 +49,14 @@ def upgrade(): "prediction_data_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True ), ) - sa.Column( - "eval_status", - modelevaluation_status_enum, - nullable=False, - server_default="pending", + op.add_column( + "model_evaluation", + sa.Column( + "eval_status", + modelevaluation_status_enum, + nullable=False, + server_default="pending", + ), ) op.alter_column( "model_evaluation", "testing_file_id", existing_type=sa.VARCHAR(), nullable=True diff --git a/backend/app/models/model_evaluation.py b/backend/app/models/model_evaluation.py index 9e9c693e..8bf590bf 100644 --- a/backend/app/models/model_evaluation.py +++ b/backend/app/models/model_evaluation.py @@ -53,20 +53,20 @@ class ModelEvaluation(ModelEvaluationBase, table=True): nullable=False, description="the ratio the dataset was divided in" ) system_prompt: str = Field(sa_column=Column(Text, nullable=False)) - testing_file_id: Optional[str] = Field( + testing_file_id: str | None = Field( description="File ID of the testing file uploaded to OpenAI for Batch processing" ) - score: Optional[dict[str, float]] = Field( + score: dict[str, float] | None = Field( sa_column=Column(JSON, nullable=True), description="Evaluation scores per metric (e.g., {'mcc': 0.85})", ) - batch_id: Optional[str] = Field( + batch_id: str | None = Field( description="ID of the batch job created for processing test data queries" ) - output_file_id: Optional[str] = Field( + output_file_id: str | None = Field( description="ID of the output file generated by OpenAI Batch processing" ) - batch_status: Optional[str] = Field( + batch_status: str | None = Field( description="Current status of the batch job on OpenAI's side" ) prediction_data_url: str | None = Field( @@ -112,7 +112,8 @@ class ModelEvaluationPublic(ModelEvaluationBase): base_model: str prediction_data_url: str | None score: dict[str, float] | None = None - status: ModelEvaluationStatus + eval_status: ModelEvaluationStatus + inserted_at: datetime updated_at: datetime deleted_at: datetime | None = None From a162a8273bc0d94b135a50da2c91c6268eedbdb3 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 28 Aug 2025 17:58:39 +0530 Subject: [PATCH 3/4] test data and prediction data changes --- ...a6_add_additional_columns_to_model_eval.py | 65 +------ backend/app/api/routes/model_evaluation.py | 21 +- backend/app/core/finetune/evaluation.py | 182 ++++++++++-------- backend/app/core/finetune/preprocessing.py | 12 +- backend/app/crud/model_evaluation.py | 6 +- backend/app/models/model_evaluation.py | 26 +-- 6 files changed, 138 insertions(+), 174 deletions(-) diff --git a/backend/app/alembic/versions/06a8bed9e7a6_add_additional_columns_to_model_eval.py b/backend/app/alembic/versions/06a8bed9e7a6_add_additional_columns_to_model_eval.py index 7e0c911b..f00052df 100644 --- a/backend/app/alembic/versions/06a8bed9e7a6_add_additional_columns_to_model_eval.py +++ b/backend/app/alembic/versions/06a8bed9e7a6_add_additional_columns_to_model_eval.py @@ -16,80 +16,27 @@ branch_labels = None depends_on = None -modelevaluation_status_enum = postgresql.ENUM( - "pending", - "running", - "completed", - "failed", - name="modelevaluationstatus", - create_type=False, -) - def upgrade(): - op.add_column( - "model_evaluation", - sa.Column("test_data_url", sqlmodel.sql.sqltypes.AutoString(), nullable=False), - ) - op.add_column( - "model_evaluation", - sa.Column("batch_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), - ) - op.add_column( - "model_evaluation", - sa.Column("output_file_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), - ) - op.add_column( - "model_evaluation", - sa.Column("batch_status", sqlmodel.sql.sqltypes.AutoString(), nullable=True), - ) + op.drop_column("model_evaluation", "testing_file_id") op.add_column( "model_evaluation", sa.Column( - "prediction_data_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True + "test_data_s3_url", sqlmodel.sql.sqltypes.AutoString(), nullable=False ), ) op.add_column( "model_evaluation", sa.Column( - "eval_status", - modelevaluation_status_enum, - nullable=False, - server_default="pending", + "prediction_data_s3_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True ), ) - op.alter_column( - "model_evaluation", "testing_file_id", existing_type=sa.VARCHAR(), nullable=True - ) - op.drop_column("model_evaluation", "status") def downgrade(): + op.drop_column("model_evaluation", "prediction_data_s3_url") + op.drop_column("model_evaluation", "test_data_s3_url") op.add_column( "model_evaluation", - sa.Column( - "status", - postgresql.ENUM( - "pending", - "running", - "completed", - "failed", - name="modelevaluationstatus", - ), - server_default=sa.text("'pending'::modelevaluationstatus"), - autoincrement=False, - nullable=False, - ), - ) - op.alter_column( - "model_evaluation", - "testing_file_id", - existing_type=sa.VARCHAR(), - nullable=False, + sa.Column("testing_file_id", sa.VARCHAR(), autoincrement=False, nullable=True), ) - op.drop_column("model_evaluation", "eval_status") - op.drop_column("model_evaluation", "prediction_data_url") - op.drop_column("model_evaluation", "batch_status") - op.drop_column("model_evaluation", "output_file_id") - op.drop_column("model_evaluation", "batch_id") - op.drop_column("model_evaluation", "test_data_url") diff --git a/backend/app/api/routes/model_evaluation.py b/backend/app/api/routes/model_evaluation.py index 23859f65..3394a344 100644 --- a/backend/app/api/routes/model_evaluation.py +++ b/backend/app/api/routes/model_evaluation.py @@ -22,6 +22,7 @@ ModelEvaluationPublic, ) from app.core.db import engine +from app.core.cloud import AmazonCloudStorage from app.core.finetune.evaluation import ModelEvaluator from app.utils import get_openai_client, APIResponse from app.api.deps import CurrentUserOrgProject, SessionDep @@ -35,7 +36,6 @@ def run_model_evaluation( eval_id: int, current_user: CurrentUserOrgProject, - client: OpenAI, ): start_time = time.time() logger.info( @@ -43,6 +43,11 @@ def run_model_evaluation( ) with Session(engine) as db: + client = get_openai_client( + db, current_user.organization_id, current_user.project_id + ) + storage = AmazonCloudStorage(current_user) + try: model_eval = update_model_eval( session=db, @@ -53,9 +58,10 @@ def run_model_evaluation( evaluator = ModelEvaluator( model_name=model_eval.model_name, - testing_file_id=model_eval.testing_file_id, + test_data_s3_url=model_eval.test_data_s3_url, system_prompt=model_eval.system_prompt, client=client, + storage=storage, ) result = evaluator.run() @@ -64,7 +70,9 @@ def run_model_evaluation( eval_id=eval_id, project_id=current_user.project_id, update=ModelEvaluationUpdate( - score=result, status=ModelEvaluationStatus.completed + score=result["evaluation_score"], + prediction_data_s3_url=result["prediction_data_s3_url"], + status=ModelEvaluationStatus.completed, ), ) @@ -111,7 +119,8 @@ def evaluate_models( """ client = get_openai_client( session, current_user.organization_id, current_user.project_id - ) + ) # keeping this here for checking if the user's validated OpenAI key is present or not, + # even though the client will be initialized separately inside the background task if not request.fine_tuning_ids: logger.error( @@ -150,9 +159,7 @@ def evaluate_models( f"[evaluate_model] Created evaluation for fine_tuning_id {job_id} with eval ID={model_eval.id}, project_id:{current_user.project_id}" ) - background_tasks.add_task( - run_model_evaluation, model_eval.id, current_user, client - ) + background_tasks.add_task(run_model_evaluation, model_eval.id, current_user) return APIResponse.success_response( {"message": "Model evaluation(s) started successfully", "data": evals} diff --git a/backend/app/core/finetune/evaluation.py b/backend/app/core/finetune/evaluation.py index fb83c133..1f34fddb 100644 --- a/backend/app/core/finetune/evaluation.py +++ b/backend/app/core/finetune/evaluation.py @@ -1,18 +1,18 @@ -import json import difflib import time import logging from typing import Set import openai +import pandas as pd from openai import OpenAI +import uuid from sklearn.metrics import ( matthews_corrcoef, - accuracy_score, - f1_score, - confusion_matrix, ) +from app.core.cloud import AmazonCloudStorage from app.api.routes.fine_tuning import handle_openai_error +from app.core.finetune.preprocessing import DataPreprocessor logger = logging.getLogger(__name__) @@ -26,12 +26,14 @@ class ModelEvaluator: def __init__( self, model_name: str, - testing_file_id: str, + test_data_s3_url: str, + storage: AmazonCloudStorage, system_prompt: str, client: OpenAI, ): self.model_name = model_name - self.testing_file_id = testing_file_id + self.test_data_s3_url = test_data_s3_url + self.storage = storage self.system_instruction = system_prompt self.client = client @@ -43,83 +45,57 @@ def __init__( def load_labels_and_prompts(self) -> None: """ - Loads labels and prompts directly from OpenAI NDJSON file content using the testing file ID. - - Example data format: - { - "messages": [ - {"role": "system", "content": "You are an assistant that is good at categorizing if what user is saying is a query or non-query"}, - {"role": "user", "content": "what is the colour of the apple"}, - {"role": "assistant", "content": "query"} - ] - } - { - "messages": [ - {"role": "system", "content": "You are an assistant that is good at categorizing if what user is saying is a query or non-query"}, - {"role": "user", "content": "i like apples"}, - {"role": "assistant", "content": "non-query"} - ] - } + Load prompts (X) and labels (y) from an S3-hosted CSV via storage.stream. + Expects: + - one of: 'query' | 'question' | 'message' + - 'label' """ logger.info( - f"[load_labels_and_prompts] Loading labels and prompts from file ID: {self.testing_file_id}" + f"[ModelEvaluator.load_labels_and_prompts] Loading CSV from: {self.test_data_s3_url}" ) + file_obj = self.storage.stream(self.test_data_s3_url) try: - response = self.client.files.content(self.testing_file_id) - file_bytes = response.read() - lines = file_bytes.decode("utf-8").splitlines() - - for ln, line in enumerate(lines, 1): - line = line.strip() - if not line: - continue - try: - obj = json.loads(line) - msgs = obj.get("messages", []) - if not isinstance(msgs, list) or not msgs: - logger.error( - f"[load_labels_and_prompts] Line {ln}: 'messages' missing or invalid" - ) - raise ValueError(f"Line {ln}: 'messages' missing or invalid") - - user_msgs = [ - m for m in msgs if m.get("role") == "user" and "content" in m - ] - model_msgs = [ - m - for m in msgs - if m.get("role") == "assistant" and "content" in m - ] - if not user_msgs or not model_msgs: - logger.error( - f"[load_labels_and_prompts] Line {ln}: missing user or assistant message" - ) - raise ValueError( - f"Line {ln}: missing user or assistant message" - ) + df = pd.read_csv(file_obj) + df.columns = [c.strip().lower() for c in df.columns] + + possible_query_columns = ["query", "question", "message"] + query_col = next( + (c for c in possible_query_columns if c in df.columns), None + ) + label_col = "label" if "label" in df.columns else None - prompt = user_msgs[0]["content"] - label = (model_msgs[0]["content"] or "").strip().lower() + if not query_col or not label_col: + logger.error( + "[ModelEvaluator.load_labels_and_prompts] CSV must contain a 'label' column " + f"and one of: {possible_query_columns}" + ) + raise ValueError( + f"CSV must contain a 'label' column and one of: {possible_query_columns}" + ) - self.prompts.append(prompt) - self.y_true.append(label) - self.allowed_labels.add(label) + prompts = df[query_col].astype(str).tolist() + labels = df[label_col].astype(str).str.strip().str.lower().tolist() - except Exception as e: - logger.error( - f"[load_labels_and_prompts] Error processing line {ln}: {str(e)}" - ) - raise + self.prompts = prompts + self.y_true = labels + self.allowed_labels = set(labels) + + self.query_col = query_col + self.label_col = label_col logger.info( - f"[load_labels_and_prompts] Loaded {len(self.prompts)} prompts and {len(self.y_true)} labels." + "[ModelEvaluator.load_labels_and_prompts] " + f"Loaded {len(self.prompts)} prompts and {len(self.y_true)} labels; " + f"query_col={query_col}, label_col={label_col}, allowed_labels={self.allowed_labels}" ) - except Exception as e: logger.error( - f"[load_labels_and_prompts] Failed to load file content: {str(e)}" + f"[ModelEvaluator.load_labels_and_prompts] Failed to load/parse test CSV: {e}", + exc_info=True, ) raise + finally: + file_obj.close() def normalize_prediction(self, text: str) -> str: logger.debug(f"[normalize_prediction] Normalizing prediction: {text}") @@ -139,7 +115,7 @@ def normalize_prediction(self, text: str) -> str: ) return next(iter(self.allowed_labels)) - def generate_predictions(self) -> list[str]: + def generate_predictions(self) -> tuple[list[str], str]: logger.info( f"[generate_predictions] Generating predictions for {len(self.prompts)} prompts." ) @@ -192,34 +168,74 @@ def generate_predictions(self) -> list[str]: total_elapsed = time.time() - start_preds logger.info( - f"[generate_predictions] Finished {total_prompts} prompts in {total_elapsed:.2f}s | Generated {len(predictions)} predictions." + f"[generate_predictions] Finished {total_prompts} prompts in {total_elapsed:.2f}s | " + f"Generated {len(predictions)} predictions." + ) + + prediction_data = pd.DataFrame( + { + "prompt": self.prompts, + "true_label": self.y_true, + "prediction": predictions, + } + ) + + unique_id = uuid.uuid4().hex + filename = f"predictions_{self.model_name}_{unique_id}.csv" + prediction_data_s3_url = DataPreprocessor.upload_csv_to_s3( + self.storage, prediction_data, filename + ) + self.prediction_data_s3_url = prediction_data_s3_url + + logger.info( + f"[generate_predictions] Predictions CSV uploaded to S3 | url={prediction_data_s3_url}" ) - return predictions - def evaluate(self, y_pred: list[str]) -> dict: - """Evaluate the predictions against the true labels.""" + return predictions, prediction_data_s3_url - logger.info(f"[evaluate] Starting evaluation with {len(y_pred)} predictions.") + def evaluate(self) -> dict: + """Evaluate using the predictions CSV previously uploaded to S3.""" + if not getattr(self, "prediction_data_s3_url", None): + raise RuntimeError( + "[evaluate] predictions_s3_url not set. Call generate_predictions() first." + ) + logger.info( + f"[evaluate] Streaming predictions CSV from: {self.prediction_data_s3_url}" + ) + prediction_obj = self.storage.stream(self.prediction_data_s3_url) try: - mcc_score = round(matthews_corrcoef(self.y_true, y_pred), 4) + df = pd.read_csv(prediction_obj) + finally: + prediction_obj.close() - return { - "mcc": mcc_score, - } + if "true_label" not in df.columns or "prediction" not in df.columns: + raise ValueError( + "[evaluate] prediction data CSV must contain 'true_label' and 'prediction' columns." + ) + y_true = df["true_label"].astype(str).str.strip().str.lower().tolist() + y_pred = df["prediction"].astype(str).str.strip().str.lower().tolist() + + try: + mcc_score = round(matthews_corrcoef(y_true, y_pred), 4) + logger.info(f"[evaluate] Computed MCC={mcc_score}") + return {"mcc_score": mcc_score} except Exception as e: - logger.error(f"[evaluate] Evaluation failed: {e}") + logger.error(f"[evaluate] Evaluation failed: {e}", exc_info=True) raise def run(self) -> dict: """Run the full evaluation process: load data, generate predictions, evaluate results.""" try: self.load_labels_and_prompts() - predictions = self.generate_predictions() - evaluation_results = self.evaluate(predictions) + predictions, prediction_data_s3_url = self.generate_predictions() + evaluation_results = self.evaluate() logger.info("[evaluate] Model evaluation completed successfully.") - return evaluation_results + return { + "evaluation_score": evaluation_results, + "prediction_data_s3_url": prediction_data_s3_url, + } except Exception as e: logger.error(f"[evaluate] Error in running ModelEvaluator: {str(e)}") raise diff --git a/backend/app/core/finetune/preprocessing.py b/backend/app/core/finetune/preprocessing.py index 12120258..28fe9b23 100644 --- a/backend/app/core/finetune/preprocessing.py +++ b/backend/app/core/finetune/preprocessing.py @@ -27,7 +27,11 @@ def __init__(self, document, storage, split_ratio: float, system_message: str): self.system_message = {"role": "system", "content": system_message.strip()} - def upload_csv_to_s3(self, df, filename: str) -> str: + @staticmethod + def upload_csv_to_s3(storage, df, filename: str) -> str: + """ + Uploads a DataFrame as CSV to S3 using the provided storage instance. + """ logger.info( f"[upload_csv_to_s3] Preparing to upload '{filename}' to s3 | rows={len(df)}, cols={len(df.columns)}" ) @@ -43,7 +47,7 @@ def upload_csv_to_s3(self, df, filename: str) -> str: ) try: - dest = self.storage.put(upload, basename=Path("datasets") / filename) + dest = storage.put(upload, basename=Path("datasets") / filename) logger.info( f"[upload_csv_to_s3] Upload successful | filename='{filename}', s3_url='{dest}'" ) @@ -139,8 +143,8 @@ def process(self): test_csv_name = f"test_split_{test_percentage}_{unique_id}.csv" train_jsonl_name = f"train_data_{train_percentage}_{unique_id}.jsonl" - train_csv_url = self.upload_csv_to_s3(train_data, train_csv_name) - test_csv_url = self.upload_csv_to_s3(test_data, test_csv_name) + train_csv_url = self.upload_csv_to_s3(self.storage, train_data, train_csv_name) + test_csv_url = self.upload_csv_to_s3(self.storage, test_data, test_csv_name) train_jsonl_path = self._save_to_jsonl(train_jsonl, train_jsonl_name) diff --git a/backend/app/crud/model_evaluation.py b/backend/app/crud/model_evaluation.py index 96b91477..7a1e2b3b 100644 --- a/backend/app/crud/model_evaluation.py +++ b/backend/app/crud/model_evaluation.py @@ -26,9 +26,9 @@ def create_model_evaluation( ) -> ModelEvaluation: fine_tuning_job = fetch_by_id(session, request.fine_tuning_id, project_id) - if fine_tuning_job.fine_tuned_model and fine_tuning_job.testing_file_id is None: + if fine_tuning_job.fine_tuned_model and fine_tuning_job.test_data_s3_url is None: logger.error( - f"[create_model_evaluation] No fine tuned model found for the given fine tuning ID | fine_tuning_id={request.fine_tuning_id}, project_id={project_id}" + f"[create_model_evaluation] No fine tuned model or test data found for the given fine tuning ID | fine_tuning_id={request.fine_tuning_id}, project_id={project_id}" ) raise HTTPException(404, "Fine tuned model not found") @@ -39,7 +39,7 @@ def create_model_evaluation( "split_ratio": fine_tuning_job.split_ratio, "model_name": fine_tuning_job.fine_tuned_model, "document_id": fine_tuning_job.document_id, - "testing_file_id": fine_tuning_job.testing_file_id, + "test_data_s3_url": fine_tuning_job.test_data_s3_url, "project_id": project_id, "organization_id": organization_id, "status": status, diff --git a/backend/app/models/model_evaluation.py b/backend/app/models/model_evaluation.py index 8bf590bf..138adc13 100644 --- a/backend/app/models/model_evaluation.py +++ b/backend/app/models/model_evaluation.py @@ -47,33 +47,23 @@ class ModelEvaluation(ModelEvaluationBase, table=True): nullable=False, ) model_name: str = Field(description="fine tuned model name from OpenAI") - test_data_url: str = Field(description="S3 url of the testing data stored ins S3") + test_data_s3_url: str = Field( + description="S3 url of the testing data stored ins S3" + ) base_model: str = Field(nullable=False, description="Base model for fine-tuning") split_ratio: float = Field( nullable=False, description="the ratio the dataset was divided in" ) system_prompt: str = Field(sa_column=Column(Text, nullable=False)) - testing_file_id: str | None = Field( - description="File ID of the testing file uploaded to OpenAI for Batch processing" - ) score: dict[str, float] | None = Field( sa_column=Column(JSON, nullable=True), description="Evaluation scores per metric (e.g., {'mcc': 0.85})", ) - batch_id: str | None = Field( - description="ID of the batch job created for processing test data queries" - ) - output_file_id: str | None = Field( - description="ID of the output file generated by OpenAI Batch processing" - ) - batch_status: str | None = Field( - description="Current status of the batch job on OpenAI's side" - ) - prediction_data_url: str | None = Field( + prediction_data_s3_url: str | None = Field( default=None, description="S3 URL where the prediction data generated by the fine-tuned model is stored", ) - eval_status: ModelEvaluationStatus = ( + status: ModelEvaluationStatus = ( Field(default=ModelEvaluationStatus.pending, description="Evaluation status"), ) error_message: str | None = Field( @@ -99,7 +89,7 @@ class ModelEvaluationUpdate(SQLModel): score: Optional[dict[str, float]] = None status: Optional[ModelEvaluationStatus] = None error_message: Optional[str] = None - prediction_data_url: Optional[str] = None + prediction_data_s3_url: Optional[str] = None class ModelEvaluationPublic(ModelEvaluationBase): @@ -110,9 +100,9 @@ class ModelEvaluationPublic(ModelEvaluationBase): model_name: str split_ratio: float base_model: str - prediction_data_url: str | None + prediction_data_s3_url: str | None score: dict[str, float] | None = None - eval_status: ModelEvaluationStatus + status: ModelEvaluationStatus inserted_at: datetime updated_at: datetime From afbf7ae8f49bf51e242b4736d51bc16179fa2b8f Mon Sep 17 00:00:00 2001 From: nishika26 Date: Thu, 28 Aug 2025 18:10:56 +0530 Subject: [PATCH 4/4] test cases --- backend/app/models/model_evaluation.py | 4 +--- backend/app/tests/crud/test_model_evaluation.py | 4 ++-- backend/app/tests/utils/test_data.py | 8 +++----- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/backend/app/models/model_evaluation.py b/backend/app/models/model_evaluation.py index 138adc13..da24cdcd 100644 --- a/backend/app/models/model_evaluation.py +++ b/backend/app/models/model_evaluation.py @@ -47,9 +47,7 @@ class ModelEvaluation(ModelEvaluationBase, table=True): nullable=False, ) model_name: str = Field(description="fine tuned model name from OpenAI") - test_data_s3_url: str = Field( - description="S3 url of the testing data stored ins S3" - ) + test_data_s3_url: str = Field(description="S3 url of the testing data stored in S3") base_model: str = Field(nullable=False, description="Base model for fine-tuning") split_ratio: float = Field( nullable=False, description="the ratio the dataset was divided in" diff --git a/backend/app/tests/crud/test_model_evaluation.py b/backend/app/tests/crud/test_model_evaluation.py index 2b71ce4f..0eaa72f7 100644 --- a/backend/app/tests/crud/test_model_evaluation.py +++ b/backend/app/tests/crud/test_model_evaluation.py @@ -37,7 +37,7 @@ def test_create_model_evaluation(db: Session): base_model=fine_tune.base_model, model_name=fine_tune.fine_tuned_model, document_id=fine_tune.document_id, - testing_file_id=fine_tune.testing_file_id, + test_data_s3_url=fine_tune.test_data_s3_url, status="pending", ) @@ -52,7 +52,7 @@ def test_create_model_evaluation(db: Session): assert created_eval.status == "pending" assert created_eval.document_id == fine_tune.document_id assert created_eval.model_name == fine_tune.fine_tuned_model - assert created_eval.testing_file_id == fine_tune.testing_file_id + assert created_eval.test_data_s3_url == fine_tune.test_data_s3_url def test_fetch_by_eval_id_success(db: Session): diff --git a/backend/app/tests/utils/test_data.py b/backend/app/tests/utils/test_data.py index 680a2a87..edce345e 100644 --- a/backend/app/tests/utils/test_data.py +++ b/backend/app/tests/utils/test_data.py @@ -164,11 +164,11 @@ def create_test_finetuning_job_with_extra_fields( db: Session, ratios: list[float], ) -> tuple[list[Fine_Tuning], bool]: - jobs, _ = create_test_fine_tuning_jobs(db, [0.5, 0.7]) + jobs, _ = create_test_fine_tuning_jobs(db, ratios) if jobs: for job in jobs: - job.testing_file_id = "testing_file_id_example" + job.test_data_s3_url = "test_data_s3_url_example" job.fine_tuned_model = "fine_tuned_model_name" return jobs, True @@ -186,9 +186,7 @@ def create_test_model_evaluation(db) -> list[ModelEvaluation]: base_model=fine_tune.base_model, model_name=fine_tune.fine_tuned_model, document_id=fine_tune.document_id, - testing_file_id=fine_tune.testing_file_id - if fine_tune.testing_file_id - else None, + test_data_s3_url=fine_tune.test_data_s3_url, ) model_eval = create_model_evaluation(