-
Notifications
You must be signed in to change notification settings - Fork 7
Evaluation #405
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Evaluation #405
Changes from all commits
62d159e
706c5dc
d882160
168f521
fa509ad
38ea1a9
be299a3
46fd438
6198542
74ab6d1
cfc2dbd
675e220
0633c54
c130b02
7878d2a
5466229
2ac8163
17c767c
f99ae27
9bc96b2
2200c27
ae3c779
6acde66
34082d6
2a915fa
b2c1b46
1e278fc
4cb5d56
877ba04
6979e33
26ee6f0
c76ef50
5bfc8d8
3912d9f
6012d5c
d289794
e98dae6
cc2df27
ebafe8b
a21709f
e15c5f2
ed0da58
f573e70
11663da
5988f80
22361fe
d9704e3
0fd0842
cd757bd
ea5d000
f2ec2a5
f7ca621
e74ea09
4f4cea1
9b038c6
622b4eb
4c61d74
4ec5971
6f19f05
4a649d3
9fc12a6
a5c8a03
cba41c9
9dffd06
8d32883
24a958e
34700c5
9755403
2b38293
dce502b
8ad6982
c08d626
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,249 @@ | ||||||
| """create_evaluation_run_table, batch_job_table, and evaluation_dataset_table | ||||||
|
|
||||||
| Revision ID: 6fe772038a5a | ||||||
| Revises: 219033c644de | ||||||
| Create Date: 2025-11-05 22:47:18.266070 | ||||||
|
|
||||||
| """ | ||||||
| from alembic import op | ||||||
| import sqlalchemy as sa | ||||||
| from sqlalchemy.dialects import postgresql | ||||||
| import sqlmodel.sql.sqltypes | ||||||
|
|
||||||
|
|
||||||
| # revision identifiers, used by Alembic. | ||||||
| revision = "6fe772038a5a" | ||||||
| down_revision = "219033c644de" | ||||||
| branch_labels = None | ||||||
| depends_on = None | ||||||
|
|
||||||
|
|
||||||
| def upgrade(): | ||||||
| # Create batch_job table first (as evaluation_run will reference it) | ||||||
| op.create_table( | ||||||
| "batch_job", | ||||||
| sa.Column("id", sa.Integer(), nullable=False), | ||||||
| sa.Column( | ||||||
| "provider", | ||||||
| sa.String(), | ||||||
| nullable=False, | ||||||
| comment="LLM provider name (e.g., 'openai', 'anthropic')", | ||||||
| ), | ||||||
| sa.Column( | ||||||
| "job_type", | ||||||
| sa.String(), | ||||||
| nullable=False, | ||||||
| comment="Type of batch job (e.g., 'evaluation', 'classification', 'embedding')", | ||||||
| ), | ||||||
| sa.Column( | ||||||
| "config", | ||||||
| postgresql.JSONB(astext_type=sa.Text()), | ||||||
| nullable=False, | ||||||
| server_default=sa.text("'{}'::jsonb"), | ||||||
| comment="Complete batch configuration", | ||||||
| ), | ||||||
| sa.Column( | ||||||
| "provider_batch_id", | ||||||
| sa.String(), | ||||||
| nullable=True, | ||||||
| comment="Provider's batch job ID", | ||||||
| ), | ||||||
| sa.Column( | ||||||
| "provider_file_id", | ||||||
| sa.String(), | ||||||
| nullable=True, | ||||||
| comment="Provider's input file ID", | ||||||
| ), | ||||||
| sa.Column( | ||||||
| "provider_output_file_id", | ||||||
| sa.String(), | ||||||
| nullable=True, | ||||||
| comment="Provider's output file ID", | ||||||
| ), | ||||||
| sa.Column( | ||||||
| "provider_status", | ||||||
| sa.String(), | ||||||
| nullable=True, | ||||||
| comment="Provider-specific status (e.g., OpenAI: validating, in_progress, completed, failed)", | ||||||
| ), | ||||||
| sa.Column( | ||||||
| "raw_output_url", | ||||||
| sa.String(), | ||||||
| nullable=True, | ||||||
| comment="S3 URL of raw batch output file", | ||||||
| ), | ||||||
| sa.Column( | ||||||
| "total_items", | ||||||
| sa.Integer(), | ||||||
| nullable=False, | ||||||
| server_default=sa.text("0"), | ||||||
| comment="Total number of items in the batch", | ||||||
| ), | ||||||
| sa.Column( | ||||||
| "error_message", | ||||||
| sa.Text(), | ||||||
| nullable=True, | ||||||
| comment="Error message if batch failed", | ||||||
| ), | ||||||
| sa.Column("organization_id", sa.Integer(), nullable=False), | ||||||
| sa.Column("project_id", sa.Integer(), nullable=False), | ||||||
| sa.Column("inserted_at", sa.DateTime(), nullable=False), | ||||||
| sa.Column("updated_at", sa.DateTime(), nullable=False), | ||||||
| sa.ForeignKeyConstraint( | ||||||
| ["organization_id"], ["organization.id"], ondelete="CASCADE" | ||||||
| ), | ||||||
| sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"), | ||||||
| sa.PrimaryKeyConstraint("id"), | ||||||
| ) | ||||||
| op.create_index( | ||||||
| op.f("ix_batch_job_job_type"), "batch_job", ["job_type"], unique=False | ||||||
| ) | ||||||
| op.create_index( | ||||||
| op.f("ix_batch_job_organization_id"), | ||||||
| "batch_job", | ||||||
| ["organization_id"], | ||||||
| unique=False, | ||||||
| ) | ||||||
| op.create_index( | ||||||
| op.f("ix_batch_job_project_id"), "batch_job", ["project_id"], unique=False | ||||||
| ) | ||||||
| op.create_index( | ||||||
| "idx_batch_job_status_org", | ||||||
| "batch_job", | ||||||
| ["provider_status", "organization_id"], | ||||||
| unique=False, | ||||||
| ) | ||||||
| op.create_index( | ||||||
| "idx_batch_job_status_project", | ||||||
| "batch_job", | ||||||
| ["provider_status", "project_id"], | ||||||
| unique=False, | ||||||
| ) | ||||||
|
|
||||||
| # Create evaluation_dataset table | ||||||
| op.create_table( | ||||||
| "evaluation_dataset", | ||||||
| sa.Column("id", sa.Integer(), nullable=False), | ||||||
| sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), | ||||||
| sa.Column("description", sqlmodel.sql.sqltypes.AutoString(), nullable=True), | ||||||
| sa.Column( | ||||||
| "dataset_metadata", | ||||||
| postgresql.JSONB(astext_type=sa.Text()), | ||||||
| nullable=False, | ||||||
| server_default=sa.text("'{}'::jsonb"), | ||||||
| ), | ||||||
| sa.Column( | ||||||
| "object_store_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True | ||||||
| ), | ||||||
| sa.Column( | ||||||
| "langfuse_dataset_id", | ||||||
| sqlmodel.sql.sqltypes.AutoString(), | ||||||
| nullable=True, | ||||||
| ), | ||||||
| sa.Column("organization_id", sa.Integer(), nullable=False), | ||||||
| sa.Column("project_id", sa.Integer(), nullable=False), | ||||||
| sa.Column("inserted_at", sa.DateTime(), nullable=False), | ||||||
| sa.Column("updated_at", sa.DateTime(), nullable=False), | ||||||
| sa.ForeignKeyConstraint( | ||||||
| ["organization_id"], ["organization.id"], ondelete="CASCADE" | ||||||
| ), | ||||||
| sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"), | ||||||
| sa.PrimaryKeyConstraint("id"), | ||||||
| sa.UniqueConstraint( | ||||||
| "name", | ||||||
| "organization_id", | ||||||
| "project_id", | ||||||
| name="uq_evaluation_dataset_name_org_project", | ||||||
| ), | ||||||
| ) | ||||||
| op.create_index( | ||||||
| op.f("ix_evaluation_dataset_name"), | ||||||
| "evaluation_dataset", | ||||||
| ["name"], | ||||||
| unique=False, | ||||||
| ) | ||||||
|
|
||||||
| # Create evaluation_run table with all columns and foreign key references | ||||||
| op.create_table( | ||||||
| "evaluation_run", | ||||||
| sa.Column("run_name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), | ||||||
| sa.Column("dataset_name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), | ||||||
| sa.Column("config", sa.JSON(), nullable=False), | ||||||
| sa.Column("batch_job_id", sa.Integer(), nullable=True), | ||||||
| sa.Column( | ||||||
| "embedding_batch_job_id", | ||||||
| sa.Integer(), | ||||||
| nullable=True, | ||||||
| comment="Reference to the batch_job for embedding-based similarity scoring", | ||||||
| ), | ||||||
| sa.Column("dataset_id", sa.Integer(), nullable=False), | ||||||
| sa.Column("status", sqlmodel.sql.sqltypes.AutoString(), nullable=False), | ||||||
| sa.Column( | ||||||
| "object_store_url", sqlmodel.sql.sqltypes.AutoString(), nullable=True | ||||||
| ), | ||||||
| sa.Column("total_items", sa.Integer(), nullable=False), | ||||||
| sa.Column("score", sa.JSON(), nullable=True), | ||||||
| sa.Column("error_message", sa.Text(), nullable=True), | ||||||
| sa.Column("organization_id", sa.Integer(), nullable=False), | ||||||
| sa.Column("project_id", sa.Integer(), nullable=False), | ||||||
| sa.Column("id", sa.Integer(), nullable=False), | ||||||
| sa.Column("inserted_at", sa.DateTime(), nullable=False), | ||||||
| sa.Column("updated_at", sa.DateTime(), nullable=False), | ||||||
| sa.ForeignKeyConstraint( | ||||||
| ["batch_job_id"], | ||||||
| ["batch_job.id"], | ||||||
| ondelete="SET NULL", | ||||||
| ), | ||||||
| sa.ForeignKeyConstraint( | ||||||
| ["embedding_batch_job_id"], | ||||||
| ["batch_job.id"], | ||||||
| name="fk_evaluation_run_embedding_batch_job_id", | ||||||
| ondelete="SET NULL", | ||||||
| ), | ||||||
| sa.ForeignKeyConstraint( | ||||||
| ["dataset_id"], | ||||||
| ["evaluation_dataset.id"], | ||||||
| name="fk_evaluation_run_dataset_id", | ||||||
| ondelete="CASCADE", | ||||||
| ), | ||||||
| sa.ForeignKeyConstraint( | ||||||
| ["organization_id"], ["organization.id"], ondelete="CASCADE" | ||||||
| ), | ||||||
| sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"), | ||||||
| sa.PrimaryKeyConstraint("id"), | ||||||
| ) | ||||||
|
Comment on lines
+24
to
+214
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix primary keys to autoincrement.
- sa.Column("id", sa.Integer(), nullable=False),
+ sa.Column("id", sa.Integer(), primary_key=True),
...
- sa.Column("id", sa.Integer(), nullable=False),
+ sa.Column("id", sa.Integer(), primary_key=True),
...
- sa.Column("id", sa.Integer(), nullable=False),
+ sa.Column("id", sa.Integer(), primary_key=True),🤖 Prompt for AI Agents |
||||||
| op.create_index( | ||||||
| op.f("ix_evaluation_run_run_name"), "evaluation_run", ["run_name"], unique=False | ||||||
| ) | ||||||
| op.create_index( | ||||||
| "idx_eval_run_status_org", | ||||||
| "evaluation_run", | ||||||
| ["status", "organization_id"], | ||||||
| unique=False, | ||||||
| ) | ||||||
| op.create_index( | ||||||
| "idx_eval_run_status_project", | ||||||
| "evaluation_run", | ||||||
| ["status", "project_id"], | ||||||
| unique=False, | ||||||
| ) | ||||||
|
|
||||||
|
|
||||||
| def downgrade(): | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add return type hint to the downgrade function. The function signature is missing a return type hint, which violates the project's coding guidelines for Python 3.11+. As per coding guidelines. Apply this diff: -def downgrade():
+def downgrade() -> None:📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||
| # Drop evaluation_run table first (has foreign keys to batch_job and evaluation_dataset) | ||||||
| op.drop_index("idx_eval_run_status_project", table_name="evaluation_run") | ||||||
| op.drop_index("idx_eval_run_status_org", table_name="evaluation_run") | ||||||
| op.drop_index(op.f("ix_evaluation_run_run_name"), table_name="evaluation_run") | ||||||
| op.drop_table("evaluation_run") | ||||||
|
|
||||||
| # Drop evaluation_dataset table | ||||||
| op.drop_index(op.f("ix_evaluation_dataset_name"), table_name="evaluation_dataset") | ||||||
| op.drop_table("evaluation_dataset") | ||||||
|
|
||||||
| # Drop batch_job table | ||||||
| op.drop_index("idx_batch_job_status_project", table_name="batch_job") | ||||||
| op.drop_index("idx_batch_job_status_org", table_name="batch_job") | ||||||
| op.drop_index(op.f("ix_batch_job_project_id"), table_name="batch_job") | ||||||
| op.drop_index(op.f("ix_batch_job_organization_id"), table_name="batch_job") | ||||||
| op.drop_index(op.f("ix_batch_job_job_type"), table_name="batch_job") | ||||||
| op.drop_table("batch_job") | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| Start an evaluation using OpenAI Batch API. | ||
|
|
||
| This endpoint: | ||
| 1. Fetches the dataset from database and validates it has Langfuse dataset ID | ||
| 2. Creates an EvaluationRun record in the database | ||
| 3. Fetches dataset items from Langfuse | ||
| 4. Builds JSONL for batch processing (config is used as-is) | ||
| 5. Creates a batch job via the generic batch infrastructure | ||
| 6. Returns the evaluation run details with batch_job_id | ||
|
|
||
| The batch will be processed asynchronously by Celery Beat (every 60s). | ||
| Use GET /evaluations/{evaluation_id} to check progress. | ||
|
|
||
| ## Request Body | ||
|
|
||
| - **dataset_id** (required): ID of the evaluation dataset (from /evaluations/datasets) | ||
| - **experiment_name** (required): Name for this evaluation experiment/run | ||
| - **config** (optional): Configuration dict that will be used as-is in JSONL generation. Can include any OpenAI Responses API parameters like: | ||
| - model: str (e.g., "gpt-4o", "gpt-5") | ||
| - instructions: str | ||
| - tools: list (e.g., [{"type": "file_search", "vector_store_ids": [...]}]) | ||
| - reasoning: dict (e.g., {"effort": "low"}) | ||
| - text: dict (e.g., {"verbosity": "low"}) | ||
| - temperature: float | ||
| - include: list (e.g., ["file_search_call.results"]) | ||
| - Note: "input" will be added automatically from the dataset | ||
| - **assistant_id** (optional): Assistant ID to fetch configuration from. If provided, configuration will be fetched from the assistant in the database. Config can be passed as empty dict {} when using assistant_id. | ||
|
|
||
| ## Example with config | ||
|
|
||
| ```json | ||
| { | ||
| "dataset_id": 123, | ||
| "experiment_name": "test_run", | ||
| "config": { | ||
| "model": "gpt-4.1", | ||
| "instructions": "You are a helpful FAQ assistant.", | ||
| "tools": [ | ||
| { | ||
| "type": "file_search", | ||
| "vector_store_ids": ["vs_12345"], | ||
| "max_num_results": 3 | ||
| } | ||
| ], | ||
| "include": ["file_search_call.results"] | ||
| } | ||
| } | ||
| ``` | ||
|
|
||
| ## Example with assistant_id | ||
|
|
||
| ```json | ||
| { | ||
| "dataset_id": 123, | ||
| "experiment_name": "test_run", | ||
| "config": {}, | ||
| "assistant_id": "asst_xyz" | ||
| } | ||
| ``` | ||
|
|
||
| ## Returns | ||
|
|
||
| EvaluationRunPublic with batch details and status: | ||
| - id: Evaluation run ID | ||
| - run_name: Name of the evaluation run | ||
| - dataset_name: Name of the dataset used | ||
| - dataset_id: ID of the dataset used | ||
| - config: Configuration used for the evaluation | ||
| - batch_job_id: ID of the batch job processing this evaluation | ||
| - status: Current status (pending, running, completed, failed) | ||
| - total_items: Total number of items being evaluated | ||
| - completed_items: Number of items completed so far | ||
| - results: Evaluation results (when completed) | ||
| - error_message: Error message if failed | ||
|
|
||
| ## Error Responses | ||
|
|
||
| - **404**: Dataset or assistant not found or not accessible | ||
| - **400**: Missing required credentials (OpenAI or Langfuse), dataset missing Langfuse ID, or config missing required fields | ||
| - **500**: Failed to configure API clients or start batch evaluation |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| Delete a dataset by ID. | ||
|
|
||
| This will remove the dataset record from the database. The CSV file in object store (if exists) will remain for audit purposes, but the dataset will no longer be accessible for creating new evaluations. | ||
|
|
||
| ## Path Parameters | ||
|
|
||
| - **dataset_id**: ID of the dataset to delete | ||
|
|
||
| ## Returns | ||
|
|
||
| Success message with deleted dataset details: | ||
| - message: Confirmation message | ||
| - dataset_id: ID of the deleted dataset | ||
|
|
||
| ## Error Responses | ||
|
|
||
| - **404**: Dataset not found or not accessible to your organization/project | ||
| - **400**: Dataset cannot be deleted (e.g., has active evaluation runs) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| Get details of a specific dataset by ID. | ||
|
|
||
| Retrieves comprehensive information about a dataset including metadata, object store URL, and Langfuse integration details. | ||
|
|
||
| ## Path Parameters | ||
|
|
||
| - **dataset_id**: ID of the dataset to retrieve | ||
|
|
||
| ## Returns | ||
|
|
||
| DatasetUploadResponse with dataset details: | ||
| - dataset_id: Unique identifier for the dataset | ||
| - dataset_name: Name of the dataset (sanitized) | ||
| - total_items: Total number of items including duplication | ||
| - original_items: Number of original items before duplication | ||
| - duplication_factor: Factor by which items were duplicated | ||
| - langfuse_dataset_id: ID of the dataset in Langfuse | ||
| - object_store_url: URL to the CSV file in object storage | ||
|
|
||
| ## Error Responses | ||
|
|
||
| - **404**: Dataset not found or not accessible to your organization/project |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add return type hint to the upgrade function.
The function signature is missing a return type hint, which violates the project's coding guidelines for Python 3.11+.
As per coding guidelines.
Apply this diff:
📝 Committable suggestion
🤖 Prompt for AI Agents