-
Notifications
You must be signed in to change notification settings - Fork 7
cli runner for benchmarks #200
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b8fea7c
1b4c0da
4dfd55b
2829fe7
bbc4453
d85c3e1
d9e11c1
9c7ee5e
0aea62d
1f80184
b987720
87674ad
ffd8a92
e76b2a0
7501e61
d3c11a3
88b0270
3fb44ab
e53e9a1
f93343a
7fa48ee
f641892
651fa48
5357fb4
0bd2aaf
fa3ff46
9bb6922
8f34120
ef0b578
2694151
cbfc8c4
38193e1
c58ff68
bbfe1ef
f2543db
1a831bf
9c73202
d54de56
f569263
15b44b6
7e98278
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,94 @@ | ||
| name: RAG Benchmark | ||
|
|
||
| run-name: RAG Benchmark by ${{ github.actor }} | ||
|
|
||
| on: | ||
| workflow_dispatch: | ||
|
|
||
| jobs: | ||
| benchmark: | ||
| environment: main | ||
|
|
||
| runs-on: ubuntu-latest | ||
|
|
||
| strategy: | ||
| matrix: | ||
| dataset: [kunji, sneha] | ||
| service: [assistants, responses] | ||
| count: [100] | ||
|
|
||
| env: | ||
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | ||
| LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY }} | ||
| LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }} | ||
| LANGFUSE_HOST: ${{ secrets.LANGFUSE_HOST }} | ||
| LOCAL_CREDENTIALS_ORG_OPENAI_API_KEY: ${{ secrets.LOCAL_CREDENTIALS_ORG_OPENAI_API_KEY }} | ||
| LOCAL_CREDENTIALS_API_KEY: ${{ secrets.LOCAL_CREDENTIALS_API_KEY }} | ||
|
|
||
| steps: | ||
| - name: Checkout code | ||
| uses: actions/checkout@v4 | ||
|
|
||
| - run: | | ||
| cp .env.example .env | ||
| sed -i 's/changethis/secret123/g' .env | ||
|
|
||
| - name: Run server | ||
| run: | | ||
| docker compose up -d | ||
| sleep 10 | ||
|
|
||
| - name: prestart logs on failure | ||
| if: failure() | ||
| run: | | ||
| docker compose logs -f prestart | ||
| exit 1 | ||
|
|
||
| - name: Create local credentials | ||
| run: | | ||
| curl -X POST "http://localhost:8000/api/v1/credentials/" \ | ||
| -H "Content-Type: application/json" \ | ||
| -H "X-API-KEY: ${{ env.LOCAL_CREDENTIALS_API_KEY }}" \ | ||
| -d '{ | ||
| "organization_id": 1, | ||
| "project_id": 1, | ||
| "is_active": true, | ||
| "credential": { | ||
| "openai": { | ||
| "api_key": "${{ env.LOCAL_CREDENTIALS_ORG_OPENAI_API_KEY }}" | ||
| } | ||
| } | ||
| }' | ||
|
|
||
| - name: Run benchmark | ||
| run: | | ||
| docker compose exec backend uv run ai-cli bench ${{ matrix.service }} --dataset ${{ matrix.dataset }} --count ${{ matrix.count }} | tee benchmark_output.txt | ||
| # Extract mean duration from benchmark output | ||
| MEAN_DURATION=$(grep '^Mean duration:' benchmark_output.txt | awk '{print $3}') | ||
| echo "## Benchmark Results for ${{ matrix.service }} - ${{ matrix.dataset }} (${{ matrix.count }} queries, ${MEAN_DURATION} avg)" >> $GITHUB_STEP_SUMMARY | ||
| echo '```' >> $GITHUB_STEP_SUMMARY | ||
| cat benchmark_output.txt >> $GITHUB_STEP_SUMMARY | ||
| echo '```' >> $GITHUB_STEP_SUMMARY | ||
| # Find latest benchmark file inside container first | ||
| CONTAINER_LATEST=$(docker compose exec backend sh -c "ls -t bench_results_*.csv | head -n1") | ||
| # Copy the specific file out | ||
| docker compose cp backend:/app/$CONTAINER_LATEST ./ | ||
| cp $CONTAINER_LATEST bench-${{ matrix.service }}-${{ matrix.dataset }}-${{ matrix.count }}.csv | ||
| ls -l bench-${{ matrix.service }}-${{ matrix.dataset }}-${{ matrix.count }}.csv | ||
|
|
||
| - name: backend logs on failure | ||
| if: failure() | ||
| timeout-minutes: 1 | ||
| run: | | ||
| docker compose logs -f backend | ||
| exit 1 | ||
|
|
||
| - name: Upload benchmark results | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: bench-${{ matrix.service }}-${{ matrix.dataset }}-${{ matrix.count }}.csv | ||
| path: bench-${{ matrix.service }}-${{ matrix.dataset }}-${{ matrix.count }}.csv | ||
|
Comment on lines
+86
to
+90
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. where are we uploading the results of the benchmark from here?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in the github actions ui see link:
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks |
||
|
|
||
| - name: Cleanup | ||
| if: always() | ||
| run: docker compose down | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,132 @@ | ||
| from typing import Optional | ||
|
|
||
| import openai | ||
| from pydantic import BaseModel | ||
| from fastapi import APIRouter, Depends | ||
| from openai import OpenAI | ||
| from sqlmodel import Session | ||
|
|
||
| from app.api.deps import get_current_user_org, get_db | ||
| from app.crud.credentials import get_provider_credential | ||
| from app.models import UserOrganization | ||
| from app.utils import APIResponse | ||
|
|
||
| router = APIRouter(tags=["responses"]) | ||
|
|
||
|
|
||
| def handle_openai_error(e: openai.OpenAIError) -> str: | ||
| """Extract error message from OpenAI error.""" | ||
| if isinstance(e.body, dict) and "message" in e.body: | ||
| return e.body["message"] | ||
| return str(e) | ||
|
|
||
|
|
||
| class ResponsesAPIRequest(BaseModel): | ||
| project_id: int | ||
|
|
||
| model: str | ||
| instructions: str | ||
| vector_store_ids: list[str] | ||
| max_num_results: Optional[int] = 20 | ||
| temperature: Optional[float] = 0.1 | ||
| response_id: Optional[str] = None | ||
|
|
||
| question: str | ||
|
|
||
|
|
||
| class Diagnostics(BaseModel): | ||
| input_tokens: int | ||
| output_tokens: int | ||
| total_tokens: int | ||
|
|
||
| model: str | ||
|
Comment on lines
+24
to
+42
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. move models to
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sees are for parsing requests not for backing database models. can introduce to a /schema/ folder for these
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure |
||
|
|
||
|
|
||
| class FileResultChunk(BaseModel): | ||
| score: float | ||
| text: str | ||
|
|
||
|
|
||
| class _APIResponse(BaseModel): | ||
| status: str | ||
|
|
||
| response_id: str | ||
| message: str | ||
| chunks: list[FileResultChunk] | ||
|
|
||
| diagnostics: Optional[Diagnostics] = None | ||
|
Comment on lines
+50
to
+57
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we already have APIResponse model in |
||
|
|
||
|
|
||
| class ResponsesAPIResponse(APIResponse[_APIResponse]): | ||
| pass | ||
|
|
||
|
|
||
| def get_file_search_results(response): | ||
| results: list[FileResultChunk] = [] | ||
|
|
||
| for tool_call in response.output: | ||
| if tool_call.type == "file_search_call": | ||
| results.extend( | ||
| [FileResultChunk(score=hit.score, text=hit.text) for hit in results] | ||
| ) | ||
|
|
||
| return results | ||
|
|
||
|
|
||
| @router.post("/responses/sync", response_model=ResponsesAPIResponse) | ||
| async def responses_sync( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you mean synchronous or asynchronous in the description as I'm assuming this is running asynchronous so fasten up the completion by running parallel |
||
| request: ResponsesAPIRequest, | ||
| _session: Session = Depends(get_db), | ||
| _current_user: UserOrganization = Depends(get_current_user_org), | ||
| ): | ||
| """ | ||
| Temp synchronous endpoint for benchmarking OpenAI responses API | ||
| """ | ||
| credentials = get_provider_credential( | ||
| session=_session, | ||
| org_id=_current_user.organization_id, | ||
| provider="openai", | ||
| project_id=request.project_id, | ||
| ) | ||
| if not credentials or "api_key" not in credentials: | ||
| return APIResponse.failure_response( | ||
| error="OpenAI API key not configured for this organization." | ||
| ) | ||
|
|
||
| client = OpenAI(api_key=credentials["api_key"]) | ||
|
|
||
| try: | ||
| response = client.responses.create( | ||
| model=request.model, | ||
| previous_response_id=request.response_id, | ||
| instructions=request.instructions, | ||
| tools=[ | ||
| { | ||
| "type": "file_search", | ||
| "vector_store_ids": request.vector_store_ids, | ||
| "max_num_results": request.max_num_results, | ||
| } | ||
| ], | ||
| temperature=request.temperature, | ||
| input=[{"role": "user", "content": request.question}], | ||
| include=["file_search_call.results"], | ||
| ) | ||
|
|
||
| response_chunks = get_file_search_results(response) | ||
|
|
||
| return ResponsesAPIResponse.success_response( | ||
| data=_APIResponse( | ||
| status="success", | ||
| response_id=response.id, | ||
| message=response.output_text, | ||
| chunks=response_chunks, | ||
| diagnostics=Diagnostics( | ||
| input_tokens=response.usage.input_tokens, | ||
| output_tokens=response.usage.output_tokens, | ||
| total_tokens=response.usage.total_tokens, | ||
| model=response.model, | ||
| ), | ||
| ), | ||
| ) | ||
| except openai.OpenAIError as e: | ||
| return ResponsesAPIResponse.failure_response(error=handle_openai_error(e)) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we also need to run seeder before adding credentials for organization_id 1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seeder is run in prestart in docker compose up
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok cool