diff --git a/.github/workflows/pr-evaluation-run.yml b/.github/workflows/pr-evaluation-run.yml deleted file mode 100644 index a417b8e..0000000 --- a/.github/workflows/pr-evaluation-run.yml +++ /dev/null @@ -1,292 +0,0 @@ -name: Router Submission Evaluation - -on: - workflow_dispatch: - inputs: - pr_number: - description: Pull request number to evaluate - required: true - type: string - base_ref: - description: Base branch ref for evaluation scripts checkout - required: true - type: string - base_sha: - description: Base commit SHA for PR diff/evaluation - required: true - type: string - -jobs: - evaluate-router: - runs-on: self-hosted - permissions: - contents: read - issues: write - checks: write - pull-requests: write - env: - PR_NUMBER: ${{ inputs.pr_number }} - BASE_REF: ${{ inputs.base_ref }} - BASE_SHA: ${{ inputs.base_sha }} - PR_CHECKOUT_REF: ${{ format('refs/pull/{0}/head', inputs.pr_number) }} - steps: - - name: Fetch PR head SHA - id: prmeta - uses: actions/github-script@v7 - with: - script: | - const pr = await github.rest.pulls.get({ - owner: context.repo.owner, - repo: context.repo.repo, - pull_number: Number('${{ env.PR_NUMBER }}') - }); - core.setOutput('head_sha', pr.data.head.sha); - - - name: Create in-progress PR check - id: checkrun - uses: actions/github-script@v7 - with: - script: | - const result = await github.rest.checks.create({ - owner: context.repo.owner, - repo: context.repo.repo, - name: 'Router Submission Evaluation (/evaluate)', - head_sha: '${{ steps.prmeta.outputs.head_sha }}', - status: 'in_progress', - started_at: new Date().toISOString(), - details_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`, - output: { - title: 'Evaluation started', - summary: 'Router evaluation is running.' - } - }); - core.setOutput('id', String(result.data.id)); - - - name: Checkout base repository (for evaluation scripts) - uses: actions/checkout@v4 - with: - ref: ${{ env.BASE_REF }} - path: base - fetch-depth: 0 - - - name: Checkout PR branch (for prediction file only) - uses: actions/checkout@v4 - with: - ref: ${{ env.PR_CHECKOUT_REF }} - path: pr - fetch-depth: 0 - - - name: Detect changed prediction file - id: detect - shell: bash - working-directory: pr - run: | - set -euo pipefail - BASE_REF="${{ env.BASE_REF }}" - BASE_SHA="${{ env.BASE_SHA }}" - - if [[ -z "$BASE_SHA" ]]; then - echo "Error: Could not determine PR base SHA" >&2 - exit 1 - fi - - git fetch origin "$BASE_REF" || true - - if ! git cat-file -e "$BASE_SHA" 2>/dev/null; then - echo "Base SHA $BASE_SHA not found locally, attempting to fetch..." - git fetch origin "$BASE_SHA" || git fetch origin "$BASE_REF" || true - fi - - mapfile -t CHANGED_FILES < <(git diff --name-status "$BASE_SHA"...HEAD -- router_inference/predictions/*.json 2>/dev/null | awk '$1 == "A" || $1 == "M" {print $2}') - - if [[ ${#CHANGED_FILES[@]} -eq 0 ]]; then - echo "No changed prediction file detected; skipping evaluation." - echo "router=" >> "$GITHUB_OUTPUT" - exit 0 - fi - - router_name="" - has_base=0 - has_robustness=0 - - for file in "${CHANGED_FILES[@]}"; do - filename=$(basename "$file") - name="${filename%.json}" - if [[ "$name" == *-robustness ]]; then - has_robustness=1 - name="${name%-robustness}" - else - has_base=1 - fi - - if [[ -z "$name" ]]; then - echo "Unable to determine router name from $file" >&2 - exit 1 - fi - - if [[ -z "$router_name" ]]; then - router_name="$name" - elif [[ "$router_name" != "$name" ]]; then - echo "Prediction files belong to different routers:" >&2 - printf ' %s\n' "${CHANGED_FILES[@]}" >&2 - exit 1 - fi - done - - if [[ ${#CHANGED_FILES[@]} -ne 2 || $has_base -ne 1 || $has_robustness -ne 1 ]]; then - echo "Expected exactly two prediction files (router and router-robustness), found:" >&2 - printf ' %s\n' "${CHANGED_FILES[@]}" >&2 - exit 1 - fi - - ROUTER_NAME="$router_name" - echo "router=$ROUTER_NAME" >> "$GITHUB_OUTPUT" - - PREDICTION_FILE="router_inference/predictions/${ROUTER_NAME}.json" - if [[ ! -f "$PREDICTION_FILE" ]]; then - echo "Error: Prediction file not found at $PREDICTION_FILE" >&2 - exit 1 - fi - - ENTRY_COUNT=$(python3 -c "import json; print(len(json.load(open('$PREDICTION_FILE'))))") - echo "Prediction file contains $ENTRY_COUNT entries" - - if [[ "$ENTRY_COUNT" -eq 8400 ]]; then - SPLIT="full" - elif [[ "$ENTRY_COUNT" -eq 809 ]]; then - SPLIT="sub_10" - else - echo "Warning: Unexpected prediction file size ($ENTRY_COUNT entries). Defaulting to full." >&2 - SPLIT="full" - fi - echo "split=$SPLIT" >> "$GITHUB_OUTPUT" - - - name: Show detected router - if: ${{ steps.detect.outputs.router != '' }} - run: | - set -euo pipefail - echo "Detected router submission: ${{ steps.detect.outputs.router }}" - echo "Detected split: ${{ steps.detect.outputs.split }}" - - - name: Prepare dataset - if: ${{ steps.detect.outputs.router != '' }} - working-directory: base - run: | - set -euo pipefail - echo "Preparing dataset..." - mkdir -p "${{ github.workspace }}/dataset" - uv run python scripts/process_datasets/prep_datasets.py - - - name: Copy PR prediction file to base workspace - if: ${{ steps.detect.outputs.router != '' }} - run: | - set -euo pipefail - ROUTER_NAME="${{ steps.detect.outputs.router }}" - mkdir -p base/router_inference/predictions - cp "pr/router_inference/predictions/${ROUTER_NAME}.json" \ - "base/router_inference/predictions/${ROUTER_NAME}.json" - cp "pr/router_inference/predictions/${ROUTER_NAME}-robustness.json" \ - "base/router_inference/predictions/${ROUTER_NAME}-robustness.json" - echo "Copied prediction files from PR to base workspace" - - - name: Evaluate submission - if: ${{ steps.detect.outputs.router != '' }} - id: evaluate - working-directory: base - env: - ROUTERARENA_DATASET_DIR: ${{ github.workspace }}/dataset - run: | - set -euo pipefail; trap 'cat evaluation_output.txt' EXIT - BASE_SHA="${{ env.BASE_SHA }}" - uv run python automation/process_pr_submission.py \ - --pr "${{ env.PR_NUMBER }}" \ - --router "${{ steps.detect.outputs.router }}" \ - --split "${{ steps.detect.outputs.split }}" \ - --base-ref "$BASE_SHA" > evaluation_output.txt 2>&1 - cat evaluation_output.txt - - - name: Post evaluation results as PR comment - if: ${{ steps.detect.outputs.router != '' && steps.evaluate.outcome == 'success' }} - uses: actions/github-script@v7 - with: - script: | - const fs = require('fs'); - const path = require('path'); - - let comment = '## Router Evaluation Results\n\n'; - comment += `**Router:** \`${{ steps.detect.outputs.router }}\`\n`; - comment += `**Dataset Split:** \`${{ steps.detect.outputs.split }}\`\n\n`; - - const metricsPath = path.join('base', 'metrics.json'); - if (!fs.existsSync(metricsPath)) { - throw new Error(`metrics.json not found at ${metricsPath}. Evaluation must produce metrics.json file.`); - } - - const metrics = JSON.parse(fs.readFileSync(metricsPath, 'utf8')); - comment += '### RouterArena Metrics\n\n'; - comment += '| Metric | Value |\n'; - comment += '|--------|-------|\n'; - comment += `| **RouterArena Score** | ${metrics.arena_score.toFixed(4)} |\n`; - comment += `| **Accuracy** | ${(metrics.accuracy * 100).toFixed(2)}% |\n`; - comment += `| **Total Cost** | $${metrics.total_cost.toFixed(6)} |\n`; - comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`; - comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`; - comment += `| **Number of Queries** | ${metrics.num_queries} |\n`; - const robustnessScore = metrics.robustness_score; - const robustnessCell = robustnessScore !== undefined ? robustnessScore.toFixed(4) : 'N/A'; - comment += `| **Robustness Score** | ${robustnessCell} |\n`; - - if (metrics.optimality) { - comment += '\n### Optimality Metrics\n\n'; - comment += '| Metric | Value |\n'; - comment += '|--------|-------|\n'; - comment += `| **Opt.Sel** (Optimal Selection) | ${metrics.optimality.opt_sel.toFixed(4)} |\n`; - comment += `| **Opt.Cost** (Cost Efficiency) | ${metrics.optimality.opt_cost.toFixed(4)} |\n`; - comment += `| **Opt.Acc** (Accuracy vs Optimal) | ${metrics.optimality.opt_acc.toFixed(4)} |\n`; - } - - comment += '\n---\n'; - comment += '*Evaluation completed by RouterArena automated workflow*'; - - await github.rest.issues.createComment({ - issue_number: Number('${{ env.PR_NUMBER }}'), - owner: context.repo.owner, - repo: context.repo.repo, - body: comment - }); - console.log('Successfully posted evaluation results as PR comment'); - - - name: Complete PR check - if: ${{ always() && steps.checkrun.outputs.id != '' }} - uses: actions/github-script@v7 - env: - DETECTED_ROUTER: ${{ steps.detect.outputs.router }} - DETECT_OUTCOME: ${{ steps.detect.outcome }} - EVALUATE_OUTCOME: ${{ steps.evaluate.outcome }} - with: - script: | - let conclusion = 'success'; - let title = 'Evaluation completed'; - let summary = 'Router evaluation finished successfully.'; - - if (!process.env.DETECTED_ROUTER) { - conclusion = process.env.DETECT_OUTCOME === 'success' ? 'neutral' : 'failure'; - title = process.env.DETECT_OUTCOME === 'success' ? 'No router file detected' : 'Evaluation setup failed'; - summary = process.env.DETECT_OUTCOME === 'success' - ? 'No changed prediction file was detected for this PR, so evaluation was skipped.' - : 'Failed while detecting prediction files for this PR.'; - } else if (process.env.EVALUATE_OUTCOME !== 'success') { - conclusion = 'failure'; - title = 'Evaluation failed'; - summary = 'The evaluation step failed. Check this workflow run logs for details.'; - } - - await github.rest.checks.update({ - owner: context.repo.owner, - repo: context.repo.repo, - check_run_id: Number('${{ steps.checkrun.outputs.id }}'), - status: 'completed', - conclusion, - completed_at: new Date().toISOString(), - output: { title, summary } - }); diff --git a/.github/workflows/pr-evaluation.yml b/.github/workflows/pr-evaluation.yml index 5635825..b34998b 100644 --- a/.github/workflows/pr-evaluation.yml +++ b/.github/workflows/pr-evaluation.yml @@ -1,11 +1,11 @@ -name: Router Submission Evaluation Trigger +name: Router Submission Evaluation on: issue_comment: types: [created] jobs: - request-evaluation: + evaluate-router: if: >- github.event.issue.pull_request && startsWith(github.event.comment.body, '/evaluate') && @@ -17,32 +17,20 @@ jobs: ) runs-on: self-hosted permissions: - actions: write - issues: write - pull-requests: write checks: write contents: read + pull-requests: write steps: - name: Acknowledge /evaluate command uses: actions/github-script@v7 with: script: | - try { - await github.rest.reactions.createForIssueComment({ - owner: context.repo.owner, - repo: context.repo.repo, - comment_id: context.payload.comment.id, - content: 'eyes' - }); - } catch (error) { - // Some org/repo token policies disallow reactions for GITHUB_TOKEN. - // Do not block evaluation trigger on this cosmetic action. - if (error.status === 403) { - core.warning(`Skipping reaction due to permission restriction: ${error.message}`); - } else { - throw error; - } - } + await github.rest.reactions.createForIssueComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: context.payload.comment.id, + content: 'eyes' + }); - name: Fetch PR details id: pr @@ -54,22 +42,281 @@ jobs: repo: context.repo.repo, pull_number: context.payload.issue.number }); - core.setOutput('number', String(pr.data.number)); + core.setOutput('head_sha', pr.data.head.sha); core.setOutput('base_ref', pr.data.base.ref); core.setOutput('base_sha', pr.data.base.sha); + core.setOutput('number', pr.data.number); - - name: Dispatch evaluation workflow + - name: Create in-progress PR check + id: checkrun uses: actions/github-script@v7 with: script: | - await github.rest.actions.createWorkflowDispatch({ + const result = await github.rest.checks.create({ owner: context.repo.owner, repo: context.repo.repo, - workflow_id: 'pr-evaluation-run.yml', - ref: '${{ steps.pr.outputs.base_ref }}', - inputs: { - pr_number: '${{ steps.pr.outputs.number }}', - base_ref: '${{ steps.pr.outputs.base_ref }}', - base_sha: '${{ steps.pr.outputs.base_sha }}' + name: 'Router Submission Evaluation (/evaluate)', + head_sha: '${{ steps.pr.outputs.head_sha }}', + status: 'in_progress', + started_at: new Date().toISOString(), + details_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`, + output: { + title: 'Evaluation started', + summary: 'Router evaluation was triggered via `/evaluate` and is now running.' } }); + core.setOutput('id', String(result.data.id)); + + - name: Checkout base repository (for evaluation scripts) + uses: actions/checkout@v4 + with: + ref: ${{ steps.pr.outputs.base_ref }} + path: base + fetch-depth: 0 + + - name: Checkout PR branch (for prediction file only) + uses: actions/checkout@v4 + with: + ref: ${{ steps.pr.outputs.head_sha }} + path: pr + fetch-depth: 0 + + - name: Detect changed prediction file + id: detect + shell: bash + working-directory: pr + run: | + set -euo pipefail + # Compare against the upstream base branch + # This ensures each router submission is evaluated independently + BASE_REF="${{ steps.pr.outputs.base_ref }}" + BASE_SHA="${{ steps.pr.outputs.base_sha }}" + + if [[ -z "$BASE_SHA" ]]; then + echo "Error: Could not determine PR base SHA" >&2 + exit 1 + fi + + # Fetch the base branch to ensure it's available for comparison + git fetch origin "$BASE_REF" || true + + # Try to fetch the specific base SHA if it's not already available + if ! git cat-file -e "$BASE_SHA" 2>/dev/null; then + echo "Base SHA $BASE_SHA not found locally, attempting to fetch..." + git fetch origin "$BASE_SHA" || git fetch origin "$BASE_REF" || true + fi + + # Compare against base to show only changes in this PR + # Use three-dot diff to show changes from merge-base to HEAD (only PR changes) + mapfile -t CHANGED_FILES < <(git diff --name-status "$BASE_SHA"...HEAD -- router_inference/predictions/*.json 2>/dev/null | awk '$1 == "A" || $1 == "M" {print $2}') + + if [[ ${#CHANGED_FILES[@]} -eq 0 ]]; then + echo "No changed prediction file detected; skipping evaluation." + echo "router=" >> "$GITHUB_OUTPUT" + exit 0 + fi + + router_name="" + has_base=0 + has_robustness=0 + + for file in "${CHANGED_FILES[@]}"; do + filename=$(basename "$file") + name="${filename%.json}" + if [[ "$name" == *-robustness ]]; then + has_robustness=1 + name="${name%-robustness}" + else + has_base=1 + fi + + if [[ -z "$name" ]]; then + echo "Unable to determine router name from $file" >&2 + exit 1 + fi + + if [[ -z "$router_name" ]]; then + router_name="$name" + elif [[ "$router_name" != "$name" ]]; then + echo "Prediction files belong to different routers:" >&2 + printf ' %s\n' "${CHANGED_FILES[@]}" >&2 + exit 1 + fi + done + + if [[ ${#CHANGED_FILES[@]} -ne 2 || $has_base -ne 1 || $has_robustness -ne 1 ]]; then + echo "Expected exactly two prediction files (router and router-robustness), found:" >&2 + printf ' %s\n' "${CHANGED_FILES[@]}" >&2 + exit 1 + fi + + ROUTER_NAME="$router_name" + echo "router=$ROUTER_NAME" >> "$GITHUB_OUTPUT" + + # Detect split based on prediction file size (from PR branch) + PREDICTION_FILE="router_inference/predictions/${ROUTER_NAME}.json" + if [[ ! -f "$PREDICTION_FILE" ]]; then + echo "Error: Prediction file not found at $PREDICTION_FILE" >&2 + exit 1 + fi + ENTRY_COUNT=$(python3 -c "import json; print(len(json.load(open('$PREDICTION_FILE'))))") + echo "Prediction file contains $ENTRY_COUNT entries" + + if [[ "$ENTRY_COUNT" -eq 8400 ]]; then + SPLIT="full" + elif [[ "$ENTRY_COUNT" -eq 809 ]]; then + SPLIT="sub_10" + else + echo "Warning: Unexpected prediction file size ($ENTRY_COUNT entries). Defaulting to full." >&2 + SPLIT="full" + fi + echo "split=$SPLIT" >> "$GITHUB_OUTPUT" + + - name: Show detected router + if: ${{ steps.detect.outputs.router != '' }} + run: | + set -euo pipefail + echo "Detected router submission: ${{ steps.detect.outputs.router }}" + echo "Detected split: ${{ steps.detect.outputs.split }}" + + - name: Prepare dataset + if: ${{ steps.detect.outputs.router != '' }} + working-directory: base + run: | + set -euo pipefail + # Prepare dataset from public repository + # Uses base repo's script (safe - not from PR) + echo "Preparing dataset..." + mkdir -p "${{ github.workspace }}/dataset" + uv run python scripts/process_datasets/prep_datasets.py + + - name: Copy PR prediction file to base workspace + if: ${{ steps.detect.outputs.router != '' }} + run: | + set -euo pipefail + ROUTER_NAME="${{ steps.detect.outputs.router }}" + # Copy prediction file from PR to base workspace + mkdir -p base/router_inference/predictions + cp "pr/router_inference/predictions/${ROUTER_NAME}.json" \ + "base/router_inference/predictions/${ROUTER_NAME}.json" + cp "pr/router_inference/predictions/${ROUTER_NAME}-robustness.json" \ + "base/router_inference/predictions/${ROUTER_NAME}-robustness.json" + echo "Copied prediction file from PR to base workspace" + + - name: Evaluate submission + if: ${{ steps.detect.outputs.router != '' }} + id: evaluate + working-directory: base + env: + ROUTERARENA_DATASET_DIR: ${{ github.workspace }}/dataset + run: | + set -euo pipefail; trap 'cat evaluation_output.txt' EXIT + # Uses base repo's evaluation script (safe - not from PR) + BASE_SHA="${{ steps.pr.outputs.base_sha }}" + uv run python automation/process_pr_submission.py \ + --pr "${{ steps.pr.outputs.number }}" \ + --router "${{ steps.detect.outputs.router }}" \ + --split "${{ steps.detect.outputs.split }}" \ + --base-ref "$BASE_SHA" > evaluation_output.txt 2>&1 + cat evaluation_output.txt + + - name: Post evaluation results as PR comment + if: ${{ steps.detect.outputs.router != '' && steps.evaluate.outcome == 'success' }} + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const path = require('path'); + + let comment = '## Router Evaluation Results\n\n'; + comment += `**Router:** \`${{ steps.detect.outputs.router }}\`\n`; + comment += `**Dataset Split:** \`${{ steps.detect.outputs.split }}\`\n\n`; + + // Read metrics from metrics.json file (required - no fallback) + const metricsPath = path.join('base', 'metrics.json'); + if (!fs.existsSync(metricsPath)) { + throw new Error(`metrics.json not found at ${metricsPath}. Evaluation must produce metrics.json file.`); + } + + const metrics = JSON.parse(fs.readFileSync(metricsPath, 'utf8')); + comment += '### RouterArena Metrics\n\n'; + comment += '| Metric | Value |\n'; + comment += '|--------|-------|\n'; + comment += `| **RouterArena Score** | ${metrics.arena_score.toFixed(4)} |\n`; + comment += `| **Accuracy** | ${(metrics.accuracy * 100).toFixed(2)}% |\n`; + comment += `| **Total Cost** | $${metrics.total_cost.toFixed(6)} |\n`; + comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`; + comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`; + comment += `| **Number of Queries** | ${metrics.num_queries} |\n`; + const robustnessScore = metrics.robustness_score; + const robustnessCell = robustnessScore !== undefined ? robustnessScore.toFixed(4) : 'N/A'; + comment += `| **Robustness Score** | ${robustnessCell} |\n`; + + // Add optimality scores if available + if (metrics.optimality) { + comment += '\n### Optimality Metrics\n\n'; + comment += '| Metric | Value |\n'; + comment += '|--------|-------|\n'; + comment += `| **Opt.Sel** (Optimal Selection) | ${metrics.optimality.opt_sel.toFixed(4)} |\n`; + comment += `| **Opt.Cost** (Cost Efficiency) | ${metrics.optimality.opt_cost.toFixed(4)} |\n`; + comment += `| **Opt.Acc** (Accuracy vs Optimal) | ${metrics.optimality.opt_acc.toFixed(4)} |\n`; + } + + comment += '\n---\n'; + comment += '*Evaluation completed by RouterArena automated workflow*'; + + // Post comment to PR + await github.rest.issues.createComment({ + issue_number: context.payload.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: comment + }); + console.log('Successfully posted evaluation results as PR comment'); + + - name: React with success + if: ${{ steps.detect.outputs.router != '' && steps.evaluate.outcome == 'success' }} + uses: actions/github-script@v7 + with: + script: | + await github.rest.reactions.createForIssueComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: context.payload.comment.id, + content: 'rocket' + }); + + - name: Complete PR check + if: ${{ always() && steps.checkrun.outputs.id != '' }} + uses: actions/github-script@v7 + env: + DETECTED_ROUTER: ${{ steps.detect.outputs.router }} + DETECT_OUTCOME: ${{ steps.detect.outcome }} + EVALUATE_OUTCOME: ${{ steps.evaluate.outcome }} + with: + script: | + let conclusion = 'success'; + let title = 'Evaluation completed'; + let summary = 'Router evaluation finished successfully.'; + + if (!process.env.DETECTED_ROUTER) { + conclusion = process.env.DETECT_OUTCOME === 'success' ? 'neutral' : 'failure'; + title = process.env.DETECT_OUTCOME === 'success' ? 'No router file detected' : 'Evaluation setup failed'; + summary = process.env.DETECT_OUTCOME === 'success' + ? 'No changed prediction file was detected for this PR, so evaluation was skipped.' + : 'Failed while detecting prediction files for this PR.'; + } else if (process.env.EVALUATE_OUTCOME !== 'success') { + conclusion = 'failure'; + title = 'Evaluation failed'; + summary = 'The evaluation step failed. Check this workflow run logs for details.'; + } + + await github.rest.checks.update({ + owner: context.repo.owner, + repo: context.repo.repo, + check_run_id: Number('${{ steps.checkrun.outputs.id }}'), + status: 'completed', + conclusion, + completed_at: new Date().toISOString(), + output: { title, summary } + });