From abbe3e5846bdef5ce59fc372a4ff93fb62dae786 Mon Sep 17 00:00:00 2001
From: yl231 <yifan21lu@gmail.com>
Date: Tue, 27 Jan 2026 21:37:02 -0600
Subject: [PATCH 1/7] feat: Integrate GPQA dataset with proper answer
 extraction and option shuffling

Add support for GPQA (Graduate-Level Google-Proof Q&A) dataset integration
into the RouterArena evaluation pipeline.

Key changes:
- Add GPQA evaluation config (config/eval_config/zero-shot/GPQA.json)
- Create prepare_gpqa_data.py script to:
  * Load GPQA dataset from HuggingFace (Idavidrein/gpqa)
  * Extract correct answers from \"Correct Answer\" field (fixes empty answer bug)
  * Shuffle MCQ options deterministically to distribute answers across A-D
  * Generate formatted prompts and ground truth files
- Update evaluation pipeline (llm_evaluation/run.py, evaluate_models.py):
  * Add GPQA split handling in load_ground_truth_dataset()
  * Support GPQA dataset name detection
- Update inference pipeline (llm_inference/run.py, model_inference.py):
  * Add GPQA split support for router predictions
  * Handle GPQA-specific data loading
- Add OpenRouter router configuration
- Update model cost configurations for GPQA models
- Add universal model name mappings
---
 config/eval_config/zero-shot/GPQA.json        |  18 ++
 llm_evaluation/evaluate_models.py             |  68 +++--
 llm_evaluation/run.py                         |  73 ++++--
 llm_inference/model_inference.py              |   1 +
 llm_inference/run.py                          |  46 +++-
 model_cost/cost.json                          |   9 +
 .../config/openrouter-router.json             |   9 +
 router_inference/generate_prediction_file.py  |   8 +-
 scripts/prepare_gpqa_data.py                  | 241 ++++++++++++++++++
 universal_model_names.py                      |   2 +
 10 files changed, 426 insertions(+), 49 deletions(-)
 create mode 100644 config/eval_config/zero-shot/GPQA.json
 create mode 100644 router_inference/config/openrouter-router.json
 create mode 100644 scripts/prepare_gpqa_data.py

diff --git a/config/eval_config/zero-shot/GPQA.json b/config/eval_config/zero-shot/GPQA.json
new file mode 100644
index 0000000..6f00598
--- /dev/null
+++ b/config/eval_config/zero-shot/GPQA.json
@@ -0,0 +1,18 @@
+{
+    "eval_params": {
+        "dataset": "GPQA",
+        "eval_metrics": [
+            "mcq_accuracy"
+        ],
+        "setting": "zero-shot",
+        "prompt": "Please read the following multiple-choice questions and provide the most likely correct answer based on the options given.\n\nContext: {Context}\n\nQuestion: {Question}\n\nOptions: \n{Options}\n\nProvide the correct letter choice in \\boxed{{X}}, where X is the correct letter choice. Keep the explanation or feedback within 3 sentences."
+    },
+    "management": {
+        "sub_dir": {
+            "input_config": "input_config/",
+            "raw_results": "raw_results.json",
+            "result_vis": "result_vis.png",
+            "output_config": "output_config.json"
+        }
+    }
+}
\ No newline at end of file
diff --git a/llm_evaluation/evaluate_models.py b/llm_evaluation/evaluate_models.py
index a7eede3..4f627e6 100644
--- a/llm_evaluation/evaluate_models.py
+++ b/llm_evaluation/evaluate_models.py
@@ -144,10 +144,15 @@ def load_dataset_configs(self):
     def load_cost_config(self):
         """Load cost configuration from model_cost/cost.json"""
         # Try multiple possible paths for cost file
+        # Get the directory of this file and construct paths relative to project root
+        current_file_dir = os.path.dirname(os.path.abspath(__file__))
+        project_root = os.path.dirname(current_file_dir)  # Go up from llm_evaluation/ to project root
+        
         possible_paths = [
-            "./model_cost/cost.json",
-            "../model_cost/cost.json",
-            "model_cost/cost.json",
+            os.path.join(project_root, "model_cost", "cost.json"),  # From project root
+            "./model_cost/cost.json",  # Current working directory
+            "../model_cost/cost.json",  # Parent directory
+            "model_cost/cost.json",  # Relative to current dir
         ]
 
         cost_file = None
@@ -160,6 +165,7 @@ def load_cost_config(self):
             print(
                 f"Warning: Could not find cost configuration file. Tried: {possible_paths}"
             )
+            print(f"Current working directory: {os.getcwd()}")
             self.cost_config = {}
             return
 
@@ -177,7 +183,11 @@ def calculate_inference_cost(
         self, model_name: str, token_usage: Dict[str, int]
     ) -> float:
         """Calculate inference cost based on token usage and model pricing."""
-        if not token_usage or not self.cost_config:
+        if not token_usage:
+            return 0.0
+        
+        if not self.cost_config:
+            print("Warning: Cost config is empty!")
             return 0.0
 
         # Remove _batch suffix if present for cost lookup
@@ -185,27 +195,33 @@ def calculate_inference_cost(
         if model_name.endswith("_batch"):
             cost_lookup_name = model_name[:-6]  # Remove '_batch' suffix
 
-        # Normalize model name to match cost config
-        if model_name_manager:
-            normalized_name = model_name_manager.get_universal_name(cost_lookup_name)
+        # Try exact match first (cost config uses original model names)
+        if cost_lookup_name in self.cost_config:
+            cost_info = self.cost_config[cost_lookup_name]
         else:
-            normalized_name = cost_lookup_name
+            # Normalize model name to match cost config
+            if model_name_manager:
+                normalized_name = model_name_manager.get_universal_name(cost_lookup_name)
+            else:
+                normalized_name = cost_lookup_name
 
-        # Try to find exact match first
-        if normalized_name in self.cost_config:
-            cost_info = self.cost_config[normalized_name]
-        else:
-            # Try to find partial matches
-            cost_info = None
-            for config_name in self.cost_config.keys():
-                if config_name in normalized_name or normalized_name in config_name:
-                    cost_info = self.cost_config[config_name]
-                    break
+            # Try to find exact match with normalized name
+            if normalized_name in self.cost_config:
+                cost_info = self.cost_config[normalized_name]
+            else:
+                # Try to find partial matches
+                cost_info = None
+                for config_name in self.cost_config.keys():
+                    if config_name in normalized_name or normalized_name in config_name:
+                        cost_info = self.cost_config[config_name]
+                        break
 
         if not cost_info:
             print(
-                f"Warning: No cost configuration found for model {model_name} (lookup: {cost_lookup_name}, normalized: {normalized_name})"
+                f"Warning: No cost configuration found for model {model_name} (lookup: {cost_lookup_name})"
             )
+            if len(self.cost_config) > 0:
+                print(f"Available cost config keys (first 10): {list(self.cost_config.keys())[:10]}")
             return 0.0
 
         # Calculate cost
@@ -239,6 +255,7 @@ def determine_dataset_from_global_index(self, global_index: str) -> str:
             "FinQA": "FinQA",
             "GeoBench": "GeoBench",
             "GeoGraphyData": "GeoGraphyData_100k",  # Fix the dataset name
+            "GPQA": "GPQA",
             "GSM8K": "GSM8K",
             "LiveCodeBench": "LiveCodeBench",
             "MATH": "MATH",
@@ -468,7 +485,18 @@ def _get_ground_truth(self, global_index: str, dataset_name: str) -> Optional[An
             except Exception as e:
                 print(f"Error loading LiveCodeBench dataset: {e}")
                 return None
-
+        elif dataset_name == "GPQA":
+            gpqa_gt_path = "./dataset/gpqa_ground_truth.json"
+            if os.path.exists(gpqa_gt_path):
+                try:
+                    with open(gpqa_gt_path, "r", encoding="utf-8") as f:
+                        gpqa_data = json.load(f)
+                    for item in gpqa_data:
+                        if item.get("global_index") == global_index:
+                            return item["answer"]
+                except Exception as e:
+                    print(f"Error loading GPQA ground truth: {e}")
+            return None
         # For other datasets, find the entry with matching global_index
         if self.all_data is None:
             return None
diff --git a/llm_evaluation/run.py b/llm_evaluation/run.py
index 3e3365b..7ccaa8a 100644
--- a/llm_evaluation/run.py
+++ b/llm_evaluation/run.py
@@ -100,17 +100,23 @@ def compute_arena_score(cost, accuracy, beta=0.1, c_max=200, c_min=0.0044):
     return S
 
 
-def load_predictions_file(router_name: str) -> List[Dict[str, Any]]:
+def load_predictions_file(router_name: str, split: str | None = None) -> List[Dict[str, Any]]:
     """
     Load router predictions from JSON file.
 
     Args:
         router_name: Name of the router
+        split: Dataset split (optional). Used to determine prediction file name.
 
     Returns:
         List of prediction dictionaries
     """
-    prediction_path = f"./router_inference/predictions/{router_name}.json"
+    # Construct prediction path based on split (same logic as llm_inference/run.py)
+    if split and split in ["gpqa", "robustness"]:
+        filename = f"{router_name}-{split}"
+    else:
+        filename = router_name
+    prediction_path = f"./router_inference/predictions/{filename}.json"
 
     if not os.path.exists(prediction_path):
         raise FileNotFoundError(
@@ -136,15 +142,21 @@ def load_predictions_from_path(path: str) -> List[Dict[str, Any]]:
         return json.load(f)
 
 
-def save_predictions_file(predictions: List[Dict[str, Any]], router_name: str) -> None:
+def save_predictions_file(predictions: List[Dict[str, Any]], router_name: str, split: str | None = None) -> None:
     """
     Save predictions back to file.
 
     Args:
         predictions: List of prediction dictionaries
         router_name: Name of the router
+        split: Dataset split (optional). Used to determine prediction file name.
     """
-    prediction_path = f"./router_inference/predictions/{router_name}.json"
+    # Construct filename based on split (same logic as load_predictions_file)
+    if split and split in ["gpqa", "robustness"]:
+        filename = f"{router_name}-{split}"
+    else:
+        filename = router_name
+    prediction_path = f"./router_inference/predictions/{filename}.json"
 
     # Create directory if it doesn't exist
     os.makedirs(os.path.dirname(prediction_path), exist_ok=True)
@@ -170,7 +182,33 @@ def load_ground_truth_dataset(split: str) -> Dict[str, Dict[str, Any]]:
     """
     from datasets import load_from_disk
     import pandas as pd
-
+    ground_truth_map = {}
+    
+    # Handle GPQA split
+    if split == "gpqa":
+        gpqa_gt_path = "./dataset/gpqa_ground_truth.json"
+        if not os.path.exists(gpqa_gt_path):
+            raise FileNotFoundError(
+                f"GPQA ground truth not found at {gpqa_gt_path}. "
+                f"Please create it using the preparation script."
+            )
+        logger.info(f"Loading GPQA ground truth from {gpqa_gt_path}...")
+        with open(gpqa_gt_path, "r", encoding="utf-8") as f:
+            gpqa_data = json.load(f)
+        
+        for item in gpqa_data:
+            global_index = item["global_index"]
+            ground_truth_map[global_index] = {
+                "question": item.get("question", ""),
+                "global_index": global_index,
+                "context": item.get("context", ""),
+                "answer": item["answer"],
+                "options": item.get("options", []),
+                "metadata": item.get("metadata", {}),
+            }
+        
+        logger.info(f"Loaded {len(ground_truth_map)} GPQA ground truth samples")
+        return ground_truth_map
     if split not in ["sub_10", "full"]:
         raise ValueError(f"Invalid split: {split}. Must be 'sub_10' or 'full'")
 
@@ -354,9 +392,10 @@ def evaluate_single_prediction(
             )
 
         # Calculate inference cost
+        # Use original model name for cost lookup since cost config uses original names
         token_usage = generated_result.get("token_usage", {})
         inference_cost = evaluator.calculate_inference_cost(
-            universal_model_name, token_usage
+            model_name, token_usage  # Use original model_name instead of universal_model_name
         )
 
         # Update the prediction with evaluation results
@@ -396,7 +435,7 @@ def process_router_predictions(
     logger.info(f"Using {num_workers} worker threads for parallel processing")
 
     # Load predictions
-    predictions = load_predictions_file(router_name)
+    predictions = load_predictions_file(router_name, split=split)
 
     # Separate regular and optimality entries
     regular_predictions = [p for p in predictions if not p.get("for_optimality", False)]
@@ -439,11 +478,13 @@ def process_router_predictions(
     # Note: This loop runs in the main thread before threading starts, so no lock needed
     tasks = []
     for i, prediction in enumerate(predictions):
-        # Check if already evaluated (has accuracy and cost)
+        # Check if already evaluated (has accuracy and cost > 0)
         # Skip if already evaluated AND force is False
+        # Note: cost > 0 check ensures costs were actually calculated (0.0 means not calculated)
         if not force and (
             prediction.get("accuracy") is not None
             and prediction.get("cost") is not None
+            and prediction.get("cost", 0) > 0  # Cost must be > 0 to be considered evaluated
         ):
             already_evaluated_count += 1
             evaluated_count += 1
@@ -494,7 +535,7 @@ def evaluate_task(seq_idx: int, prediction: Dict[str, Any]) -> bool:
                 with save_lock:
                     # Save the entire predictions list
                     # This is safe because each thread modifies a different index
-                    save_predictions_file(predictions, router_name)
+                    save_predictions_file(predictions, router_name, split=split)
 
                     elapsed_time = (
                         datetime.datetime.now() - start_time
@@ -542,7 +583,7 @@ def evaluate_task(seq_idx: int, prediction: Dict[str, Any]) -> bool:
 
     # Final save
     with save_lock:
-        save_predictions_file(predictions, router_name)
+        save_predictions_file(predictions, router_name, split=split)
 
     # Final summary
     end_time = datetime.datetime.now()
@@ -901,7 +942,7 @@ def run_robustness_only(router_name: str, robustness_path: Optional[str]) -> Non
         target_path,
     )
 
-    predictions = load_predictions_file(router_name)
+    predictions = load_predictions_file(router_name, split=None)  # Load base file for robustness
 
     try:
         robustness_predictions = load_predictions_from_path(target_path)
@@ -1096,10 +1137,10 @@ def main():
         "split",
         nargs="?",
         type=str,
-        choices=["sub_10", "full", "robustness"],
+        choices=["sub_10", "full", "robustness", "gpqa"],
         help=(
             "Dataset split to use for evaluation ('sub_10' for testing with answers, "
-            "'full' for submission, 'robustness' to compute robustness score only)."
+            "'full' for submission, 'robustness' to compute robustness score only, 'gpqa' for GPQA dataset)."
         ),
     )
     parser.add_argument(
@@ -1161,7 +1202,7 @@ def main():
     # Run evaluation
     try:
         # If save_interval is 0, only save at the end
-        predictions = load_predictions_file(args.router_name)
+        predictions = load_predictions_file(args.router_name, split=args.split)
         save_interval = (
             args.save_interval if args.save_interval > 0 else len(predictions) + 1
         )
@@ -1177,8 +1218,8 @@ def main():
         logger.info("\nInterrupted by user. Saving partial results...")
         try:
             # Try to save current state if possible
-            predictions = load_predictions_file(args.router_name)
-            save_predictions_file(predictions, args.router_name)
+            predictions = load_predictions_file(args.router_name, split=args.split)
+            save_predictions_file(predictions, args.router_name, split=args.split)
             logger.info("Partial results saved successfully.")
         except Exception as e:
             logger.warning(f"Could not save partial results: {e}")
diff --git a/llm_inference/model_inference.py b/llm_inference/model_inference.py
index 3a50ba6..d961e12 100644
--- a/llm_inference/model_inference.py
+++ b/llm_inference/model_inference.py
@@ -172,6 +172,7 @@ def _get_provider(self, model_name: str) -> str:
             "qwen/qwen3-vl-235b-a22b-instruct": "openrouter",
             "qwen/qwen3-coder": "openrouter",
             "x-ai/grok-code-fast-1": "openrouter",
+            "xiaomi/mimo-v2-flash": "openrouter",
             "xiaomi/mimo-v2-flash:free": "openrouter",
             "openai/gpt-oss-120b": "openrouter",
             "qwen/qwen3-235b-a22b-2507": "openrouter",
diff --git a/llm_inference/run.py b/llm_inference/run.py
index 6d72ee3..d455c64 100644
--- a/llm_inference/run.py
+++ b/llm_inference/run.py
@@ -38,17 +38,22 @@
 logger = logging.getLogger(__name__)
 
 
-def load_predictions_file(router_name: str) -> List[Dict[str, Any]]:
+def load_predictions_file(router_name: str, split: str | None = None) -> List[Dict[str, Any]]:
     """
     Load router predictions from JSON file.
 
     Args:
         router_name: Name of the router
-
+        split: Dataset split ("sub_10", "full", "robustness", "gpqa")
     Returns:
         List of prediction dictionaries
     """
-    prediction_path = f"./router_inference/predictions/{router_name}.json"
+    # Construct prediction path based on split
+    if split and split in ["gpqa", "robustness"]:
+        filename = f"{router_name}-{split}"
+    else:
+        filename = router_name
+    prediction_path = f"./router_inference/predictions/{filename}.json"
 
     if not os.path.exists(prediction_path):
         raise FileNotFoundError(
@@ -105,7 +110,9 @@ def load_cached_results_for_predictions(
 
     # Load cache for each model
     for universal_model_name, model_predictions in model_to_predictions.items():
-        cached_file = os.path.join(cached_results_dir, f"{universal_model_name}.jsonl")
+        # Sanitize model name for filename (replace / with _)
+        model_filename = universal_model_name.replace("/", "_")
+        cached_file = os.path.join(cached_results_dir, f"{model_filename}.jsonl")
 
         if not os.path.exists(cached_file):
             continue
@@ -162,7 +169,7 @@ def load_cached_results_for_predictions(
 
 
 def save_predictions_file(
-    predictions: List[Dict[str, Any]], router_name: str, create_backup: bool = False
+    predictions: List[Dict[str, Any]], router_name: str, create_backup: bool = False, split: str | None = None
 ) -> None:
     """
     Save predictions back to file.
@@ -171,8 +178,14 @@ def save_predictions_file(
         predictions: List of prediction dictionaries
         router_name: Name of the router
         create_backup: Whether to create a backup before saving (only needed once)
+        split: Dataset split (optional). Used to determine prediction file name.
     """
-    prediction_path = f"./router_inference/predictions/{router_name}.json"
+    # Construct filename based on split (same logic as load_predictions_file)
+    if split and split in ["gpqa", "robustness"]:
+        filename = f"{router_name}-{split}"
+    else:
+        filename = router_name
+    prediction_path = f"./router_inference/predictions/{filename}.json"
 
     with open(prediction_path, "w", encoding="utf-8") as f:
         json.dump(predictions, f, ensure_ascii=False, indent=2)
@@ -185,6 +198,7 @@ def process_router_predictions(
     num_workers: int = 16,
     num_runs: int = 1,
     cached_results_dir: str = "./cached_results",
+    split: str | None = None,
 ) -> None:
     """
     Process router predictions using parallel inference system.
@@ -202,11 +216,11 @@ def process_router_predictions(
     logger.info(f"Target runs per query: {num_runs}")
 
     # Load predictions
-    predictions = load_predictions_file(router_name)
+    predictions = load_predictions_file(router_name, split)
     logger.info(f"Loaded {len(predictions)} predictions")
 
     # Create backup of original predictions file
-    save_predictions_file(predictions, router_name, create_backup=True)
+    save_predictions_file(predictions, router_name, create_backup=True, split=split)
 
     # Filter out entries without required fields and convert to universal model names
     valid_predictions = []
@@ -297,7 +311,7 @@ def process_router_predictions(
             updated_count += 1
 
     # Save updated predictions
-    save_predictions_file(predictions, router_name, create_backup=False)
+    save_predictions_file(predictions, router_name, create_backup=False, split=split)
 
     # Final summary
     end_time = datetime.datetime.now()
@@ -319,8 +333,13 @@ def process_router_predictions(
         logger.info(f"    Successful: {stats['successful']}")
         logger.info(f"    Failed: {stats['failed']}")
 
+    # Construct filename for log message
+    if split and split in ["gpqa", "robustness"]:
+        filename = f"{router_name}-{split}"
+    else:
+        filename = router_name
     logger.info(
-        f"\nPredictions saved to: ./router_inference/predictions/{router_name}.json"
+        f"\nPredictions saved to: ./router_inference/predictions/{filename}.json"
     )
     logger.info("=" * 80)
 
@@ -347,6 +366,12 @@ def main():
         type=str,
         help="Name of the router (corresponds to ./router_inference/predictions/<router_name>.json)",
     )
+    parser.add_argument(
+        "--split",
+        type=str,
+        choices=["sub_10", "full", "robustness", "gpqa"],
+        help="Dataset split (optional). Used to determine prediction file name."
+    )
     parser.add_argument(
         "--num-workers",
         type=int,
@@ -394,6 +419,7 @@ def main():
             num_workers=args.num_workers,
             num_runs=args.num_runs,
             cached_results_dir=args.cached_results_dir,
+            split=args.split,
         )
     except KeyboardInterrupt:
         logger.info("\nInterrupted by user. Partial results have been saved.")
diff --git a/model_cost/cost.json b/model_cost/cost.json
index 4e8c38d..0d76794 100644
--- a/model_cost/cost.json
+++ b/model_cost/cost.json
@@ -178,5 +178,14 @@
   "glm-4-plus": {
     "input_token_price_per_million": 0.7,
     "output_token_price_per_million": 0.7
+  },
+  "xiaomi/mimo-v2-flash": {
+    "input_token_price_per_million": 0.09,
+    "output_token_price_per_million": 0.29
+  },
+
+  "mistralai/devstral-2512:free": {
+    "input_token_price_per_million": 0.0001,
+    "output_token_price_per_million": 0.0001
   }
 }
diff --git a/router_inference/config/openrouter-router.json b/router_inference/config/openrouter-router.json
new file mode 100644
index 0000000..9d7a7b1
--- /dev/null
+++ b/router_inference/config/openrouter-router.json
@@ -0,0 +1,9 @@
+{
+    "pipeline_params": {
+        "router_name": "openrouter-router",
+        "models": [
+            "xiaomi/mimo-v2-flash",
+            "mistralai/devstral-2512:free"
+        ]
+    }
+}
diff --git a/router_inference/generate_prediction_file.py b/router_inference/generate_prediction_file.py
index 694ce3f..1a190b1 100644
--- a/router_inference/generate_prediction_file.py
+++ b/router_inference/generate_prediction_file.py
@@ -30,6 +30,7 @@
     "sub_10": "./dataset/router_data_10.json",
     "full": "./dataset/router_data.json",
     "robustness": "./dataset/router_robustness.json",
+    "gpqa": "./dataset/gpqa_data.json",
 }
 
 
@@ -38,8 +39,7 @@ def load_dataset(split: str) -> List[Dict[str, Any]]:
     Load dataset file.
 
     Args:
-        split: One of the supported dataset splits (sub_10, full, robustness)
-
+        split: One of the supported dataset splits (sub_10, full, robustness, gpqa)
     Returns:
         List of dataset entries
     """
@@ -182,6 +182,8 @@ def save_predictions(
     filename = router_name
     if split == "robustness":
         filename = f"{router_name}-robustness"
+    elif split == "gpqa":
+        filename = f"{router_name}-gpqa"
     prediction_path = f"./router_inference/predictions/{filename}.json"
 
     # Create directory if it doesn't exist
@@ -207,7 +209,7 @@ def main():
         "split",
         type=str,
         choices=list(DATASET_PATHS.keys()),
-        help="Dataset split: 'sub_10', 'full', or 'robustness'",
+        help="Dataset split: 'sub_10', 'full', 'robustness', or 'gpqa'",
     )
     parser.add_argument(
         "--no-optimality",
diff --git a/scripts/prepare_gpqa_data.py b/scripts/prepare_gpqa_data.py
new file mode 100644
index 0000000..a09de82
--- /dev/null
+++ b/scripts/prepare_gpqa_data.py
@@ -0,0 +1,241 @@
+"""
+Prepare GPQA dataset for RouterArena pipeline.
+
+This script:
+1. Loads GPQA dataset from HuggingFace
+2. Formats prompts according to RouterArena requirements
+3. Creates dataset/gpqa_data.json (for router inference)
+4. Creates dataset/gpqa_ground_truth.json (for evaluation)
+
+How to run:
+    uv run python scripts/prepare_gpqa_data.py
+
+Prerequisites:
+    - Install packages: uv sync (or pip install datasets)
+    - (Optional) If authentication needed: huggingface-cli login
+"""
+
+from datasets import load_dataset, DatasetDict, Dataset
+import json
+import os
+import random
+
+# Ensure dataset directory exists
+os.makedirs("dataset", exist_ok=True)
+
+# Load the dataset. If authentication is needed, ensure you are logged in.
+print("Loading GPQA dataset from HuggingFace...")
+raw_gpqa = load_dataset("Idavidrein/gpqa", "gpqa_diamond")
+
+# Extract the actual dataset from DatasetDict if needed
+if isinstance(raw_gpqa, DatasetDict):
+    split_names = list(raw_gpqa.keys())
+    print(f"Found splits: {split_names}")
+    # Get the first split (usually 'train')
+    split_key = 'train' if 'train' in raw_gpqa else split_names[0]
+    print(f"Using split: {split_key}")
+    gpqa_dataset = raw_gpqa[split_key]
+else:
+    # If it's already a single Dataset, use it directly
+    gpqa_dataset = raw_gpqa
+
+print(f"Loaded {len(gpqa_dataset)} GPQA entries")
+print(f"Dataset features: {gpqa_dataset.features}")
+
+# Inspect first entry to understand structure
+if len(gpqa_dataset) > 0:
+    print("\nFirst entry sample:")
+    print(gpqa_dataset[0])
+    print()
+
+# Define prompt template (must match eval config!)
+PROMPT_TEMPLATE = """Please read the following multiple-choice questions and provide the most likely correct answer based on the options given.
+
+Context: {context}
+
+Question: {question}
+
+Options: 
+{options}
+
+Provide the correct letter choice in \\boxed{{X}}, where X is the correct letter choice. Keep the explanation or feedback within 3 sentences."""
+
+# Step 2.2: Create dataset file with formatted prompts
+print("\n[Step 2.2] Formatting prompts and creating dataset file...")
+formatted_data = []
+# Store shuffled options and answer letters for use in ground truth generation
+shuffled_data = {}  # {index: (shuffled_options, answer_letter)}
+
+for i, item in enumerate(gpqa_dataset):
+    # Extract fields (adjust field names based on actual dataset structure)
+    question = item.get("question", item.get("Question", ""))
+    # GPQA dataset has separate fields for correct and incorrect answers
+    # Construct options list from these fields
+    correct_answer = item.get("Correct Answer", item.get("correct_answer", item.get("answer", item.get("Answer", ""))))
+    incorrect_1 = item.get("Incorrect Answer 1", "")
+    incorrect_2 = item.get("Incorrect Answer 2", "")
+    incorrect_3 = item.get("Incorrect Answer 3", "")
+    
+    # Build options list with all answers
+    all_options = [opt for opt in [correct_answer, incorrect_1, incorrect_2, incorrect_3] if opt]
+    
+    # If we have a pre-formatted options list, use that instead
+    if item.get("options") or item.get("Options"):
+        all_options = item.get("options", item.get("Options", []))
+        # Still need to find correct answer in the list
+        if not correct_answer:
+            correct_answer = item.get("Correct Answer", item.get("correct_answer", item.get("answer", item.get("Answer", ""))))
+    
+    # Shuffle options using deterministic seed based on index for reproducibility
+    # This ensures the same question always gets the same shuffle
+    random.seed(i)
+    shuffled_options = all_options.copy()
+    random.shuffle(shuffled_options)
+    
+    # Find the index of the correct answer in the shuffled list
+    answer_index = -1
+    if correct_answer:
+        try:
+            answer_index = shuffled_options.index(correct_answer)
+        except ValueError:
+            # Try case-insensitive matching
+            for idx, opt in enumerate(shuffled_options):
+                if opt and correct_answer and opt.strip().lower() == correct_answer.strip().lower():
+                    answer_index = idx
+                    break
+    
+    # Convert index to letter (A, B, C, D, etc.)
+    if answer_index >= 0:
+        answer_letter = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[answer_index]
+    else:
+        answer_letter = "A"  # Default fallback
+        print(f"Warning: Could not find correct answer in options for GPQA_{i}, defaulting to A")
+    
+    # Store shuffled data for ground truth generation
+    shuffled_data[i] = (shuffled_options, answer_letter)
+    
+    context = item.get("context", item.get("Context", ""))
+    
+    # Format shuffled options as "A. option1\nB. option2\n..."
+    options_str = ""
+    for j, opt in enumerate(shuffled_options):
+        letter = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[j]
+        options_str += f"{letter}. {opt}\n"
+    
+    # Build the complete prompt
+    prompt = PROMPT_TEMPLATE.format(
+        context=context or "None",
+        question=question,
+        options=options_str.strip()
+    )
+    
+    # Create the dataset entry
+    formatted_data.append({
+        "prompt_formatted": prompt,
+        "global index": f"GPQA_{i}"  # CRITICAL: Prefix must match dataset name
+    })
+
+# Save dataset file
+dataset_path = "dataset/gpqa_data.json"
+with open(dataset_path, "w", encoding="utf-8") as f:
+    json.dump(formatted_data, f, indent=2, ensure_ascii=False)
+
+print(f"✓ Created {len(formatted_data)} GPQA entries in {dataset_path}")
+
+# Step 2.3: Create ground truth file
+print("\n[Step 2.3] Creating ground truth file...")
+ground_truth = []
+for i, item in enumerate(gpqa_dataset):
+    # Extract fields (same as above)
+    question = item.get("question", item.get("Question", ""))
+    context = item.get("context", item.get("Context", ""))
+    
+    # Use the same shuffled options and answer letter from step 2.2
+    if i in shuffled_data:
+        shuffled_options, answer_letter = shuffled_data[i]
+    else:
+        # Fallback: regenerate shuffle if somehow missing (shouldn't happen)
+        correct_answer = item.get("Correct Answer", item.get("correct_answer", item.get("answer", item.get("Answer", ""))))
+        incorrect_1 = item.get("Incorrect Answer 1", "")
+        incorrect_2 = item.get("Incorrect Answer 2", "")
+        incorrect_3 = item.get("Incorrect Answer 3", "")
+        all_options = [opt for opt in [correct_answer, incorrect_1, incorrect_2, incorrect_3] if opt]
+        
+        if item.get("options") or item.get("Options"):
+            all_options = item.get("options", item.get("Options", []))
+        
+        random.seed(i)
+        shuffled_options = all_options.copy()
+        random.shuffle(shuffled_options)
+        
+        answer_index = -1
+        if correct_answer:
+            try:
+                answer_index = shuffled_options.index(correct_answer)
+            except ValueError:
+                for idx, opt in enumerate(shuffled_options):
+                    if opt and correct_answer and opt.strip().lower() == correct_answer.strip().lower():
+                        answer_index = idx
+                        break
+        
+        if answer_index >= 0:
+            answer_letter = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[answer_index]
+        else:
+            answer_letter = "A"
+            print(f"Warning: Could not find correct answer in options for GPQA_{i}, defaulting to A")
+    
+    ground_truth.append({
+        "global_index": f"GPQA_{i}",  # MUST match dataset file
+        "question": question,
+        "answer": answer_letter,  # Store as letter (A, B, C, D)
+        "options": shuffled_options,  # Use shuffled options to match the prompt
+        "context": context or "",
+        "metadata": item.get("metadata", {})
+    })
+
+# Save ground truth file
+gt_path = "dataset/gpqa_ground_truth.json"
+with open(gt_path, "w", encoding="utf-8") as f:
+    json.dump(ground_truth, f, indent=2, ensure_ascii=False)
+
+print(f"✓ Created {len(ground_truth)} GPQA ground truth entries in {gt_path}")
+
+# Step 2.4: Verify files
+print("\n[Step 2.4] Verifying files...")
+try:
+    # Check dataset file structure
+    with open(dataset_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    print(f"✓ Dataset file: {len(data)} entries")
+    print(f"  First entry keys: {list(data[0].keys())}")
+    print(f"  First global_index: {data[0].get('global index')}")
+    
+    # Check ground truth file structure
+    with open(gt_path, "r", encoding="utf-8") as f:
+        gt = json.load(f)
+    print(f"✓ Ground truth file: {len(gt)} entries")
+    print(f"  First entry keys: {list(gt[0].keys())}")
+    print(f"  First answer: {gt[0].get('answer')}")
+    
+    # Verify matching indices
+    data_indices = {e.get("global index") for e in data}
+    gt_indices = {e.get("global_index") for e in gt}
+    if data_indices == gt_indices:
+        print(f"✓ All {len(data_indices)} indices match between dataset and ground truth")
+    else:
+        missing_in_data = gt_indices - data_indices
+        missing_in_gt = data_indices - gt_indices
+        if missing_in_data:
+            print(f"⚠ Warning: {len(missing_in_data)} indices in ground truth not in dataset")
+        if missing_in_gt:
+            print(f"⚠ Warning: {len(missing_in_gt)} indices in dataset not in ground truth")
+    
+    print("\n✓ All files created and verified successfully!")
+    print(f"\nNext steps:")
+    print(f"1. Review dataset/gpqa_data.json to ensure prompts are formatted correctly")
+    print(f"2. Review dataset/gpqa_ground_truth.json to ensure answers are correct")
+    print(f"3. Proceed to Step 3: Router Inference Setup")
+    
+except Exception as e:
+    print(f"✗ Verification failed: {e}")
+    raise
\ No newline at end of file
diff --git a/universal_model_names.py b/universal_model_names.py
index cc1016c..7fbd935 100644
--- a/universal_model_names.py
+++ b/universal_model_names.py
@@ -51,6 +51,8 @@
     "meta-llama/llama-3-8b-instruct",
     "anthropic/claude-3.5-sonnet",
     "Qwen/QwQ-32B",
+    "xiaomi/mimo-v2-flash",
+    "mistralai/devstral-2512:free",
     # Replicate
     "meta/codellama-34b-instruct",
     # AWS Bedrock

From 5a06c3101fd11251b0214f958d723e5e14657a95 Mon Sep 17 00:00:00 2001
From: Michael Yu <zy53@rice.edu>
Date: Tue, 27 Jan 2026 21:37:02 -0600
Subject: [PATCH 2/7] feat: Integrate GPQA dataset with proper answer
 extraction and option shuffling

Add support for GPQA (Graduate-Level Google-Proof Q&A) dataset integration
into the RouterArena evaluation pipeline.

Key changes:
- Add GPQA evaluation config (config/eval_config/zero-shot/GPQA.json)
- Create prepare_gpqa_data.py script to:
  * Load GPQA dataset from HuggingFace (Idavidrein/gpqa)
  * Extract correct answers from \"Correct Answer\" field (fixes empty answer bug)
  * Shuffle MCQ options deterministically to distribute answers across A-D
  * Generate formatted prompts and ground truth files
- Update evaluation pipeline (llm_evaluation/run.py, evaluate_models.py):
  * Add GPQA split handling in load_ground_truth_dataset()
  * Support GPQA dataset name detection
- Update inference pipeline (llm_inference/run.py, model_inference.py):
  * Add GPQA split support for router predictions
  * Handle GPQA-specific data loading
- Add OpenRouter router configuration
- Update model cost configurations for GPQA models
- Add universal model name mappings
---
 config/eval_config/zero-shot/GPQA.json        |  18 ++
 llm_evaluation/evaluate_models.py             |  68 +++--
 llm_evaluation/run.py                         |  73 ++++--
 llm_inference/model_inference.py              |   1 +
 llm_inference/run.py                          |  46 +++-
 model_cost/cost.json                          |   9 +
 .../config/openrouter-router.json             |   9 +
 router_inference/generate_prediction_file.py  |   8 +-
 scripts/prepare_gpqa_data.py                  | 241 ++++++++++++++++++
 universal_model_names.py                      |   2 +
 10 files changed, 426 insertions(+), 49 deletions(-)
 create mode 100644 config/eval_config/zero-shot/GPQA.json
 create mode 100644 router_inference/config/openrouter-router.json
 create mode 100644 scripts/prepare_gpqa_data.py

diff --git a/config/eval_config/zero-shot/GPQA.json b/config/eval_config/zero-shot/GPQA.json
new file mode 100644
index 0000000..6f00598
--- /dev/null
+++ b/config/eval_config/zero-shot/GPQA.json
@@ -0,0 +1,18 @@
+{
+    "eval_params": {
+        "dataset": "GPQA",
+        "eval_metrics": [
+            "mcq_accuracy"
+        ],
+        "setting": "zero-shot",
+        "prompt": "Please read the following multiple-choice questions and provide the most likely correct answer based on the options given.\n\nContext: {Context}\n\nQuestion: {Question}\n\nOptions: \n{Options}\n\nProvide the correct letter choice in \\boxed{{X}}, where X is the correct letter choice. Keep the explanation or feedback within 3 sentences."
+    },
+    "management": {
+        "sub_dir": {
+            "input_config": "input_config/",
+            "raw_results": "raw_results.json",
+            "result_vis": "result_vis.png",
+            "output_config": "output_config.json"
+        }
+    }
+}
\ No newline at end of file
diff --git a/llm_evaluation/evaluate_models.py b/llm_evaluation/evaluate_models.py
index a7eede3..4f627e6 100644
--- a/llm_evaluation/evaluate_models.py
+++ b/llm_evaluation/evaluate_models.py
@@ -144,10 +144,15 @@ def load_dataset_configs(self):
     def load_cost_config(self):
         """Load cost configuration from model_cost/cost.json"""
         # Try multiple possible paths for cost file
+        # Get the directory of this file and construct paths relative to project root
+        current_file_dir = os.path.dirname(os.path.abspath(__file__))
+        project_root = os.path.dirname(current_file_dir)  # Go up from llm_evaluation/ to project root
+        
         possible_paths = [
-            "./model_cost/cost.json",
-            "../model_cost/cost.json",
-            "model_cost/cost.json",
+            os.path.join(project_root, "model_cost", "cost.json"),  # From project root
+            "./model_cost/cost.json",  # Current working directory
+            "../model_cost/cost.json",  # Parent directory
+            "model_cost/cost.json",  # Relative to current dir
         ]
 
         cost_file = None
@@ -160,6 +165,7 @@ def load_cost_config(self):
             print(
                 f"Warning: Could not find cost configuration file. Tried: {possible_paths}"
             )
+            print(f"Current working directory: {os.getcwd()}")
             self.cost_config = {}
             return
 
@@ -177,7 +183,11 @@ def calculate_inference_cost(
         self, model_name: str, token_usage: Dict[str, int]
     ) -> float:
         """Calculate inference cost based on token usage and model pricing."""
-        if not token_usage or not self.cost_config:
+        if not token_usage:
+            return 0.0
+        
+        if not self.cost_config:
+            print("Warning: Cost config is empty!")
             return 0.0
 
         # Remove _batch suffix if present for cost lookup
@@ -185,27 +195,33 @@ def calculate_inference_cost(
         if model_name.endswith("_batch"):
             cost_lookup_name = model_name[:-6]  # Remove '_batch' suffix
 
-        # Normalize model name to match cost config
-        if model_name_manager:
-            normalized_name = model_name_manager.get_universal_name(cost_lookup_name)
+        # Try exact match first (cost config uses original model names)
+        if cost_lookup_name in self.cost_config:
+            cost_info = self.cost_config[cost_lookup_name]
         else:
-            normalized_name = cost_lookup_name
+            # Normalize model name to match cost config
+            if model_name_manager:
+                normalized_name = model_name_manager.get_universal_name(cost_lookup_name)
+            else:
+                normalized_name = cost_lookup_name
 
-        # Try to find exact match first
-        if normalized_name in self.cost_config:
-            cost_info = self.cost_config[normalized_name]
-        else:
-            # Try to find partial matches
-            cost_info = None
-            for config_name in self.cost_config.keys():
-                if config_name in normalized_name or normalized_name in config_name:
-                    cost_info = self.cost_config[config_name]
-                    break
+            # Try to find exact match with normalized name
+            if normalized_name in self.cost_config:
+                cost_info = self.cost_config[normalized_name]
+            else:
+                # Try to find partial matches
+                cost_info = None
+                for config_name in self.cost_config.keys():
+                    if config_name in normalized_name or normalized_name in config_name:
+                        cost_info = self.cost_config[config_name]
+                        break
 
         if not cost_info:
             print(
-                f"Warning: No cost configuration found for model {model_name} (lookup: {cost_lookup_name}, normalized: {normalized_name})"
+                f"Warning: No cost configuration found for model {model_name} (lookup: {cost_lookup_name})"
             )
+            if len(self.cost_config) > 0:
+                print(f"Available cost config keys (first 10): {list(self.cost_config.keys())[:10]}")
             return 0.0
 
         # Calculate cost
@@ -239,6 +255,7 @@ def determine_dataset_from_global_index(self, global_index: str) -> str:
             "FinQA": "FinQA",
             "GeoBench": "GeoBench",
             "GeoGraphyData": "GeoGraphyData_100k",  # Fix the dataset name
+            "GPQA": "GPQA",
             "GSM8K": "GSM8K",
             "LiveCodeBench": "LiveCodeBench",
             "MATH": "MATH",
@@ -468,7 +485,18 @@ def _get_ground_truth(self, global_index: str, dataset_name: str) -> Optional[An
             except Exception as e:
                 print(f"Error loading LiveCodeBench dataset: {e}")
                 return None
-
+        elif dataset_name == "GPQA":
+            gpqa_gt_path = "./dataset/gpqa_ground_truth.json"
+            if os.path.exists(gpqa_gt_path):
+                try:
+                    with open(gpqa_gt_path, "r", encoding="utf-8") as f:
+                        gpqa_data = json.load(f)
+                    for item in gpqa_data:
+                        if item.get("global_index") == global_index:
+                            return item["answer"]
+                except Exception as e:
+                    print(f"Error loading GPQA ground truth: {e}")
+            return None
         # For other datasets, find the entry with matching global_index
         if self.all_data is None:
             return None
diff --git a/llm_evaluation/run.py b/llm_evaluation/run.py
index 3e3365b..7ccaa8a 100644
--- a/llm_evaluation/run.py
+++ b/llm_evaluation/run.py
@@ -100,17 +100,23 @@ def compute_arena_score(cost, accuracy, beta=0.1, c_max=200, c_min=0.0044):
     return S
 
 
-def load_predictions_file(router_name: str) -> List[Dict[str, Any]]:
+def load_predictions_file(router_name: str, split: str | None = None) -> List[Dict[str, Any]]:
     """
     Load router predictions from JSON file.
 
     Args:
         router_name: Name of the router
+        split: Dataset split (optional). Used to determine prediction file name.
 
     Returns:
         List of prediction dictionaries
     """
-    prediction_path = f"./router_inference/predictions/{router_name}.json"
+    # Construct prediction path based on split (same logic as llm_inference/run.py)
+    if split and split in ["gpqa", "robustness"]:
+        filename = f"{router_name}-{split}"
+    else:
+        filename = router_name
+    prediction_path = f"./router_inference/predictions/{filename}.json"
 
     if not os.path.exists(prediction_path):
         raise FileNotFoundError(
@@ -136,15 +142,21 @@ def load_predictions_from_path(path: str) -> List[Dict[str, Any]]:
         return json.load(f)
 
 
-def save_predictions_file(predictions: List[Dict[str, Any]], router_name: str) -> None:
+def save_predictions_file(predictions: List[Dict[str, Any]], router_name: str, split: str | None = None) -> None:
     """
     Save predictions back to file.
 
     Args:
         predictions: List of prediction dictionaries
         router_name: Name of the router
+        split: Dataset split (optional). Used to determine prediction file name.
     """
-    prediction_path = f"./router_inference/predictions/{router_name}.json"
+    # Construct filename based on split (same logic as load_predictions_file)
+    if split and split in ["gpqa", "robustness"]:
+        filename = f"{router_name}-{split}"
+    else:
+        filename = router_name
+    prediction_path = f"./router_inference/predictions/{filename}.json"
 
     # Create directory if it doesn't exist
     os.makedirs(os.path.dirname(prediction_path), exist_ok=True)
@@ -170,7 +182,33 @@ def load_ground_truth_dataset(split: str) -> Dict[str, Dict[str, Any]]:
     """
     from datasets import load_from_disk
     import pandas as pd
-
+    ground_truth_map = {}
+    
+    # Handle GPQA split
+    if split == "gpqa":
+        gpqa_gt_path = "./dataset/gpqa_ground_truth.json"
+        if not os.path.exists(gpqa_gt_path):
+            raise FileNotFoundError(
+                f"GPQA ground truth not found at {gpqa_gt_path}. "
+                f"Please create it using the preparation script."
+            )
+        logger.info(f"Loading GPQA ground truth from {gpqa_gt_path}...")
+        with open(gpqa_gt_path, "r", encoding="utf-8") as f:
+            gpqa_data = json.load(f)
+        
+        for item in gpqa_data:
+            global_index = item["global_index"]
+            ground_truth_map[global_index] = {
+                "question": item.get("question", ""),
+                "global_index": global_index,
+                "context": item.get("context", ""),
+                "answer": item["answer"],
+                "options": item.get("options", []),
+                "metadata": item.get("metadata", {}),
+            }
+        
+        logger.info(f"Loaded {len(ground_truth_map)} GPQA ground truth samples")
+        return ground_truth_map
     if split not in ["sub_10", "full"]:
         raise ValueError(f"Invalid split: {split}. Must be 'sub_10' or 'full'")
 
@@ -354,9 +392,10 @@ def evaluate_single_prediction(
             )
 
         # Calculate inference cost
+        # Use original model name for cost lookup since cost config uses original names
         token_usage = generated_result.get("token_usage", {})
         inference_cost = evaluator.calculate_inference_cost(
-            universal_model_name, token_usage
+            model_name, token_usage  # Use original model_name instead of universal_model_name
         )
 
         # Update the prediction with evaluation results
@@ -396,7 +435,7 @@ def process_router_predictions(
     logger.info(f"Using {num_workers} worker threads for parallel processing")
 
     # Load predictions
-    predictions = load_predictions_file(router_name)
+    predictions = load_predictions_file(router_name, split=split)
 
     # Separate regular and optimality entries
     regular_predictions = [p for p in predictions if not p.get("for_optimality", False)]
@@ -439,11 +478,13 @@ def process_router_predictions(
     # Note: This loop runs in the main thread before threading starts, so no lock needed
     tasks = []
     for i, prediction in enumerate(predictions):
-        # Check if already evaluated (has accuracy and cost)
+        # Check if already evaluated (has accuracy and cost > 0)
         # Skip if already evaluated AND force is False
+        # Note: cost > 0 check ensures costs were actually calculated (0.0 means not calculated)
         if not force and (
             prediction.get("accuracy") is not None
             and prediction.get("cost") is not None
+            and prediction.get("cost", 0) > 0  # Cost must be > 0 to be considered evaluated
         ):
             already_evaluated_count += 1
             evaluated_count += 1
@@ -494,7 +535,7 @@ def evaluate_task(seq_idx: int, prediction: Dict[str, Any]) -> bool:
                 with save_lock:
                     # Save the entire predictions list
                     # This is safe because each thread modifies a different index
-                    save_predictions_file(predictions, router_name)
+                    save_predictions_file(predictions, router_name, split=split)
 
                     elapsed_time = (
                         datetime.datetime.now() - start_time
@@ -542,7 +583,7 @@ def evaluate_task(seq_idx: int, prediction: Dict[str, Any]) -> bool:
 
     # Final save
     with save_lock:
-        save_predictions_file(predictions, router_name)
+        save_predictions_file(predictions, router_name, split=split)
 
     # Final summary
     end_time = datetime.datetime.now()
@@ -901,7 +942,7 @@ def run_robustness_only(router_name: str, robustness_path: Optional[str]) -> Non
         target_path,
     )
 
-    predictions = load_predictions_file(router_name)
+    predictions = load_predictions_file(router_name, split=None)  # Load base file for robustness
 
     try:
         robustness_predictions = load_predictions_from_path(target_path)
@@ -1096,10 +1137,10 @@ def main():
         "split",
         nargs="?",
         type=str,
-        choices=["sub_10", "full", "robustness"],
+        choices=["sub_10", "full", "robustness", "gpqa"],
         help=(
             "Dataset split to use for evaluation ('sub_10' for testing with answers, "
-            "'full' for submission, 'robustness' to compute robustness score only)."
+            "'full' for submission, 'robustness' to compute robustness score only, 'gpqa' for GPQA dataset)."
         ),
     )
     parser.add_argument(
@@ -1161,7 +1202,7 @@ def main():
     # Run evaluation
     try:
         # If save_interval is 0, only save at the end
-        predictions = load_predictions_file(args.router_name)
+        predictions = load_predictions_file(args.router_name, split=args.split)
         save_interval = (
             args.save_interval if args.save_interval > 0 else len(predictions) + 1
         )
@@ -1177,8 +1218,8 @@ def main():
         logger.info("\nInterrupted by user. Saving partial results...")
         try:
             # Try to save current state if possible
-            predictions = load_predictions_file(args.router_name)
-            save_predictions_file(predictions, args.router_name)
+            predictions = load_predictions_file(args.router_name, split=args.split)
+            save_predictions_file(predictions, args.router_name, split=args.split)
             logger.info("Partial results saved successfully.")
         except Exception as e:
             logger.warning(f"Could not save partial results: {e}")
diff --git a/llm_inference/model_inference.py b/llm_inference/model_inference.py
index 3a50ba6..d961e12 100644
--- a/llm_inference/model_inference.py
+++ b/llm_inference/model_inference.py
@@ -172,6 +172,7 @@ def _get_provider(self, model_name: str) -> str:
             "qwen/qwen3-vl-235b-a22b-instruct": "openrouter",
             "qwen/qwen3-coder": "openrouter",
             "x-ai/grok-code-fast-1": "openrouter",
+            "xiaomi/mimo-v2-flash": "openrouter",
             "xiaomi/mimo-v2-flash:free": "openrouter",
             "openai/gpt-oss-120b": "openrouter",
             "qwen/qwen3-235b-a22b-2507": "openrouter",
diff --git a/llm_inference/run.py b/llm_inference/run.py
index 6d72ee3..d455c64 100644
--- a/llm_inference/run.py
+++ b/llm_inference/run.py
@@ -38,17 +38,22 @@
 logger = logging.getLogger(__name__)
 
 
-def load_predictions_file(router_name: str) -> List[Dict[str, Any]]:
+def load_predictions_file(router_name: str, split: str | None = None) -> List[Dict[str, Any]]:
     """
     Load router predictions from JSON file.
 
     Args:
         router_name: Name of the router
-
+        split: Dataset split ("sub_10", "full", "robustness", "gpqa")
     Returns:
         List of prediction dictionaries
     """
-    prediction_path = f"./router_inference/predictions/{router_name}.json"
+    # Construct prediction path based on split
+    if split and split in ["gpqa", "robustness"]:
+        filename = f"{router_name}-{split}"
+    else:
+        filename = router_name
+    prediction_path = f"./router_inference/predictions/{filename}.json"
 
     if not os.path.exists(prediction_path):
         raise FileNotFoundError(
@@ -105,7 +110,9 @@ def load_cached_results_for_predictions(
 
     # Load cache for each model
     for universal_model_name, model_predictions in model_to_predictions.items():
-        cached_file = os.path.join(cached_results_dir, f"{universal_model_name}.jsonl")
+        # Sanitize model name for filename (replace / with _)
+        model_filename = universal_model_name.replace("/", "_")
+        cached_file = os.path.join(cached_results_dir, f"{model_filename}.jsonl")
 
         if not os.path.exists(cached_file):
             continue
@@ -162,7 +169,7 @@ def load_cached_results_for_predictions(
 
 
 def save_predictions_file(
-    predictions: List[Dict[str, Any]], router_name: str, create_backup: bool = False
+    predictions: List[Dict[str, Any]], router_name: str, create_backup: bool = False, split: str | None = None
 ) -> None:
     """
     Save predictions back to file.
@@ -171,8 +178,14 @@ def save_predictions_file(
         predictions: List of prediction dictionaries
         router_name: Name of the router
         create_backup: Whether to create a backup before saving (only needed once)
+        split: Dataset split (optional). Used to determine prediction file name.
     """
-    prediction_path = f"./router_inference/predictions/{router_name}.json"
+    # Construct filename based on split (same logic as load_predictions_file)
+    if split and split in ["gpqa", "robustness"]:
+        filename = f"{router_name}-{split}"
+    else:
+        filename = router_name
+    prediction_path = f"./router_inference/predictions/{filename}.json"
 
     with open(prediction_path, "w", encoding="utf-8") as f:
         json.dump(predictions, f, ensure_ascii=False, indent=2)
@@ -185,6 +198,7 @@ def process_router_predictions(
     num_workers: int = 16,
     num_runs: int = 1,
     cached_results_dir: str = "./cached_results",
+    split: str | None = None,
 ) -> None:
     """
     Process router predictions using parallel inference system.
@@ -202,11 +216,11 @@ def process_router_predictions(
     logger.info(f"Target runs per query: {num_runs}")
 
     # Load predictions
-    predictions = load_predictions_file(router_name)
+    predictions = load_predictions_file(router_name, split)
     logger.info(f"Loaded {len(predictions)} predictions")
 
     # Create backup of original predictions file
-    save_predictions_file(predictions, router_name, create_backup=True)
+    save_predictions_file(predictions, router_name, create_backup=True, split=split)
 
     # Filter out entries without required fields and convert to universal model names
     valid_predictions = []
@@ -297,7 +311,7 @@ def process_router_predictions(
             updated_count += 1
 
     # Save updated predictions
-    save_predictions_file(predictions, router_name, create_backup=False)
+    save_predictions_file(predictions, router_name, create_backup=False, split=split)
 
     # Final summary
     end_time = datetime.datetime.now()
@@ -319,8 +333,13 @@ def process_router_predictions(
         logger.info(f"    Successful: {stats['successful']}")
         logger.info(f"    Failed: {stats['failed']}")
 
+    # Construct filename for log message
+    if split and split in ["gpqa", "robustness"]:
+        filename = f"{router_name}-{split}"
+    else:
+        filename = router_name
     logger.info(
-        f"\nPredictions saved to: ./router_inference/predictions/{router_name}.json"
+        f"\nPredictions saved to: ./router_inference/predictions/{filename}.json"
     )
     logger.info("=" * 80)
 
@@ -347,6 +366,12 @@ def main():
         type=str,
         help="Name of the router (corresponds to ./router_inference/predictions/<router_name>.json)",
     )
+    parser.add_argument(
+        "--split",
+        type=str,
+        choices=["sub_10", "full", "robustness", "gpqa"],
+        help="Dataset split (optional). Used to determine prediction file name."
+    )
     parser.add_argument(
         "--num-workers",
         type=int,
@@ -394,6 +419,7 @@ def main():
             num_workers=args.num_workers,
             num_runs=args.num_runs,
             cached_results_dir=args.cached_results_dir,
+            split=args.split,
         )
     except KeyboardInterrupt:
         logger.info("\nInterrupted by user. Partial results have been saved.")
diff --git a/model_cost/cost.json b/model_cost/cost.json
index 4e8c38d..0d76794 100644
--- a/model_cost/cost.json
+++ b/model_cost/cost.json
@@ -178,5 +178,14 @@
   "glm-4-plus": {
     "input_token_price_per_million": 0.7,
     "output_token_price_per_million": 0.7
+  },
+  "xiaomi/mimo-v2-flash": {
+    "input_token_price_per_million": 0.09,
+    "output_token_price_per_million": 0.29
+  },
+
+  "mistralai/devstral-2512:free": {
+    "input_token_price_per_million": 0.0001,
+    "output_token_price_per_million": 0.0001
   }
 }
diff --git a/router_inference/config/openrouter-router.json b/router_inference/config/openrouter-router.json
new file mode 100644
index 0000000..9d7a7b1
--- /dev/null
+++ b/router_inference/config/openrouter-router.json
@@ -0,0 +1,9 @@
+{
+    "pipeline_params": {
+        "router_name": "openrouter-router",
+        "models": [
+            "xiaomi/mimo-v2-flash",
+            "mistralai/devstral-2512:free"
+        ]
+    }
+}
diff --git a/router_inference/generate_prediction_file.py b/router_inference/generate_prediction_file.py
index 694ce3f..1a190b1 100644
--- a/router_inference/generate_prediction_file.py
+++ b/router_inference/generate_prediction_file.py
@@ -30,6 +30,7 @@
     "sub_10": "./dataset/router_data_10.json",
     "full": "./dataset/router_data.json",
     "robustness": "./dataset/router_robustness.json",
+    "gpqa": "./dataset/gpqa_data.json",
 }
 
 
@@ -38,8 +39,7 @@ def load_dataset(split: str) -> List[Dict[str, Any]]:
     Load dataset file.
 
     Args:
-        split: One of the supported dataset splits (sub_10, full, robustness)
-
+        split: One of the supported dataset splits (sub_10, full, robustness, gpqa)
     Returns:
         List of dataset entries
     """
@@ -182,6 +182,8 @@ def save_predictions(
     filename = router_name
     if split == "robustness":
         filename = f"{router_name}-robustness"
+    elif split == "gpqa":
+        filename = f"{router_name}-gpqa"
     prediction_path = f"./router_inference/predictions/{filename}.json"
 
     # Create directory if it doesn't exist
@@ -207,7 +209,7 @@ def main():
         "split",
         type=str,
         choices=list(DATASET_PATHS.keys()),
-        help="Dataset split: 'sub_10', 'full', or 'robustness'",
+        help="Dataset split: 'sub_10', 'full', 'robustness', or 'gpqa'",
     )
     parser.add_argument(
         "--no-optimality",
diff --git a/scripts/prepare_gpqa_data.py b/scripts/prepare_gpqa_data.py
new file mode 100644
index 0000000..a09de82
--- /dev/null
+++ b/scripts/prepare_gpqa_data.py
@@ -0,0 +1,241 @@
+"""
+Prepare GPQA dataset for RouterArena pipeline.
+
+This script:
+1. Loads GPQA dataset from HuggingFace
+2. Formats prompts according to RouterArena requirements
+3. Creates dataset/gpqa_data.json (for router inference)
+4. Creates dataset/gpqa_ground_truth.json (for evaluation)
+
+How to run:
+    uv run python scripts/prepare_gpqa_data.py
+
+Prerequisites:
+    - Install packages: uv sync (or pip install datasets)
+    - (Optional) If authentication needed: huggingface-cli login
+"""
+
+from datasets import load_dataset, DatasetDict, Dataset
+import json
+import os
+import random
+
+# Ensure dataset directory exists
+os.makedirs("dataset", exist_ok=True)
+
+# Load the dataset. If authentication is needed, ensure you are logged in.
+print("Loading GPQA dataset from HuggingFace...")
+raw_gpqa = load_dataset("Idavidrein/gpqa", "gpqa_diamond")
+
+# Extract the actual dataset from DatasetDict if needed
+if isinstance(raw_gpqa, DatasetDict):
+    split_names = list(raw_gpqa.keys())
+    print(f"Found splits: {split_names}")
+    # Get the first split (usually 'train')
+    split_key = 'train' if 'train' in raw_gpqa else split_names[0]
+    print(f"Using split: {split_key}")
+    gpqa_dataset = raw_gpqa[split_key]
+else:
+    # If it's already a single Dataset, use it directly
+    gpqa_dataset = raw_gpqa
+
+print(f"Loaded {len(gpqa_dataset)} GPQA entries")
+print(f"Dataset features: {gpqa_dataset.features}")
+
+# Inspect first entry to understand structure
+if len(gpqa_dataset) > 0:
+    print("\nFirst entry sample:")
+    print(gpqa_dataset[0])
+    print()
+
+# Define prompt template (must match eval config!)
+PROMPT_TEMPLATE = """Please read the following multiple-choice questions and provide the most likely correct answer based on the options given.
+
+Context: {context}
+
+Question: {question}
+
+Options: 
+{options}
+
+Provide the correct letter choice in \\boxed{{X}}, where X is the correct letter choice. Keep the explanation or feedback within 3 sentences."""
+
+# Step 2.2: Create dataset file with formatted prompts
+print("\n[Step 2.2] Formatting prompts and creating dataset file...")
+formatted_data = []
+# Store shuffled options and answer letters for use in ground truth generation
+shuffled_data = {}  # {index: (shuffled_options, answer_letter)}
+
+for i, item in enumerate(gpqa_dataset):
+    # Extract fields (adjust field names based on actual dataset structure)
+    question = item.get("question", item.get("Question", ""))
+    # GPQA dataset has separate fields for correct and incorrect answers
+    # Construct options list from these fields
+    correct_answer = item.get("Correct Answer", item.get("correct_answer", item.get("answer", item.get("Answer", ""))))
+    incorrect_1 = item.get("Incorrect Answer 1", "")
+    incorrect_2 = item.get("Incorrect Answer 2", "")
+    incorrect_3 = item.get("Incorrect Answer 3", "")
+    
+    # Build options list with all answers
+    all_options = [opt for opt in [correct_answer, incorrect_1, incorrect_2, incorrect_3] if opt]
+    
+    # If we have a pre-formatted options list, use that instead
+    if item.get("options") or item.get("Options"):
+        all_options = item.get("options", item.get("Options", []))
+        # Still need to find correct answer in the list
+        if not correct_answer:
+            correct_answer = item.get("Correct Answer", item.get("correct_answer", item.get("answer", item.get("Answer", ""))))
+    
+    # Shuffle options using deterministic seed based on index for reproducibility
+    # This ensures the same question always gets the same shuffle
+    random.seed(i)
+    shuffled_options = all_options.copy()
+    random.shuffle(shuffled_options)
+    
+    # Find the index of the correct answer in the shuffled list
+    answer_index = -1
+    if correct_answer:
+        try:
+            answer_index = shuffled_options.index(correct_answer)
+        except ValueError:
+            # Try case-insensitive matching
+            for idx, opt in enumerate(shuffled_options):
+                if opt and correct_answer and opt.strip().lower() == correct_answer.strip().lower():
+                    answer_index = idx
+                    break
+    
+    # Convert index to letter (A, B, C, D, etc.)
+    if answer_index >= 0:
+        answer_letter = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[answer_index]
+    else:
+        answer_letter = "A"  # Default fallback
+        print(f"Warning: Could not find correct answer in options for GPQA_{i}, defaulting to A")
+    
+    # Store shuffled data for ground truth generation
+    shuffled_data[i] = (shuffled_options, answer_letter)
+    
+    context = item.get("context", item.get("Context", ""))
+    
+    # Format shuffled options as "A. option1\nB. option2\n..."
+    options_str = ""
+    for j, opt in enumerate(shuffled_options):
+        letter = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[j]
+        options_str += f"{letter}. {opt}\n"
+    
+    # Build the complete prompt
+    prompt = PROMPT_TEMPLATE.format(
+        context=context or "None",
+        question=question,
+        options=options_str.strip()
+    )
+    
+    # Create the dataset entry
+    formatted_data.append({
+        "prompt_formatted": prompt,
+        "global index": f"GPQA_{i}"  # CRITICAL: Prefix must match dataset name
+    })
+
+# Save dataset file
+dataset_path = "dataset/gpqa_data.json"
+with open(dataset_path, "w", encoding="utf-8") as f:
+    json.dump(formatted_data, f, indent=2, ensure_ascii=False)
+
+print(f"✓ Created {len(formatted_data)} GPQA entries in {dataset_path}")
+
+# Step 2.3: Create ground truth file
+print("\n[Step 2.3] Creating ground truth file...")
+ground_truth = []
+for i, item in enumerate(gpqa_dataset):
+    # Extract fields (same as above)
+    question = item.get("question", item.get("Question", ""))
+    context = item.get("context", item.get("Context", ""))
+    
+    # Use the same shuffled options and answer letter from step 2.2
+    if i in shuffled_data:
+        shuffled_options, answer_letter = shuffled_data[i]
+    else:
+        # Fallback: regenerate shuffle if somehow missing (shouldn't happen)
+        correct_answer = item.get("Correct Answer", item.get("correct_answer", item.get("answer", item.get("Answer", ""))))
+        incorrect_1 = item.get("Incorrect Answer 1", "")
+        incorrect_2 = item.get("Incorrect Answer 2", "")
+        incorrect_3 = item.get("Incorrect Answer 3", "")
+        all_options = [opt for opt in [correct_answer, incorrect_1, incorrect_2, incorrect_3] if opt]
+        
+        if item.get("options") or item.get("Options"):
+            all_options = item.get("options", item.get("Options", []))
+        
+        random.seed(i)
+        shuffled_options = all_options.copy()
+        random.shuffle(shuffled_options)
+        
+        answer_index = -1
+        if correct_answer:
+            try:
+                answer_index = shuffled_options.index(correct_answer)
+            except ValueError:
+                for idx, opt in enumerate(shuffled_options):
+                    if opt and correct_answer and opt.strip().lower() == correct_answer.strip().lower():
+                        answer_index = idx
+                        break
+        
+        if answer_index >= 0:
+            answer_letter = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[answer_index]
+        else:
+            answer_letter = "A"
+            print(f"Warning: Could not find correct answer in options for GPQA_{i}, defaulting to A")
+    
+    ground_truth.append({
+        "global_index": f"GPQA_{i}",  # MUST match dataset file
+        "question": question,
+        "answer": answer_letter,  # Store as letter (A, B, C, D)
+        "options": shuffled_options,  # Use shuffled options to match the prompt
+        "context": context or "",
+        "metadata": item.get("metadata", {})
+    })
+
+# Save ground truth file
+gt_path = "dataset/gpqa_ground_truth.json"
+with open(gt_path, "w", encoding="utf-8") as f:
+    json.dump(ground_truth, f, indent=2, ensure_ascii=False)
+
+print(f"✓ Created {len(ground_truth)} GPQA ground truth entries in {gt_path}")
+
+# Step 2.4: Verify files
+print("\n[Step 2.4] Verifying files...")
+try:
+    # Check dataset file structure
+    with open(dataset_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    print(f"✓ Dataset file: {len(data)} entries")
+    print(f"  First entry keys: {list(data[0].keys())}")
+    print(f"  First global_index: {data[0].get('global index')}")
+    
+    # Check ground truth file structure
+    with open(gt_path, "r", encoding="utf-8") as f:
+        gt = json.load(f)
+    print(f"✓ Ground truth file: {len(gt)} entries")
+    print(f"  First entry keys: {list(gt[0].keys())}")
+    print(f"  First answer: {gt[0].get('answer')}")
+    
+    # Verify matching indices
+    data_indices = {e.get("global index") for e in data}
+    gt_indices = {e.get("global_index") for e in gt}
+    if data_indices == gt_indices:
+        print(f"✓ All {len(data_indices)} indices match between dataset and ground truth")
+    else:
+        missing_in_data = gt_indices - data_indices
+        missing_in_gt = data_indices - gt_indices
+        if missing_in_data:
+            print(f"⚠ Warning: {len(missing_in_data)} indices in ground truth not in dataset")
+        if missing_in_gt:
+            print(f"⚠ Warning: {len(missing_in_gt)} indices in dataset not in ground truth")
+    
+    print("\n✓ All files created and verified successfully!")
+    print(f"\nNext steps:")
+    print(f"1. Review dataset/gpqa_data.json to ensure prompts are formatted correctly")
+    print(f"2. Review dataset/gpqa_ground_truth.json to ensure answers are correct")
+    print(f"3. Proceed to Step 3: Router Inference Setup")
+    
+except Exception as e:
+    print(f"✗ Verification failed: {e}")
+    raise
\ No newline at end of file
diff --git a/universal_model_names.py b/universal_model_names.py
index cc1016c..7fbd935 100644
--- a/universal_model_names.py
+++ b/universal_model_names.py
@@ -51,6 +51,8 @@
     "meta-llama/llama-3-8b-instruct",
     "anthropic/claude-3.5-sonnet",
     "Qwen/QwQ-32B",
+    "xiaomi/mimo-v2-flash",
+    "mistralai/devstral-2512:free",
     # Replicate
     "meta/codellama-34b-instruct",
     # AWS Bedrock

From 239f5ff329677232d005c113950836d8cc36a977 Mon Sep 17 00:00:00 2001
From: ZhiboYu1 <zy53@rice.edu>
Date: Sun, 1 Feb 2026 15:31:43 -0600
Subject: [PATCH 3/7] fix: linting error that python 3.6 doesn't support union
 syntax, and fix the error such that we are using universal model name when
 calculating prices

---
 llm_evaluation/run.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llm_evaluation/run.py b/llm_evaluation/run.py
index 7ccaa8a..e1e988c 100644
--- a/llm_evaluation/run.py
+++ b/llm_evaluation/run.py
@@ -100,7 +100,7 @@ def compute_arena_score(cost, accuracy, beta=0.1, c_max=200, c_min=0.0044):
     return S
 
 
-def load_predictions_file(router_name: str, split: str | None = None) -> List[Dict[str, Any]]:
+def load_predictions_file(router_name: str, split: Optional[str] = None) -> List[Dict[str, Any]]:
     """
     Load router predictions from JSON file.
 
@@ -392,10 +392,10 @@ def evaluate_single_prediction(
             )
 
         # Calculate inference cost
-        # Use original model name for cost lookup since cost config uses original names
+        # Use universal model name for cost lookup to respect user-defined mappings
         token_usage = generated_result.get("token_usage", {})
         inference_cost = evaluator.calculate_inference_cost(
-            model_name, token_usage  # Use original model_name instead of universal_model_name
+            universal_model_name, token_usage  # Use universal_model_name to respect mapping in universal_model_names.py
         )
 
         # Update the prediction with evaluation results

From 1ad4f30e74cb4b907245aafc9febe163afde7a11 Mon Sep 17 00:00:00 2001
From: ZhiboYu1 <zy53@rice.edu>
Date: Sun, 1 Feb 2026 15:53:45 -0600
Subject: [PATCH 4/7] docs: updated .gitignore to ignore markdown files except
 for README

---
 .gitignore | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.gitignore b/.gitignore
index 49b22b9..3b49a5b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,9 @@ Thumbs.db
 
 tools/.addlicense.lock
 tools/addlicense
+
+# Ignore all Markdown files
+*.md
+
+# Except for this one specific file
+!README.md

From d3cbb82fcc1cc84314b800b8d55be1d1baf53fa0 Mon Sep 17 00:00:00 2001
From: ZhiboYu1 <zy53@rice.edu>
Date: Tue, 3 Feb 2026 12:43:19 -0600
Subject: [PATCH 5/7] fix some typo

---
 llm_evaluation/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_evaluation/run.py b/llm_evaluation/run.py
index 32175f1..337ea61 100644
--- a/llm_evaluation/run.py
+++ b/llm_evaluation/run.py
@@ -143,7 +143,7 @@ def load_predictions_from_path(path: str) -> List[Dict[str, Any]]:
 
 
 def save_predictions_file(
-    predictions: List[Dict[str, Any]], router_name: str, split: str | None = None
+    predictions: List[Dict[str, Any]], router_name: str, split: Optional[str] = None
 ) -> None:
     """
     Save predictions back to file.

From 4c71a1867289f1c4cdd428db99f194a75d25d5b8 Mon Sep 17 00:00:00 2001
From: ZhiboYu1 <zy53@rice.edu>
Date: Tue, 3 Feb 2026 12:46:13 -0600
Subject: [PATCH 6/7] Refactor type hints to use Optional for better clarity

---
 automation/process_pr_submission.py | 2 +-
 global_utils/robustness.py          | 2 +-
 llm_inference/run.py                | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/automation/process_pr_submission.py b/automation/process_pr_submission.py
index ebf1202..c22db34 100644
--- a/automation/process_pr_submission.py
+++ b/automation/process_pr_submission.py
@@ -57,7 +57,7 @@ class CommandError(RuntimeError):
     """Raised when a subprocess fails and we want a cleaner error message."""
 
     def __init__(
-        self, message: str, *, stdout: str | None = None, stderr: str | None = None
+        self, message: str, *, stdout: Optional[str] = None, stderr: Optional[str] = None
     ):
         super().__init__(message)
         self.stdout = stdout
diff --git a/global_utils/robustness.py b/global_utils/robustness.py
index e57b762..d410fc0 100644
--- a/global_utils/robustness.py
+++ b/global_utils/robustness.py
@@ -28,7 +28,7 @@ def compute_robustness_score(
     full_predictions: list[dict[str, Any]],
     robustness_predictions: list[dict[str, Any]],
     *,
-    name_manager: ModelNameManager | None = None,
+    name_manager: Optional[ModelNameManager] = None,
 ) -> Optional[float]:
     """
     Compute the robustness flip ratio between full and robustness prediction sets.
diff --git a/llm_inference/run.py b/llm_inference/run.py
index c139d19..a2135f4 100644
--- a/llm_inference/run.py
+++ b/llm_inference/run.py
@@ -16,7 +16,7 @@
 import sys
 import logging
 import datetime
-from typing import Dict, Any, List, Tuple
+from typing import Dict, Any, List, Tuple, Optional
 from collections import defaultdict
 
 # Add parent directory to path for imports
@@ -39,7 +39,7 @@
 
 
 def load_predictions_file(
-    router_name: str, split: str | None = None
+    router_name: str, split: Optional[str] = None
 ) -> List[Dict[str, Any]]:
     """
     Load router predictions from JSON file.
@@ -174,7 +174,7 @@ def save_predictions_file(
     predictions: List[Dict[str, Any]],
     router_name: str,
     create_backup: bool = False,
-    split: str | None = None,
+    split: Optional[str] = None,
 ) -> None:
     """
     Save predictions back to file.
@@ -203,7 +203,7 @@ def process_router_predictions(
     num_workers: int = 16,
     num_runs: int = 1,
     cached_results_dir: str = "./cached_results",
-    split: str | None = None,
+    split: Optional[str] = None,
 ) -> None:
     """
     Process router predictions using parallel inference system.

From 19ffea1e8e8af920a828084a8939571cb0804ed5 Mon Sep 17 00:00:00 2001
From: ZhiboYu1 <zy53@rice.edu>
Date: Tue, 3 Feb 2026 12:49:29 -0600
Subject: [PATCH 7/7] Refactor type

---
 automation/process_pr_submission.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/automation/process_pr_submission.py b/automation/process_pr_submission.py
index c22db34..001c8f7 100644
--- a/automation/process_pr_submission.py
+++ b/automation/process_pr_submission.py
@@ -57,7 +57,11 @@ class CommandError(RuntimeError):
     """Raised when a subprocess fails and we want a cleaner error message."""
 
     def __init__(
-        self, message: str, *, stdout: Optional[str] = None, stderr: Optional[str] = None
+        self,
+        message: str,
+        *,
+        stdout: Optional[str] = None,
+        stderr: Optional[str] = None,
     ):
         super().__init__(message)
         self.stdout = stdout