RouteWorks · yl231 · Feb 6, 2026 · Jan 28, 2026 · Jan 28, 2026 · Feb 1, 2026
diff --git a/.gitignore b/.gitignore
@@ -35,3 +35,9 @@ Thumbs.db
 
 tools/.addlicense.lock
 tools/addlicense
+
+# Ignore all Markdown files
+*.md
+
+# Except for this one specific file
+!README.md
diff --git a/automation/process_pr_submission.py b/automation/process_pr_submission.py
@@ -57,7 +57,11 @@ class CommandError(RuntimeError):
     """Raised when a subprocess fails and we want a cleaner error message."""
 
     def __init__(
-        self, message: str, *, stdout: str | None = None, stderr: str | None = None
+        self,
+        message: str,
+        *,
+        stdout: Optional[str] = None,
+        stderr: Optional[str] = None,
     ):
         super().__init__(message)
         self.stdout = stdout

diff --git a/config/eval_config/zero-shot/GPQA.json b/config/eval_config/zero-shot/GPQA.json
@@ -0,0 +1,18 @@
+{
+    "eval_params": {
+        "dataset": "GPQA",
+        "eval_metrics": [
+            "mcq_accuracy"
+        ],
+        "setting": "zero-shot",
+        "prompt": "Please read the following multiple-choice questions and provide the most likely correct answer based on the options given.\n\nContext: {Context}\n\nQuestion: {Question}\n\nOptions: \n{Options}\n\nProvide the correct letter choice in \\boxed{{X}}, where X is the correct letter choice. Keep the explanation or feedback within 3 sentences."
+    },
+    "management": {
+        "sub_dir": {
+            "input_config": "input_config/",
+            "raw_results": "raw_results.json",
+            "result_vis": "result_vis.png",
+            "output_config": "output_config.json"
+        }
+    }
+}
diff --git a/global_utils/robustness.py b/global_utils/robustness.py
@@ -28,7 +28,7 @@ def compute_robustness_score(
     full_predictions: list[dict[str, Any]],
     robustness_predictions: list[dict[str, Any]],
     *,
-    name_manager: ModelNameManager | None = None,
+    name_manager: Optional[ModelNameManager] = None,
 ) -> Optional[float]:
     """
     Compute the robustness flip ratio between full and robustness prediction sets.

diff --git a/llm_evaluation/evaluate_models.py b/llm_evaluation/evaluate_models.py
@@ -169,6 +169,12 @@ def load_cost_config(self):
         project_root = os.path.dirname(script_dir)
 
         # Try multiple possible paths for cost file
+        # Get the directory of this file and construct paths relative to project root
+        current_file_dir = os.path.dirname(os.path.abspath(__file__))
+        project_root = os.path.dirname(
+            current_file_dir
+        )  # Go up from llm_evaluation/ to project root
+
         possible_paths = [
             os.path.join(project_root, "model_cost", "model_cost.json"),
             "./model_cost/model_cost.json",
@@ -186,6 +192,7 @@ def load_cost_config(self):
             print(
                 f"Warning: Could not find cost configuration file. Tried: {possible_paths[:3]}..."
             )
+            print(f"Current working directory: {os.getcwd()}")
             self.cost_config = {}
             return
 
@@ -203,7 +210,11 @@ def calculate_inference_cost(
         self, model_name: str, token_usage: Dict[str, int]
     ) -> float:
         """Calculate inference cost based on token usage and model pricing."""
-        if not token_usage or not self.cost_config:
+        if not token_usage:
+            return 0.0
+
+        if not self.cost_config:
+            print("Warning: Cost config is empty!")
             return 0.0
 
         # Remove _batch suffix if present for cost lookup
@@ -227,6 +238,10 @@ def calculate_inference_cost(
             print(
                 f"Warning: No cost configuration found for model {model_name} (lookup: {cost_lookup_name})"
             )
+            if len(self.cost_config) > 0:
+                print(
+                    f"Available cost config keys (first 10): {list(self.cost_config.keys())[:10]}"
+                )
             return 0.0
 
         # Calculate cost
@@ -260,6 +275,7 @@ def determine_dataset_from_global_index(self, global_index: str) -> str:
             "FinQA": "FinQA",
             "GeoBench": "GeoBench",
             "GeoGraphyData": "GeoGraphyData_100k",  # Fix the dataset name
+            "GPQA": "GPQA",
             "GSM8K": "GSM8K",
             "LiveCodeBench": "LiveCodeBench",
             "MATH": "MATH",
@@ -480,7 +496,18 @@ def _get_ground_truth(self, global_index: str, dataset_name: str) -> Optional[An
             except Exception as e:
                 print(f"Error loading LiveCodeBench dataset: {e}")
                 return None
-
+        elif dataset_name == "GPQA":
+            gpqa_gt_path = "./dataset/gpqa_ground_truth.json"
+            if os.path.exists(gpqa_gt_path):
+                try:
+                    with open(gpqa_gt_path, "r", encoding="utf-8") as f:
+                        gpqa_data = json.load(f)
+                    for item in gpqa_data:
+                        if item.get("global_index") == global_index:
+                            return item["answer"]
+                except Exception as e:
+                    print(f"Error loading GPQA ground truth: {e}")
+            return None
         # For other datasets, find the entry with matching global_index
         if self.all_data is None:
             return None

diff --git a/llm_evaluation/run.py b/llm_evaluation/run.py
@@ -98,17 +98,25 @@ def compute_arena_score(cost, accuracy, beta=0.1, c_max=200, c_min=0.0044):
     return S
 
 
-def load_predictions_file(router_name: str) -> List[Dict[str, Any]]:
+def load_predictions_file(
+    router_name: str, split: Optional[str] = None
+) -> List[Dict[str, Any]]:
     """
     Load router predictions from JSON file.
 
     Args:
         router_name: Name of the router
+        split: Dataset split (optional). Used to determine prediction file name.
 
     Returns:
         List of prediction dictionaries
     """
-    prediction_path = f"./router_inference/predictions/{router_name}.json"
+    # Construct prediction path based on split (same logic as llm_inference/run.py)
+    if split and split in ["gpqa", "robustness"]:
+        filename = f"{router_name}-{split}"
+    else:
+        filename = router_name
+    prediction_path = f"./router_inference/predictions/{filename}.json"
 
     if not os.path.exists(prediction_path):
         raise FileNotFoundError(
@@ -134,15 +142,23 @@ def load_predictions_from_path(path: str) -> List[Dict[str, Any]]:
         return json.load(f)
 
 
-def save_predictions_file(predictions: List[Dict[str, Any]], router_name: str) -> None:
+def save_predictions_file(
+    predictions: List[Dict[str, Any]], router_name: str, split: Optional[str] = None
+) -> None:
     """
     Save predictions back to file.
 
     Args:
         predictions: List of prediction dictionaries
         router_name: Name of the router
+        split: Dataset split (optional). Used to determine prediction file name.
     """
-    prediction_path = f"./router_inference/predictions/{router_name}.json"
+    # Construct filename based on split (same logic as load_predictions_file)
+    if split and split in ["gpqa", "robustness"]:
+        filename = f"{router_name}-{split}"
+    else:
+        filename = router_name
+    prediction_path = f"./router_inference/predictions/{filename}.json"
 
     # Create directory if it doesn't exist
     os.makedirs(os.path.dirname(prediction_path), exist_ok=True)
@@ -169,6 +185,33 @@ def load_ground_truth_dataset(split: str) -> Dict[str, Dict[str, Any]]:
     from datasets import load_from_disk
     import pandas as pd
 
+    ground_truth_map = {}
+
+    # Handle GPQA split
+    if split == "gpqa":
+        gpqa_gt_path = "./dataset/gpqa_ground_truth.json"
+        if not os.path.exists(gpqa_gt_path):
+            raise FileNotFoundError(
+                f"GPQA ground truth not found at {gpqa_gt_path}. "
+                f"Please create it using the preparation script."
+            )
+        logger.info(f"Loading GPQA ground truth from {gpqa_gt_path}...")
+        with open(gpqa_gt_path, "r", encoding="utf-8") as f:
+            gpqa_data = json.load(f)
+
+        for item in gpqa_data:
+            global_index = item["global_index"]
+            ground_truth_map[global_index] = {
+                "question": item.get("question", ""),
+                "global_index": global_index,
+                "context": item.get("context", ""),
+                "answer": item["answer"],
+                "options": item.get("options", []),
+                "metadata": item.get("metadata", {}),
+            }
+
+        logger.info(f"Loaded {len(ground_truth_map)} GPQA ground truth samples")
+        return ground_truth_map
     if split not in ["sub_10", "full"]:
         raise ValueError(f"Invalid split: {split}. Must be 'sub_10' or 'full'")
 
@@ -352,9 +395,11 @@ def evaluate_single_prediction(
             )
 
         # Calculate inference cost
+        # Use universal model name for cost lookup to respect user-defined mappings
         token_usage = generated_result.get("token_usage", {})
         inference_cost = evaluator.calculate_inference_cost(
-            universal_model_name, token_usage
+            universal_model_name,
+            token_usage,  # Use universal_model_name to respect mapping in universal_model_names.py
         )
 
         # Update the prediction with evaluation results
@@ -394,7 +439,7 @@ def process_router_predictions(
     logger.info(f"Using {num_workers} worker threads for parallel processing")
 
     # Load predictions
-    predictions = load_predictions_file(router_name)
+    predictions = load_predictions_file(router_name, split=split)
 
     # Separate regular and optimality entries
     regular_predictions = [p for p in predictions if not p.get("for_optimality", False)]
@@ -433,11 +478,14 @@ def process_router_predictions(
     # Prepare tasks: filter out already evaluated entries (unless force is True)
     tasks = []
     for i, prediction in enumerate(predictions):
-        # Check if already evaluated (has accuracy and cost)
+        # Check if already evaluated (has accuracy and cost > 0)
         # Skip if already evaluated AND force is False
+        # Note: cost > 0 check ensures costs were actually calculated (0.0 means not calculated)
         if not force and (
             prediction.get("accuracy") is not None
             and prediction.get("cost") is not None
+            and prediction.get("cost", 0)
+            > 0  # Cost must be > 0 to be considered evaluated
         ):
             already_evaluated_count += 1
             continue
@@ -466,7 +514,7 @@ def evaluate_task_wrapper(
 
     def save_callback():
         """Callback to save predictions file."""
-        save_predictions_file(predictions, router_name)
+        save_predictions_file(predictions, router_name, split=split)
 
     # Run parallel evaluation
     manager.evaluate_entries_parallel(
@@ -839,7 +887,9 @@ def run_robustness_only(router_name: str, robustness_path: Optional[str]) -> Non
         target_path,
     )
 
-    predictions = load_predictions_file(router_name)
+    predictions = load_predictions_file(
+        router_name, split=None
+    )  # Load base file for robustness
 
     try:
         robustness_predictions = load_predictions_from_path(target_path)
@@ -1034,10 +1084,10 @@ def main():
         "split",
         nargs="?",
         type=str,
-        choices=["sub_10", "full", "robustness"],
+        choices=["sub_10", "full", "robustness", "gpqa"],
         help=(
             "Dataset split to use for evaluation ('sub_10' for testing with answers, "
-            "'full' for submission, 'robustness' to compute robustness score only)."
+            "'full' for submission, 'robustness' to compute robustness score only, 'gpqa' for GPQA dataset)."
         ),
     )
     parser.add_argument(
@@ -1099,7 +1149,7 @@ def main():
     # Run evaluation
     try:
         # If save_interval is 0, only save at the end
-        predictions = load_predictions_file(args.router_name)
+        predictions = load_predictions_file(args.router_name, split=args.split)
         save_interval = (
             args.save_interval if args.save_interval > 0 else len(predictions) + 1
         )
@@ -1115,8 +1165,8 @@ def main():
         logger.info("\nInterrupted by user. Saving partial results...")
         try:
             # Try to save current state if possible
-            predictions = load_predictions_file(args.router_name)
-            save_predictions_file(predictions, args.router_name)
+            predictions = load_predictions_file(args.router_name, split=args.split)
+            save_predictions_file(predictions, args.router_name, split=args.split)
             logger.info("Partial results saved successfully.")
         except Exception as e:
             logger.warning(f"Could not save partial results: {e}")

diff --git a/llm_inference/model_inference.py b/llm_inference/model_inference.py
@@ -172,6 +172,7 @@ def _get_provider(self, model_name: str) -> str:
             "qwen/qwen3-vl-235b-a22b-instruct": "openrouter",
             "qwen/qwen3-coder": "openrouter",
             "x-ai/grok-code-fast-1": "openrouter",
+            "xiaomi/mimo-v2-flash": "openrouter",
             "xiaomi/mimo-v2-flash:free": "openrouter",
             "openai/gpt-oss-120b": "openrouter",
             "qwen/qwen3-235b-a22b-2507": "openrouter",