From abbe3e5846bdef5ce59fc372a4ff93fb62dae786 Mon Sep 17 00:00:00 2001 From: yl231 Date: Tue, 27 Jan 2026 21:37:02 -0600 Subject: [PATCH 1/7] feat: Integrate GPQA dataset with proper answer extraction and option shuffling Add support for GPQA (Graduate-Level Google-Proof Q&A) dataset integration into the RouterArena evaluation pipeline. Key changes: - Add GPQA evaluation config (config/eval_config/zero-shot/GPQA.json) - Create prepare_gpqa_data.py script to: * Load GPQA dataset from HuggingFace (Idavidrein/gpqa) * Extract correct answers from \"Correct Answer\" field (fixes empty answer bug) * Shuffle MCQ options deterministically to distribute answers across A-D * Generate formatted prompts and ground truth files - Update evaluation pipeline (llm_evaluation/run.py, evaluate_models.py): * Add GPQA split handling in load_ground_truth_dataset() * Support GPQA dataset name detection - Update inference pipeline (llm_inference/run.py, model_inference.py): * Add GPQA split support for router predictions * Handle GPQA-specific data loading - Add OpenRouter router configuration - Update model cost configurations for GPQA models - Add universal model name mappings --- config/eval_config/zero-shot/GPQA.json | 18 ++ llm_evaluation/evaluate_models.py | 68 +++-- llm_evaluation/run.py | 73 ++++-- llm_inference/model_inference.py | 1 + llm_inference/run.py | 46 +++- model_cost/cost.json | 9 + .../config/openrouter-router.json | 9 + router_inference/generate_prediction_file.py | 8 +- scripts/prepare_gpqa_data.py | 241 ++++++++++++++++++ universal_model_names.py | 2 + 10 files changed, 426 insertions(+), 49 deletions(-) create mode 100644 config/eval_config/zero-shot/GPQA.json create mode 100644 router_inference/config/openrouter-router.json create mode 100644 scripts/prepare_gpqa_data.py diff --git a/config/eval_config/zero-shot/GPQA.json b/config/eval_config/zero-shot/GPQA.json new file mode 100644 index 0000000..6f00598 --- /dev/null +++ b/config/eval_config/zero-shot/GPQA.json @@ -0,0 +1,18 @@ +{ + "eval_params": { + "dataset": "GPQA", + "eval_metrics": [ + "mcq_accuracy" + ], + "setting": "zero-shot", + "prompt": "Please read the following multiple-choice questions and provide the most likely correct answer based on the options given.\n\nContext: {Context}\n\nQuestion: {Question}\n\nOptions: \n{Options}\n\nProvide the correct letter choice in \\boxed{{X}}, where X is the correct letter choice. Keep the explanation or feedback within 3 sentences." + }, + "management": { + "sub_dir": { + "input_config": "input_config/", + "raw_results": "raw_results.json", + "result_vis": "result_vis.png", + "output_config": "output_config.json" + } + } +} \ No newline at end of file diff --git a/llm_evaluation/evaluate_models.py b/llm_evaluation/evaluate_models.py index a7eede3..4f627e6 100644 --- a/llm_evaluation/evaluate_models.py +++ b/llm_evaluation/evaluate_models.py @@ -144,10 +144,15 @@ def load_dataset_configs(self): def load_cost_config(self): """Load cost configuration from model_cost/cost.json""" # Try multiple possible paths for cost file + # Get the directory of this file and construct paths relative to project root + current_file_dir = os.path.dirname(os.path.abspath(__file__)) + project_root = os.path.dirname(current_file_dir) # Go up from llm_evaluation/ to project root + possible_paths = [ - "./model_cost/cost.json", - "../model_cost/cost.json", - "model_cost/cost.json", + os.path.join(project_root, "model_cost", "cost.json"), # From project root + "./model_cost/cost.json", # Current working directory + "../model_cost/cost.json", # Parent directory + "model_cost/cost.json", # Relative to current dir ] cost_file = None @@ -160,6 +165,7 @@ def load_cost_config(self): print( f"Warning: Could not find cost configuration file. Tried: {possible_paths}" ) + print(f"Current working directory: {os.getcwd()}") self.cost_config = {} return @@ -177,7 +183,11 @@ def calculate_inference_cost( self, model_name: str, token_usage: Dict[str, int] ) -> float: """Calculate inference cost based on token usage and model pricing.""" - if not token_usage or not self.cost_config: + if not token_usage: + return 0.0 + + if not self.cost_config: + print("Warning: Cost config is empty!") return 0.0 # Remove _batch suffix if present for cost lookup @@ -185,27 +195,33 @@ def calculate_inference_cost( if model_name.endswith("_batch"): cost_lookup_name = model_name[:-6] # Remove '_batch' suffix - # Normalize model name to match cost config - if model_name_manager: - normalized_name = model_name_manager.get_universal_name(cost_lookup_name) + # Try exact match first (cost config uses original model names) + if cost_lookup_name in self.cost_config: + cost_info = self.cost_config[cost_lookup_name] else: - normalized_name = cost_lookup_name + # Normalize model name to match cost config + if model_name_manager: + normalized_name = model_name_manager.get_universal_name(cost_lookup_name) + else: + normalized_name = cost_lookup_name - # Try to find exact match first - if normalized_name in self.cost_config: - cost_info = self.cost_config[normalized_name] - else: - # Try to find partial matches - cost_info = None - for config_name in self.cost_config.keys(): - if config_name in normalized_name or normalized_name in config_name: - cost_info = self.cost_config[config_name] - break + # Try to find exact match with normalized name + if normalized_name in self.cost_config: + cost_info = self.cost_config[normalized_name] + else: + # Try to find partial matches + cost_info = None + for config_name in self.cost_config.keys(): + if config_name in normalized_name or normalized_name in config_name: + cost_info = self.cost_config[config_name] + break if not cost_info: print( - f"Warning: No cost configuration found for model {model_name} (lookup: {cost_lookup_name}, normalized: {normalized_name})" + f"Warning: No cost configuration found for model {model_name} (lookup: {cost_lookup_name})" ) + if len(self.cost_config) > 0: + print(f"Available cost config keys (first 10): {list(self.cost_config.keys())[:10]}") return 0.0 # Calculate cost @@ -239,6 +255,7 @@ def determine_dataset_from_global_index(self, global_index: str) -> str: "FinQA": "FinQA", "GeoBench": "GeoBench", "GeoGraphyData": "GeoGraphyData_100k", # Fix the dataset name + "GPQA": "GPQA", "GSM8K": "GSM8K", "LiveCodeBench": "LiveCodeBench", "MATH": "MATH", @@ -468,7 +485,18 @@ def _get_ground_truth(self, global_index: str, dataset_name: str) -> Optional[An except Exception as e: print(f"Error loading LiveCodeBench dataset: {e}") return None - + elif dataset_name == "GPQA": + gpqa_gt_path = "./dataset/gpqa_ground_truth.json" + if os.path.exists(gpqa_gt_path): + try: + with open(gpqa_gt_path, "r", encoding="utf-8") as f: + gpqa_data = json.load(f) + for item in gpqa_data: + if item.get("global_index") == global_index: + return item["answer"] + except Exception as e: + print(f"Error loading GPQA ground truth: {e}") + return None # For other datasets, find the entry with matching global_index if self.all_data is None: return None diff --git a/llm_evaluation/run.py b/llm_evaluation/run.py index 3e3365b..7ccaa8a 100644 --- a/llm_evaluation/run.py +++ b/llm_evaluation/run.py @@ -100,17 +100,23 @@ def compute_arena_score(cost, accuracy, beta=0.1, c_max=200, c_min=0.0044): return S -def load_predictions_file(router_name: str) -> List[Dict[str, Any]]: +def load_predictions_file(router_name: str, split: str | None = None) -> List[Dict[str, Any]]: """ Load router predictions from JSON file. Args: router_name: Name of the router + split: Dataset split (optional). Used to determine prediction file name. Returns: List of prediction dictionaries """ - prediction_path = f"./router_inference/predictions/{router_name}.json" + # Construct prediction path based on split (same logic as llm_inference/run.py) + if split and split in ["gpqa", "robustness"]: + filename = f"{router_name}-{split}" + else: + filename = router_name + prediction_path = f"./router_inference/predictions/{filename}.json" if not os.path.exists(prediction_path): raise FileNotFoundError( @@ -136,15 +142,21 @@ def load_predictions_from_path(path: str) -> List[Dict[str, Any]]: return json.load(f) -def save_predictions_file(predictions: List[Dict[str, Any]], router_name: str) -> None: +def save_predictions_file(predictions: List[Dict[str, Any]], router_name: str, split: str | None = None) -> None: """ Save predictions back to file. Args: predictions: List of prediction dictionaries router_name: Name of the router + split: Dataset split (optional). Used to determine prediction file name. """ - prediction_path = f"./router_inference/predictions/{router_name}.json" + # Construct filename based on split (same logic as load_predictions_file) + if split and split in ["gpqa", "robustness"]: + filename = f"{router_name}-{split}" + else: + filename = router_name + prediction_path = f"./router_inference/predictions/{filename}.json" # Create directory if it doesn't exist os.makedirs(os.path.dirname(prediction_path), exist_ok=True) @@ -170,7 +182,33 @@ def load_ground_truth_dataset(split: str) -> Dict[str, Dict[str, Any]]: """ from datasets import load_from_disk import pandas as pd - + ground_truth_map = {} + + # Handle GPQA split + if split == "gpqa": + gpqa_gt_path = "./dataset/gpqa_ground_truth.json" + if not os.path.exists(gpqa_gt_path): + raise FileNotFoundError( + f"GPQA ground truth not found at {gpqa_gt_path}. " + f"Please create it using the preparation script." + ) + logger.info(f"Loading GPQA ground truth from {gpqa_gt_path}...") + with open(gpqa_gt_path, "r", encoding="utf-8") as f: + gpqa_data = json.load(f) + + for item in gpqa_data: + global_index = item["global_index"] + ground_truth_map[global_index] = { + "question": item.get("question", ""), + "global_index": global_index, + "context": item.get("context", ""), + "answer": item["answer"], + "options": item.get("options", []), + "metadata": item.get("metadata", {}), + } + + logger.info(f"Loaded {len(ground_truth_map)} GPQA ground truth samples") + return ground_truth_map if split not in ["sub_10", "full"]: raise ValueError(f"Invalid split: {split}. Must be 'sub_10' or 'full'") @@ -354,9 +392,10 @@ def evaluate_single_prediction( ) # Calculate inference cost + # Use original model name for cost lookup since cost config uses original names token_usage = generated_result.get("token_usage", {}) inference_cost = evaluator.calculate_inference_cost( - universal_model_name, token_usage + model_name, token_usage # Use original model_name instead of universal_model_name ) # Update the prediction with evaluation results @@ -396,7 +435,7 @@ def process_router_predictions( logger.info(f"Using {num_workers} worker threads for parallel processing") # Load predictions - predictions = load_predictions_file(router_name) + predictions = load_predictions_file(router_name, split=split) # Separate regular and optimality entries regular_predictions = [p for p in predictions if not p.get("for_optimality", False)] @@ -439,11 +478,13 @@ def process_router_predictions( # Note: This loop runs in the main thread before threading starts, so no lock needed tasks = [] for i, prediction in enumerate(predictions): - # Check if already evaluated (has accuracy and cost) + # Check if already evaluated (has accuracy and cost > 0) # Skip if already evaluated AND force is False + # Note: cost > 0 check ensures costs were actually calculated (0.0 means not calculated) if not force and ( prediction.get("accuracy") is not None and prediction.get("cost") is not None + and prediction.get("cost", 0) > 0 # Cost must be > 0 to be considered evaluated ): already_evaluated_count += 1 evaluated_count += 1 @@ -494,7 +535,7 @@ def evaluate_task(seq_idx: int, prediction: Dict[str, Any]) -> bool: with save_lock: # Save the entire predictions list # This is safe because each thread modifies a different index - save_predictions_file(predictions, router_name) + save_predictions_file(predictions, router_name, split=split) elapsed_time = ( datetime.datetime.now() - start_time @@ -542,7 +583,7 @@ def evaluate_task(seq_idx: int, prediction: Dict[str, Any]) -> bool: # Final save with save_lock: - save_predictions_file(predictions, router_name) + save_predictions_file(predictions, router_name, split=split) # Final summary end_time = datetime.datetime.now() @@ -901,7 +942,7 @@ def run_robustness_only(router_name: str, robustness_path: Optional[str]) -> Non target_path, ) - predictions = load_predictions_file(router_name) + predictions = load_predictions_file(router_name, split=None) # Load base file for robustness try: robustness_predictions = load_predictions_from_path(target_path) @@ -1096,10 +1137,10 @@ def main(): "split", nargs="?", type=str, - choices=["sub_10", "full", "robustness"], + choices=["sub_10", "full", "robustness", "gpqa"], help=( "Dataset split to use for evaluation ('sub_10' for testing with answers, " - "'full' for submission, 'robustness' to compute robustness score only)." + "'full' for submission, 'robustness' to compute robustness score only, 'gpqa' for GPQA dataset)." ), ) parser.add_argument( @@ -1161,7 +1202,7 @@ def main(): # Run evaluation try: # If save_interval is 0, only save at the end - predictions = load_predictions_file(args.router_name) + predictions = load_predictions_file(args.router_name, split=args.split) save_interval = ( args.save_interval if args.save_interval > 0 else len(predictions) + 1 ) @@ -1177,8 +1218,8 @@ def main(): logger.info("\nInterrupted by user. Saving partial results...") try: # Try to save current state if possible - predictions = load_predictions_file(args.router_name) - save_predictions_file(predictions, args.router_name) + predictions = load_predictions_file(args.router_name, split=args.split) + save_predictions_file(predictions, args.router_name, split=args.split) logger.info("Partial results saved successfully.") except Exception as e: logger.warning(f"Could not save partial results: {e}") diff --git a/llm_inference/model_inference.py b/llm_inference/model_inference.py index 3a50ba6..d961e12 100644 --- a/llm_inference/model_inference.py +++ b/llm_inference/model_inference.py @@ -172,6 +172,7 @@ def _get_provider(self, model_name: str) -> str: "qwen/qwen3-vl-235b-a22b-instruct": "openrouter", "qwen/qwen3-coder": "openrouter", "x-ai/grok-code-fast-1": "openrouter", + "xiaomi/mimo-v2-flash": "openrouter", "xiaomi/mimo-v2-flash:free": "openrouter", "openai/gpt-oss-120b": "openrouter", "qwen/qwen3-235b-a22b-2507": "openrouter", diff --git a/llm_inference/run.py b/llm_inference/run.py index 6d72ee3..d455c64 100644 --- a/llm_inference/run.py +++ b/llm_inference/run.py @@ -38,17 +38,22 @@ logger = logging.getLogger(__name__) -def load_predictions_file(router_name: str) -> List[Dict[str, Any]]: +def load_predictions_file(router_name: str, split: str | None = None) -> List[Dict[str, Any]]: """ Load router predictions from JSON file. Args: router_name: Name of the router - + split: Dataset split ("sub_10", "full", "robustness", "gpqa") Returns: List of prediction dictionaries """ - prediction_path = f"./router_inference/predictions/{router_name}.json" + # Construct prediction path based on split + if split and split in ["gpqa", "robustness"]: + filename = f"{router_name}-{split}" + else: + filename = router_name + prediction_path = f"./router_inference/predictions/{filename}.json" if not os.path.exists(prediction_path): raise FileNotFoundError( @@ -105,7 +110,9 @@ def load_cached_results_for_predictions( # Load cache for each model for universal_model_name, model_predictions in model_to_predictions.items(): - cached_file = os.path.join(cached_results_dir, f"{universal_model_name}.jsonl") + # Sanitize model name for filename (replace / with _) + model_filename = universal_model_name.replace("/", "_") + cached_file = os.path.join(cached_results_dir, f"{model_filename}.jsonl") if not os.path.exists(cached_file): continue @@ -162,7 +169,7 @@ def load_cached_results_for_predictions( def save_predictions_file( - predictions: List[Dict[str, Any]], router_name: str, create_backup: bool = False + predictions: List[Dict[str, Any]], router_name: str, create_backup: bool = False, split: str | None = None ) -> None: """ Save predictions back to file. @@ -171,8 +178,14 @@ def save_predictions_file( predictions: List of prediction dictionaries router_name: Name of the router create_backup: Whether to create a backup before saving (only needed once) + split: Dataset split (optional). Used to determine prediction file name. """ - prediction_path = f"./router_inference/predictions/{router_name}.json" + # Construct filename based on split (same logic as load_predictions_file) + if split and split in ["gpqa", "robustness"]: + filename = f"{router_name}-{split}" + else: + filename = router_name + prediction_path = f"./router_inference/predictions/{filename}.json" with open(prediction_path, "w", encoding="utf-8") as f: json.dump(predictions, f, ensure_ascii=False, indent=2) @@ -185,6 +198,7 @@ def process_router_predictions( num_workers: int = 16, num_runs: int = 1, cached_results_dir: str = "./cached_results", + split: str | None = None, ) -> None: """ Process router predictions using parallel inference system. @@ -202,11 +216,11 @@ def process_router_predictions( logger.info(f"Target runs per query: {num_runs}") # Load predictions - predictions = load_predictions_file(router_name) + predictions = load_predictions_file(router_name, split) logger.info(f"Loaded {len(predictions)} predictions") # Create backup of original predictions file - save_predictions_file(predictions, router_name, create_backup=True) + save_predictions_file(predictions, router_name, create_backup=True, split=split) # Filter out entries without required fields and convert to universal model names valid_predictions = [] @@ -297,7 +311,7 @@ def process_router_predictions( updated_count += 1 # Save updated predictions - save_predictions_file(predictions, router_name, create_backup=False) + save_predictions_file(predictions, router_name, create_backup=False, split=split) # Final summary end_time = datetime.datetime.now() @@ -319,8 +333,13 @@ def process_router_predictions( logger.info(f" Successful: {stats['successful']}") logger.info(f" Failed: {stats['failed']}") + # Construct filename for log message + if split and split in ["gpqa", "robustness"]: + filename = f"{router_name}-{split}" + else: + filename = router_name logger.info( - f"\nPredictions saved to: ./router_inference/predictions/{router_name}.json" + f"\nPredictions saved to: ./router_inference/predictions/{filename}.json" ) logger.info("=" * 80) @@ -347,6 +366,12 @@ def main(): type=str, help="Name of the router (corresponds to ./router_inference/predictions/.json)", ) + parser.add_argument( + "--split", + type=str, + choices=["sub_10", "full", "robustness", "gpqa"], + help="Dataset split (optional). Used to determine prediction file name." + ) parser.add_argument( "--num-workers", type=int, @@ -394,6 +419,7 @@ def main(): num_workers=args.num_workers, num_runs=args.num_runs, cached_results_dir=args.cached_results_dir, + split=args.split, ) except KeyboardInterrupt: logger.info("\nInterrupted by user. Partial results have been saved.") diff --git a/model_cost/cost.json b/model_cost/cost.json index 4e8c38d..0d76794 100644 --- a/model_cost/cost.json +++ b/model_cost/cost.json @@ -178,5 +178,14 @@ "glm-4-plus": { "input_token_price_per_million": 0.7, "output_token_price_per_million": 0.7 + }, + "xiaomi/mimo-v2-flash": { + "input_token_price_per_million": 0.09, + "output_token_price_per_million": 0.29 + }, + + "mistralai/devstral-2512:free": { + "input_token_price_per_million": 0.0001, + "output_token_price_per_million": 0.0001 } } diff --git a/router_inference/config/openrouter-router.json b/router_inference/config/openrouter-router.json new file mode 100644 index 0000000..9d7a7b1 --- /dev/null +++ b/router_inference/config/openrouter-router.json @@ -0,0 +1,9 @@ +{ + "pipeline_params": { + "router_name": "openrouter-router", + "models": [ + "xiaomi/mimo-v2-flash", + "mistralai/devstral-2512:free" + ] + } +} diff --git a/router_inference/generate_prediction_file.py b/router_inference/generate_prediction_file.py index 694ce3f..1a190b1 100644 --- a/router_inference/generate_prediction_file.py +++ b/router_inference/generate_prediction_file.py @@ -30,6 +30,7 @@ "sub_10": "./dataset/router_data_10.json", "full": "./dataset/router_data.json", "robustness": "./dataset/router_robustness.json", + "gpqa": "./dataset/gpqa_data.json", } @@ -38,8 +39,7 @@ def load_dataset(split: str) -> List[Dict[str, Any]]: Load dataset file. Args: - split: One of the supported dataset splits (sub_10, full, robustness) - + split: One of the supported dataset splits (sub_10, full, robustness, gpqa) Returns: List of dataset entries """ @@ -182,6 +182,8 @@ def save_predictions( filename = router_name if split == "robustness": filename = f"{router_name}-robustness" + elif split == "gpqa": + filename = f"{router_name}-gpqa" prediction_path = f"./router_inference/predictions/{filename}.json" # Create directory if it doesn't exist @@ -207,7 +209,7 @@ def main(): "split", type=str, choices=list(DATASET_PATHS.keys()), - help="Dataset split: 'sub_10', 'full', or 'robustness'", + help="Dataset split: 'sub_10', 'full', 'robustness', or 'gpqa'", ) parser.add_argument( "--no-optimality", diff --git a/scripts/prepare_gpqa_data.py b/scripts/prepare_gpqa_data.py new file mode 100644 index 0000000..a09de82 --- /dev/null +++ b/scripts/prepare_gpqa_data.py @@ -0,0 +1,241 @@ +""" +Prepare GPQA dataset for RouterArena pipeline. + +This script: +1. Loads GPQA dataset from HuggingFace +2. Formats prompts according to RouterArena requirements +3. Creates dataset/gpqa_data.json (for router inference) +4. Creates dataset/gpqa_ground_truth.json (for evaluation) + +How to run: + uv run python scripts/prepare_gpqa_data.py + +Prerequisites: + - Install packages: uv sync (or pip install datasets) + - (Optional) If authentication needed: huggingface-cli login +""" + +from datasets import load_dataset, DatasetDict, Dataset +import json +import os +import random + +# Ensure dataset directory exists +os.makedirs("dataset", exist_ok=True) + +# Load the dataset. If authentication is needed, ensure you are logged in. +print("Loading GPQA dataset from HuggingFace...") +raw_gpqa = load_dataset("Idavidrein/gpqa", "gpqa_diamond") + +# Extract the actual dataset from DatasetDict if needed +if isinstance(raw_gpqa, DatasetDict): + split_names = list(raw_gpqa.keys()) + print(f"Found splits: {split_names}") + # Get the first split (usually 'train') + split_key = 'train' if 'train' in raw_gpqa else split_names[0] + print(f"Using split: {split_key}") + gpqa_dataset = raw_gpqa[split_key] +else: + # If it's already a single Dataset, use it directly + gpqa_dataset = raw_gpqa + +print(f"Loaded {len(gpqa_dataset)} GPQA entries") +print(f"Dataset features: {gpqa_dataset.features}") + +# Inspect first entry to understand structure +if len(gpqa_dataset) > 0: + print("\nFirst entry sample:") + print(gpqa_dataset[0]) + print() + +# Define prompt template (must match eval config!) +PROMPT_TEMPLATE = """Please read the following multiple-choice questions and provide the most likely correct answer based on the options given. + +Context: {context} + +Question: {question} + +Options: +{options} + +Provide the correct letter choice in \\boxed{{X}}, where X is the correct letter choice. Keep the explanation or feedback within 3 sentences.""" + +# Step 2.2: Create dataset file with formatted prompts +print("\n[Step 2.2] Formatting prompts and creating dataset file...") +formatted_data = [] +# Store shuffled options and answer letters for use in ground truth generation +shuffled_data = {} # {index: (shuffled_options, answer_letter)} + +for i, item in enumerate(gpqa_dataset): + # Extract fields (adjust field names based on actual dataset structure) + question = item.get("question", item.get("Question", "")) + # GPQA dataset has separate fields for correct and incorrect answers + # Construct options list from these fields + correct_answer = item.get("Correct Answer", item.get("correct_answer", item.get("answer", item.get("Answer", "")))) + incorrect_1 = item.get("Incorrect Answer 1", "") + incorrect_2 = item.get("Incorrect Answer 2", "") + incorrect_3 = item.get("Incorrect Answer 3", "") + + # Build options list with all answers + all_options = [opt for opt in [correct_answer, incorrect_1, incorrect_2, incorrect_3] if opt] + + # If we have a pre-formatted options list, use that instead + if item.get("options") or item.get("Options"): + all_options = item.get("options", item.get("Options", [])) + # Still need to find correct answer in the list + if not correct_answer: + correct_answer = item.get("Correct Answer", item.get("correct_answer", item.get("answer", item.get("Answer", "")))) + + # Shuffle options using deterministic seed based on index for reproducibility + # This ensures the same question always gets the same shuffle + random.seed(i) + shuffled_options = all_options.copy() + random.shuffle(shuffled_options) + + # Find the index of the correct answer in the shuffled list + answer_index = -1 + if correct_answer: + try: + answer_index = shuffled_options.index(correct_answer) + except ValueError: + # Try case-insensitive matching + for idx, opt in enumerate(shuffled_options): + if opt and correct_answer and opt.strip().lower() == correct_answer.strip().lower(): + answer_index = idx + break + + # Convert index to letter (A, B, C, D, etc.) + if answer_index >= 0: + answer_letter = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[answer_index] + else: + answer_letter = "A" # Default fallback + print(f"Warning: Could not find correct answer in options for GPQA_{i}, defaulting to A") + + # Store shuffled data for ground truth generation + shuffled_data[i] = (shuffled_options, answer_letter) + + context = item.get("context", item.get("Context", "")) + + # Format shuffled options as "A. option1\nB. option2\n..." + options_str = "" + for j, opt in enumerate(shuffled_options): + letter = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[j] + options_str += f"{letter}. {opt}\n" + + # Build the complete prompt + prompt = PROMPT_TEMPLATE.format( + context=context or "None", + question=question, + options=options_str.strip() + ) + + # Create the dataset entry + formatted_data.append({ + "prompt_formatted": prompt, + "global index": f"GPQA_{i}" # CRITICAL: Prefix must match dataset name + }) + +# Save dataset file +dataset_path = "dataset/gpqa_data.json" +with open(dataset_path, "w", encoding="utf-8") as f: + json.dump(formatted_data, f, indent=2, ensure_ascii=False) + +print(f"✓ Created {len(formatted_data)} GPQA entries in {dataset_path}") + +# Step 2.3: Create ground truth file +print("\n[Step 2.3] Creating ground truth file...") +ground_truth = [] +for i, item in enumerate(gpqa_dataset): + # Extract fields (same as above) + question = item.get("question", item.get("Question", "")) + context = item.get("context", item.get("Context", "")) + + # Use the same shuffled options and answer letter from step 2.2 + if i in shuffled_data: + shuffled_options, answer_letter = shuffled_data[i] + else: + # Fallback: regenerate shuffle if somehow missing (shouldn't happen) + correct_answer = item.get("Correct Answer", item.get("correct_answer", item.get("answer", item.get("Answer", "")))) + incorrect_1 = item.get("Incorrect Answer 1", "") + incorrect_2 = item.get("Incorrect Answer 2", "") + incorrect_3 = item.get("Incorrect Answer 3", "") + all_options = [opt for opt in [correct_answer, incorrect_1, incorrect_2, incorrect_3] if opt] + + if item.get("options") or item.get("Options"): + all_options = item.get("options", item.get("Options", [])) + + random.seed(i) + shuffled_options = all_options.copy() + random.shuffle(shuffled_options) + + answer_index = -1 + if correct_answer: + try: + answer_index = shuffled_options.index(correct_answer) + except ValueError: + for idx, opt in enumerate(shuffled_options): + if opt and correct_answer and opt.strip().lower() == correct_answer.strip().lower(): + answer_index = idx + break + + if answer_index >= 0: + answer_letter = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[answer_index] + else: + answer_letter = "A" + print(f"Warning: Could not find correct answer in options for GPQA_{i}, defaulting to A") + + ground_truth.append({ + "global_index": f"GPQA_{i}", # MUST match dataset file + "question": question, + "answer": answer_letter, # Store as letter (A, B, C, D) + "options": shuffled_options, # Use shuffled options to match the prompt + "context": context or "", + "metadata": item.get("metadata", {}) + }) + +# Save ground truth file +gt_path = "dataset/gpqa_ground_truth.json" +with open(gt_path, "w", encoding="utf-8") as f: + json.dump(ground_truth, f, indent=2, ensure_ascii=False) + +print(f"✓ Created {len(ground_truth)} GPQA ground truth entries in {gt_path}") + +# Step 2.4: Verify files +print("\n[Step 2.4] Verifying files...") +try: + # Check dataset file structure + with open(dataset_path, "r", encoding="utf-8") as f: + data = json.load(f) + print(f"✓ Dataset file: {len(data)} entries") + print(f" First entry keys: {list(data[0].keys())}") + print(f" First global_index: {data[0].get('global index')}") + + # Check ground truth file structure + with open(gt_path, "r", encoding="utf-8") as f: + gt = json.load(f) + print(f"✓ Ground truth file: {len(gt)} entries") + print(f" First entry keys: {list(gt[0].keys())}") + print(f" First answer: {gt[0].get('answer')}") + + # Verify matching indices + data_indices = {e.get("global index") for e in data} + gt_indices = {e.get("global_index") for e in gt} + if data_indices == gt_indices: + print(f"✓ All {len(data_indices)} indices match between dataset and ground truth") + else: + missing_in_data = gt_indices - data_indices + missing_in_gt = data_indices - gt_indices + if missing_in_data: + print(f"⚠ Warning: {len(missing_in_data)} indices in ground truth not in dataset") + if missing_in_gt: + print(f"⚠ Warning: {len(missing_in_gt)} indices in dataset not in ground truth") + + print("\n✓ All files created and verified successfully!") + print(f"\nNext steps:") + print(f"1. Review dataset/gpqa_data.json to ensure prompts are formatted correctly") + print(f"2. Review dataset/gpqa_ground_truth.json to ensure answers are correct") + print(f"3. Proceed to Step 3: Router Inference Setup") + +except Exception as e: + print(f"✗ Verification failed: {e}") + raise \ No newline at end of file diff --git a/universal_model_names.py b/universal_model_names.py index cc1016c..7fbd935 100644 --- a/universal_model_names.py +++ b/universal_model_names.py @@ -51,6 +51,8 @@ "meta-llama/llama-3-8b-instruct", "anthropic/claude-3.5-sonnet", "Qwen/QwQ-32B", + "xiaomi/mimo-v2-flash", + "mistralai/devstral-2512:free", # Replicate "meta/codellama-34b-instruct", # AWS Bedrock From 5a06c3101fd11251b0214f958d723e5e14657a95 Mon Sep 17 00:00:00 2001 From: Michael Yu Date: Tue, 27 Jan 2026 21:37:02 -0600 Subject: [PATCH 2/7] feat: Integrate GPQA dataset with proper answer extraction and option shuffling Add support for GPQA (Graduate-Level Google-Proof Q&A) dataset integration into the RouterArena evaluation pipeline. Key changes: - Add GPQA evaluation config (config/eval_config/zero-shot/GPQA.json) - Create prepare_gpqa_data.py script to: * Load GPQA dataset from HuggingFace (Idavidrein/gpqa) * Extract correct answers from \"Correct Answer\" field (fixes empty answer bug) * Shuffle MCQ options deterministically to distribute answers across A-D * Generate formatted prompts and ground truth files - Update evaluation pipeline (llm_evaluation/run.py, evaluate_models.py): * Add GPQA split handling in load_ground_truth_dataset() * Support GPQA dataset name detection - Update inference pipeline (llm_inference/run.py, model_inference.py): * Add GPQA split support for router predictions * Handle GPQA-specific data loading - Add OpenRouter router configuration - Update model cost configurations for GPQA models - Add universal model name mappings --- config/eval_config/zero-shot/GPQA.json | 18 ++ llm_evaluation/evaluate_models.py | 68 +++-- llm_evaluation/run.py | 73 ++++-- llm_inference/model_inference.py | 1 + llm_inference/run.py | 46 +++- model_cost/cost.json | 9 + .../config/openrouter-router.json | 9 + router_inference/generate_prediction_file.py | 8 +- scripts/prepare_gpqa_data.py | 241 ++++++++++++++++++ universal_model_names.py | 2 + 10 files changed, 426 insertions(+), 49 deletions(-) create mode 100644 config/eval_config/zero-shot/GPQA.json create mode 100644 router_inference/config/openrouter-router.json create mode 100644 scripts/prepare_gpqa_data.py diff --git a/config/eval_config/zero-shot/GPQA.json b/config/eval_config/zero-shot/GPQA.json new file mode 100644 index 0000000..6f00598 --- /dev/null +++ b/config/eval_config/zero-shot/GPQA.json @@ -0,0 +1,18 @@ +{ + "eval_params": { + "dataset": "GPQA", + "eval_metrics": [ + "mcq_accuracy" + ], + "setting": "zero-shot", + "prompt": "Please read the following multiple-choice questions and provide the most likely correct answer based on the options given.\n\nContext: {Context}\n\nQuestion: {Question}\n\nOptions: \n{Options}\n\nProvide the correct letter choice in \\boxed{{X}}, where X is the correct letter choice. Keep the explanation or feedback within 3 sentences." + }, + "management": { + "sub_dir": { + "input_config": "input_config/", + "raw_results": "raw_results.json", + "result_vis": "result_vis.png", + "output_config": "output_config.json" + } + } +} \ No newline at end of file diff --git a/llm_evaluation/evaluate_models.py b/llm_evaluation/evaluate_models.py index a7eede3..4f627e6 100644 --- a/llm_evaluation/evaluate_models.py +++ b/llm_evaluation/evaluate_models.py @@ -144,10 +144,15 @@ def load_dataset_configs(self): def load_cost_config(self): """Load cost configuration from model_cost/cost.json""" # Try multiple possible paths for cost file + # Get the directory of this file and construct paths relative to project root + current_file_dir = os.path.dirname(os.path.abspath(__file__)) + project_root = os.path.dirname(current_file_dir) # Go up from llm_evaluation/ to project root + possible_paths = [ - "./model_cost/cost.json", - "../model_cost/cost.json", - "model_cost/cost.json", + os.path.join(project_root, "model_cost", "cost.json"), # From project root + "./model_cost/cost.json", # Current working directory + "../model_cost/cost.json", # Parent directory + "model_cost/cost.json", # Relative to current dir ] cost_file = None @@ -160,6 +165,7 @@ def load_cost_config(self): print( f"Warning: Could not find cost configuration file. Tried: {possible_paths}" ) + print(f"Current working directory: {os.getcwd()}") self.cost_config = {} return @@ -177,7 +183,11 @@ def calculate_inference_cost( self, model_name: str, token_usage: Dict[str, int] ) -> float: """Calculate inference cost based on token usage and model pricing.""" - if not token_usage or not self.cost_config: + if not token_usage: + return 0.0 + + if not self.cost_config: + print("Warning: Cost config is empty!") return 0.0 # Remove _batch suffix if present for cost lookup @@ -185,27 +195,33 @@ def calculate_inference_cost( if model_name.endswith("_batch"): cost_lookup_name = model_name[:-6] # Remove '_batch' suffix - # Normalize model name to match cost config - if model_name_manager: - normalized_name = model_name_manager.get_universal_name(cost_lookup_name) + # Try exact match first (cost config uses original model names) + if cost_lookup_name in self.cost_config: + cost_info = self.cost_config[cost_lookup_name] else: - normalized_name = cost_lookup_name + # Normalize model name to match cost config + if model_name_manager: + normalized_name = model_name_manager.get_universal_name(cost_lookup_name) + else: + normalized_name = cost_lookup_name - # Try to find exact match first - if normalized_name in self.cost_config: - cost_info = self.cost_config[normalized_name] - else: - # Try to find partial matches - cost_info = None - for config_name in self.cost_config.keys(): - if config_name in normalized_name or normalized_name in config_name: - cost_info = self.cost_config[config_name] - break + # Try to find exact match with normalized name + if normalized_name in self.cost_config: + cost_info = self.cost_config[normalized_name] + else: + # Try to find partial matches + cost_info = None + for config_name in self.cost_config.keys(): + if config_name in normalized_name or normalized_name in config_name: + cost_info = self.cost_config[config_name] + break if not cost_info: print( - f"Warning: No cost configuration found for model {model_name} (lookup: {cost_lookup_name}, normalized: {normalized_name})" + f"Warning: No cost configuration found for model {model_name} (lookup: {cost_lookup_name})" ) + if len(self.cost_config) > 0: + print(f"Available cost config keys (first 10): {list(self.cost_config.keys())[:10]}") return 0.0 # Calculate cost @@ -239,6 +255,7 @@ def determine_dataset_from_global_index(self, global_index: str) -> str: "FinQA": "FinQA", "GeoBench": "GeoBench", "GeoGraphyData": "GeoGraphyData_100k", # Fix the dataset name + "GPQA": "GPQA", "GSM8K": "GSM8K", "LiveCodeBench": "LiveCodeBench", "MATH": "MATH", @@ -468,7 +485,18 @@ def _get_ground_truth(self, global_index: str, dataset_name: str) -> Optional[An except Exception as e: print(f"Error loading LiveCodeBench dataset: {e}") return None - + elif dataset_name == "GPQA": + gpqa_gt_path = "./dataset/gpqa_ground_truth.json" + if os.path.exists(gpqa_gt_path): + try: + with open(gpqa_gt_path, "r", encoding="utf-8") as f: + gpqa_data = json.load(f) + for item in gpqa_data: + if item.get("global_index") == global_index: + return item["answer"] + except Exception as e: + print(f"Error loading GPQA ground truth: {e}") + return None # For other datasets, find the entry with matching global_index if self.all_data is None: return None diff --git a/llm_evaluation/run.py b/llm_evaluation/run.py index 3e3365b..7ccaa8a 100644 --- a/llm_evaluation/run.py +++ b/llm_evaluation/run.py @@ -100,17 +100,23 @@ def compute_arena_score(cost, accuracy, beta=0.1, c_max=200, c_min=0.0044): return S -def load_predictions_file(router_name: str) -> List[Dict[str, Any]]: +def load_predictions_file(router_name: str, split: str | None = None) -> List[Dict[str, Any]]: """ Load router predictions from JSON file. Args: router_name: Name of the router + split: Dataset split (optional). Used to determine prediction file name. Returns: List of prediction dictionaries """ - prediction_path = f"./router_inference/predictions/{router_name}.json" + # Construct prediction path based on split (same logic as llm_inference/run.py) + if split and split in ["gpqa", "robustness"]: + filename = f"{router_name}-{split}" + else: + filename = router_name + prediction_path = f"./router_inference/predictions/{filename}.json" if not os.path.exists(prediction_path): raise FileNotFoundError( @@ -136,15 +142,21 @@ def load_predictions_from_path(path: str) -> List[Dict[str, Any]]: return json.load(f) -def save_predictions_file(predictions: List[Dict[str, Any]], router_name: str) -> None: +def save_predictions_file(predictions: List[Dict[str, Any]], router_name: str, split: str | None = None) -> None: """ Save predictions back to file. Args: predictions: List of prediction dictionaries router_name: Name of the router + split: Dataset split (optional). Used to determine prediction file name. """ - prediction_path = f"./router_inference/predictions/{router_name}.json" + # Construct filename based on split (same logic as load_predictions_file) + if split and split in ["gpqa", "robustness"]: + filename = f"{router_name}-{split}" + else: + filename = router_name + prediction_path = f"./router_inference/predictions/{filename}.json" # Create directory if it doesn't exist os.makedirs(os.path.dirname(prediction_path), exist_ok=True) @@ -170,7 +182,33 @@ def load_ground_truth_dataset(split: str) -> Dict[str, Dict[str, Any]]: """ from datasets import load_from_disk import pandas as pd - + ground_truth_map = {} + + # Handle GPQA split + if split == "gpqa": + gpqa_gt_path = "./dataset/gpqa_ground_truth.json" + if not os.path.exists(gpqa_gt_path): + raise FileNotFoundError( + f"GPQA ground truth not found at {gpqa_gt_path}. " + f"Please create it using the preparation script." + ) + logger.info(f"Loading GPQA ground truth from {gpqa_gt_path}...") + with open(gpqa_gt_path, "r", encoding="utf-8") as f: + gpqa_data = json.load(f) + + for item in gpqa_data: + global_index = item["global_index"] + ground_truth_map[global_index] = { + "question": item.get("question", ""), + "global_index": global_index, + "context": item.get("context", ""), + "answer": item["answer"], + "options": item.get("options", []), + "metadata": item.get("metadata", {}), + } + + logger.info(f"Loaded {len(ground_truth_map)} GPQA ground truth samples") + return ground_truth_map if split not in ["sub_10", "full"]: raise ValueError(f"Invalid split: {split}. Must be 'sub_10' or 'full'") @@ -354,9 +392,10 @@ def evaluate_single_prediction( ) # Calculate inference cost + # Use original model name for cost lookup since cost config uses original names token_usage = generated_result.get("token_usage", {}) inference_cost = evaluator.calculate_inference_cost( - universal_model_name, token_usage + model_name, token_usage # Use original model_name instead of universal_model_name ) # Update the prediction with evaluation results @@ -396,7 +435,7 @@ def process_router_predictions( logger.info(f"Using {num_workers} worker threads for parallel processing") # Load predictions - predictions = load_predictions_file(router_name) + predictions = load_predictions_file(router_name, split=split) # Separate regular and optimality entries regular_predictions = [p for p in predictions if not p.get("for_optimality", False)] @@ -439,11 +478,13 @@ def process_router_predictions( # Note: This loop runs in the main thread before threading starts, so no lock needed tasks = [] for i, prediction in enumerate(predictions): - # Check if already evaluated (has accuracy and cost) + # Check if already evaluated (has accuracy and cost > 0) # Skip if already evaluated AND force is False + # Note: cost > 0 check ensures costs were actually calculated (0.0 means not calculated) if not force and ( prediction.get("accuracy") is not None and prediction.get("cost") is not None + and prediction.get("cost", 0) > 0 # Cost must be > 0 to be considered evaluated ): already_evaluated_count += 1 evaluated_count += 1 @@ -494,7 +535,7 @@ def evaluate_task(seq_idx: int, prediction: Dict[str, Any]) -> bool: with save_lock: # Save the entire predictions list # This is safe because each thread modifies a different index - save_predictions_file(predictions, router_name) + save_predictions_file(predictions, router_name, split=split) elapsed_time = ( datetime.datetime.now() - start_time @@ -542,7 +583,7 @@ def evaluate_task(seq_idx: int, prediction: Dict[str, Any]) -> bool: # Final save with save_lock: - save_predictions_file(predictions, router_name) + save_predictions_file(predictions, router_name, split=split) # Final summary end_time = datetime.datetime.now() @@ -901,7 +942,7 @@ def run_robustness_only(router_name: str, robustness_path: Optional[str]) -> Non target_path, ) - predictions = load_predictions_file(router_name) + predictions = load_predictions_file(router_name, split=None) # Load base file for robustness try: robustness_predictions = load_predictions_from_path(target_path) @@ -1096,10 +1137,10 @@ def main(): "split", nargs="?", type=str, - choices=["sub_10", "full", "robustness"], + choices=["sub_10", "full", "robustness", "gpqa"], help=( "Dataset split to use for evaluation ('sub_10' for testing with answers, " - "'full' for submission, 'robustness' to compute robustness score only)." + "'full' for submission, 'robustness' to compute robustness score only, 'gpqa' for GPQA dataset)." ), ) parser.add_argument( @@ -1161,7 +1202,7 @@ def main(): # Run evaluation try: # If save_interval is 0, only save at the end - predictions = load_predictions_file(args.router_name) + predictions = load_predictions_file(args.router_name, split=args.split) save_interval = ( args.save_interval if args.save_interval > 0 else len(predictions) + 1 ) @@ -1177,8 +1218,8 @@ def main(): logger.info("\nInterrupted by user. Saving partial results...") try: # Try to save current state if possible - predictions = load_predictions_file(args.router_name) - save_predictions_file(predictions, args.router_name) + predictions = load_predictions_file(args.router_name, split=args.split) + save_predictions_file(predictions, args.router_name, split=args.split) logger.info("Partial results saved successfully.") except Exception as e: logger.warning(f"Could not save partial results: {e}") diff --git a/llm_inference/model_inference.py b/llm_inference/model_inference.py index 3a50ba6..d961e12 100644 --- a/llm_inference/model_inference.py +++ b/llm_inference/model_inference.py @@ -172,6 +172,7 @@ def _get_provider(self, model_name: str) -> str: "qwen/qwen3-vl-235b-a22b-instruct": "openrouter", "qwen/qwen3-coder": "openrouter", "x-ai/grok-code-fast-1": "openrouter", + "xiaomi/mimo-v2-flash": "openrouter", "xiaomi/mimo-v2-flash:free": "openrouter", "openai/gpt-oss-120b": "openrouter", "qwen/qwen3-235b-a22b-2507": "openrouter", diff --git a/llm_inference/run.py b/llm_inference/run.py index 6d72ee3..d455c64 100644 --- a/llm_inference/run.py +++ b/llm_inference/run.py @@ -38,17 +38,22 @@ logger = logging.getLogger(__name__) -def load_predictions_file(router_name: str) -> List[Dict[str, Any]]: +def load_predictions_file(router_name: str, split: str | None = None) -> List[Dict[str, Any]]: """ Load router predictions from JSON file. Args: router_name: Name of the router - + split: Dataset split ("sub_10", "full", "robustness", "gpqa") Returns: List of prediction dictionaries """ - prediction_path = f"./router_inference/predictions/{router_name}.json" + # Construct prediction path based on split + if split and split in ["gpqa", "robustness"]: + filename = f"{router_name}-{split}" + else: + filename = router_name + prediction_path = f"./router_inference/predictions/{filename}.json" if not os.path.exists(prediction_path): raise FileNotFoundError( @@ -105,7 +110,9 @@ def load_cached_results_for_predictions( # Load cache for each model for universal_model_name, model_predictions in model_to_predictions.items(): - cached_file = os.path.join(cached_results_dir, f"{universal_model_name}.jsonl") + # Sanitize model name for filename (replace / with _) + model_filename = universal_model_name.replace("/", "_") + cached_file = os.path.join(cached_results_dir, f"{model_filename}.jsonl") if not os.path.exists(cached_file): continue @@ -162,7 +169,7 @@ def load_cached_results_for_predictions( def save_predictions_file( - predictions: List[Dict[str, Any]], router_name: str, create_backup: bool = False + predictions: List[Dict[str, Any]], router_name: str, create_backup: bool = False, split: str | None = None ) -> None: """ Save predictions back to file. @@ -171,8 +178,14 @@ def save_predictions_file( predictions: List of prediction dictionaries router_name: Name of the router create_backup: Whether to create a backup before saving (only needed once) + split: Dataset split (optional). Used to determine prediction file name. """ - prediction_path = f"./router_inference/predictions/{router_name}.json" + # Construct filename based on split (same logic as load_predictions_file) + if split and split in ["gpqa", "robustness"]: + filename = f"{router_name}-{split}" + else: + filename = router_name + prediction_path = f"./router_inference/predictions/{filename}.json" with open(prediction_path, "w", encoding="utf-8") as f: json.dump(predictions, f, ensure_ascii=False, indent=2) @@ -185,6 +198,7 @@ def process_router_predictions( num_workers: int = 16, num_runs: int = 1, cached_results_dir: str = "./cached_results", + split: str | None = None, ) -> None: """ Process router predictions using parallel inference system. @@ -202,11 +216,11 @@ def process_router_predictions( logger.info(f"Target runs per query: {num_runs}") # Load predictions - predictions = load_predictions_file(router_name) + predictions = load_predictions_file(router_name, split) logger.info(f"Loaded {len(predictions)} predictions") # Create backup of original predictions file - save_predictions_file(predictions, router_name, create_backup=True) + save_predictions_file(predictions, router_name, create_backup=True, split=split) # Filter out entries without required fields and convert to universal model names valid_predictions = [] @@ -297,7 +311,7 @@ def process_router_predictions( updated_count += 1 # Save updated predictions - save_predictions_file(predictions, router_name, create_backup=False) + save_predictions_file(predictions, router_name, create_backup=False, split=split) # Final summary end_time = datetime.datetime.now() @@ -319,8 +333,13 @@ def process_router_predictions( logger.info(f" Successful: {stats['successful']}") logger.info(f" Failed: {stats['failed']}") + # Construct filename for log message + if split and split in ["gpqa", "robustness"]: + filename = f"{router_name}-{split}" + else: + filename = router_name logger.info( - f"\nPredictions saved to: ./router_inference/predictions/{router_name}.json" + f"\nPredictions saved to: ./router_inference/predictions/{filename}.json" ) logger.info("=" * 80) @@ -347,6 +366,12 @@ def main(): type=str, help="Name of the router (corresponds to ./router_inference/predictions/.json)", ) + parser.add_argument( + "--split", + type=str, + choices=["sub_10", "full", "robustness", "gpqa"], + help="Dataset split (optional). Used to determine prediction file name." + ) parser.add_argument( "--num-workers", type=int, @@ -394,6 +419,7 @@ def main(): num_workers=args.num_workers, num_runs=args.num_runs, cached_results_dir=args.cached_results_dir, + split=args.split, ) except KeyboardInterrupt: logger.info("\nInterrupted by user. Partial results have been saved.") diff --git a/model_cost/cost.json b/model_cost/cost.json index 4e8c38d..0d76794 100644 --- a/model_cost/cost.json +++ b/model_cost/cost.json @@ -178,5 +178,14 @@ "glm-4-plus": { "input_token_price_per_million": 0.7, "output_token_price_per_million": 0.7 + }, + "xiaomi/mimo-v2-flash": { + "input_token_price_per_million": 0.09, + "output_token_price_per_million": 0.29 + }, + + "mistralai/devstral-2512:free": { + "input_token_price_per_million": 0.0001, + "output_token_price_per_million": 0.0001 } } diff --git a/router_inference/config/openrouter-router.json b/router_inference/config/openrouter-router.json new file mode 100644 index 0000000..9d7a7b1 --- /dev/null +++ b/router_inference/config/openrouter-router.json @@ -0,0 +1,9 @@ +{ + "pipeline_params": { + "router_name": "openrouter-router", + "models": [ + "xiaomi/mimo-v2-flash", + "mistralai/devstral-2512:free" + ] + } +} diff --git a/router_inference/generate_prediction_file.py b/router_inference/generate_prediction_file.py index 694ce3f..1a190b1 100644 --- a/router_inference/generate_prediction_file.py +++ b/router_inference/generate_prediction_file.py @@ -30,6 +30,7 @@ "sub_10": "./dataset/router_data_10.json", "full": "./dataset/router_data.json", "robustness": "./dataset/router_robustness.json", + "gpqa": "./dataset/gpqa_data.json", } @@ -38,8 +39,7 @@ def load_dataset(split: str) -> List[Dict[str, Any]]: Load dataset file. Args: - split: One of the supported dataset splits (sub_10, full, robustness) - + split: One of the supported dataset splits (sub_10, full, robustness, gpqa) Returns: List of dataset entries """ @@ -182,6 +182,8 @@ def save_predictions( filename = router_name if split == "robustness": filename = f"{router_name}-robustness" + elif split == "gpqa": + filename = f"{router_name}-gpqa" prediction_path = f"./router_inference/predictions/{filename}.json" # Create directory if it doesn't exist @@ -207,7 +209,7 @@ def main(): "split", type=str, choices=list(DATASET_PATHS.keys()), - help="Dataset split: 'sub_10', 'full', or 'robustness'", + help="Dataset split: 'sub_10', 'full', 'robustness', or 'gpqa'", ) parser.add_argument( "--no-optimality", diff --git a/scripts/prepare_gpqa_data.py b/scripts/prepare_gpqa_data.py new file mode 100644 index 0000000..a09de82 --- /dev/null +++ b/scripts/prepare_gpqa_data.py @@ -0,0 +1,241 @@ +""" +Prepare GPQA dataset for RouterArena pipeline. + +This script: +1. Loads GPQA dataset from HuggingFace +2. Formats prompts according to RouterArena requirements +3. Creates dataset/gpqa_data.json (for router inference) +4. Creates dataset/gpqa_ground_truth.json (for evaluation) + +How to run: + uv run python scripts/prepare_gpqa_data.py + +Prerequisites: + - Install packages: uv sync (or pip install datasets) + - (Optional) If authentication needed: huggingface-cli login +""" + +from datasets import load_dataset, DatasetDict, Dataset +import json +import os +import random + +# Ensure dataset directory exists +os.makedirs("dataset", exist_ok=True) + +# Load the dataset. If authentication is needed, ensure you are logged in. +print("Loading GPQA dataset from HuggingFace...") +raw_gpqa = load_dataset("Idavidrein/gpqa", "gpqa_diamond") + +# Extract the actual dataset from DatasetDict if needed +if isinstance(raw_gpqa, DatasetDict): + split_names = list(raw_gpqa.keys()) + print(f"Found splits: {split_names}") + # Get the first split (usually 'train') + split_key = 'train' if 'train' in raw_gpqa else split_names[0] + print(f"Using split: {split_key}") + gpqa_dataset = raw_gpqa[split_key] +else: + # If it's already a single Dataset, use it directly + gpqa_dataset = raw_gpqa + +print(f"Loaded {len(gpqa_dataset)} GPQA entries") +print(f"Dataset features: {gpqa_dataset.features}") + +# Inspect first entry to understand structure +if len(gpqa_dataset) > 0: + print("\nFirst entry sample:") + print(gpqa_dataset[0]) + print() + +# Define prompt template (must match eval config!) +PROMPT_TEMPLATE = """Please read the following multiple-choice questions and provide the most likely correct answer based on the options given. + +Context: {context} + +Question: {question} + +Options: +{options} + +Provide the correct letter choice in \\boxed{{X}}, where X is the correct letter choice. Keep the explanation or feedback within 3 sentences.""" + +# Step 2.2: Create dataset file with formatted prompts +print("\n[Step 2.2] Formatting prompts and creating dataset file...") +formatted_data = [] +# Store shuffled options and answer letters for use in ground truth generation +shuffled_data = {} # {index: (shuffled_options, answer_letter)} + +for i, item in enumerate(gpqa_dataset): + # Extract fields (adjust field names based on actual dataset structure) + question = item.get("question", item.get("Question", "")) + # GPQA dataset has separate fields for correct and incorrect answers + # Construct options list from these fields + correct_answer = item.get("Correct Answer", item.get("correct_answer", item.get("answer", item.get("Answer", "")))) + incorrect_1 = item.get("Incorrect Answer 1", "") + incorrect_2 = item.get("Incorrect Answer 2", "") + incorrect_3 = item.get("Incorrect Answer 3", "") + + # Build options list with all answers + all_options = [opt for opt in [correct_answer, incorrect_1, incorrect_2, incorrect_3] if opt] + + # If we have a pre-formatted options list, use that instead + if item.get("options") or item.get("Options"): + all_options = item.get("options", item.get("Options", [])) + # Still need to find correct answer in the list + if not correct_answer: + correct_answer = item.get("Correct Answer", item.get("correct_answer", item.get("answer", item.get("Answer", "")))) + + # Shuffle options using deterministic seed based on index for reproducibility + # This ensures the same question always gets the same shuffle + random.seed(i) + shuffled_options = all_options.copy() + random.shuffle(shuffled_options) + + # Find the index of the correct answer in the shuffled list + answer_index = -1 + if correct_answer: + try: + answer_index = shuffled_options.index(correct_answer) + except ValueError: + # Try case-insensitive matching + for idx, opt in enumerate(shuffled_options): + if opt and correct_answer and opt.strip().lower() == correct_answer.strip().lower(): + answer_index = idx + break + + # Convert index to letter (A, B, C, D, etc.) + if answer_index >= 0: + answer_letter = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[answer_index] + else: + answer_letter = "A" # Default fallback + print(f"Warning: Could not find correct answer in options for GPQA_{i}, defaulting to A") + + # Store shuffled data for ground truth generation + shuffled_data[i] = (shuffled_options, answer_letter) + + context = item.get("context", item.get("Context", "")) + + # Format shuffled options as "A. option1\nB. option2\n..." + options_str = "" + for j, opt in enumerate(shuffled_options): + letter = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[j] + options_str += f"{letter}. {opt}\n" + + # Build the complete prompt + prompt = PROMPT_TEMPLATE.format( + context=context or "None", + question=question, + options=options_str.strip() + ) + + # Create the dataset entry + formatted_data.append({ + "prompt_formatted": prompt, + "global index": f"GPQA_{i}" # CRITICAL: Prefix must match dataset name + }) + +# Save dataset file +dataset_path = "dataset/gpqa_data.json" +with open(dataset_path, "w", encoding="utf-8") as f: + json.dump(formatted_data, f, indent=2, ensure_ascii=False) + +print(f"✓ Created {len(formatted_data)} GPQA entries in {dataset_path}") + +# Step 2.3: Create ground truth file +print("\n[Step 2.3] Creating ground truth file...") +ground_truth = [] +for i, item in enumerate(gpqa_dataset): + # Extract fields (same as above) + question = item.get("question", item.get("Question", "")) + context = item.get("context", item.get("Context", "")) + + # Use the same shuffled options and answer letter from step 2.2 + if i in shuffled_data: + shuffled_options, answer_letter = shuffled_data[i] + else: + # Fallback: regenerate shuffle if somehow missing (shouldn't happen) + correct_answer = item.get("Correct Answer", item.get("correct_answer", item.get("answer", item.get("Answer", "")))) + incorrect_1 = item.get("Incorrect Answer 1", "") + incorrect_2 = item.get("Incorrect Answer 2", "") + incorrect_3 = item.get("Incorrect Answer 3", "") + all_options = [opt for opt in [correct_answer, incorrect_1, incorrect_2, incorrect_3] if opt] + + if item.get("options") or item.get("Options"): + all_options = item.get("options", item.get("Options", [])) + + random.seed(i) + shuffled_options = all_options.copy() + random.shuffle(shuffled_options) + + answer_index = -1 + if correct_answer: + try: + answer_index = shuffled_options.index(correct_answer) + except ValueError: + for idx, opt in enumerate(shuffled_options): + if opt and correct_answer and opt.strip().lower() == correct_answer.strip().lower(): + answer_index = idx + break + + if answer_index >= 0: + answer_letter = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[answer_index] + else: + answer_letter = "A" + print(f"Warning: Could not find correct answer in options for GPQA_{i}, defaulting to A") + + ground_truth.append({ + "global_index": f"GPQA_{i}", # MUST match dataset file + "question": question, + "answer": answer_letter, # Store as letter (A, B, C, D) + "options": shuffled_options, # Use shuffled options to match the prompt + "context": context or "", + "metadata": item.get("metadata", {}) + }) + +# Save ground truth file +gt_path = "dataset/gpqa_ground_truth.json" +with open(gt_path, "w", encoding="utf-8") as f: + json.dump(ground_truth, f, indent=2, ensure_ascii=False) + +print(f"✓ Created {len(ground_truth)} GPQA ground truth entries in {gt_path}") + +# Step 2.4: Verify files +print("\n[Step 2.4] Verifying files...") +try: + # Check dataset file structure + with open(dataset_path, "r", encoding="utf-8") as f: + data = json.load(f) + print(f"✓ Dataset file: {len(data)} entries") + print(f" First entry keys: {list(data[0].keys())}") + print(f" First global_index: {data[0].get('global index')}") + + # Check ground truth file structure + with open(gt_path, "r", encoding="utf-8") as f: + gt = json.load(f) + print(f"✓ Ground truth file: {len(gt)} entries") + print(f" First entry keys: {list(gt[0].keys())}") + print(f" First answer: {gt[0].get('answer')}") + + # Verify matching indices + data_indices = {e.get("global index") for e in data} + gt_indices = {e.get("global_index") for e in gt} + if data_indices == gt_indices: + print(f"✓ All {len(data_indices)} indices match between dataset and ground truth") + else: + missing_in_data = gt_indices - data_indices + missing_in_gt = data_indices - gt_indices + if missing_in_data: + print(f"⚠ Warning: {len(missing_in_data)} indices in ground truth not in dataset") + if missing_in_gt: + print(f"⚠ Warning: {len(missing_in_gt)} indices in dataset not in ground truth") + + print("\n✓ All files created and verified successfully!") + print(f"\nNext steps:") + print(f"1. Review dataset/gpqa_data.json to ensure prompts are formatted correctly") + print(f"2. Review dataset/gpqa_ground_truth.json to ensure answers are correct") + print(f"3. Proceed to Step 3: Router Inference Setup") + +except Exception as e: + print(f"✗ Verification failed: {e}") + raise \ No newline at end of file diff --git a/universal_model_names.py b/universal_model_names.py index cc1016c..7fbd935 100644 --- a/universal_model_names.py +++ b/universal_model_names.py @@ -51,6 +51,8 @@ "meta-llama/llama-3-8b-instruct", "anthropic/claude-3.5-sonnet", "Qwen/QwQ-32B", + "xiaomi/mimo-v2-flash", + "mistralai/devstral-2512:free", # Replicate "meta/codellama-34b-instruct", # AWS Bedrock From 239f5ff329677232d005c113950836d8cc36a977 Mon Sep 17 00:00:00 2001 From: ZhiboYu1 Date: Sun, 1 Feb 2026 15:31:43 -0600 Subject: [PATCH 3/7] fix: linting error that python 3.6 doesn't support union syntax, and fix the error such that we are using universal model name when calculating prices --- llm_evaluation/run.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llm_evaluation/run.py b/llm_evaluation/run.py index 7ccaa8a..e1e988c 100644 --- a/llm_evaluation/run.py +++ b/llm_evaluation/run.py @@ -100,7 +100,7 @@ def compute_arena_score(cost, accuracy, beta=0.1, c_max=200, c_min=0.0044): return S -def load_predictions_file(router_name: str, split: str | None = None) -> List[Dict[str, Any]]: +def load_predictions_file(router_name: str, split: Optional[str] = None) -> List[Dict[str, Any]]: """ Load router predictions from JSON file. @@ -392,10 +392,10 @@ def evaluate_single_prediction( ) # Calculate inference cost - # Use original model name for cost lookup since cost config uses original names + # Use universal model name for cost lookup to respect user-defined mappings token_usage = generated_result.get("token_usage", {}) inference_cost = evaluator.calculate_inference_cost( - model_name, token_usage # Use original model_name instead of universal_model_name + universal_model_name, token_usage # Use universal_model_name to respect mapping in universal_model_names.py ) # Update the prediction with evaluation results From 1ad4f30e74cb4b907245aafc9febe163afde7a11 Mon Sep 17 00:00:00 2001 From: ZhiboYu1 Date: Sun, 1 Feb 2026 15:53:45 -0600 Subject: [PATCH 4/7] docs: updated .gitignore to ignore markdown files except for README --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitignore b/.gitignore index 49b22b9..3b49a5b 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,9 @@ Thumbs.db tools/.addlicense.lock tools/addlicense + +# Ignore all Markdown files +*.md + +# Except for this one specific file +!README.md From d3cbb82fcc1cc84314b800b8d55be1d1baf53fa0 Mon Sep 17 00:00:00 2001 From: ZhiboYu1 Date: Tue, 3 Feb 2026 12:43:19 -0600 Subject: [PATCH 5/7] fix some typo --- llm_evaluation/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_evaluation/run.py b/llm_evaluation/run.py index 32175f1..337ea61 100644 --- a/llm_evaluation/run.py +++ b/llm_evaluation/run.py @@ -143,7 +143,7 @@ def load_predictions_from_path(path: str) -> List[Dict[str, Any]]: def save_predictions_file( - predictions: List[Dict[str, Any]], router_name: str, split: str | None = None + predictions: List[Dict[str, Any]], router_name: str, split: Optional[str] = None ) -> None: """ Save predictions back to file. From 4c71a1867289f1c4cdd428db99f194a75d25d5b8 Mon Sep 17 00:00:00 2001 From: ZhiboYu1 Date: Tue, 3 Feb 2026 12:46:13 -0600 Subject: [PATCH 6/7] Refactor type hints to use Optional for better clarity --- automation/process_pr_submission.py | 2 +- global_utils/robustness.py | 2 +- llm_inference/run.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/automation/process_pr_submission.py b/automation/process_pr_submission.py index ebf1202..c22db34 100644 --- a/automation/process_pr_submission.py +++ b/automation/process_pr_submission.py @@ -57,7 +57,7 @@ class CommandError(RuntimeError): """Raised when a subprocess fails and we want a cleaner error message.""" def __init__( - self, message: str, *, stdout: str | None = None, stderr: str | None = None + self, message: str, *, stdout: Optional[str] = None, stderr: Optional[str] = None ): super().__init__(message) self.stdout = stdout diff --git a/global_utils/robustness.py b/global_utils/robustness.py index e57b762..d410fc0 100644 --- a/global_utils/robustness.py +++ b/global_utils/robustness.py @@ -28,7 +28,7 @@ def compute_robustness_score( full_predictions: list[dict[str, Any]], robustness_predictions: list[dict[str, Any]], *, - name_manager: ModelNameManager | None = None, + name_manager: Optional[ModelNameManager] = None, ) -> Optional[float]: """ Compute the robustness flip ratio between full and robustness prediction sets. diff --git a/llm_inference/run.py b/llm_inference/run.py index c139d19..a2135f4 100644 --- a/llm_inference/run.py +++ b/llm_inference/run.py @@ -16,7 +16,7 @@ import sys import logging import datetime -from typing import Dict, Any, List, Tuple +from typing import Dict, Any, List, Tuple, Optional from collections import defaultdict # Add parent directory to path for imports @@ -39,7 +39,7 @@ def load_predictions_file( - router_name: str, split: str | None = None + router_name: str, split: Optional[str] = None ) -> List[Dict[str, Any]]: """ Load router predictions from JSON file. @@ -174,7 +174,7 @@ def save_predictions_file( predictions: List[Dict[str, Any]], router_name: str, create_backup: bool = False, - split: str | None = None, + split: Optional[str] = None, ) -> None: """ Save predictions back to file. @@ -203,7 +203,7 @@ def process_router_predictions( num_workers: int = 16, num_runs: int = 1, cached_results_dir: str = "./cached_results", - split: str | None = None, + split: Optional[str] = None, ) -> None: """ Process router predictions using parallel inference system. From 19ffea1e8e8af920a828084a8939571cb0804ed5 Mon Sep 17 00:00:00 2001 From: ZhiboYu1 Date: Tue, 3 Feb 2026 12:49:29 -0600 Subject: [PATCH 7/7] Refactor type --- automation/process_pr_submission.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/automation/process_pr_submission.py b/automation/process_pr_submission.py index c22db34..001c8f7 100644 --- a/automation/process_pr_submission.py +++ b/automation/process_pr_submission.py @@ -57,7 +57,11 @@ class CommandError(RuntimeError): """Raised when a subprocess fails and we want a cleaner error message.""" def __init__( - self, message: str, *, stdout: Optional[str] = None, stderr: Optional[str] = None + self, + message: str, + *, + stdout: Optional[str] = None, + stderr: Optional[str] = None, ): super().__init__(message) self.stdout = stdout