Skip to content
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,9 @@ Thumbs.db

tools/.addlicense.lock
tools/addlicense

# Ignore all Markdown files
*.md

# Except for this one specific file
!README.md
6 changes: 5 additions & 1 deletion automation/process_pr_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,11 @@ class CommandError(RuntimeError):
"""Raised when a subprocess fails and we want a cleaner error message."""

def __init__(
self, message: str, *, stdout: str | None = None, stderr: str | None = None
self,
message: str,
*,
stdout: Optional[str] = None,
stderr: Optional[str] = None,
):
super().__init__(message)
self.stdout = stdout
Expand Down
18 changes: 18 additions & 0 deletions config/eval_config/zero-shot/GPQA.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"eval_params": {
"dataset": "GPQA",
"eval_metrics": [
"mcq_accuracy"
],
"setting": "zero-shot",
"prompt": "Please read the following multiple-choice questions and provide the most likely correct answer based on the options given.\n\nContext: {Context}\n\nQuestion: {Question}\n\nOptions: \n{Options}\n\nProvide the correct letter choice in \\boxed{{X}}, where X is the correct letter choice. Keep the explanation or feedback within 3 sentences."
},
"management": {
"sub_dir": {
"input_config": "input_config/",
"raw_results": "raw_results.json",
"result_vis": "result_vis.png",
"output_config": "output_config.json"
}
}
}
2 changes: 1 addition & 1 deletion global_utils/robustness.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def compute_robustness_score(
full_predictions: list[dict[str, Any]],
robustness_predictions: list[dict[str, Any]],
*,
name_manager: ModelNameManager | None = None,
name_manager: Optional[ModelNameManager] = None,
) -> Optional[float]:
"""
Compute the robustness flip ratio between full and robustness prediction sets.
Expand Down
31 changes: 29 additions & 2 deletions llm_evaluation/evaluate_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,12 @@ def load_cost_config(self):
project_root = os.path.dirname(script_dir)

# Try multiple possible paths for cost file
# Get the directory of this file and construct paths relative to project root
current_file_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(
current_file_dir
) # Go up from llm_evaluation/ to project root

possible_paths = [
os.path.join(project_root, "model_cost", "model_cost.json"),
"./model_cost/model_cost.json",
Expand All @@ -186,6 +192,7 @@ def load_cost_config(self):
print(
f"Warning: Could not find cost configuration file. Tried: {possible_paths[:3]}..."
)
print(f"Current working directory: {os.getcwd()}")
self.cost_config = {}
return

Expand All @@ -203,7 +210,11 @@ def calculate_inference_cost(
self, model_name: str, token_usage: Dict[str, int]
) -> float:
"""Calculate inference cost based on token usage and model pricing."""
if not token_usage or not self.cost_config:
if not token_usage:
return 0.0

if not self.cost_config:
print("Warning: Cost config is empty!")
return 0.0

# Remove _batch suffix if present for cost lookup
Expand All @@ -227,6 +238,10 @@ def calculate_inference_cost(
print(
f"Warning: No cost configuration found for model {model_name} (lookup: {cost_lookup_name})"
)
if len(self.cost_config) > 0:
print(
f"Available cost config keys (first 10): {list(self.cost_config.keys())[:10]}"
)
return 0.0

# Calculate cost
Expand Down Expand Up @@ -260,6 +275,7 @@ def determine_dataset_from_global_index(self, global_index: str) -> str:
"FinQA": "FinQA",
"GeoBench": "GeoBench",
"GeoGraphyData": "GeoGraphyData_100k", # Fix the dataset name
"GPQA": "GPQA",
"GSM8K": "GSM8K",
"LiveCodeBench": "LiveCodeBench",
"MATH": "MATH",
Expand Down Expand Up @@ -480,7 +496,18 @@ def _get_ground_truth(self, global_index: str, dataset_name: str) -> Optional[An
except Exception as e:
print(f"Error loading LiveCodeBench dataset: {e}")
return None

elif dataset_name == "GPQA":
gpqa_gt_path = "./dataset/gpqa_ground_truth.json"
if os.path.exists(gpqa_gt_path):
try:
with open(gpqa_gt_path, "r", encoding="utf-8") as f:
gpqa_data = json.load(f)
for item in gpqa_data:
if item.get("global_index") == global_index:
return item["answer"]
except Exception as e:
print(f"Error loading GPQA ground truth: {e}")
return None
# For other datasets, find the entry with matching global_index
if self.all_data is None:
return None
Expand Down
78 changes: 64 additions & 14 deletions llm_evaluation/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,17 +98,25 @@ def compute_arena_score(cost, accuracy, beta=0.1, c_max=200, c_min=0.0044):
return S


def load_predictions_file(router_name: str) -> List[Dict[str, Any]]:
def load_predictions_file(
router_name: str, split: Optional[str] = None
) -> List[Dict[str, Any]]:
"""
Load router predictions from JSON file.

Args:
router_name: Name of the router
split: Dataset split (optional). Used to determine prediction file name.

Returns:
List of prediction dictionaries
"""
prediction_path = f"./router_inference/predictions/{router_name}.json"
# Construct prediction path based on split (same logic as llm_inference/run.py)
if split and split in ["gpqa", "robustness"]:
filename = f"{router_name}-{split}"
else:
filename = router_name
prediction_path = f"./router_inference/predictions/{filename}.json"

if not os.path.exists(prediction_path):
raise FileNotFoundError(
Expand All @@ -134,15 +142,23 @@ def load_predictions_from_path(path: str) -> List[Dict[str, Any]]:
return json.load(f)


def save_predictions_file(predictions: List[Dict[str, Any]], router_name: str) -> None:
def save_predictions_file(
predictions: List[Dict[str, Any]], router_name: str, split: Optional[str] = None
) -> None:
"""
Save predictions back to file.

Args:
predictions: List of prediction dictionaries
router_name: Name of the router
split: Dataset split (optional). Used to determine prediction file name.
"""
prediction_path = f"./router_inference/predictions/{router_name}.json"
# Construct filename based on split (same logic as load_predictions_file)
if split and split in ["gpqa", "robustness"]:
filename = f"{router_name}-{split}"
else:
filename = router_name
prediction_path = f"./router_inference/predictions/{filename}.json"

# Create directory if it doesn't exist
os.makedirs(os.path.dirname(prediction_path), exist_ok=True)
Expand All @@ -169,6 +185,33 @@ def load_ground_truth_dataset(split: str) -> Dict[str, Dict[str, Any]]:
from datasets import load_from_disk
import pandas as pd

ground_truth_map = {}

# Handle GPQA split
if split == "gpqa":
gpqa_gt_path = "./dataset/gpqa_ground_truth.json"
if not os.path.exists(gpqa_gt_path):
raise FileNotFoundError(
f"GPQA ground truth not found at {gpqa_gt_path}. "
f"Please create it using the preparation script."
)
logger.info(f"Loading GPQA ground truth from {gpqa_gt_path}...")
with open(gpqa_gt_path, "r", encoding="utf-8") as f:
gpqa_data = json.load(f)

for item in gpqa_data:
global_index = item["global_index"]
ground_truth_map[global_index] = {
"question": item.get("question", ""),
"global_index": global_index,
"context": item.get("context", ""),
"answer": item["answer"],
"options": item.get("options", []),
"metadata": item.get("metadata", {}),
}

logger.info(f"Loaded {len(ground_truth_map)} GPQA ground truth samples")
return ground_truth_map
if split not in ["sub_10", "full"]:
raise ValueError(f"Invalid split: {split}. Must be 'sub_10' or 'full'")

Expand Down Expand Up @@ -352,9 +395,11 @@ def evaluate_single_prediction(
)

# Calculate inference cost
# Use universal model name for cost lookup to respect user-defined mappings
token_usage = generated_result.get("token_usage", {})
inference_cost = evaluator.calculate_inference_cost(
universal_model_name, token_usage
universal_model_name,
token_usage, # Use universal_model_name to respect mapping in universal_model_names.py
)

# Update the prediction with evaluation results
Expand Down Expand Up @@ -394,7 +439,7 @@ def process_router_predictions(
logger.info(f"Using {num_workers} worker threads for parallel processing")

# Load predictions
predictions = load_predictions_file(router_name)
predictions = load_predictions_file(router_name, split=split)

# Separate regular and optimality entries
regular_predictions = [p for p in predictions if not p.get("for_optimality", False)]
Expand Down Expand Up @@ -433,11 +478,14 @@ def process_router_predictions(
# Prepare tasks: filter out already evaluated entries (unless force is True)
tasks = []
for i, prediction in enumerate(predictions):
# Check if already evaluated (has accuracy and cost)
# Check if already evaluated (has accuracy and cost > 0)
# Skip if already evaluated AND force is False
# Note: cost > 0 check ensures costs were actually calculated (0.0 means not calculated)
if not force and (
prediction.get("accuracy") is not None
and prediction.get("cost") is not None
and prediction.get("cost", 0)
> 0 # Cost must be > 0 to be considered evaluated
):
already_evaluated_count += 1
continue
Expand Down Expand Up @@ -466,7 +514,7 @@ def evaluate_task_wrapper(

def save_callback():
"""Callback to save predictions file."""
save_predictions_file(predictions, router_name)
save_predictions_file(predictions, router_name, split=split)

# Run parallel evaluation
manager.evaluate_entries_parallel(
Expand Down Expand Up @@ -839,7 +887,9 @@ def run_robustness_only(router_name: str, robustness_path: Optional[str]) -> Non
target_path,
)

predictions = load_predictions_file(router_name)
predictions = load_predictions_file(
router_name, split=None
) # Load base file for robustness

try:
robustness_predictions = load_predictions_from_path(target_path)
Expand Down Expand Up @@ -1034,10 +1084,10 @@ def main():
"split",
nargs="?",
type=str,
choices=["sub_10", "full", "robustness"],
choices=["sub_10", "full", "robustness", "gpqa"],
help=(
"Dataset split to use for evaluation ('sub_10' for testing with answers, "
"'full' for submission, 'robustness' to compute robustness score only)."
"'full' for submission, 'robustness' to compute robustness score only, 'gpqa' for GPQA dataset)."
),
)
parser.add_argument(
Expand Down Expand Up @@ -1099,7 +1149,7 @@ def main():
# Run evaluation
try:
# If save_interval is 0, only save at the end
predictions = load_predictions_file(args.router_name)
predictions = load_predictions_file(args.router_name, split=args.split)
save_interval = (
args.save_interval if args.save_interval > 0 else len(predictions) + 1
)
Expand All @@ -1115,8 +1165,8 @@ def main():
logger.info("\nInterrupted by user. Saving partial results...")
try:
# Try to save current state if possible
predictions = load_predictions_file(args.router_name)
save_predictions_file(predictions, args.router_name)
predictions = load_predictions_file(args.router_name, split=args.split)
save_predictions_file(predictions, args.router_name, split=args.split)
logger.info("Partial results saved successfully.")
except Exception as e:
logger.warning(f"Could not save partial results: {e}")
Expand Down
1 change: 1 addition & 0 deletions llm_inference/model_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ def _get_provider(self, model_name: str) -> str:
"qwen/qwen3-vl-235b-a22b-instruct": "openrouter",
"qwen/qwen3-coder": "openrouter",
"x-ai/grok-code-fast-1": "openrouter",
"xiaomi/mimo-v2-flash": "openrouter",
"xiaomi/mimo-v2-flash:free": "openrouter",
"openai/gpt-oss-120b": "openrouter",
"qwen/qwen3-235b-a22b-2507": "openrouter",
Expand Down
Loading