In [15]:
#!/usr/bin/env python3
"""
Main entry point for running bizCon benchmarks.
"""

import os
import sys
import argparse
import yaml
import json
import datetime
from pathlib import Path
import sys

current_path = sys.argv[0]

# Add parent directory to path for importing
sys.path.insert(0, str(Path(current_path).resolve().parent))

# Import our modules
from models import get_model_client, list_supported_models
from scenarios import load_scenarios, list_available_scenarios
from evaluators import get_all_evaluators
from tools import get_default_tools
from core.pipeline import EvaluationPipeline
from transformers import AutoModelForCausalLM, AutoTokenizer

In [16]:
def load_config(config_path):
    """Load configuration from YAML file."""
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

In [17]:
def load_scenarios_by_config(config, scenario_ids=None):
    """
    Load scenarios based on configuration and optional specific IDs.
    
    Args:
        config: Configuration dictionary
        scenario_ids: Optional list of specific scenario IDs to load
        
    Returns:
        List of scenario instances
    """
    if scenario_ids:
        # Load specific scenarios by ID
        return load_scenarios(scenario_ids)
    
    # Check for scenarios in config
    config_scenarios = config.get('evaluation', {}).get('scenarios', [])
    if config_scenarios:
        return load_scenarios(config_scenarios)
    
    # Check for scenario categories in config
    scenario_categories = config.get('evaluation', {}).get('scenario_categories', [])
    if scenario_categories:
        # Get all available scenarios
        available_scenarios = list_available_scenarios()
        scenario_ids = []
        
        for scenario_id, metadata in available_scenarios.items():
            category = scenario_id.split('_')[0]
            if category in scenario_categories:
                scenario_ids.append(scenario_id)
        
        return load_scenarios(scenario_ids)
    
    # Default to all scenarios
    return load_scenarios(list(list_available_scenarios().keys()))

In [18]:
def load_models_from_config(config):
    """
    Load model clients from configuration.
    
    Args:
        config: Dictionary with model configurations
        
    Returns:
        List of model client instances
    """
    models = []
    for model_config in config.get('models', []):
        provider = model_config.get('provider')
        model_name = model_config.get('name')
        
        # Get API key from environment or config
        api_key = os.environ.get(f"{provider.upper()}_API_KEY", model_config.get('api_key'))
        if not api_key:
            api_key = True
        
        if api_key:
        # Create model client
            model = get_model_client(
                provider=provider,
                model_name=model_name,
                api_key=api_key,
                temperature=model_config.get('temperature', 0.7),
                max_tokens=model_config.get('max_tokens', 1024),
                endpoint=model_config.get('azure_endpoint'),
                api_version=model_config.get('api_version'),
                **(model_config.get('parameters', {}))
            )
            models.append(model)
            print(f"Initialized model: {model}")
    
    return models

In [19]:
def run_benchmark(config_path, output_dir, scenario_ids=None, parallel=False, verbose=False):
    """
    Run benchmark evaluation.
    
    Args:
        config_path: Path to configuration file
        output_dir: Directory to save output
        scenario_ids: Optional list of scenario IDs to run
        parallel: Whether to run evaluations in parallel
        verbose: Whether to display detailed progress
    """
    # Load configuration
    config = load_config(config_path)
    
    # Load models
    models = load_models_from_config(config)
    if not models:
        print("Error: No models loaded. Check your API keys and configuration.")
        return
    
    # Load scenarios
    scenarios = load_scenarios_by_config(config, scenario_ids)
    if not scenarios:
        print("Error: No scenarios loaded. Check your scenario IDs or configuration.")
        return
    
    print(f"Loaded {len(scenarios)} scenarios for evaluation")
    
    # Load evaluators with weights from config
    evaluator_weights = config.get('evaluation', {}).get('evaluator_weights', {})
    evaluators = get_all_evaluators(weights=evaluator_weights)
    
    # Load tools with error rates from config
    tool_error_rates = config.get('evaluation', {}).get('tool_error_rates', {})
    tools = get_default_tools()
    for tool_id, tool in tools.items():
        if tool_id in tool_error_rates:
            tool.error_rate = tool_error_rates[tool_id]
    
    # Get number of runs from config
    num_runs = config.get('evaluation', {}).get('num_runs', 1)
    
    # Set up pipeline
    pipeline = EvaluationPipeline(
        models=models,
        scenarios=scenarios,
        evaluators=evaluators,
        tools=tools,
        num_runs=num_runs,
        parallel=parallel,
        verbose=verbose
    )
    
    # Run evaluation
    print(f"Running benchmark with {len(models)} models on {len(scenarios)} scenarios...")
    results = pipeline.run()
    
    # Create timestamped output directory
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    result_dir = os.path.join(output_dir, f"benchmark_{timestamp}")
    os.makedirs(result_dir, exist_ok=True)
    
    # Save raw results
    results_file = os.path.join(result_dir, "results.json")
    with open(results_file, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"Results saved to {results_file}")
    
    # Generate report
    pipeline.generate_report(result_dir)
    
    print(f"Report generated in {result_dir}")
    
    # Print summary
    print("\nSummary of Results:")
    for model_id, score in results["summary"]["overall_scores"].items():
        print(f"  {model_id}: {score:.2f}")
    
    return results

In [20]:
import copy

def remove_required_from_properties(tool_definitions):
    new_tool_definitions = copy.deepcopy(tool_definitions)
    for tool in new_tool_definitions:
        properties = tool.get('function', {}).get('parameters', {}).get('properties', {})
        for prop_name, prop in properties.items():
            if 'required' in prop:
                del prop['required']
    return new_tool_definitions

In [21]:
config = load_config("config/models.yaml")
models = load_models_from_config(config)
scenarios = load_scenarios_by_config(config)[0:1]
tools = get_default_tools()


tool_definitions = []
for tool_id in scenarios[0].tools_required:
    if tool_id in tools:
        tool_definitions.append(tools[tool_id].get_definition())
tool_definitions = remove_required_from_properties(tool_definitions)


initial_message = scenarios[0].get_initial_message()
response = models[0].generate_response(
    messages=[initial_message],
    tools=tool_definitions if tool_definitions else None
)

Initialized model: AzureClient(gpt-4o-mini-1)


In [22]:
print("Response:", response)

Response: ChatCompletion(id='chatcmpl-Bs4nmqzNNQ59th8QLtNIa3CuCgoCF', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_ZJQT6cbqr2BEUUSYbSxgaemh', function=Function(arguments='{"product_category":"data_analytics","industry":"financial_services"}', name='product_catalog'), type='function')]), content_filter_results={})], created=1752227870, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_efad92c60b', usage=CompletionUsage(completion_tokens=23, prompt_tokens=344, total_tokens=367, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)), prompt_filter_results=[{'prompt_index': 0, 'con