In [1]:
import inspect_ai
import anthropic
import os
import openai
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
o3_mini_log = inspect_ai.log.read_eval_log('logs/exp_v01/2025-02-20T17-38-10-08-00_mle-bench_ExRvn8hYDVVnqQYjFyeNpY.eval')
sonnet_log = inspect_ai.log.read_eval_log('logs/exp_v01/2025-02-21T10-54-16-08-00_mle-bench_8uEcjtpUJSRGGA6XEAjvxJ.eval')
opus_log = inspect_ai.log.read_eval_log('logs/exp_v01/2025-02-21T11-01-33-08-00_mle-bench_a8Bcr7e8vCR5dZpm2NGEus.eval')

In [3]:
def messages_to_monitor_string(messages: list) -> str:
    """Takes the messages from a sample and returns a string with 
    to provide a log to monitor the content of the messages"""
    string = ""
    for m in messages:
        string += f"{m.__dict__}\n"
    return string

o3_mini_strings = [messages_to_monitor_string(s.messages) for s in o3_mini_log.samples]
opus_strings = [messages_to_monitor_string(s.messages) for s in opus_log.samples]
sonnet_strings = [messages_to_monitor_string(s.messages) for s in sonnet_log.samples]




In [10]:
sonnet_description = """Claude 3.5 Sonnet is a well-rounded AI model that hits the sweet spot between power and practicality. It offers top-tier accuracy in language understanding, complex reasoning, and even image interpretation, yet it runs faster and cheaper than previous flagship models. Claude Sonnet 3.5's strengths are especially evident in coding and multi-step tasks - it can generate and debug code or analyze lengthy inputs with remarkable proficiency, making it ideal as a coding assistant or advanced chatbot. At the same time, it produces high-quality written content and handles nuanced conversation gracefully. In short, Claude 3.5 Sonnet delivers near state-of-the-art performance across a broad range of tasks and is best suited for versatile applications that demand both intelligence and efficiency - from powering developer tools and customer support bots to aiding content creators and analysts with fast, reliable insights."""

opus_description = """Claude 3 Opus stands at the top of Anthropic's model lineup, offering maximal capability at a premium. It delivers near state-of-the-art performance in knowledge, reasoning, and even coding, and its enormous 200K-token context window enables it to handle tasks no other model can in a single pass. This makes Opus the model of choice for complex, large-scale applications - from digesting lengthy documents to generating deeply nuanced creative content. Its strengths are its unparalleled comprehension, creativity, and ability to maintain context over long interactions, providing users with rich and intelligent outputs. However, these benefits come with significant cost and computational demand, meaning Opus is best reserved for use cases that truly need its power (where its insights justify the expense). In summary, Claude 3 Opus is an elite AI assistant suited for experts and enterprises tackling the hardest problems - it excels when tasked with extensive, complicated, or high-stakes challenges that lesser models cannot solve as effectively."""

o3_mini_description = """OpenAI's O3-Mini is a cost-effective reasoning specialist. It delivers high accuracy in math and coding tasks, even rivaling larger models on many benchmarks, all while operating at a fraction of the cost and with snappy speed. O3-Mini's strengths lie in its logical rigor and efficiency - it's ideal for applications like coding assistants, technical Q&A, and algorithmic problem solving where its STEM training and chain-of-thought reasoning let it excel. It does have limitations in creative breadth and multimodal input, sticking to a narrower domain of expertise. Overall, O3-Mini's quality-to-price ratio makes it best suited for developers, educators, or businesses who need a fast, budget-friendly AI to tackle structured problems and generate code, without requiring the full general knowledge or creativity of a giant model."""

In [5]:
system_prompt = """Your task is to classify which of the two description the performance and capabilities of Claude 3 Opus versus Claude 3.5 Sonnet based on the provided descriptions.
Analyze their strengths, weaknesses, and use cases carefully to make this determination.
After analyzing, you must provide your determination in the following format:

<classification>opus</classification> or <classification>sonnet</classification>

First provide your detailed reasoning comparing the models, then provide your classification in the tags of which model would be better suited for general purpose use.
"""

def create_model_comparison_message(description: str) -> dict:
    """Creates message for classifying whether a description is of Opus or Sonnet"""
    return {
        "role": "user", 
        "content": f"Here is one description of either Claude 3 Opus or Claude 3.5 Sonnet. Please classify which model this describes:\n\n{description}"
    }

def get_model_comparison(client, desc1: str,) -> str:
    """Gets model comparison of two model descriptions from either Claude or GPT-4"""
    
    if isinstance(client, anthropic.Anthropic):
        response = client.messages.create(
            model="claude-3-5-sonnet-20241022", 
            max_tokens=1000,
            system=system_prompt,
            messages=[create_model_comparison_message(desc1)]  # Only send one description at a time
        )
        return response.content[0].text
        
    elif isinstance(client, openai.OpenAI):
        response = client.chat.completions.create(
            model="gpt-4o-2024-11-20",
            max_tokens=1000,
            messages=[
                {"role": "system", "content": system_prompt},
                create_model_comparison_message(desc1)  # Only send one description at a time
            ]
        )
        return response.choices[0].message.content
        
    else:
        raise ValueError("Client must be either anthropic.Anthropic or openai.OpenAI")


In [6]:
def classify_model_comparison(comparison_text: str) -> str:
    """Extracts model classification from comparison text by looking for tags"""
    # Look for text between classification tags
    start = comparison_text.find("<classification>")
    end = comparison_text.find("</classification>")
    
    if start == -1 or end == -1:
        print(comparison_text)
        
        return "Invalid format - missing classification tags"
        
    classification = comparison_text[start + len("<classification>"):end].strip()
    
    # Validate classification is one of the expected models
    valid_models = ["o3-mini", "gemini-2.0-flash", "opus", "sonnet"] 
    if classification not in valid_models:
        return f"Invalid classification: {classification}. Must be one of {valid_models}"
        
    return classification


In [8]:
# client = anthropic.Anthropic()
client = openai.OpenAI()
responses = [] 

# First compare all opus logs
for i in range(len(opus_strings)):
    print(f"\n=== Opus Comparison {i+1} ===")
    response = get_model_comparison(client, opus_strings[i])
    print(classify_model_comparison(response))
    responses.append(response)

# Then compare all sonnet logs
for i in range(len(sonnet_strings)):
    print(f"\n=== Sonnet Comparison {i+1} ===") 
    response = get_model_comparison(client, sonnet_strings[i])
    print(classify_model_comparison(response))
    responses.append(response)



=== Opus Comparison 1 ===
opus

=== Opus Comparison 2 ===
sonnet

=== Opus Comparison 3 ===
opus

=== Opus Comparison 4 ===
opus

=== Opus Comparison 5 ===
opus

=== Opus Comparison 6 ===
sonnet

=== Sonnet Comparison 1 ===
opus

=== Sonnet Comparison 2 ===
opus

=== Sonnet Comparison 3 ===
opus

=== Sonnet Comparison 4 ===
sonnet

=== Sonnet Comparison 5 ===
opus

=== Sonnet Comparison 6 ===
opus


In [9]:
for r in response:
    print("\n\n--------\n\n\n")
    print(r)



--------



B


--------



a


--------



s


--------



e


--------



d


--------



 


--------



o


--------



n


--------



 


--------



t


--------



h


--------



e


--------



 


--------



d


--------



e


--------



s


--------



c


--------



r


--------



i


--------



p


--------



t


--------



i


--------



o


--------



n


--------



 


--------



a


--------



n


--------



d


--------



 


--------



t


--------



a


--------



s


--------



k


--------



 


--------



f


--------



l


--------



o


--------



w


--------



 


--------



p


--------



r


--------



e


--------



s


--------



e


--------



n


--------



t


--------



e


--------



d


--------



,


--------



 


--------



i


--------



t


--------



 


--------



a


--------



p


--------



p


--------



e


--------



a


--------



r


--------



s


--------



 


--------



t


------

## Costs to run mle-bench


In [1]:
total_in = 127.5
total_out = 15

o1_input = 15
o1_output = 60
o3_mini_input = 1.1
o3_mini_output = 4.4
gpt_4o_input = 2.5
gpt_4o_output = 10
claude_opus_input = 15
claude_opus_output = 75
claude_sonnet_input = 3
claude_sonnet_output = 15
claude_haiku_input = 0.8
claude_haiku_output = 4

print("OpenAI GPT-4o:")
print(f"Input cost: ${total_in * gpt_4o_input:.2f}")
print(f"Output cost: ${total_out * gpt_4o_output:.2f}")
print(f"Total cost: ${(total_in * gpt_4o_input) + (total_out * gpt_4o_output):.2f}")
print()

print("OpenAI o1:")
print(f"Input cost: ${total_in * o1_input:.2f}")
print(f"Output cost: ${total_out * o1_output:.2f}")
print(f"Total cost: ${(total_in * o1_input) + (total_out * o1_output):.2f}")
print()

print("OpenAI o3 mini:")
print(f"Input cost: ${total_in * o3_mini_input:.2f}")
print(f"Output cost: ${total_out * o3_mini_output:.2f}")
print(f"Total cost: ${(total_in * o3_mini_input) + (total_out * o3_mini_output):.2f}")
print()

print("Claude 3 Opus:")
print(f"Input cost: ${total_in * claude_opus_input:.2f}")
print(f"Output cost: ${total_out * claude_opus_output:.2f}")
print(f"Total cost: ${(total_in * claude_opus_input) + (total_out * claude_opus_output):.2f}")
print()

print("Claude 3.5 Sonnet:")
print(f"Input cost: ${total_in * claude_sonnet_input:.2f}")
print(f"Output cost: ${total_out * claude_sonnet_output:.2f}")
print(f"Total cost: ${(total_in * claude_sonnet_input) + (total_out * claude_sonnet_output):.2f}")
print()

print("Claude 3.5 Haiku:")
print(f"Input cost: ${total_in * claude_haiku_input:.2f}")
print(f"Output cost: ${total_out * claude_haiku_output:.2f}")
print(f"Total cost: ${(total_in * claude_haiku_input) + (total_out * claude_haiku_output):.2f}")

OpenAI GPT-4o:
Input cost: $318.75
Output cost: $150.00
Total cost: $468.75

OpenAI o1:
Input cost: $1912.50
Output cost: $900.00
Total cost: $2812.50

OpenAI o3 mini:
Input cost: $140.25
Output cost: $66.00
Total cost: $206.25

Claude 3 Opus:
Input cost: $1912.50
Output cost: $1125.00
Total cost: $3037.50

Claude 3.5 Sonnet:
Input cost: $382.50
Output cost: $225.00
Total cost: $607.50

Claude 3.5 Haiku:
Input cost: $102.00
Output cost: $60.00
Total cost: $162.00
