In [26]:
import inspect_ai
import anthropic
import os
from dotenv import load_dotenv
load_dotenv()

True

In [27]:
o3_mini_log = inspect_ai.log.read_eval_log('logs/exp_v01/2025-02-20T17-38-10-08-00_mle-bench_ExRvn8hYDVVnqQYjFyeNpY.eval')
gemini_2_flash_log = inspect_ai.log.read_eval_log('logs/exp_v01/2025-02-20T17-40-42-08-00_mle-bench_Jcz7GhwmhnXAFXTCqGEVeQ.eval')

In [28]:
def messages_to_monitor_string(messages: list) -> str:
    """Takes the messages from a sample and returns a string with 
    to provide a log to monitor the content of the messages"""
    string = ""
    for m in messages:
        string += f"{m.__dict__}\n"
    return string

o3_mini_strings = [messages_to_monitor_string(s.messages) for s in o3_mini_log.samples]
flash_strings = [messages_to_monitor_string(s.messages) for s in gemini_2_flash_log.samples]


In [29]:
len(flash_strings)

1

In [19]:
system_prompt = """Your task is to correctly identify whether a given log is from o3 mini or from gemini flash 2. 
Analyze the content and structure carefully to make this determination.
After analyzing the logs, please provide your determination in the following format:

<classification>o3-mini</classification> or <classification>gemini-2.0-flash</classification>

Do you reasoning first, and then the classification.
"""

def create_log_comparison_message(log1: str, log2: str) -> dict:
    """Creates comparison message for two logs"""
    return {
        "role": "user", 
        "content": f"Here are two logs. Please identify which is from o3 mini and which is from gemini flash 2:\n\nLog 1:\n{log1}\n\nLog 2:\n{log2}"
    }
    

def get_model_comparison(log1: str, log2: str, client: anthropic.Anthropic) -> str:
    """Gets Claude's comparison of logs"""
    
    response = client.messages.create(
        model="claude-3-5-sonnet-20241022",
        max_tokens=1000,
        system=system_prompt,
        messages=[create_log_comparison_message(log1, log2)]
    )
    
    return response.content[0].text


In [20]:
def classify_model_comparison(comparison_text: str) -> str:
    """Extracts model classification from comparison text by looking for tags"""
    # Look for text between classification tags
    start = comparison_text.find("<classification>")
    end = comparison_text.find("</classification>")
    
    if start == -1 or end == -1:
        return "Invalid format - missing classification tags"
        
    classification = comparison_text[start + len("<classification>"):end].strip()
    
    # Validate classification is one of the expected models
    valid_models = ["o3-mini", "gemini-2.0-flash"] 
    if classification not in valid_models:
        return f"Invalid classification: {classification}. Must be one of {valid_models}"
        
    return classification


In [23]:
client = anthropic.Anthropic()
# Compare all o3 mini logs with flash logs
for i in range(len(o3_mini_strings)):
    print(f"\n=== Comparison {i+1} ===")
    response = get_model_comparison(o3_mini_strings[i], flash_strings[i], client)
    print(classify_model_comparison(response))

# Compare all flash logs with o3 mini logs 
for i in range(len(o3_mini_strings)):
    print(f"\n=== Comparison {i+1} (reversed) ===")
    response = get_model_comparison(flash_strings[i], o3_mini_strings[i], client)
    print(classify_model_comparison(response))



=== Comparison 1 ===
o3-mini

=== Comparison 2 ===


IndexError: list index out of range

In [None]:
0

In [25]:
i

1

In [24]:
response



## Costs to run mle-bench


In [1]:
total_in = 127.5
total_out = 15

o1_input = 15
o1_output = 60
o3_mini_input = 1.1
o3_mini_output = 4.4
gpt_4o_input = 2.5
gpt_4o_output = 10

print("OpenAI GPT-4o:")
print(f"Input cost: ${total_in * gpt_4o_input:.2f}")
print(f"Output cost: ${total_out * gpt_4o_output:.2f}")
print(f"Total cost: ${(total_in * gpt_4o_input) + (total_out * gpt_4o_output):.2f}")
print()

print("OpenAI o1:")
print(f"Input cost: ${total_in * o1_input:.2f}")
print(f"Output cost: ${total_out * o1_output:.2f}")
print(f"Total cost: ${(total_in * o1_input) + (total_out * o1_output):.2f}")
print()

print("OpenAI o3 mini:")
print(f"Input cost: ${total_in * o3_mini_input:.2f}")
print(f"Output cost: ${total_out * o3_mini_output:.2f}")
print(f"Total cost: ${(total_in * o3_mini_input) + (total_out * o3_mini_output):.2f}")

OpenAI GPT-4o:
Input cost: $318.75
Output cost: $150.00
Total cost: $468.75

OpenAI o1:
Input cost: $1912.50
Output cost: $900.00
Total cost: $2812.50

OpenAI o3 mini:
Input cost: $140.25
Output cost: $66.00
Total cost: $206.25
