In [6]:
from repenseai.genai.agent import list_models
list_models('chat')

['deepseek-chat',
 'deepseek-reasoner',
 'sabia-3',
 'sabiazinho-3',
 'gpt-4o-mini',
 'gpt-4o',
 'gpt-4.1',
 'gpt-4.1-mini',
 'gpt-4.1-nano',
 'o1',
 'o1-mini',
 'o1-pro',
 'o3-mini',
 'o3',
 'o4-mini',
 'claude-3-5-haiku-20241022',
 'claude-3-5-sonnet-20241022',
 'claude-3-7-sonnet-20250219',
 'gemini-2.5-pro-exp-03-25',
 'gemini-1.5-pro',
 'gemini-1.5-flash',
 'gemini-2.0-flash',
 'gemini-2.0-flash-lite-preview-02-05',
 'mistral-large-latest',
 'mistral-small-latest',
 'pixtral-12b-2409',
 'command-r-plus-08-2024',
 'command-r-08-2024',
 'llama-3.3-70b-versatile',
 'Meta-Llama-3.1-405B-Instruct',
 'meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo',
 'meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo',
 'meta-llama/Llama-3.3-70B-Instruct-Turbo',
 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo',
 'Qwen/Qwen2-VL-72B-Instruct',
 'meta-llama/Llama-4-Scout-17B-16E-Instruct',
 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8',
 'mistralai/Mistral-7B-Instruct-v0.2',
 'grok-2',
 'grok-2-vision',


In [5]:
import json
import numpy as np
from typing import Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

from repenseai.genai.agent import Agent
from repenseai.genai.tasks.api import Task
from repenseai.utils.text import extract_json_text

MODELS = [
    "claude-3-5-sonnet-20241022",
    "gpt-4.1",
    "o4-mini",
    "deepseek-chat",
    "deepseek-reasoner"
]

# Define equações matemáticas complexas para teste
EQUATIONS = {
    "baseline": "1 + 1",
    "basic_arithmetic": "2345678901234567890 * 1234567890987654321",
    "nested_parentheses": "(((5 * 7 + 3) * 4 - 2) * 6 + 8) / 2",
    "exponents": "2**64 + 3**21 - 5**15",
    "large_division": "987654321987654321 / 123456789",
    "mixed_operations": "sqrt(3**2 + 4**2) * (15 + 25) / 2",
    "complex_fractions": "(1/3 + 2/5) * (7/8 - 3/4)",
    "scientific_notation": "2.5e20 * 4.8e15 / 1.2e18",
    "trigonometry": "sin(45) + cos(60) + tan(30)",
    "logarithms": "log(1000000) + log2(256) + log10(1000000000)",
    "complex_roots": "((sqrt(169) + sqrt(144)) * sqrt(25))",
}

SOLVER_PROMPT = """
You are a mathematical expert. Solve the following equation and provide your solution and explanation.
Always return the final numerical result (not in scientific notation).

Equation: {equation}

Return your response in the following JSON format:
{
    "solution": <numerical_result>,
    "explanation": "Step by step explanation of how you solved it"
}
"""

def evaluate_equation(equation: str) -> float:
    """Avalia a equação usando eval() do Python com funções matemáticas do numpy"""
    namespace = {
        'sin': np.sin,
        'cos': np.cos,
        'tan': np.tan,
        'sqrt': np.sqrt,
        'log': np.log,
        'log2': np.log2,
        'log10': np.log10,
        'pi': np.pi,
        'e': np.e
    }
    return float(eval(equation, {"__builtins__": {}}, namespace))

def check_response(predicted: int | float, actual: float) -> bool:
    """Check if predicted value matches actual value"""
    if predicted == actual:
        return True
    return abs(predicted - actual) < 1e-10

def solve_equation(agent: Agent, eq_name: str, equation: str, print_lock: Lock) -> tuple[str, str]:
    """Solve a single equation using the agent"""
    task = Task(
        user=SOLVER_PROMPT,
        agent=agent,
        simple_response=True
    )
    response = task.run({"equation": equation})
    with print_lock:
        print(f"Completed: {eq_name}")
    return eq_name, response

def process_model_results(model_name: str, equations_results: Dict[str, Any], actual_results: Dict[str, float]) -> Dict[str, Any]:
    """Process results for a single model"""
    model_results = {}
    
    for eq_name, text_response in equations_results.items():
        try:
            json_response = extract_json_text(text_response)
            response = json.loads(json_response)
            predicted = float(response["solution"])
            correct = check_response(predicted, actual_results[eq_name])
        except Exception as e:
            predicted = None
            correct = False

        model_results[eq_name] = {
            "predicted": predicted,
            "actual": actual_results[eq_name],
            "correct": correct
        }
        
        print(f"\n{model_name} - {eq_name}:")
        print(f"Predicted: {predicted}")
        print(f"Actual: {actual_results[eq_name]}")
    
    return model_results

# Calculate actual results
actual_results = {name: evaluate_equation(equation) for name, equation in EQUATIONS.items()}
results = {}

# Process each model
for model_name in MODELS:
    print(f"\nProcessing model: {model_name}")
    
    agent = Agent(
        model=model_name, 
        model_type="chat",
    )
    
    equations_results = {}
    print_lock = Lock()
    
    # Create thread pool and submit tasks
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_eq = {
            executor.submit(solve_equation, agent, eq_name, equation, print_lock): eq_name 
            for eq_name, equation in EQUATIONS.items()
        }
        
        # Collect results as they complete
        for future in as_completed(future_to_eq):
            eq_name, response = future.result()
            equations_results[eq_name] = response
    
    # Process results for this model
    results[model_name] = process_model_results(model_name, equations_results, actual_results)

# Final Analysis
print("\n=== FINAL ANALYSIS ===")
for model_name in MODELS:
    correct_count = sum(1 for eq_result in results[model_name].values() if eq_result["correct"])
    total_count = len(EQUATIONS)
    accuracy = (correct_count / total_count) * 100
    print(f"\n{model_name}:")
    print(f"Accuracy: {accuracy:.2f}%")


Processing model: claude-3-5-sonnet-20241022


2025-04-18 10:40:01 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: baseline


2025-04-18 10:40:03 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: nested_parentheses


2025-04-18 10:40:03 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: exponents


2025-04-18 10:40:04 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: mixed_operations


2025-04-18 10:40:05 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: basic_arithmetic


2025-04-18 10:40:05 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: large_division


2025-04-18 10:40:07 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: complex_fractions


2025-04-18 10:40:07 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: scientific_notation


2025-04-18 10:40:08 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: complex_roots


2025-04-18 10:40:09 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: trigonometry


2025-04-18 10:40:09 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: logarithms

claude-3-5-sonnet-20241022 - baseline:
Predicted: 2.0
Actual: 2.0

claude-3-5-sonnet-20241022 - nested_parentheses:
Predicted: 494.0
Actual: 454.0

claude-3-5-sonnet-20241022 - exponents:
Predicted: 1.8446744073709552e+19
Actual: 1.8446744053652326e+19

claude-3-5-sonnet-20241022 - mixed_operations:
Predicted: 200.0
Actual: 100.0

claude-3-5-sonnet-20241022 - basic_arithmetic:
Predicted: 2.8960196201625335e+39
Actual: 2.895899854031399e+36

claude-3-5-sonnet-20241022 - large_division:
Predicted: 8000000016000000.0
Actual: 8000000080.900001

claude-3-5-sonnet-20241022 - complex_fractions:
Predicted: 0.025
Actual: 0.09166666666666667

claude-3-5-sonnet-20241022 - scientific_notation:
Predicted: 1e+18
Actual: 1e+18

claude-3-5-sonnet-20241022 - complex_roots:
Predicted: 65.0
Actual: 125.0

claude-3-5-sonnet-20241022 - trigonometry:
Predicted: 1.9330127018922194
Actual: -6.5068406525273135

claude-3-5-sonnet-20241022 - logarithms:
Predicted: 25.0
Actual: 30.815510557

2025-04-18 10:40:11 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-18 10:40:11 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: baseline
Completed: basic_arithmetic


2025-04-18 10:40:12 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: large_division


2025-04-18 10:40:12 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: exponents


2025-04-18 10:40:13 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-18 10:40:13 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: nested_parentheses
Completed: mixed_operations


2025-04-18 10:40:14 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: scientific_notation


2025-04-18 10:40:15 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: complex_roots


2025-04-18 10:40:16 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: logarithms


2025-04-18 10:40:16 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: complex_fractions


2025-04-18 10:40:18 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: trigonometry

gpt-4.1 - baseline:
Predicted: 2.0
Actual: 2.0

gpt-4.1 - basic_arithmetic:
Predicted: 2.895899852717331e+39
Actual: 2.895899854031399e+36

gpt-4.1 - large_division:
Predicted: 8000000072.0
Actual: 8000000080.900001

gpt-4.1 - exponents:
Predicted: None
Actual: 1.8446744053652326e+19

gpt-4.1 - nested_parentheses:
Predicted: 436.0
Actual: 454.0

gpt-4.1 - mixed_operations:
Predicted: 250.0
Actual: 100.0

gpt-4.1 - scientific_notation:
Predicted: None
Actual: 1e+18

gpt-4.1 - complex_roots:
Predicted: 130.0
Actual: 125.0

gpt-4.1 - logarithms:
Predicted: 27.0
Actual: 30.815510557964274

gpt-4.1 - complex_fractions:
Predicted: 0.175
Actual: 0.09166666666666667

gpt-4.1 - trigonometry:
Predicted: 1.618253968
Actual: -6.5068406525273135

Processing model: o4-mini


2025-04-18 10:40:20 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: baseline


2025-04-18 10:40:22 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: nested_parentheses


2025-04-18 10:40:26 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: complex_fractions


2025-04-18 10:40:26 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: mixed_operations


2025-04-18 10:40:32 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: scientific_notation


2025-04-18 10:40:33 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: trigonometry


2025-04-18 10:40:37 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: complex_roots


2025-04-18 10:40:39 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: logarithms


2025-04-18 10:40:46 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: exponents


2025-04-18 10:41:00 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: large_division


2025-04-18 10:43:07 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: basic_arithmetic

o4-mini - baseline:
Predicted: 2.0
Actual: 2.0

o4-mini - nested_parentheses:
Predicted: 454.0
Actual: 454.0

o4-mini - complex_fractions:
Predicted: None
Actual: 0.09166666666666667

o4-mini - mixed_operations:
Predicted: 100.0
Actual: 100.0

o4-mini - scientific_notation:
Predicted: 1e+18
Actual: 1e+18

o4-mini - trigonometry:
Predicted: 1.7844570502
Actual: -6.5068406525273135

o4-mini - complex_roots:
Predicted: 125.0
Actual: 125.0

o4-mini - logarithms:
Predicted: 23.0
Actual: 30.815510557964274

o4-mini - exponents:
Predicted: 1.8446744053652326e+19
Actual: 1.8446744053652326e+19

o4-mini - large_division:
Predicted: None
Actual: 8000000080.900001

o4-mini - basic_arithmetic:
Predicted: 2.895899854031399e+36
Actual: 2.895899854031399e+36

Processing model: deepseek-chat


2025-04-18 10:43:08 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-04-18 10:43:08 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-04-18 10:43:08 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-04-18 10:43:08 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-04-18 10:43:09 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"


Completed: nested_parentheses


2025-04-18 10:43:20 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"


Completed: large_division


2025-04-18 10:43:34 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"


Completed: exponents


2025-04-18 10:43:51 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"


Completed: baseline


2025-04-18 10:43:55 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"


Completed: basic_arithmetic


2025-04-18 10:44:19 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"


Completed: mixed_operations


2025-04-18 10:44:29 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"


Completed: complex_fractions
Completed: scientific_notation
Completed: trigonometry
Completed: logarithms
Completed: complex_roots

deepseek-chat - nested_parentheses:
Predicted: 467.0
Actual: 454.0

deepseek-chat - large_division:
Predicted: 8000000073.0
Actual: 8000000080.900001

deepseek-chat - exponents:
Predicted: 1.8446744073709552e+19
Actual: 1.8446744053652326e+19

deepseek-chat - baseline:
Predicted: 2.0
Actual: 2.0

deepseek-chat - basic_arithmetic:
Predicted: 2.8958998532864333e+48
Actual: 2.895899854031399e+36

deepseek-chat - mixed_operations:
Predicted: 100.0
Actual: 100.0

deepseek-chat - complex_fractions:
Predicted: -0.03666666666666667
Actual: 0.09166666666666667

deepseek-chat - scientific_notation:
Predicted: 1000000000000000.0
Actual: 1e+18

deepseek-chat - trigonometry:
Predicted: 1.760344
Actual: -6.5068406525273135

deepseek-chat - logarithms:
Predicted: 19.0
Actual: 30.815510557964274

deepseek-chat - complex_roots:
Predicted: 65.0
Actual: 125.0

Processing mod

2025-04-18 10:45:34 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-04-18 10:45:34 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-04-18 10:45:34 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-04-18 10:45:34 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-04-18 10:45:35 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"


Completed: nested_parentheses


2025-04-18 10:45:59 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"


Completed: basic_arithmetic


2025-04-18 10:48:48 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"


Completed: exponents


2025-04-18 10:52:23 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"


Completed: baseline


2025-04-18 10:52:47 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-04-18 10:55:59 - INFO - Erro na chamada da API - modelo deepseek-reasoner: Expecting value: line 10 column 1 (char 9)


Completed: mixed_operations


2025-04-18 10:55:59 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"
2025-04-18 10:58:49 - INFO - Erro na chamada da API - modelo deepseek-reasoner: Expecting value: line 10 column 1 (char 9)


Completed: complex_fractions


2025-04-18 10:58:49 - INFO - HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"


Completed: large_division
Completed: scientific_notation
Completed: trigonometry
Completed: logarithms
Completed: complex_roots

deepseek-reasoner - nested_parentheses:
Predicted: 454.0
Actual: 454.0

deepseek-reasoner - basic_arithmetic:
Predicted: 2.8958998583384054e+37
Actual: 2.895899854031399e+36

deepseek-reasoner - exponents:
Predicted: 1.8446744053652326e+19
Actual: 1.8446744053652326e+19

deepseek-reasoner - baseline:
Predicted: None
Actual: 2.0

deepseek-reasoner - mixed_operations:
Predicted: None
Actual: 100.0

deepseek-reasoner - complex_fractions:
Predicted: None
Actual: 0.09166666666666667

deepseek-reasoner - large_division:
Predicted: 8000000008.1
Actual: 8000000080.900001

deepseek-reasoner - scientific_notation:
Predicted: 1e+18
Actual: 1e+18

deepseek-reasoner - trigonometry:
Predicted: 1.7845
Actual: -6.5068406525273135

deepseek-reasoner - logarithms:
Predicted: 23.0
Actual: 30.815510557964274

deepseek-reasoner - complex_roots:
Predicted: 125.0
Actual: 125.0

===

### Reasoning antes da resposta

In [None]:
import json
import numpy as np
from typing import Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

from repenseai.genai.agent import Agent
from repenseai.genai.tasks.api import Task
from repenseai.utils.text import extract_json_text

MODELS = [
    "claude-3-5-sonnet-20241022",
    "gpt-4.1",
]

# Define equações matemáticas complexas para teste
EQUATIONS = {
    "baseline": "1 + 1",
    "basic_arithmetic": "2345678901234567890 * 1234567890987654321",
    "nested_parentheses": "(((5 * 7 + 3) * 4 - 2) * 6 + 8) / 2",
    "exponents": "2**64 + 3**21 - 5**15",
    "large_division": "987654321987654321 / 123456789",
    "mixed_operations": "sqrt(3**2 + 4**2) * (15 + 25) / 2",
    "complex_fractions": "(1/3 + 2/5) * (7/8 - 3/4)",
    "scientific_notation": "2.5e20 * 4.8e15 / 1.2e18",
    "trigonometry": "sin(45) + cos(60) + tan(30)",
    "logarithms": "log(1000000) + log2(256) + log10(1000000000)",
    "complex_roots": "((sqrt(169) + sqrt(144)) * sqrt(25))",
}

SOLVER_PROMPT = """
You are a mathematical expert. Solve the following equation and provide your solution and explanation.
Always return the final numerical result (not in scientific notation).

Equation: {equation}

Return your response in the following JSON format:
{
    "explanation": "Step by step explanation of how you solved it",
    "solution": <numerical_result>
}
"""

def evaluate_equation(equation: str) -> float:
    """Avalia a equação usando eval() do Python com funções matemáticas do numpy"""
    namespace = {
        'sin': np.sin,
        'cos': np.cos,
        'tan': np.tan,
        'sqrt': np.sqrt,
        'log': np.log,
        'log2': np.log2,
        'log10': np.log10,
        'pi': np.pi,
        'e': np.e
    }
    return float(eval(equation, {"__builtins__": {}}, namespace))

def check_response(predicted: int | float, actual: float) -> bool:
    """Check if predicted value matches actual value"""
    if predicted == actual:
        return True
    return abs(predicted - actual) < 1e-10

def solve_equation(agent: Agent, eq_name: str, equation: str, print_lock: Lock) -> tuple[str, str]:
    """Solve a single equation using the agent"""
    task = Task(
        user=SOLVER_PROMPT,
        agent=agent,
        simple_response=True
    )
    response = task.run({"equation": equation})
    with print_lock:
        print(f"Completed: {eq_name}")
    return eq_name, response

def process_model_results(model_name: str, equations_results: Dict[str, Any], actual_results: Dict[str, float]) -> Dict[str, Any]:
    """Process results for a single model"""
    model_results = {}
    
    for eq_name, text_response in equations_results.items():
        try:
            json_response = extract_json_text(text_response)
            response = json.loads(json_response)
            predicted = float(response["solution"])
            correct = check_response(predicted, actual_results[eq_name])
        except Exception as e:
            predicted = None
            correct = False

        model_results[eq_name] = {
            "predicted": predicted,
            "actual": actual_results[eq_name],
            "correct": correct
        }
        
        print(f"\n{model_name} - {eq_name}:")
        print(f"Predicted: {predicted}")
        print(f"Actual: {actual_results[eq_name]}")
    
    return model_results

# Calculate actual results
actual_results = {name: evaluate_equation(equation) for name, equation in EQUATIONS.items()}
results = {}

# Process each model
for model_name in MODELS:
    print(f"\nProcessing model: {model_name}")
    
    agent = Agent(
        model=model_name, 
        model_type="chat",
    )
    
    equations_results = {}
    print_lock = Lock()
    
    # Create thread pool and submit tasks
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_eq = {
            executor.submit(solve_equation, agent, eq_name, equation, print_lock): eq_name 
            for eq_name, equation in EQUATIONS.items()
        }
        
        # Collect results as they complete
        for future in as_completed(future_to_eq):
            eq_name, response = future.result()
            equations_results[eq_name] = response
    
    # Process results for this model
    results[model_name] = process_model_results(model_name, equations_results, actual_results)

# Final Analysis
print("\n=== FINAL ANALYSIS ===")
for model_name in MODELS:
    correct_count = sum(1 for eq_result in results[model_name].values() if eq_result["correct"])
    total_count = len(EQUATIONS)
    accuracy = (correct_count / total_count) * 100
    print(f"\n{model_name}:")
    print(f"Accuracy: {accuracy:.2f}%")


Processing model: claude-3-5-sonnet-20241022


2025-04-18 11:10:19 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
2025-04-18 11:10:19 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
2025-04-18 11:10:19 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: baseline
Completed: basic_arithmetic
Completed: nested_parentheses


2025-04-18 11:10:20 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: large_division


2025-04-18 11:10:20 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: exponents


2025-04-18 11:10:22 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: mixed_operations


2025-04-18 11:10:24 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
2025-04-18 11:10:24 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
2025-04-18 11:10:24 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
2025-04-18 11:10:24 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: scientific_notation
Completed: trigonometry
Completed: logarithms
Completed: complex_fractions


2025-04-18 11:10:25 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Completed: complex_roots

claude-3-5-sonnet-20241022 - baseline:
Predicted: 2.0
Actual: 2.0

claude-3-5-sonnet-20241022 - basic_arithmetic:
Predicted: 2.896019960666578e+45
Actual: 2.895899854031399e+36

claude-3-5-sonnet-20241022 - nested_parentheses:
Predicted: 454.0
Actual: 454.0

claude-3-5-sonnet-20241022 - large_division:
Predicted: 8000000007.0
Actual: 8000000080.900001

claude-3-5-sonnet-20241022 - exponents:
Predicted: 1.8446744073689496e+19
Actual: 1.8446744053652326e+19

claude-3-5-sonnet-20241022 - mixed_operations:
Predicted: 100.0
Actual: 100.0

claude-3-5-sonnet-20241022 - scientific_notation:
Predicted: 1e+18
Actual: 1e+18

claude-3-5-sonnet-20241022 - trigonometry:
Predicted: 1.7844570503761732
Actual: -6.5068406525273135

claude-3-5-sonnet-20241022 - logarithms:
Predicted: 30.815510557964274
Actual: 30.815510557964274

claude-3-5-sonnet-20241022 - complex_fractions:
Predicted: 0.09166666666666667
Actual: 0.09166666666666667

claude-3-5-sonnet-20241022 - complex_roots:

2025-04-18 11:10:27 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: baseline


2025-04-18 11:10:28 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-18 11:10:28 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-18 11:10:28 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: nested_parentheses
Completed: basic_arithmetic
Completed: large_division


2025-04-18 11:10:29 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: exponents


2025-04-18 11:10:30 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: trigonometry


2025-04-18 11:10:30 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: complex_fractions


2025-04-18 11:10:31 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-18 11:10:31 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: complex_roots
Completed: scientific_notation


2025-04-18 11:10:31 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: logarithms


2025-04-18 11:10:32 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: mixed_operations

gpt-4.1 - baseline:
Predicted: 2.0
Actual: 2.0

gpt-4.1 - nested_parentheses:
Predicted: 454.0
Actual: 454.0

gpt-4.1 - basic_arithmetic:
Predicted: 2.895899854691262e+39
Actual: 2.895899854031399e+36

gpt-4.1 - large_division:
Predicted: 8000000081.0
Actual: 8000000080.900001

gpt-4.1 - exponents:
Predicted: 1.8446724016484176e+19
Actual: 1.8446744053652326e+19

gpt-4.1 - trigonometry:
Predicted: 1.7845
Actual: -6.5068406525273135

gpt-4.1 - complex_fractions:
Predicted: 0.09166666666666666
Actual: 0.09166666666666667

gpt-4.1 - complex_roots:
Predicted: 125.0
Actual: 125.0

gpt-4.1 - scientific_notation:
Predicted: 1e+18
Actual: 1e+18

gpt-4.1 - logarithms:
Predicted: 23.0
Actual: 30.815510557964274

gpt-4.1 - mixed_operations:
Predicted: 100.0
Actual: 100.0

=== FINAL ANALYSIS ===

claude-3-5-sonnet-20241022:
Accuracy: 63.64%

gpt-4.1:
Accuracy: 54.55%


### Reasoning models

In [8]:
import json
import numpy as np
from typing import Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

from repenseai.genai.agent import Agent
from repenseai.genai.tasks.api import Task
from repenseai.utils.text import extract_json_text

MODELS = [
    "o4-mini",
    "o3",
]

# Define equações matemáticas complexas para teste
EQUATIONS = {
    "baseline": "1 + 1",
    "basic_arithmetic": "2345678901234567890 * 1234567890987654321",
    "nested_parentheses": "(((5 * 7 + 3) * 4 - 2) * 6 + 8) / 2",
    "exponents": "2**64 + 3**21 - 5**15",
    "large_division": "987654321987654321 / 123456789",
    "mixed_operations": "sqrt(3**2 + 4**2) * (15 + 25) / 2",
    "complex_fractions": "(1/3 + 2/5) * (7/8 - 3/4)",
    "scientific_notation": "2.5e20 * 4.8e15 / 1.2e18",
    "trigonometry": "sin(45) + cos(60) + tan(30)",
    "logarithms": "log(1000000) + log2(256) + log10(1000000000)",
    "complex_roots": "((sqrt(169) + sqrt(144)) * sqrt(25))",
}

SOLVER_PROMPT = """
You are a mathematical expert. Solve the following equation and provide your solution and explanation.
Always return the final numerical result (not in scientific notation).

Equation: {equation}

Return your response in the following JSON format:
{
    "explanation": "Step by step explanation of how you solved it",
    "solution": <numerical_result>
}
"""

def evaluate_equation(equation: str) -> float:
    """Avalia a equação usando eval() do Python com funções matemáticas do numpy"""
    namespace = {
        'sin': np.sin,
        'cos': np.cos,
        'tan': np.tan,
        'sqrt': np.sqrt,
        'log': np.log,
        'log2': np.log2,
        'log10': np.log10,
        'pi': np.pi,
        'e': np.e
    }
    return float(eval(equation, {"__builtins__": {}}, namespace))

def check_response(predicted: int | float, actual: float) -> bool:
    """Check if predicted value matches actual value"""
    if predicted == actual:
        return True
    return abs(predicted - actual) < 1e-10

def solve_equation(agent: Agent, eq_name: str, equation: str, print_lock: Lock) -> tuple[str, str]:
    """Solve a single equation using the agent"""
    task = Task(
        user=SOLVER_PROMPT,
        agent=agent,
        simple_response=True
    )
    response = task.run({"equation": equation})
    with print_lock:
        print(f"Completed: {eq_name}")
    return eq_name, response

def process_model_results(model_name: str, equations_results: Dict[str, Any], actual_results: Dict[str, float]) -> Dict[str, Any]:
    """Process results for a single model"""
    model_results = {}
    
    for eq_name, text_response in equations_results.items():
        try:
            json_response = extract_json_text(text_response)
            response = json.loads(json_response)
            predicted = float(response["solution"])
            correct = check_response(predicted, actual_results[eq_name])
        except Exception as e:
            predicted = None
            correct = False

        model_results[eq_name] = {
            "predicted": predicted,
            "actual": actual_results[eq_name],
            "correct": correct
        }
        
        print(f"\n{model_name} - {eq_name}:")
        print(f"Predicted: {predicted}")
        print(f"Actual: {actual_results[eq_name]}")
    
    return model_results

# Calculate actual results
actual_results = {name: evaluate_equation(equation) for name, equation in EQUATIONS.items()}
results = {}

# Process each model
for model_name in MODELS:
    print(f"\nProcessing model: {model_name}")
    
    agent = Agent(
        model=model_name, 
        model_type="chat",
    )
    
    equations_results = {}
    print_lock = Lock()
    
    # Create thread pool and submit tasks
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_eq = {
            executor.submit(solve_equation, agent, eq_name, equation, print_lock): eq_name 
            for eq_name, equation in EQUATIONS.items()
        }
        
        # Collect results as they complete
        for future in as_completed(future_to_eq):
            eq_name, response = future.result()
            equations_results[eq_name] = response
    
    # Process results for this model
    results[model_name] = process_model_results(model_name, equations_results, actual_results)

# Final Analysis
print("\n=== FINAL ANALYSIS ===")
for model_name in MODELS:
    correct_count = sum(1 for eq_result in results[model_name].values() if eq_result["correct"])
    total_count = len(EQUATIONS)
    accuracy = (correct_count / total_count) * 100
    print(f"\n{model_name}:")
    print(f"Accuracy: {accuracy:.2f}%")


Processing model: o4-mini


2025-04-18 11:18:24 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: nested_parentheses


2025-04-18 11:18:25 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: baseline


2025-04-18 11:18:28 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: mixed_operations


2025-04-18 11:18:33 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: complex_fractions


2025-04-18 11:18:33 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: scientific_notation


2025-04-18 11:18:39 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: trigonometry


2025-04-18 11:18:42 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: complex_roots


2025-04-18 11:18:44 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: logarithms


2025-04-18 11:18:45 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: exponents


2025-04-18 11:19:19 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: large_division


2025-04-18 11:19:38 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: basic_arithmetic

o4-mini - nested_parentheses:
Predicted: 454.0
Actual: 454.0

o4-mini - baseline:
Predicted: 2.0
Actual: 2.0

o4-mini - mixed_operations:
Predicted: 100.0
Actual: 100.0

o4-mini - complex_fractions:
Predicted: None
Actual: 0.09166666666666667

o4-mini - scientific_notation:
Predicted: 1e+18
Actual: 1e+18

o4-mini - trigonometry:
Predicted: 1.78445705
Actual: -6.5068406525273135

o4-mini - complex_roots:
Predicted: 125.0
Actual: 125.0

o4-mini - logarithms:
Predicted: 23.0
Actual: 30.815510557964274

o4-mini - exponents:
Predicted: 1.8446744053652326e+19
Actual: 1.8446744053652326e+19

o4-mini - large_division:
Predicted: None
Actual: 8000000080.900001

o4-mini - basic_arithmetic:
Predicted: 2.895899854031399e+36
Actual: 2.895899854031399e+36

Processing model: o3


2025-04-18 11:19:41 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: baseline


2025-04-18 11:19:42 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: nested_parentheses


2025-04-18 11:19:44 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: mixed_operations


2025-04-18 11:19:49 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: complex_fractions


2025-04-18 11:19:50 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: scientific_notation


2025-04-18 11:19:54 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: trigonometry


2025-04-18 11:19:56 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: complex_roots


2025-04-18 11:20:08 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: logarithms


2025-04-18 11:20:14 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: exponents


2025-04-18 11:21:03 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: basic_arithmetic


2025-04-18 11:22:09 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: large_division

o3 - baseline:
Predicted: 2.0
Actual: 2.0

o3 - nested_parentheses:
Predicted: 454.0
Actual: 454.0

o3 - mixed_operations:
Predicted: 100.0
Actual: 100.0

o3 - complex_fractions:
Predicted: None
Actual: 0.09166666666666667

o3 - scientific_notation:
Predicted: 1e+18
Actual: 1e+18

o3 - trigonometry:
Predicted: 1.784457050376
Actual: -6.5068406525273135

o3 - complex_roots:
Predicted: 125.0
Actual: 125.0

o3 - logarithms:
Predicted: 30.815510557964277
Actual: 30.815510557964274

o3 - exponents:
Predicted: 1.8446744053652326e+19
Actual: 1.8446744053652326e+19

o3 - basic_arithmetic:
Predicted: 2.895899854031399e+36
Actual: 2.895899854031399e+36

o3 - large_division:
Predicted: 8000000080.900001
Actual: 8000000080.900001

=== FINAL ANALYSIS ===

o4-mini:
Accuracy: 63.64%

o3:
Accuracy: 81.82%


In [1]:
import json
import numpy as np
from typing import Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

from repenseai.genai.agent import Agent
from repenseai.genai.tasks.api import Task
from repenseai.utils.text import extract_json_text

from pydantic import BaseModel

class SolverResponse(BaseModel):
    explanation: str
    solution: float | int

MODELS = [
    "o4-mini",
    "o3",
]

# Define equações matemáticas complexas para teste
EQUATIONS = {
    "baseline": "1 + 1",
    "basic_arithmetic": "2345678901234567890 * 1234567890987654321",
    "nested_parentheses": "(((5 * 7 + 3) * 4 - 2) * 6 + 8) / 2",
    "exponents": "2**64 + 3**21 - 5**15",
    "large_division": "987654321987654321 / 123456789",
    "mixed_operations": "sqrt(3**2 + 4**2) * (15 + 25) / 2",
    "complex_fractions": "(1/3 + 2/5) * (7/8 - 3/4)",
    "scientific_notation": "2.5e20 * 4.8e15 / 1.2e18",
    "trigonometry": "sin(45) + cos(60) + tan(30)",
    "logarithms": "log(1000000) + log2(256) + log10(1000000000)",
    "complex_roots": "((sqrt(169) + sqrt(144)) * sqrt(25))",
}

SOLVER_PROMPT = """
You are a mathematical expert. Solve the following equation and provide your solution and explanation.
Always return the final numerical result (not in scientific notation).

Equation: {equation}

Return your response in the following JSON format:
{
    "explanation": "Step by step explanation of how you solved it",
    "solution": <numerical_result>
}
"""

def evaluate_equation(equation: str) -> float:
    """Avalia a equação usando eval() do Python com funções matemáticas do numpy"""
    namespace = {
        'sin': np.sin,
        'cos': np.cos,
        'tan': np.tan,
        'sqrt': np.sqrt,
        'log': np.log,
        'log2': np.log2,
        'log10': np.log10,
        'pi': np.pi,
        'e': np.e
    }
    return float(eval(equation, {"__builtins__": {}}, namespace))

def check_response(predicted: int | float, actual: float) -> bool:
    """Check if predicted value matches actual value"""
    if predicted == actual:
        return True
    return abs(predicted - actual) < 1e-10

def solve_equation(agent: Agent, eq_name: str, equation: str, print_lock: Lock) -> tuple[str, str]:
    """Solve a single equation using the agent"""
    task = Task(
        user=SOLVER_PROMPT,
        agent=agent,
        simple_response=True
    )
    response = task.run({"equation": equation})
    with print_lock:
        print(f"Completed: {eq_name}")
    return eq_name, response

def process_model_results(model_name: str, equations_results: Dict[str, Any], actual_results: Dict[str, float]) -> Dict[str, Any]:
    """Process results for a single model"""
    model_results = {}
    
    for eq_name, text_response in equations_results.items():
        try:
            json_response = extract_json_text(text_response)
            response = json.loads(json_response)
            predicted = float(response["solution"])
            correct = check_response(predicted, actual_results[eq_name])
        except Exception as e:
            predicted = None
            correct = False

        model_results[eq_name] = {
            "predicted": predicted,
            "actual": actual_results[eq_name],
            "correct": correct
        }
        
        print(f"\n{model_name} - {eq_name}:")
        print(f"Predicted: {predicted}")
        print(f"Actual: {actual_results[eq_name]}")
    
    return model_results

# Calculate actual results
actual_results = {name: evaluate_equation(equation) for name, equation in EQUATIONS.items()}
results = {}

# Process each model
for model_name in MODELS:
    print(f"\nProcessing model: {model_name}")
    
    agent = Agent(
        model=model_name, 
        model_type="chat",
        json_schema=SolverResponse
    )
    
    equations_results = {}
    print_lock = Lock()
    
    # Create thread pool and submit tasks
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_eq = {
            executor.submit(solve_equation, agent, eq_name, equation, print_lock): eq_name 
            for eq_name, equation in EQUATIONS.items()
        }
        
        # Collect results as they complete
        for future in as_completed(future_to_eq):
            eq_name, response = future.result()
            equations_results[eq_name] = response
    
    # Process results for this model
    results[model_name] = process_model_results(model_name, equations_results, actual_results)

# Final Analysis
print("\n=== FINAL ANALYSIS ===")
for model_name in MODELS:
    correct_count = sum(1 for eq_result in results[model_name].values() if eq_result["correct"])
    total_count = len(EQUATIONS)
    accuracy = (correct_count / total_count) * 100
    print(f"\n{model_name}:")
    print(f"Accuracy: {accuracy:.2f}%")


Processing model: o4-mini


2025-04-18 11:59:35 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: nested_parentheses


2025-04-18 11:59:36 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: baseline


2025-04-18 11:59:39 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: mixed_operations


2025-04-18 11:59:44 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: scientific_notation


2025-04-18 11:59:45 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: complex_fractions


2025-04-18 11:59:45 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: exponents


2025-04-18 11:59:47 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: complex_roots


2025-04-18 11:59:49 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: trigonometry


2025-04-18 11:59:53 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: logarithms


2025-04-18 12:00:36 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: large_division


2025-04-18 12:01:24 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: basic_arithmetic

o4-mini - nested_parentheses:
Predicted: None
Actual: 454.0

o4-mini - baseline:
Predicted: None
Actual: 2.0

o4-mini - mixed_operations:
Predicted: None
Actual: 100.0

o4-mini - scientific_notation:
Predicted: None
Actual: 1e+18

o4-mini - complex_fractions:
Predicted: None
Actual: 0.09166666666666667

o4-mini - exponents:
Predicted: None
Actual: 1.8446744053652326e+19

o4-mini - complex_roots:
Predicted: None
Actual: 125.0

o4-mini - trigonometry:
Predicted: None
Actual: -6.5068406525273135

o4-mini - logarithms:
Predicted: None
Actual: 30.815510557964274

o4-mini - large_division:
Predicted: None
Actual: 8000000080.900001

o4-mini - basic_arithmetic:
Predicted: None
Actual: 2.895899854031399e+36

Processing model: o3


2025-04-18 12:01:26 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: nested_parentheses


2025-04-18 12:01:27 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: baseline


2025-04-18 12:01:29 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: mixed_operations


2025-04-18 12:01:32 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: complex_fractions


2025-04-18 12:01:34 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: scientific_notation


2025-04-18 12:01:39 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: trigonometry


2025-04-18 12:01:41 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: logarithms


2025-04-18 12:01:42 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: complex_roots


2025-04-18 12:02:10 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: exponents


2025-04-18 12:03:51 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: large_division


2025-04-18 12:03:57 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Completed: basic_arithmetic

o3 - nested_parentheses:
Predicted: None
Actual: 454.0

o3 - baseline:
Predicted: None
Actual: 2.0

o3 - mixed_operations:
Predicted: None
Actual: 100.0

o3 - complex_fractions:
Predicted: None
Actual: 0.09166666666666667

o3 - scientific_notation:
Predicted: None
Actual: 1e+18

o3 - trigonometry:
Predicted: None
Actual: -6.5068406525273135

o3 - logarithms:
Predicted: None
Actual: 30.815510557964274

o3 - complex_roots:
Predicted: None
Actual: 125.0

o3 - exponents:
Predicted: None
Actual: 1.8446744053652326e+19

o3 - large_division:
Predicted: None
Actual: 8000000080.900001

o3 - basic_arithmetic:
Predicted: None
Actual: 2.895899854031399e+36

=== FINAL ANALYSIS ===

o4-mini:
Accuracy: 0.00%

o3:
Accuracy: 0.00%


In [9]:
actual_results

{'baseline': 2.0,
 'basic_arithmetic': 2.895899854031399e+36,
 'nested_parentheses': 454.0,
 'exponents': 1.8446744053652326e+19,
 'large_division': 8000000080.900001,
 'mixed_operations': 100.0,
 'complex_fractions': 0.09166666666666667,
 'scientific_notation': 1e+18,
 'trigonometry': -6.5068406525273135,
 'logarithms': 30.815510557964274,
 'complex_roots': 125.0}

In [12]:
chatgpt = [1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1]
print(f"Resultado: {sum(chatgpt)/len(chatgpt):.2%}")

Resultado: 81.82%
