# Categorization & Generation - Find good models that succeed on N Tasks with M tests.

Creates and saves synthetic maths data to:
/home/ubuntu/pq-research/data/prompt_categorization/synthetic_arithmetic_data.csv

Finds good models that succeed on N tasks with M test examples and saves model names to say:
/home/ubuntu/pq-research/data/prompt_categorization/GoodOpenModels_6Tasks_5Tests.json

The proposal/hypothesis is here https://docs.google.com/document/d/1x7n2iy1_LZXZNLQpxCzF84lZ8BEG6ZT3KWXC59erhJA

Your .env file must contain your MARTIAN_API_KEY (obtained from app.withmartian.com)

In [42]:
# CatGen (singleton) config class
class CG:
    # Scan models for accuracy on first few tasks.
    # Configuration constants
    NUM_TEST_TASKS = 4
    NUM_EXAMPLES_PER_TASK = 5
    MAX_MODELS_TO_FIND = 10
    RANDOM_SEED = 42

    # Open source keywords for model filtering
    OPEN_SOURCE_KEYWORDS = {'meta', 'llama', 'qwen', 'oss'}

    # The Martian API may have no credit available
    insufficientBalance = False

    IN_COLAB = False


## Martian LLMs

Supported martian models are at https://app.withmartian.com/docs/index.html
and https://api.withmartian.com/v1/models

In [2]:
import pandas as pd
import random
import requests
import os
from dotenv import load_dotenv
import openai
from concurrent.futures import ThreadPoolExecutor, as_completed
import httpx
import json

In [28]:
import sys
import os

try:
    import google.colab
    CG.IN_COLAB = True
except:
    CG.IN_COLAB = False

if CG.IN_COLAB:
    !pip install --upgrade git+https://github.com/PhilipQuirke/LlmPromptCategorization.git -q
else:
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import MathsCatGen as mcg

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [29]:
# Fetch Martian model data from the API
martian_models_url = "https://api.withmartian.com/v1/models"
response = requests.get(martian_models_url)
martian_models_json = response.json()

martian_models_json = martian_models_json['data']

In [30]:
# Remove from martian_models_json all models whose name contains ':cheap'
martian_models_json = [model for model in martian_models_json if ':cheap' not in model['id']]
print(f"Models after filtering ':cheap': {len(martian_models_json)}")

Models after filtering ':cheap': 281


In [31]:
# Extract model names from the new data structure
def extract_model_names():
    """Extract just the model ids from the new data structure"""
    return [model['id'] for model in martian_models_json]

# Group models by provider (if provider info is in id, e.g., 'provider/model')
def get_models_by_provider():
    providers = {}
    for model in martian_models_json:
        provider = model['id'].split('/')[0]
        if provider not in providers:
            providers[provider] = []
        providers[provider].append(model)
    return providers

# Find models by input cost
def find_models_by_cost(top_n=5, reverse=False):
    models_with_cost = [(model['id'], model.get('pricing', {}).get('prompt', float('inf')), model.get('pricing', {}).get('completion', float('inf'))) for model in martian_models_json]
    sorted_by_input = sorted(models_with_cost, key=lambda x: x[1], reverse=reverse)
    return sorted_by_input[:top_n]

In [32]:
martian_models_names = extract_model_names()

print("=== MARTIAN AI MODELS ANALYSIS ===\n")

providers = get_models_by_provider()
print(f"Number of providers: {len(providers)}")

print(f"\nCheapest Models (input cost):")
models = find_models_by_cost(reverse=False)
for i, model in enumerate(models, 1):
    print( "  ", i, model[0], model[1], model[2])

print(f"\nMost Expensive Models (input cost):")
models = find_models_by_cost(reverse=True)
for i, model in enumerate(models, 1):
    print( "  ", i, model[0], model[1], model[2])

# Print top-level and nested JSON keys for inspection
def print_json_keys(obj, prefix=""):
    if isinstance(obj, dict):
        for key, value in obj.items():
            print(f"{prefix}{key}")
            print_json_keys(value, prefix + "  ")
    elif isinstance(obj, list) and obj:
        print_json_keys(obj[0], prefix + "[0] ")
print("\nJSON key structure:")
print_json_keys(martian_models_json)

print(f"\nSample model data structure:")
for i in range(3):
    model = martian_models_json[i]
    print(f"   {model}")

=== MARTIAN AI MODELS ANALYSIS ===

Number of providers: 45

Cheapest Models (input cost):
   1 ibm-granite/granite-4.0-h-micro 0.000000017 0.00000011
   2 deepinfra/google/gemma-3-4b-it 0.00000001703012 0.0000000681536
   3 deepseek/deepseek-r1-0528-qwen3-8b 0.00000002 0.0000001
   4 meta-llama/llama-3.1-8b-instruct 0.00000002 0.00000003
   5 meta-llama/llama-3.2-3b-instruct 0.00000002 0.00000002

Most Expensive Models (input cost):
   1 openai/o1-pro 0.00015 0.0006
   2 openai/gpt-4 0.00003 0.00006
   3 openai/gpt-5.2-pro 0.000021 0.000168
   4 openai/gpt-5.2-pro-2025-12-11 0.000021 0.000168
   5 openai/o3-pro 0.00002 0.00008

JSON key structure:
[0] id
[0] pricing
[0]   prompt
[0]   completion
[0]   image
[0]   request
[0]   web_search
[0]   internal_reasoning
[0] added_at
[0] updated_at
[0] reliability_tier
[0] max_completion_tokens

Sample model data structure:
   {'id': 'ai21/jamba-large-1.7', 'pricing': {'prompt': '0.000002', 'completion': '0.000008', 'image': '0', 'request': '0

## Run Models

In [43]:
from google.colab import userdata


def get_colab_secret(key_name,key):
    if key is not None :
        return key

    try:
        return userdata.get(key_name)
    except userdata.SecretNotFoundError:
        return None


load_dotenv()
CG.DATA_DIR = os.getenv('DATA_DIR')
CG.HF_CACHE = os.getenv('HF_HOME')
CG.MARTIAN_API_KEY = os.getenv("MARTIAN_API_KEY")


if CG.IN_COLAB:
    CG.DATA_DIR = get_colab_secret("DATA_DIR",CG.DATA_DIR)
    CG.HF_CACHE = get_colab_secret("HF_CACHE",CG.HF_CACHE)
    CG.MARTIAN_API_KEY = get_colab_secret("MARTIAN_API_KEY",CG.MARTIAN_API_KEY)
else:
    # Suppliment with settings from persistent storage
    load_dotenv('/home/ubuntu/pq-research/.env')
    if CG.DATA_DIR is None :
        CG.DATA_DIR = os.getenv("DATA_DIR")
    if CG.HF_CACHE is None :
        CG.HF_CACHE = os.getenv("HF_CACHE")
    if CG.MARTIAN_API_KEY is None :
        CG.MARTIAN_API_KEY = os.getenv("MARTIAN_API_KEY")

    if CG.DATA_DIR is None :
        CG.DATA_DIR = '/home/ubuntu/pq-research/data/prompt_categorization'
    if CG.HF_CACHE is None :
        CG.HF_CACHE = '/home/ubuntu/pq-research/models'


print("DATA_DIR", CG.DATA_DIR)
print("HF_CACHE", CG.HF_CACHE)
assert CG.MARTIAN_API_KEY, "API key not found. Please set MARTIAN_API_KEY in your .env file."

DATA_DIR /home/ubuntu/pq-research/data/prompt_categorization
HF_CACHE /home/ubuntu/pq-research/models


AssertionError: API key not found. Please set MARTIAN_API_KEY in your .env file.

In [44]:
client = openai.OpenAI(
    base_url="https://api.withmartian.com/v1",
    api_key=CG.MARTIAN_API_KEY,
    max_retries=0,  # Don't retry on timeout
    timeout=httpx.Timeout(60.0, connect=10.0)  # Separate connect timeout
)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [61]:
def run_model_inference(model_name, prompt, ground_truth, timeout=60):
    """
    Send a model a prompt, get the response, compare it to the ground_truth.
    Any model taking longer than 60 seconds to respond is consider to have failed or died.
    Returns (answer, success). If timeout, returns ("TIMEOUT", False).
    """
    try:
        # The OpenAI client has its own timeout parameter
        response = client.chat.completions.create(
            model=model_name,
            max_completion_tokens=1024, # else max_tokens=1024
            messages=[{"role": "user", "content": prompt}],
            timeout=timeout
        )
        answer = response.choices[0].message.content.strip()
        success = mcg.is_ground_truth_correct(answer, ground_truth)
        return answer, success

    except openai.APITimeoutError:
        return "TIMEOUT", False
    except openai.APIError as e:
        return f"Error: {str(e)}", False
    except Exception as e:
        return f"Error: {str(e)}", False

In [11]:
# Evaluate multiple models concurrently, scoring them based on progressive test success.
def evaluate_models_progressive(tests, max_workers=32):
    model_scores = []

    def score_model(model_name):
        score = 0
        for test_idx, (prompt, ground_truth) in enumerate(tests):
            answer, success = run_model_inference(model_name, prompt, ground_truth)

            if success:
                score = test_idx + 1
            else:
                # Check for error codes
                if "TIMEOUT" in str(answer):
                    score = -408  # HTTP timeout code
                    break
                elif isinstance(answer, str) and answer.startswith("Error:"):
                    if "400" in answer:
                        score = -400
                    else:
                        score = -999
                    break
                else:
                    # Just got wrong answer
                    break

        return {"model": model_name, "score": score}

    print(f"Evaluating {len(martian_models_json)} models concurrently with {max_workers} workers...")
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_model = {
            executor.submit(score_model, model_name): model_name
            for model_name in martian_models_names
        }

        for idx, future in enumerate(as_completed(future_to_model, timeout=120), 1):
            model_name = future_to_model[future]
            try:
                result = future.result(timeout=90)  # Add safety margin over API timeout
            except Exception as exc:
                result = {"model": model_name, "score": -999}
            print(f"[{idx}/{len(martian_models_json)}] {result['model']}: Score = {result['score']}")
            model_scores.append(result)

    return model_scores

## Generate prompt and response data for tasks

In [36]:
# Generate the data
maths_tasks = mcg.get_maths_tasks()
prompt_template = mcg.get_prompt_template()
synthetic_data_df = mcg.generate_synthetic_data(maths_tasks, prompt_template, n_examples_per_task=200)

# Display sample
print("Sample of generated data:")
pd.set_option('display.max_colwidth', None)  # Show full column content
pd.set_option('display.width', None)         # Don't wrap lines
sample_df = synthetic_data_df.groupby('task').head(2)
print(sample_df[['task', 'x', 'y', 'prompt', 'ground_truth']].to_string(index=False))

if CG.DATA_DIR is not None:
  # Persist to file
  output_file = os.path.join(CG.DATA_DIR, 'synthetic_arithmetic_data.csv')
  synthetic_data_df.to_csv(output_file, index=False)
  print(f"Saved synthetic_data_df to: {output_file}")


Generated 1311 total examples across 7 tasks
Examples per task: {'minimum': 200, 'maximum': 200, 'sum': 200, 'difference': 200, 'product': 200, 'average': 200, 'exponential': 111}
Sample of generated data:
       task  x  y                                                                  prompt    ground_truth
    minimum  8  7       Answer minimally: Given the numbers 8 and 7 calculate the minimum               7
    minimum 27 92     Answer minimally: Given the numbers 27 and 92 calculate the minimum              27
    maximum  8  7       Answer minimally: Given the numbers 8 and 7 calculate the maximum               8
    maximum 27 92     Answer minimally: Given the numbers 27 and 92 calculate the maximum              92
        sum  8  7           Answer minimally: Given the numbers 8 and 7 calculate the sum              15
        sum 27 92         Answer minimally: Given the numbers 27 and 92 calculate the sum             119
 difference  8  7    Answer minimally: Given the nu

## Find good research models

Scan the model, using the synthetic data, to find models that can accurately perform several tasks

In [58]:
def filter_open_source_models(models: list[dict]) -> list[dict]:
    """
    Filter models to include only open source models based on keywords in model ID.

    Args:
        models: List of model dictionaries with 'id' field

    Returns:
        List of models matching open source keywords
    """
    return [
        model for model in models
        #if any(keyword in model['id'].lower() for keyword in CG.OPEN_SOURCE_KEYWORDS)
        if 'gpt-5.1' in model['id'].lower() # temp
    ]

In [59]:
# Run fresh evaluation
open_source_models = filter_open_source_models(martian_models_json)
print(f"Found {len(open_source_models)} open source models")
for the_model in open_source_models:
  print("  ", the_model["id"])

Found 5 open source models
   openai/gpt-5.1
   openai/gpt-5.1-2025-11-13
   openai/gpt-5.1-codex
   openai/gpt-5.1-codex-max
   openai/gpt-5.1-codex-mini


In [50]:
def get_test_tasks(num_tasks: int) -> list[str]:
    """
    Select the first N tasks for testing.

    Args:
        num_tasks: Number of tasks to return

    Returns:
        List of task names
    """
    return maths_tasks[:num_tasks]


def evaluate_model_on_tasks(
    model: dict,
    data_df: pd.DataFrame,
    tasks: list[str],
    num_examples: int,
    verbose: bool = False
) -> dict[str, int]:
    """
    Evaluate a model's accuracy on multiple tasks.

    For each task, tests the model on a sample of examples. Stops testing a task
    on first failure to save time.

    Args:
        model: Model dictionary with 'id' field
        data_df: DataFrame containing task examples with 'task', 'prompt', 'ground_truth' columns
        tasks: List of task names to evaluate
        num_examples: Number of examples to test per task
        verbose: Whether to print detailed results

    Returns:
        Dictionary mapping task names to number of correct answers
    """
    global CG

    model_id = model['id']
    task_results = {}

    for task in tasks:
        # Sample examples for this task
        task_examples = data_df[data_df['task'] == task].sample(
            n=num_examples,
            random_state=CG.RANDOM_SEED
        )

        correct_count = 0

        for _, example in task_examples.iterrows():
            try:
                answer, is_correct = run_model_inference(
                    model_id,
                    example['prompt'],
                    example['ground_truth']
                )
                if "Insufficient balance" in answer:
                    CG.insufficientBalance = True
            except Exception as e:
                is_correct = False
                answer = f"Error: {str(e)}"
                if "Insufficient balance" in str(e):
                    CG.insufficientBalance = True


            if verbose:
                print(f"Model: {model_id}\nOutput/Error: {answer}\nSuccess: {is_correct} \n{'-'*40}")

            if is_correct:
                correct_count += 1
            else:
                # Early exit on first failure for this task
                break

        task_results[task] = correct_count

    return task_results


def find_high_accuracy_models(
    candidate_models: list[dict],
    data_df: pd.DataFrame,
    tasks: list[str],
    num_examples: int,
    max_models: int,
    required_accuracy: int = None,
    verbose: bool = False
) -> list[str]:
    """
    Find models that achieve perfect or near-perfect accuracy on all tasks.

    Args:
        candidate_models: List of model dictionaries to evaluate
        data_df: DataFrame containing task examples
        tasks: List of task names to evaluate
        num_examples: Number of examples to test per task
        max_models: Maximum number of good models to find
        required_accuracy: Required correct answers per task (defaults to num_examples)

    Returns:
        List of model IDs that meet the accuracy threshold
    """
    global CG

    if required_accuracy is None:
        required_accuracy = num_examples

    print(f"Some models with perfect accuracy on {num_examples} instances of first {CG.NUM_TEST_TASKS} tasks:")
    high_accuracy_models = []

    for model in candidate_models:

        task_scores = evaluate_model_on_tasks(model, data_df, tasks, num_examples, verbose)

        # Check if model achieved required accuracy on all tasks
        if all(score >= required_accuracy for score in task_scores.values()):
            high_accuracy_models.append(model['id'])
            print( "  Good:", model['id'] )
        elif verbose:
            print(f"Model: {model['id']}, Task Scores: {task_scores}")

        # Stop if we've found enough models
        if len(high_accuracy_models) >= max_models:
            break

        # Need Martian API credits to call LLMs
        if CG.insufficientBalance == True:
            break

    return high_accuracy_models

In [62]:
CG.insufficientBalance = False

test_tasks = get_test_tasks(CG.NUM_TEST_TASKS)

good_models = find_high_accuracy_models(
    open_source_models,
    synthetic_data_df,
    test_tasks,
    num_examples=CG.NUM_EXAMPLES_PER_TASK,
    max_models=CG.MAX_MODELS_TO_FIND,
    verbose=True
)

Some models with perfect accuracy on 5 instances of first 4 tasks:
Model: openai/gpt-5.1
Output/Error: Error: Error code: 402 - {'error': 'Insufficient balance', 'request_id': 'ba942a32-1d75-4cfb-ab2a-5dd950468c32'}
Success: False 
----------------------------------------
Model: openai/gpt-5.1
Output/Error: Error: Error code: 402 - {'error': 'Insufficient balance', 'request_id': '421bbc7f-6ef0-4f2d-b9ab-5a410a2b7550'}
Success: False 
----------------------------------------
Model: openai/gpt-5.1
Output/Error: Error: Error code: 402 - {'error': 'Insufficient balance', 'request_id': '9759a0e9-6863-43d3-9997-cfb4fe081cfa'}
Success: False 
----------------------------------------
Model: openai/gpt-5.1
Output/Error: Error: Error code: 402 - {'error': 'Insufficient balance', 'request_id': '7b27a342-61ae-4910-9d88-e77003197f2b'}
Success: False 
----------------------------------------
Model: openai/gpt-5.1, Task Scores: {'minimum': 0, 'maximum': 0, 'sum': 0, 'difference': 0}


In [41]:
if CG.insufficientBalance == True:
    print("Insufficient balance on Martian API to call LLMs")

Insufficient balance on Martian API to call LLMs


In [18]:
if CG.DATA_DIR is not None:
    # Persist good models to file
    output_file = os.path.join(CG.DATA_DIR, f"GoodOpenModels_{CG.NUM_TEST_TASKS}Tasks_{CG.NUM_EXAMPLES_PER_TASK}Tests.json")
    with open(output_file, 'w') as f:
        json.dump(good_models, f, indent=2)
    print(f"Saved good models to: {output_file}")

## Manually inspect model output

We want to avoid models that use a python sandbox to do math. This is sometimes visible in the answer detail. View sample answers here.

In [19]:
# For each of the top models, ask one instance of one task and show the answer
def inspect_model_answers():
    for model_name in good_models:
        task = random.choice(maths_tasks)
        example_df = synthetic_data_df[synthetic_data_df['task'] == task].sample(n=1, random_state=42).iloc[0]
        print(f"\nModel: {model_name}\nTask: {task}\nPrompt: {example_df['prompt']}\nGround Truth: {example_df['ground_truth']}")
        answer, success = run_model_inference(model_name, example_df['prompt'], example_df['ground_truth'])
        print(f"Answer: {answer}\nSuccess: {success}\n{'-'*60}")

# inspect_model_answers()