# Quanta: Categorization & Generation - Model Selection 

Refer proposal https://docs.google.com/document/d/1x7n2iy1_LZXZNLQpxCzF84lZ8BEG6ZT3KWXC59erhJA 

Your .env file must contain MARTIAN_API_KEY and HF_TOKEN tokens

## Martian LLMs

Supported martian models are at https://app.withmartian.com/docs/index.html
and https://api.withmartian.com/v1/models

In [1]:
import pandas as pd
import random
import requests
import os
from dotenv import load_dotenv
import openai
from concurrent.futures import ThreadPoolExecutor, as_completed
import httpx

In [2]:
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
try:
    import MathsCatGen as mcg
except ImportError:
    !{sys.executable} -m pip install --upgrade git+https://github.com/PhilipQuirke/quanta_maths.git
    import MathsCatGen as mcg

In [3]:
# Fetch Martian model data from the API
martian_models_url = "https://api.withmartian.com/v1/models"
response = requests.get(martian_models_url)
martian_models_json = response.json()

martian_models_json = martian_models_json['data']

In [4]:
# Remove from martian_models_json all models whose name contains ':cheap'
martian_models_json = [model for model in martian_models_json if ':cheap' not in model['id']]
print(f"Models after filtering ':cheap': {len(martian_models_json)}")

Models after filtering ':cheap': 261


In [5]:
# Extract model names from the new data structure
def extract_model_names():
    """Extract just the model ids from the new data structure"""
    return [model['id'] for model in martian_models_json]

# Group models by provider (if provider info is in id, e.g., 'provider/model')
def get_models_by_provider():
    providers = {}
    for model in martian_models_json:
        provider = model['id'].split('/')[0]
        if provider not in providers:
            providers[provider] = []
        providers[provider].append(model)
    return providers

# Find models by input cost
def find_models_by_cost(top_n=5, reverse=False):
    models_with_cost = [(model['id'], model.get('pricing', {}).get('prompt', float('inf')), model.get('pricing', {}).get('completion', float('inf'))) for model in martian_models_json]
    sorted_by_input = sorted(models_with_cost, key=lambda x: x[1], reverse=reverse)
    return sorted_by_input[:top_n]

In [6]:
martian_models_names = extract_model_names()

print("=== MARTIAN AI MODELS ANALYSIS ===\n")

providers = get_models_by_provider()
print(f"Number of providers: {len(providers)}")

print(f"\nCheapest Models (input cost):")
models = find_models_by_cost(reverse=False)
for i, model in enumerate(models, 1):
    print( "  ", i, model[0], model[1], model[2])

print(f"\nMost Expensive Models (input cost):")
models = find_models_by_cost(reverse=True)
for i, model in enumerate(models, 1):
    print( "  ", i, model[0], model[1], model[2])

# Print top-level and nested JSON keys for inspection
def print_json_keys(obj, prefix=""):
    if isinstance(obj, dict):
        for key, value in obj.items():
            print(f"{prefix}{key}")
            print_json_keys(value, prefix + "  ")
    elif isinstance(obj, list) and obj:
        print_json_keys(obj[0], prefix + "[0] ")
print("\nJSON key structure:")
print_json_keys(martian_models_json)

print(f"\nSample model data structure:")
for i in range(3):
    model = martian_models_json[i]
    print(f"   {model}")

=== MARTIAN AI MODELS ANALYSIS ===

Number of providers: 43

Cheapest Models (input cost):
   1 meta-llama/llama-3.2-1b-instruct 0.000000005 0.00000001
   2 liquid/lfm-7b 0.00000001 0.00000001
   3 liquid/lfm-3b 0.00000002 0.00000002
   4 meta-llama/llama-3.1-8b-instruct 0.00000002 0.00000003
   5 meta-llama/llama-3.2-3b-instruct 0.00000002 0.00000002

Most Expensive Models (input cost):
   1 openai/o1-pro 0.00015 0.0006
   2 openai/gpt-4 0.00003 0.00006
   3 anthropic/claude-3-opus-20240229 0.000015 0.000075
   4 anthropic/claude-opus-4-0 0.000015 0.000075
   5 anthropic/claude-opus-4-1 0.000015 0.000075

JSON key structure:
[0] id
[0] pricing
[0]   prompt
[0]   completion
[0]   image
[0]   request
[0]   web_search
[0]   internal_reasoning
[0] added_at
[0] updated_at
[0] reliability_tier

Sample model data structure:
   {'id': 'ai21/jamba-large-1.7', 'pricing': {'prompt': '0.000002', 'completion': '0.000008', 'image': '0', 'request': '0', 'web_search': '0', 'internal_reasoning': '0'},

## Run Models

In [7]:
load_dotenv()
MARTIAN_API_KEY = os.getenv("MARTIAN_API_KEY")
assert MARTIAN_API_KEY, "API key not found. Please set MARTIAN_API_KEY in your .env file."

In [8]:
client = openai.OpenAI(
    base_url="https://api.withmartian.com/v1",
    api_key=MARTIAN_API_KEY,
    max_retries=0,  # Don't retry on timeout
    timeout=httpx.Timeout(60.0, connect=10.0)  # Separate connect timeout     
)

In [9]:
def run_model_inference(model_name, prompt, ground_truth, timeout=60):
    """
    Send a model a prompt, get the response, compare it to the ground_truth.
    Any model taking longer than 60 seconds to respond is consider to have failed or died. 
    Returns (answer, success). If timeout, returns ("TIMEOUT", False).
    """
    try:
        # The OpenAI client has its own timeout parameter
        response = client.chat.completions.create(
            model=model_name,
            max_tokens=1024,
            messages=[{"role": "user", "content": prompt}],
            timeout=timeout  
        )
        answer = response.choices[0].message.content.strip()
        success = mcg.is_ground_truth_correct(answer, ground_truth)
        return answer, success
    
    except openai.APITimeoutError:
        return "TIMEOUT", False
    except openai.APIError as e:
        return f"Error: {str(e)}", False
    except Exception as e:
        return f"Error: {str(e)}", False

In [10]:
# Evaluate multiple models concurrently, scoring them based on progressive test success.
def evaluate_models_progressive(tests, max_workers=32):
    model_scores = []

    def score_model(model_name):
        score = 0
        for test_idx, (prompt, ground_truth) in enumerate(tests):
            answer, success = run_model_inference(model_name, prompt, ground_truth)
            
            if success:
                score = test_idx + 1
            else:
                # Check for error codes
                if "TIMEOUT" in str(answer):
                    score = -408  # HTTP timeout code
                    break
                elif isinstance(answer, str) and answer.startswith("Error:"):
                    if "400" in answer:
                        score = -400
                    else:
                        score = -999
                    break
                else:
                    # Just got wrong answer
                    break
        
        return {"model": model_name, "score": score}

    print(f"Evaluating {len(martian_models_json)} models concurrently with {max_workers} workers...")
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_model = {
            executor.submit(score_model, model_name): model_name 
            for model_name in martian_models_names
        }
        
        for idx, future in enumerate(as_completed(future_to_model, timeout=120), 1):
            model_name = future_to_model[future]
            try:
                result = future.result(timeout=90)  # Add safety margin over API timeout
            except Exception as exc:
                result = {"model": model_name, "score": -999}
            print(f"[{idx}/{len(martian_models_json)}] {result['model']}: Score = {result['score']}")
            model_scores.append(result)
    
    return model_scores

## Generate prompt and response data for tasks

In [11]:
# Generate the data
maths_tasks = mcg.get_maths_tasks()
prompt_template = mcg.get_prompt_template()
synthetic_data_df = mcg.generate_synthetic_data(maths_tasks, prompt_template, n_examples_per_task=200)

# Display sample
print("Sample of generated data:")
pd.set_option('display.max_colwidth', None)  # Show full column content
pd.set_option('display.width', None)         # Don't wrap lines
sample_df = synthetic_data_df.groupby('task').head(2)
print(sample_df[['task', 'x', 'y', 'prompt', 'ground_truth']].to_string(index=False))

# Save to file
# synthetic_data_df.to_csv('synthetic_arithmetic_data.csv', index=False)


Generated 1311 total examples across 7 tasks
Examples per task: {'minimum': 200, 'maximum': 200, 'sum': 200, 'difference': 200, 'product': 200, 'average': 200, 'exponential': 111}
Sample of generated data:
       task  x  y                                                                  prompt    ground_truth
    minimum  8  7       Answer minimally: Given the numbers 8 and 7 calculate the minimum               7
    minimum 27 92     Answer minimally: Given the numbers 27 and 92 calculate the minimum              27
    maximum  8  7       Answer minimally: Given the numbers 8 and 7 calculate the maximum               8
    maximum 27 92     Answer minimally: Given the numbers 27 and 92 calculate the maximum              92
        sum  8  7           Answer minimally: Given the numbers 8 and 7 calculate the sum              15
        sum 27 92         Answer minimally: Given the numbers 27 and 92 calculate the sum             119
 difference  8  7    Answer minimally: Given the nu

## Find good research models

Scan the model, using the synthetic data, to find models that can accurately perform several tasks 

In [12]:
# Cached results from prior runs on 13Oct25.

# These models passed can accurately answer the first 4 tasks, but are closed source.
cached_good_closed_models = [
        'anthropic/claude-3-5-sonnet-20240620',
        'anthropic/claude-3-7-sonnet-latest',
        'anthropic/claude-3-haiku-20240307',
        'google/gemini-2.0-flash',
        'google/gemini-2.0-flash-001',
        'google/gemini-2.0-flash-lite',
        'google/gemini-2.0-flash-lite-001',
        'google/gemini-2.0-flash-lite-preview',
        'google/gemini-2.0-flash-lite-preview-02-05',
        'google/gemini-2.5-flash' 
        'x-ai/grok-3',
        'x-ai/grok-3-beta',
        'x-ai/grok-3-mini',
        'x-ai/grok-3-mini-beta',
        'x-ai/grok-code-fast-1',
        'deepinfra/google/gemini-2.0-flash-001',
        'deepinfra/google/gemini-2.5-flash',
        'deepinfra/google/gemini-2.5-pro',
        'liquid/lfm-3b', #  Liquid AI LFM-3B model closed. Open models 350M, 700M, 1.2B, and 2.6B under Apache 2.0 
        'mistralai/ministral-3b', # Not open source. Research allowed on 8B.
]

# Good open-source models that passed the first 4 tasks for 5 instances each. Took 25mins to run.
cached_good_open_models_4tasks_5instances = [
    {
        'name': 'deepcogito/cogito-v2-preview-llama-109b-moe',
        'hf_repo': 'deepcogito/cogito-v2-preview-llama-109B-MoE',
        'url': 'https://huggingface.co/deepcogito/cogito-v2-preview-llama-109B-MoE',
        'notes': '109B MoE with reasoning capabilities, trained with IDA'
    },
    {
        'name': 'deepinfra/openai/gpt-oss-120b',
        'hf_repo': 'openai/gpt-oss-120b',
        'url': 'https://huggingface.co/openai/gpt-oss-120b',
        'notes': "OpenAI's 117B MoE model (5.1B active params), Apache 2.0 license"
    },
    {
        'name': 'deepinfra/openai/gpt-oss-20b',
        'hf_repo': 'openai/gpt-oss-20b',
        'url': 'https://huggingface.co/openai/gpt-oss-20b',
        'notes': "OpenAI's 21B MoE model (3.6B active params), Apache 2.0 license"
    },
    {
        'name': 'deepseek/deepseek-r1-distill-qwen-14b',
        'hf_repo': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
        'url': 'https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
        'notes': 'Distilled from DeepSeek-R1, reasoning model'
    },
    {
        'name': 'deepseek/deepseek-r1-distill-qwen-32b',
        'hf_repo': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
        'url': 'https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
        'notes': 'Distilled from DeepSeek-R1, outperforms o1-mini, SOTA for dense models'
    },
    {
        'name': 'meta-llama/llama-3-70b-instruct',
        'hf_repo': 'meta-llama/Meta-Llama-3-70B-Instruct',
        'url': 'https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct',
        'notes': 'Original Llama 3 70B, released April 2024'
    },
    {
        'name': 'meta-llama/llama-3.1-70b-instruct',
        'hf_repo': 'meta-llama/Llama-3.1-70B-Instruct',
        'url': 'https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct',
        'notes': 'Llama 3.1 with 128K context length'
    },
    {
        'name': 'meta-llama/llama-3.2-90b-vision-instruct',
        'hf_repo': 'meta-llama/Llama-3.2-90B-Vision-Instruct',
        'url': 'https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct',
        'notes': 'Multimodal (text + images), vision reasoning capabilities'
    },
    {
        'name': 'meta-llama/llama-3.3-70b-instruct',
        'hf_repo': 'meta-llama/Llama-3.3-70B-Instruct',
        'url': 'https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct',
        'notes': 'Latest 70B instruct model, December 2023 cutoff'
    },
    {
        'name': 'meta-llama/llama-4-maverick',
        'hf_repo': 'meta-llama/Llama-4-Maverick-17B-128E',
        'url': 'https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E',
        'notes': '17B active params (~400B total), 128 experts, natively multimodal, 1M context',
        'instruct_variant': 'meta-llama/Llama-4-Maverick-17B-128E-Instruct'
    },
]
 
# Good open-source models that passed the first 6 tasks for 5 instances each. Took 60mins to run.
cached_good_open_models_6tasks_5instances = [
    {
        'name': 'meta-llama/llama-3.2-90b-vision-instruct',
        'hf_repo': 'meta-llama/Llama-3.2-90B-Vision-Instruct',
        'url': 'https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct',
        'notes': 'Multimodal (text + images), vision reasoning capabilities'
    },
    {
        'name': 'meta-llama/llama-4-maverick',
        'hf_repo': 'meta-llama/Llama-4-Maverick-17B-128E',
        'url': 'https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E',
        'notes': '17B active params (~400B total), 128 experts, natively multimodal, 1M context',
        'instruct_variant': 'meta-llama/Llama-4-Maverick-17B-128E-Instruct'
    },
    {
        'name': 'meta-llama/llama-4-scout',
        'hf_repo': 'meta-llama/Llama-4-Scout-17B-16E',
        'url': 'https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E',
        'notes': '17B active params (~109B total), 16 experts, natively multimodal, 10M context, fits on single H100 GPU',
        'instruct_variant': 'meta-llama/Llama-4-Scout-17B-16E-Instruct'
    },
    {
        'name': 'nvidia/llama-3.1-nemotron-70b-instruct',
        'hf_repo': 'nvidia/Llama-3.1-Nemotron-70B-Instruct-HF',
        'url': 'https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF',
        'notes': '70B model fine-tuned by NVIDIA using RLHF, #1 on Arena Hard/AlpacaEval 2 LC/MT-Bench as of Oct 2024, trained for helpfulness',
        'base_model': 'meta-llama/Llama-3.1-70B-Instruct'
    },
    {
        'name': 'qwen/qwen-2.5-coder-32b-instruct',
        'hf_repo': 'Qwen/Qwen2.5-Coder-32B-Instruct',
        'url': 'https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct',
        'notes': 'SOTA open-source code LLM, matches GPT-4o coding abilities, 128K context, 5.5T tokens training'
    }
]

In [13]:
# Scan models for accuracy on first few tasks. 
num_test_tasks = 6
num_per_task = 5
num_models_to_find = 10 # Maximum

# 1. Search for good open source models (containing 'llama', 'qwen', 'oss')
def get_open_models(possible_models):
    open_source_keywords = {'meta', 'llama', 'qwen', 'oss'}
    return [model for model in possible_models if any(keyword in model['id'] for keyword in open_source_keywords)]

# 2. Select first few tasks
def get_scan_tasks():
    return maths_tasks[:num_test_tasks]

# 3. For each model, check accuracy on a few instances of each task
def scan_model_accuracy(model, df, scan_tasks):
    results = {}
    for task in scan_tasks:
        # Select 5 examples for this task
        task_df = df[df['task'] == task].sample(n=num_per_task, random_state=42)
        correct = 0
        for _, row in task_df.iterrows():
            try:
                answer, success = run_model_inference(model['id'], row["prompt"], row["ground_truth"])
            except Exception as e:
                answer = f"Error: {str(e)}"
                success = False
            print(f"Model: {model['id']}\nOutput/Error: {answer}\nSuccess: {success}\n{'-'*40}")

            if success:
                correct += 1
            else:
                # If any failure, stop testing this task
                break
        results[task] = correct
    return results

# 4. Find some models that get all questions correct for each task
def get_good_models(small_models,scan_tasks):
    good_models = []

    for model in small_models:
        acc = scan_model_accuracy(model, synthetic_data_df, scan_tasks)
        if all(v == 5 for v in acc.values()):
            good_models.append(model['id'])
        if len(good_models) >= num_models_to_find:
            break

    return good_models

use_cached_models = True
if use_cached_models:
    # For speed, use cached results from prior runs
    cached_good_models = cached_good_open_models_6tasks_5instances
    good_models = [model['name'] for model in cached_good_open_models_6tasks_5instances]
else:
    open_models = get_open_models(martian_models_json)
    print(f"Found {len(open_models)} models")

    scan_tasks = get_scan_tasks()
    good_models = get_good_models(open_models, scan_tasks)

In [14]:
# These models are often downloadable from HuggingFace else available via API
print(f"Some models with perfect accuracy on {num_per_task} instances of first {num_test_tasks} tasks:")
for model_name in good_models :
    print( "  ", model_name )

Some models with perfect accuracy on 5 instances of first 6 tasks:
   meta-llama/llama-3.2-90b-vision-instruct
   meta-llama/llama-4-maverick
   meta-llama/llama-4-scout
   nvidia/llama-3.1-nemotron-70b-instruct
   qwen/qwen-2.5-coder-32b-instruct


## Manually inspect model output

We want to avoid models that use a python sandbox to do math. This is sometimes visible in the answer detail. View sample answers here. 

In [15]:
# For each of the top models, ask one instance of one task and show the answer
def inspect_model_answers():
    for model_name in good_models:
        task = random.choice(maths_tasks)
        example_df = synthetic_data_df[synthetic_data_df['task'] == task].sample(n=1, random_state=42).iloc[0]
        print(f"\nModel: {model_name}\nTask: {task}\nPrompt: {example_df['prompt']}\nGround Truth: {example_df['ground_truth']}")
        answer, success = run_model_inference(model_name, example_df['prompt'], example_df['ground_truth'])
        print(f"Answer: {answer}\nSuccess: {success}\n{'-'*60}")

#inspect_model_answers()