# Quanta: Categorization & Generation Investigation 

Refer proposal https://docs.google.com/document/d/1x7n2iy1_LZXZNLQpxCzF84lZ8BEG6ZT3KWXC59erhJA 

## Martian LLMs

Supported martian models are at https://app.withmartian.com/docs/index.html
and https://api.withmartian.com/v1/models

In [1]:
import requests
import pandas as pd
import re
from typing import List, Tuple, Optional, Dict, Any
import os
from dotenv import load_dotenv
import openai
import concurrent.futures
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import matplotlib.pyplot as plt
from collections import Counter
import random
import httpx
import numpy as np

In [2]:
# Fetch Martian model data from the API
martian_models_url = "https://api.withmartian.com/v1/models"
response = requests.get(martian_models_url)
martian_models_json = response.json()

martian_models_json = martian_models_json['data']

In [3]:
# Remove from martian_models_json all models whose name contains ':cheap'
martian_models_json = [model for model in martian_models_json if ':cheap' not in model['id']]
print(f"Models after filtering ':cheap': {len(martian_models_json)}")

Models after filtering ':cheap': 261


In [4]:
def extract_param_size(param_str):
    """
    Convert model name to parameter float (in billions)
    Examples: qwen/qwen-2.5-vl-7b-instruct -> 7.0, qwen/qwen3-14 -> 14.0, qwen/qwen3-235b-a22 -> 235.0
    """
    if param_str is None:
        return None
    
    param_str = str(param_str).lower()
    
    # Handle MoE models like "8x7b" (e.g., Mixtral)
    moe_match = re.search(r'(\d+(?:\.\d+)?)x(\d+(?:\.\d+)?)b?', param_str)
    if moe_match:
        try:
            return float(moe_match.group(1)) * float(moe_match.group(2))
        except (ValueError, AttributeError):
            pass
    
    # Look for patterns like "7b", "14b", "235b"
    param_match = re.search(r'(\d+(?:\.\d+)?)b', param_str)
    if param_match:
        try:
            return float(param_match.group(1))
        except (ValueError, AttributeError):
            pass
    
    # Look for trailing numbers like "-14" or "-72" (without 'b' suffix)
    trailing_match = re.search(r'-(\d+(?:\.\d+)?)(?:-|$)', param_str)
    if trailing_match:
        try:
            return float(trailing_match.group(1))
        except (ValueError, AttributeError):
            pass
    
    return None
    
# Extend the martian_models_json with extracted parameter sizes
for model in martian_models_json:
    model['size'] = extract_param_size(model.get('id'))    

In [5]:
# Extract model names from the new data structure
def extract_model_names():
    """Extract just the model ids from the new data structure"""
    return [model['id'] for model in martian_models_json]

# Group models by provider (if provider info is in id, e.g., 'provider/model')
def get_models_by_provider():
    providers = {}
    for model in martian_models_json:
        provider = model['id'].split('/')[0]
        if provider not in providers:
            providers[provider] = []
        providers[provider].append(model)
    return providers

# Find models by input cost
def find_models_by_cost(top_n=5, reverse=False):
    models_with_cost = [(model['id'], model.get('pricing', {}).get('prompt', float('inf')), model.get('pricing', {}).get('completion', float('inf'))) for model in martian_models_json]
    sorted_by_input = sorted(models_with_cost, key=lambda x: x[1], reverse=reverse)
    return sorted_by_input[:top_n]

# Find largest models by parameter count (if available)
def find_largest_models(top_n=5):
    models_with_params = [(model['id'], model['size']) for model in martian_models_json ]
    models_with_params = [(m[0], m[1]) for m in models_with_params if m[1] is not None]
    sorted_by_params = sorted(models_with_params, key=lambda x: x[1], reverse=True)
    return sorted_by_params[:top_n]

In [6]:
martian_models_names = extract_model_names()

print("=== MARTIAN AI MODELS ANALYSIS ===\n")

providers = get_models_by_provider()
print(f"Number of providers: {len(providers)}")

print(f"\nCheapest Models (input cost):")
models = find_models_by_cost(reverse=False)
for i, model in enumerate(models, 1):
    print( "  ", i, model[0], model[1], model[2])

print(f"\nMost Expensive Models (input cost):")
models = find_models_by_cost(reverse=True)
for i, model in enumerate(models, 1):
    print( "  ", i, model[0], model[1], model[2])

print(f"\nLargest Models:")
models = find_largest_models()
for i, model in enumerate(models, 1):
    print( "  ", i, model[0], model[1])

# Print top-level and nested JSON keys for inspection
def print_json_keys(obj, prefix=""):
    if isinstance(obj, dict):
        for key, value in obj.items():
            print(f"{prefix}{key}")
            print_json_keys(value, prefix + "  ")
    elif isinstance(obj, list) and obj:
        print_json_keys(obj[0], prefix + "[0] ")
print("\nJSON key structure:")
print_json_keys(martian_models_json)

print(f"\nSample model data structure:")
for i in range(3):
    model = martian_models_json[i]
    print(f"   {model}")

=== MARTIAN AI MODELS ANALYSIS ===

Number of providers: 43

Cheapest Models (input cost):
   1 meta-llama/llama-3.2-1b-instruct 0.000000005 0.00000001
   2 liquid/lfm-7b 0.00000001 0.00000001
   3 liquid/lfm-3b 0.00000002 0.00000002
   4 meta-llama/llama-3.1-8b-instruct 0.00000002 0.00000003
   5 meta-llama/llama-3.2-3b-instruct 0.00000002 0.00000002

Most Expensive Models (input cost):
   1 openai/o1-pro 0.00015 0.0006
   2 openai/gpt-4 0.00003 0.00006
   3 anthropic/claude-3-opus-20240229 0.000015 0.000075
   4 anthropic/claude-opus-4-0 0.000015 0.000075
   5 anthropic/claude-opus-4-1 0.000015 0.000075

Largest Models:
   1 mistralai/codestral-2508 2508.0
   2 mistralai/magistral-medium-2506 2506.0
   3 mistralai/magistral-small-2506 2506.0
   4 mistralai/devstral-small-2505 2505.0
   5 mistralai/codestral-2501 2501.0

JSON key structure:
[0] id
[0] pricing
[0]   prompt
[0]   completion
[0]   image
[0]   request
[0]   web_search
[0]   internal_reasoning
[0] added_at
[0] updated_at
[

## Analyze Response

In [25]:
def is_ground_truth_correct(answer: str, ground_truth: str) -> bool:
    """
    Returns True if the ground_truth appears as the final number in the answer, ignoring whitespace and punctuation.
    Accepts answers like '13', '13.', '13**', 'The answer is 13', '**13**', 'random text **13** random text', 'boxed{13}'.
    """
    # Remove trailing whitespace and punctuation
    answer_clean = answer.strip().rstrip('.!**')
    # Find all numbers in the answer (including negative numbers and those with commas)
    numbers = re.findall(r'-?[\d,]+', answer_clean)
    
    # Remove commas from the numbers for comparison
    numbers_clean = [num.replace(',', '') for num in numbers]

    answer_no_comma = answer.replace(",", "")

    return (ground_truth == answer_no_comma or
            "**"+ground_truth+"**" in answer or
            "boxed{"+ground_truth+"}" in answer or
            ""+ground_truth+" " in answer_no_comma  or
            ""+ground_truth+"." in answer_no_comma  or
            # Check that the last number matches within 0.001 tolerance
            (numbers_clean and abs(float(numbers_clean[-1]) - float(ground_truth)) < 0.001))

## Run Models

In [8]:
load_dotenv()
MARTIAN_API_KEY = os.getenv("MARTIAN_API_KEY")
assert MARTIAN_API_KEY, "API key not found. Please set MARTIAN_API_KEY in your .env file."

In [9]:
client = openai.OpenAI(
    base_url="https://api.withmartian.com/v1",
    api_key=MARTIAN_API_KEY,
    max_retries=0,  # Don't retry on timeout
    timeout=httpx.Timeout(60.0, connect=10.0)  # Separate connect timeout     
)

In [10]:
def run_model_inference(model_name, prompt, ground_truth, timeout=60):
    """
    Send a model a prompt, get the response, compare it to the ground_truth.
    Any model taking longer than 60 seconds to respond is consider to have failed or died. 
    Returns (answer, success). If timeout, returns ("TIMEOUT", False).
    """
    try:
        # The OpenAI client has its own timeout parameter
        response = client.chat.completions.create(
            model=model_name,
            max_tokens=1024,
            messages=[{"role": "user", "content": prompt}],
            timeout=timeout  
        )
        answer = response.choices[0].message.content.strip()
        success = is_ground_truth_correct(answer, ground_truth)
        return answer, success
    
    except openai.APITimeoutError:
        return "TIMEOUT", False
    except openai.APIError as e:
        return f"Error: {str(e)}", False
    except Exception as e:
        return f"Error: {str(e)}", False

In [11]:
# Evaluate multiple models concurrently, scoring them based on progressive test success.
def evaluate_models_progressive(tests, max_workers=32):
    model_scores = []

    def score_model(model_name):
        score = 0
        for test_idx, (prompt, ground_truth) in enumerate(tests):
            answer, success = run_model_inference(model_name, prompt, ground_truth)
            
            if success:
                score = test_idx + 1
            else:
                # Check for error codes
                if "TIMEOUT" in str(answer):
                    score = -408  # HTTP timeout code
                    break
                elif isinstance(answer, str) and answer.startswith("Error:"):
                    if "400" in answer:
                        score = -400
                    else:
                        score = -999
                    break
                else:
                    # Just got wrong answer
                    break
        
        return {"model": model_name, "score": score}

    print(f"Evaluating {len(martian_models_json)} models concurrently with {max_workers} workers...")
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_model = {
            executor.submit(score_model, model_name): model_name 
            for model_name in martian_models_names
        }
        
        for idx, future in enumerate(as_completed(future_to_model, timeout=120), 1):
            model_name = future_to_model[future]
            try:
                result = future.result(timeout=90)  # Add safety margin over API timeout
            except Exception as exc:
                result = {"model": model_name, "score": -999}
            print(f"[{idx}/{len(martian_models_json)}] {result['model']}: Score = {result['score']}")
            model_scores.append(result)
    
    return model_scores

## Generate prompt and response data for tasks

In [12]:
tasks = [
    "minimum",
    "maximum", 
    "sum",
    "difference",
    "product",
    "average",
    "exponential"
]

In [13]:
def generate_number_pairs(n_examples: int = 200, 
                         min_val: int = 1, 
                         max_val: int = 99,
                         include_negatives: bool = False,
                         seed: int = 42) -> List[Tuple[int, int]]:
    """Generate diverse number pairs for testing"""
    random.seed(seed)
    pairs = []
    
    # Strategy: Mix of different number ranges for variety
    ranges = [
        (1, 9),      # Single digits
        (10, 99),    # Double digits
        (1, 99),     # Mixed
    ]
    
    if include_negatives:
        ranges.extend([
            (-99, -1),   # Negative numbers
            (-50, 50),   # Mixed positive/negative
        ])
    
    examples_per_range = n_examples // len(ranges)
    
    for min_r, max_r in ranges:
        for _ in range(examples_per_range):
            x = random.randint(min_r, max_r)
            y = random.randint(min_r, max_r)
            pairs.append((x, y))
    
    # Fill remaining with random pairs from full range
    while len(pairs) < n_examples:
        x = random.randint(min_val, max_val)
        y = random.randint(min_val, max_val)
        pairs.append((x, y))
    
    random.shuffle(pairs)
    return pairs[:n_examples]

def calculate_ground_truth(x: int, y: int, operation: str) -> str:
    """Calculate the correct answer for a given operation"""
    if operation == "minimum":
        return str(min(x, y))
    elif operation == "maximum":
        return str(max(x, y))
    elif operation == "sum":
        return str(x + y)
    elif operation == "difference":
        return str(abs(x - y))  # Assuming absolute difference
    elif operation == "product":
        return str(x * y)
    elif operation == "average":
        return str((x + y) / 2)
    elif operation == "exponential":
        # Limit exponential to prevent overflow
        try:
            result = x ** y
            # Cap at reasonable size
            if result > 10**15:
                return "OVERFLOW"
            return str(result)
        except:
            return "OVERFLOW"
    else:
        raise ValueError(f"Unknown operation: {operation}")

def generate_synthetic_data(n_examples_per_task: int = 200) -> pd.DataFrame:
    """Generate synthetic data for all tasks"""
    
    all_data = []
    
    for task in tasks:
        #print(f"Generating {n_examples_per_task} examples for task: {task}")
        
        # For exponential, use smaller Y values to prevent overflow
        if task == "exponential":
            pairs = generate_number_pairs(n_examples_per_task, min_val=2, max_val=15)
            # Limit Y further for exponential
            pairs = [(x, min(y, 10)) for x, y in pairs]
        else:
            pairs = generate_number_pairs(n_examples_per_task)
        
        for x, y in pairs:
            prompt = f"Answer minimally: Given the numbers {x} and {y} calculate the {task}"
            ground_truth = calculate_ground_truth(x, y, task)
            
            # Skip overflow cases
            if ground_truth == "OVERFLOW":
                continue
                
            all_data.append({
                "task": task,
                "x": x,
                "y": y,
                "prompt": prompt,
                "ground_truth": ground_truth
            })
    
    df = pd.DataFrame(all_data)
    print(f"\nGenerated {len(df)} total examples across {len(tasks)} tasks")
    print(f"Examples per task: {df['task'].value_counts().to_dict()}")
    
    return df

In [14]:
# Generate the data
synthetic_data_df = generate_synthetic_data(n_examples_per_task=200)

# Display sample
print("Sample of generated data:")
pd.set_option('display.max_colwidth', None)  # Show full column content
pd.set_option('display.width', None)         # Don't wrap lines
sample_df = synthetic_data_df.groupby('task').head(2)
print(sample_df[['task', 'x', 'y', 'prompt', 'ground_truth']].to_string(index=False))

# Save to file
# synthetic_data_df.to_csv('synthetic_arithmetic_data.csv', index=False)


Generated 1311 total examples across 7 tasks
Examples per task: {'minimum': 200, 'maximum': 200, 'sum': 200, 'difference': 200, 'product': 200, 'average': 200, 'exponential': 111}
Sample of generated data:
       task  x  y                                                                  prompt    ground_truth
    minimum  8  7       Answer minimally: Given the numbers 8 and 7 calculate the minimum               7
    minimum 27 92     Answer minimally: Given the numbers 27 and 92 calculate the minimum              27
    maximum  8  7       Answer minimally: Given the numbers 8 and 7 calculate the maximum               8
    maximum 27 92     Answer minimally: Given the numbers 27 and 92 calculate the maximum              92
        sum  8  7           Answer minimally: Given the numbers 8 and 7 calculate the sum              15
        sum 27 92         Answer minimally: Given the numbers 27 and 92 calculate the sum             119
 difference  8  7    Answer minimally: Given the nu

## Find good research models

Scan the model, using the synthetic data, to find 5 models that can accurately perform the first 4 tasks 

In [19]:
# These models passed can accurately answer the first 4 tasks, but are closed source.
cached_good_closed_models = [
        'anthropic/claude-3-5-sonnet-20240620',
        'anthropic/claude-3-7-sonnet-latest',
        'anthropic/claude-3-haiku-20240307',
        'google/gemini-2.0-flash',
        'google/gemini-2.0-flash-001',
        'google/gemini-2.0-flash-lite',
        'google/gemini-2.0-flash-lite-001',
        'google/gemini-2.0-flash-lite-preview',
        'google/gemini-2.0-flash-lite-preview-02-05',
        'google/gemini-2.5-flash' 
        'x-ai/grok-3',
        'x-ai/grok-3-beta',
        'x-ai/grok-3-mini',
        'x-ai/grok-3-mini-beta',
        'x-ai/grok-code-fast-1',
        'deepinfra/google/gemini-2.0-flash-001',
        'deepinfra/google/gemini-2.5-flash',
        'deepinfra/google/gemini-2.5-pro',
        'liquid/lfm-3b', #  Liquid AI LFM-3B model closed. Open models 350M, 700M, 1.2B, and 2.6B under Apache 2.0 
        'mistralai/ministral-3b', # Not open source. Research allowed on 8B.
]

# Good open-source models 
cached_good_open_models = [
        'deepcogito/cogito-v2-preview-llama-109b-moe',
        'deepinfra/openai/gpt-oss-120b',
        'deepinfra/openai/gpt-oss-20b',
        'deepseek/deepseek-r1-distill-qwen-14b',
        'deepseek/deepseek-r1-distill-qwen-32b',
        'meta-llama/llama-3-70b-instruct',
        'meta-llama/llama-3.1-70b-instruct',
        'meta-llama/llama-3.2-90b-vision-instruct',
        'meta-llama/llama-3.3-70b-instruct',
        'meta-llama/llama-4-maverick'
]

In [21]:
# Scan models for accuracy on first 4 tasks. Task ~25mins
num_test_tasks = 4
num_per_task = 5
num_models_to_find = 10

# 0. Search for good open source models (containing 'llama', 'qwen', 'oss')
def get_open_models(possible_models):
    open_source_keywords = {'meta', 'llama', 'qwen', 'oss'}
    return [model for model in possible_models if any(keyword in model['id'] for keyword in open_source_keywords)]

# 1. Filter models <= max params
def get_small_models(possible_models, max_params):
    small_models = []
    for model in possible_models:
        param_size = model['size']
        if param_size is not None and param_size <= max_params:
            small_models.append(model)
    return small_models

# 2. Select first few tasks
def get_scan_tasks():
    return tasks[:num_test_tasks]

# 3. For each model, check accuracy on a few instances of each task
def scan_model_accuracy(model, df, scan_tasks):
    results = {}
    for task in scan_tasks:
        # Select 5 examples for this task
        task_df = df[df['task'] == task].sample(n=num_per_task, random_state=42)
        correct = 0
        for _, row in task_df.iterrows():
            try:
                answer, success = run_model_inference(model['id'], row["prompt"], row["ground_truth"])
            except Exception as e:
                answer = f"Error: {str(e)}"
                success = False
            print(f"Model: {model['id']}\nOutput/Error: {answer}\nSuccess: {success}\n{'-'*40}")

            if success:
                correct += 1
        results[task] = correct
    return results

# 4. Find top few models that get all questions correct for each task
def get_top_models(small_models,scan_tasks):
    top_models = []

    for model in small_models:
        acc = scan_model_accuracy(model, synthetic_data_df, scan_tasks)
        if all(v == 5 for v in acc.values()):
            top_models.append(model['id'])
        if len(top_models) >= num_models_to_find:
            break

    return top_models

use_cached_models = True
if use_cached_models:
    top_models = cached_good_open_models
else:
    open_models = get_open_models(martian_models_json)
    small_models = open_models # get_small_models(open_models, 3.0)
    print(f"Found {len(small_models)} models")

    scan_tasks = get_scan_tasks()
    top_models = get_top_models(small_models, scan_tasks)

In [22]:
print(f"Some models with perfect accuracy on {num_per_task} instances of first {num_test_tasks} tasks:")
for model_name in top_models :
    print( "  ", model_name )

Some models with perfect accuracy on 5 instances of first 4 tasks:
   deepcogito/cogito-v2-preview-llama-109b-moe
   deepinfra/openai/gpt-oss-120b
   deepinfra/openai/gpt-oss-20b
   deepseek/deepseek-r1-distill-qwen-14b
   deepseek/deepseek-r1-distill-qwen-32b
   meta-llama/llama-3-70b-instruct
   meta-llama/llama-3.1-70b-instruct
   meta-llama/llama-3.2-90b-vision-instruct
   meta-llama/llama-3.3-70b-instruct
   meta-llama/llama-4-maverick


## Inspect models

We want to avoid models that use a python sandbox to do math. This is sometimes visible in the answer detail. View sample answers here. 

In [27]:
# For each of the top models, ask one instance of one task and show the answer
def inspect_model_answers():
    for model_name in top_models:
        task = random.choice(tasks)
        example_df = synthetic_data_df[synthetic_data_df['task'] == task].sample(n=1, random_state=42).iloc[0]
        print(f"\nModel: {model_name}\nTask: {task}\nPrompt: {example_df['prompt']}\nGround Truth: {example_df['ground_truth']}")
        answer, success = run_model_inference(model_name, example_df['prompt'], example_df['ground_truth'])
        print(f"Answer: {answer}\nSuccess: {success}\n{'-'*60}")

#inspect_model_answers()