In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [5]:
import asyncio
from itertools import islice
from lib.nyt_connections import get_connections_games, get_connections_tasks
from lib.tasks import ChatCompletionParams, get_task_results
import openai
import os


games = get_connections_games()
benchmark_tasks = list(
    islice(get_connections_tasks(games, parse_answers_liberally=True), 200)
)
benchmark_tasks

[Task(messages=[{'role': 'user', 'content': 'Find groups of four items that share something in common. Output them in the following format: four total lines. On each line, there should be four comma-separated items. No additional text (like group titles or descriptions) should be in the output. Also, there should not be anything in your output before or after the solution.\nWords:\n\nsnow\nlevel\nshift\nkayak\nheat\ntab\nbucks\nreturn\njazz\nhail\noption\nrain\nsleet\nracecar\nmom\nnets'}], grader=<function get_grader.<locals>.grader at 0x132876de0>),
 Task(messages=[{'role': 'user', 'content': 'Find groups of four items that share something in common. Output them in the following format: four total lines. On each line, there should be four comma-separated items. No additional text (like group titles or descriptions) should be in the output. Also, there should not be anything in your output before or after the solution.\nWords:\n\nPUMP\nFOOT\nTIME\nSEA\nLEAGUE\nLOAFER\nWHY\nUS\nBOOT\nY

In [4]:
fireworks = openai.AsyncOpenAI(
    base_url="https://api.fireworks.ai/inference/v1",
    api_key=os.getenv("FIREWORKS_API_KEY"),
)
openrouter = openai.AsyncOpenAI(
    base_url="https://openrouter.ai/api/v1", api_key=os.getenv("OPENROUTER_API_KEY")
)
together = openai.AsyncOpenAI(
    base_url="https://api.together.xyz/v1", api_key=os.getenv("TOGETHER_API_KEY")
)

results = await asyncio.gather(
    get_task_results(
        tasks=benchmark_tasks,
        client=fireworks,
        model="accounts/fireworks/models/deepseek-r1",
        params=ChatCompletionParams(
            max_tokens=2**17,
            logprobs=True,
            top_logprobs=5,
        ),
        pbar_desc="deepseek-r1",
        prices=(8.0, 8.0),
    ),
    # get_task_results(
    #     tasks=benchmark_tasks,
    #     client=fireworks,
    #     model="accounts/fireworks/models/deepseek-r1",
    #     params=ChatCompletionParams(
    #         logit_bias={128799: -4},  # type: ignore
    #         max_tokens=2**17,
    #         logprobs=True,
    #         temperature=0.6,
    #         top_logprobs=5,
    #     ),
    #     pbar_desc="deepseek-r1:high:0.6",
    #     prices=(8.0, 8.0),
    # ),
    # get_task_results(
    #     tasks=benchmark_tasks,
    #     client=fireworks,
    #     model="accounts/fireworks/models/deepseek-r1",
    #     params=ChatCompletionParams(
    #         max_tokens=2**17,
    #         logprobs=True,
    #         temperature=0.6,
    #         top_logprobs=5,
    #     ),
    #     pbar_desc="deepseek-r1:0.6",
    #     prices=(8.0, 8.0),
    # ),
    # get_task_results(
    #     tasks=benchmark_tasks,
    #     client=fireworks,
    #     model="accounts/fireworks/models/deepseek-r1",
    #     params=ChatCompletionParams(
    #         max_tokens=2**17,
    #         logprobs=True,
    #         temperature=0.85,
    #         top_logprobs=5,
    #     ),
    #     pbar_desc="deepseek-r1:0.85",
    #     prices=(8.0, 8.0),
    # ),
    get_task_results(
        tasks=benchmark_tasks,
        client=openrouter,
        model="deepseek/deepseek-r1-distill-qwen-1.5b",
        pbar_desc="r1-qwen-1.5b",
        prices=(0.18, 0.18),
    ),
    get_task_results(
        tasks=benchmark_tasks,
        client=openrouter,
        model="qwen/qwen-2.5-7b-instruct",
        pbar_desc="qwen-2.5-7b",
        params=ChatCompletionParams(
            extra_body={"provider": {"order": ["DeepInfra"], "allow_fallbacks": False}},
        ),
        prices=(0.0025, 0.005),
    ),
    get_task_results(
        tasks=benchmark_tasks,
        client=openrouter,
        model="deepseek/deepseek-r1-distill-qwen-14b",
        pbar_desc="r1-qwen-14b",
        prices=(1.6, 1.6),
    ),
    # get_task_results(
    #     tasks=benchmark_tasks,
    #     client=openrouter,
    #     model="deepseek/deepseek-r1-distill-qwen-14b",
    #     params=ChatCompletionParams(
    #         temperature=0.6,
    #     ),
    #     pbar_desc="r1-qwen-14b:0.6",
    #     prices=(1.6, 1.6),
    # ),
    get_task_results(
        tasks=benchmark_tasks,
        client=openrouter,
        model="deepseek/deepseek-r1-distill-qwen-32b",
        params=ChatCompletionParams(
            extra_body={"provider": {"order": ["DeepInfra"], "allow_fallbacks": False}},
        ),
        pbar_desc="r1-qwen-32b",
        prices=(0.12, 0.18),
    ),
    # get_task_results(
    #     tasks=benchmark_tasks,
    #     client=openrouter,
    #     model="deepseek/deepseek-r1-distill-qwen-32b",
    #     params=ChatCompletionParams(
    #         temperature=0.6,
    #         extra_body={"provider": {"order": ["DeepInfra"], "allow_fallbacks": False}},
    #     ),
    #     pbar_desc="r1-qwen-32b:0.6",
    #     prices=(0.12, 0.18),
    # ),
    get_task_results(
        tasks=benchmark_tasks,
        client=openrouter,
        model="deepseek/deepseek-r1-distill-llama-70b:free",
        params=ChatCompletionParams(
            extra_body={"provider": {"order": ["Targon"], "allow_fallbacks": False}},
        ),
        pbar_desc="r1-llama-70b:targon",
        prices=(0.0, 0.0),
    ),
    # get_task_results(
    #     tasks=benchmark_tasks,
    #     client=openrouter,
    #     model="deepseek/deepseek-r1-distill-llama-70b:free",
    #     params=ChatCompletionParams(
    #         logit_bias={"</think>": 10},
    #         extra_body={"provider": {"order": ["Targon"], "allow_fallbacks": False}},
    #     ),
    #     pbar_desc="r1-llama-70b:low",
    #     prices=(0.0, 0.0),
    # ),
    # get_task_results(
    #     tasks=benchmark_tasks,
    #     client=openrouter,
    #     model="deepseek/deepseek-r1-distill-llama-70b:free",
    #     params=ChatCompletionParams(
    #         logit_bias={"</think>": -2},
    #         extra_body={"provider": {"order": ["Targon"], "allow_fallbacks": False}},
    #     ),
    #     pbar_desc="r1-llama-70b:high",
    #     prices=(0.0, 0.0),
    # ),
    get_task_results(
        tasks=benchmark_tasks,
        client=openrouter,
        model="deepseek/deepseek-r1-distill-llama-70b",
        params=ChatCompletionParams(
            extra_body={"provider": {"order": ["SambaNova"], "allow_fallbacks": False}},
        ),
        pbar_desc="r1-llama-70b:samba",
        prices=(0.7, 1.4),
    ),
    # get_task_results(
    #     tasks=benchmark_tasks,
    #     client=openrouter,
    #     model="deepseek/deepseek-r1-distill-llama-70b",
    #     params=ChatCompletionParams(
    #         temperature=0.6,
    #         extra_body={"provider": {"order": ["SambaNova"], "allow_fallbacks": False}},
    #     ),
    #     pbar_desc="r1-llama-70b:samba:0.6",
    #     prices=(0.7, 1.4),
    # ),
)

deepseek-r1:   0%|          | 0/100 [00:00<?, ?it/s]

r1-qwen-1.5b:   0%|          | 0/100 [00:00<?, ?it/s]

qwen-2.5-7b:   0%|          | 0/100 [00:00<?, ?it/s]

r1-qwen-14b:   0%|          | 0/100 [00:00<?, ?it/s]

r1-qwen-32b:   0%|          | 0/100 [00:00<?, ?it/s]

r1-llama-70b:targon:   0%|          | 0/100 [00:00<?, ?it/s]

r1-llama-70b:samba:   0%|          | 0/100 [00:00<?, ?it/s]

r1-qwen-14b: 100%|██████████| 100/100 [00:21<00:00,  4.55it/s, completion_tokens=3930, prompt_tokens=136, reward=0.237, spend=$0.00]
r1-qwen-1.5b: 100%|██████████| 100/100 [00:21<00:00,  3.56it/s, completion_tokens=3917, prompt_tokens=136, reward=0, spend=$0.00]
qwen-2.5-7b: 100%|██████████| 100/100 [00:21<00:00,  6.17it/s, completion_tokens=43, prompt_tokens=140, reward=0.0425, spend=$0.00]
r1-qwen-32b: 100%|██████████| 100/100 [00:21<00:00,  8.39it/s, completion_tokens=3258, prompt_tokens=136, reward=0.292, spend=$0.00]
r1-llama-70b:samba: 100%|██████████| 100/100 [00:21<00:00,  9.52it/s, completion_tokens=3013, prompt_tokens=135, reward=0.398, spend=$0.00]
r1-llama-70b:targon: 100%|██████████| 100/100 [00:21<00:00,  8.98it/s, completion_tokens=2772, prompt_tokens=137, reward=0.41, spend=$0.00]
deepseek-r1: 100%|██████████| 100/100 [00:22<00:00,  6.52it/s, completion_tokens=4857, prompt_tokens=139, reward=0.667, spend=$0.00]


In [18]:
raise [exception for result in results[1] for exception in result.exceptions][0]

BadRequestError: Error code: 400 - {'error': {'object': 'error', 'type': 'invalid_request_error', 'message': "Input should be a valid integer, unable to parse string as an integer, field: 'logit_bias.</think>.[key]', value: '</think>'"}}

In [15]:
from typing import Any, Callable


def wrap_bound_method(method, wrapper):
    # Get the bound instance and the original function
    bound_instance = method.__self__
    original_func = method.__func__
    
    # Create a new wrapped method
    wrapped = wrapper(original_func)
    
    # Bind the wrapped function to the instance
    import types
    bound_method = types.MethodType(wrapped, bound_instance)
    
    # For the OpenAI client, we need to look at the instance's __dict__ directly
    instance_dict = vars(bound_instance)
    for attr_name, attr_value in instance_dict.items():
        if getattr(attr_value, '__func__', None) is original_func:
            setattr(bound_instance, attr_name, bound_method)
            break
    else:
        # If not found in instance dict, try class dict
        for attr_name, attr_value in bound_instance.__class__.__dict__.items():
            if getattr(attr_value, '__func__', None) is original_func:
                setattr(bound_instance, attr_name, bound_method)
                break
        else:
            # If still not found, just monkey patch the create attribute directly
            if hasattr(bound_instance, 'create'):
                bound_instance.create = bound_method
            else:
                raise ValueError("Method not found and cannot patch directly")
    
    return bound_method


def wrapper(f: Callable) -> Callable:
    async def wrapped(*args: Any, **kwargs: Any) -> Any:
        print(f"Calling {f.__name__} with args: {args} and kwargs: {kwargs}")
        return await f(*args, **kwargs)

    return wrapped


wrap_bound_method(together.chat.completions.create, wrapper)
await together.chat.completions.create(messages=[], model="")

Calling create with args: (<openai.resources.chat.completions.AsyncCompletions object at 0x14d273560>,) and kwargs: {'messages': [], 'model': ''}


NotFoundError: Error code: 404 - {'id': '91278fde2f885306', 'error': {'message': 'Unable to access model . Please visit https://api.together.ai/models to view the list of supported models.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_available'}}