In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [89]:
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [20]:
from src.model_client import ChatMessage, MessageRole, Prompt, LLMResponse, call_model, ModelClient

In [None]:
qwen_ok = "Qwen/Qwen3-Next-80B-A3B-Instruct"

In [12]:
ll3p2_3b = "meta-llama/Llama-3.2-3B-Instruct-Turbo"
gemma_3n = "google/gemma-3n-e2b-it:free"
cheap_model = gemma_3n

In [26]:
ll_base = "meta-llama/llama-3.1-405b"

In [27]:
message = ChatMessage(role=MessageRole.USER, content="Hello, world!")
out = await call_model(ll_base, [message], logprobs=True, max_tokens=1)

In [28]:
out

LLMResponse(model_id='meta-llama/llama-3.1-405b', completion='', stop_reason=None, cost=None, duration=1.2952570915222168, api_duration=1.2949938774108887, usage={'prompt_tokens': 11, 'completion_tokens': 0, 'total_tokens': 11}, finish_reason=None)

In [31]:
async def test_logprobs(model_id: str):
    client = ModelClient()
    
    messages = [
        ChatMessage(role=MessageRole.USER, content="Answer with True or False: Is 2+2=4?")
    ]
    
    response = await client.openrouter_client.chat.completions.create(
        model=model_id,
        messages=[{"role": m.role.value, "content": m.content} for m in messages],
        max_tokens=2,
        temperature=0,
        logprobs=True,
        top_logprobs=5
    )

    return response

In [32]:
response = await test_logprobs(ll_base)

In [33]:
response

ChatCompletion(id='gen-1762255735-yYArQydtTFDuk467eMeB', choices=[Choice(finish_reason=None, index=0, logprobs=None, message=ChatCompletionMessage(content='', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None, reasoning=None), native_finish_reason=None)], created=1762255735, model='meta-llama/llama-3.1-405b', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=0, prompt_tokens=21, total_tokens=21, completion_tokens_details=None, prompt_tokens_details=None), provider='Hyperbolic')

In [42]:
# If above still gives 0 tokens, try instruct model with direct question:

client = ModelClient()
response2 = await client.openrouter_client.chat.completions.create(
    model="meta-llama/llama-3.1-405b-instruct",
    messages=[{"role": "user", "content": "Say only: True"}],
    max_tokens=5,
    temperature=0,
    logprobs=True,
    top_logprobs=5,
)

In [43]:
response2

ChatCompletion(id='gen-1762256640-AIbY9CJICTYUQhOOhuSO', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='True', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None, reasoning=None), native_finish_reason='stop')], created=1762256640, model='meta-llama/llama-3.1-405b-instruct', object='chat.completion', service_tier=None, system_fingerprint='', usage=CompletionUsage(completion_tokens=2, prompt_tokens=14, total_tokens=16, completion_tokens_details=None, prompt_tokens_details=None), provider='Hyperbolic')

In [56]:
import os
import openai

system_content = "You are a gourmet. Be descriptive and helpful."
user_content = "Tell me about Chinese hotpot"

client = openai.OpenAI(
    api_key=os.getenv("HYPERBOLIC_API_KEY"),
    base_url="https://api.hyperbolic.xyz/v1",
    )

chat_completion = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
    messages=[
        #{"role": "system", "content": system_content},
        {"role": "user", "content": "Does 1+2=4? Answer with True or False."},
    ],
    temperature=0.7,
    max_tokens=2,
    logprobs=True,
    top_logprobs=5,
)

response = chat_completion.choices[0].message.content
print("Response:\n", response)

Response:
 False


In [57]:
chat_completion

ChatCompletion(id='oHuh2F8-4YNCb4-9993f878293f5012', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='False', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[], reasoning_content=None))], created=1762257832, model='meta-llama/Meta-Llama-3.1-70B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=2, prompt_tokens=49, total_tokens=51, completion_tokens_details=None, prompt_tokens_details=None))

In [59]:
test_msgs =[
        #{"role": "system", "content": system_content},
        {"role": "user", "content": "Does 1+2=4? Answer with True or False."},
    ],

In [67]:
"""
Hyperbolic API wrapper with logprobs support via raw REST API.
"""
import os
import httpx
from typing import Dict, List, Optional, Any


class HyperbolicLogprobsClient:
    """Async client for Hyperbolic API with logprobs extraction."""
    
    def __init__(self, api_key: Optional[str] = None):
        self.api_key = api_key or os.getenv("HYPERBOLIC_API_KEY")
        self.base_url = "https://api.hyperbolic.xyz/v1"
        self.client = httpx.AsyncClient(timeout=60.0)
    
    async def chat_completion_raw(
        self,
        messages: List[Dict[str, str]],
        model: str,
        max_tokens: int = 1,
        temperature: float = 0.0,
        # logprobs: bool = True,
        # top_logprobs: int = 2,
        **kwargs,
    ) -> Dict[str, Any]:
        """
        Raw chat completion call - returns full JSON response.
        
        Args:
            messages: Chat messages in OpenAI format
            model: Model ID
            max_tokens: Max tokens to generate
            temperature: Sampling temperature
            logprobs: Number of logprobs to return (integer!)
            top_logprobs: Number of top alternatives
            **kwargs: Additional API parameters
            
        Returns:
            Full JSON response from API
        """
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}",
        }
        
        payload = {
            "messages": messages,
            "model": model,
            "max_tokens": max_tokens,
            "temperature": temperature,
            # "logprobs": logprobs,
            #"top_logprobs": top_logprobs,
            "stream": False,
            **kwargs,
        }
        
        
        response = await self.client.post(
            f"{self.base_url}/chat/completions",
            headers=headers,
            json=payload,
        )
        response.raise_for_status()
        
        return response.json()
    
    def extract_label_logprobs(
        self,
        response_data: Dict[str, Any],
        labels: List[str] = ["True", "False"],
    ) -> Dict[str, float]:
        """
        Extract log probabilities for specific labels from API response.
        
        Args:
            response_data: Raw JSON response from chat_completion_raw()
            labels: Labels to extract probabilities for
            
        Returns:
            Dict mapping label -> log probability
            
        Raises:
            ValueError: If logprobs cannot be extracted
        """
        logprobs_data = response_data["choices"][0]["logprobs"]
        
        if logprobs_data is None:
            raise ValueError("API returned logprobs=None")
        
        # Get first token's logprobs - try different formats
        if "content" in logprobs_data:
            # OpenAI-style format
            token_logprobs = logprobs_data["content"][0]["top_logprobs"]
        elif "top_logprobs" in logprobs_data:
            # Alternative format
            token_logprobs = logprobs_data["top_logprobs"][0]
        else:
            raise ValueError(f"Unexpected logprobs format: {logprobs_data}")
        
        # Build dict mapping label -> logprob
        result = {}
        for item in token_logprobs:
            token = item["token"].strip()
            if token in labels:
                result[token] = item["logprob"]
        
        # Check if we got all labels
        if len(result) != len(labels):
            missing = set(labels) - set(result.keys())
            raise ValueError(f"Missing logprobs for labels: {missing}")
        
        return result
    
    async def get_label_logprobs(
        self,
        messages: List[Dict[str, str]],
        model: str,
        labels: List[str] = ["True", "False"],
        **kwargs,
    ) -> Dict[str, float]:
        """
        Convenience method: get logprobs for labels in one call.
        """
        response = await self.chat_completion_raw(messages, model, **kwargs)
        return self.extract_label_logprobs(response, labels)
    
    async def close(self):
        """Close the HTTP client."""
        await self.client.aclose()

In [68]:
client = HyperbolicLogprobsClient()
response = await client.chat_completion_raw(test_msgs, ll_base, logprobs=False)

HTTPStatusError: Client error '422 Unprocessable Entity' for url 'https://api.hyperbolic.xyz/v1/chat/completions'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/422

In [115]:
"""
Minimal Hyperbolic API client - just get it working.
"""
import os
import httpx
from typing import Dict, List, Any


async def call_hyperbolic_raw(
    messages: List[Dict[str, str]],
    model: str,
    api_key: str = None,
) -> Dict[str, Any]:
    """
    Minimal raw API call to Hyperbolic.
    
    Returns the full JSON response.
    """
    api_key = api_key or os.getenv("HYPERBOLIC_API_KEY")
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}",
    }
    
    # Minimal payload matching their example
    payload = {
        "messages": messages,
        "model": model,
        "max_tokens": 10,
        "temperature": 0.0,
        "logprobs": True,
        "echo": True,
        "top_logprobs": 2,
    }
    
    async with httpx.AsyncClient(timeout=60.0) as client:
        response = await client.post(
            "https://api.hyperbolic.xyz/v1/chat/completions",
            headers=headers,
            json=payload,
        )
        response.raise_for_status()
        return response.json()


# Test it
async def test():
    messages = [
        {"role": "user", "content": "Does 1+2=4? Reason about this then answer True or False."}
    ]
    
    result = await call_hyperbolic_raw(
        messages=messages,
        # model="meta-llama/Meta-Llama-3.1-70B-Instruct",
        model="Qwen/Qwen3-Next-80B-A3B-Instruct"# "Qwen/Qwen2.5-Coder-32B-Instruct",#"openai/gpt-oss-20b", #meta-llama/Meta-Llama-3.1-405B-Instruct" 
        #"Qwen/Qwen3-Next-80B-A3B-Instruct"
    )
    
    print(result)
    return result

# Run: asyncio.run(test())

In [116]:
r = await test()
print(r['choices'][0]['logprobs'])

{'id': 'ddf4bee8930a486d9384801025fbf55a', 'object': 'chat.completion', 'created': 1762264452, 'model': 'Qwen/Qwen3-Next-80B-A3B-Instruct', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': "Let's reason step by step:\n\nWe are asked", 'reasoning_content': None, 'tool_calls': None}, 'logprobs': {'content': [{'token': 'Let', 'bytes': [76, 101, 116], 'logprob': -0.0005292683490552008, 'top_logprobs': [{'token': 'Let', 'bytes': [76, 101, 116], 'logprob': -0.0005292683490552008}, {'token': 'No', 'bytes': [78, 111], 'logprob': -7.629513740539551}]}, {'token': "'s", 'bytes': [39, 115], 'logprob': -0.007975872606039047, 'top_logprobs': [{'token': "'s", 'bytes': [39, 115], 'logprob': -0.007975872606039047}, {'token': '’s', 'bytes': [226, 128, 153, 115], 'logprob': -4.836585521697998}]}, {'token': ' reason', 'bytes': [32, 114, 101, 97, 115, 111, 110], 'logprob': -0.00014780859055463225, 'top_logprobs': [{'token': ' reason', 'bytes': [32, 114, 101, 97, 115, 111, 110], 'logprob

None


In [93]:
from together import Together
import json

client = Together()

completion = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    messages=[
        {
            "role": "system",
            "content": "What are the top 3 things to do in New York?",
        }
    ],
    max_tokens=3,
    logprobs=2,
)

print(json.dumps(completion.model_dump(), indent=1))

{
 "id": "oHvBvpn-4YNCb4-9994822a3c53989b",
 "object": "chat.completion",
 "created": 1762263471,
 "model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
 "choices": [
  {
   "index": 0,
   "logprobs": {
    "tokens": [
     "The",
     " city",
     " that"
    ],
    "token_logprobs": [
     -0.640625,
     -0.044921875,
     -0.00063323975
    ],
    "token_ids": [
     791,
     3363,
     430
    ],
    "top_logprobs": [
     {
      "The": -0.640625,
      "New": -0.765625
     },
     {
      " city": -0.044921875,
      " top": -3.796875
     },
     {
      " that": -0.00063323975,
      " of": -7.375
     }
    ]
   },
   "seed": 1547669890996867600,
   "finish_reason": "length",
   "message": {
    "role": "assistant",
    "content": "The city that",
    "tool_calls": []
   }
  }
 ],
 "prompt": [],
 "usage": {
  "prompt_tokens": 43,
  "completion_tokens": 3,
  "total_tokens": 46,
  "cached_tokens": 0
 }
}


In [91]:
completion.choices[0]

ChatCompletionChoicesData(index=0, logprobs=LogprobsPart(tokens=['The', ' city', ' that', ' never', ' sleeps', '!', ' New', ' York', ' is', ' a'], token_logprobs=[-0.640625, -0.044921875, -0.00063323975, -2.0742416e-05, -0.00012874603, -0.58203125, -0.0026397705, -5.9604645e-07, -0.69921875, -0.013061523], token_ids=[791, 3363, 430, 2646, 72490, 0, 1561, 4356, 374, 264], top_logprobs=[{'The': -0.640625}, {' city': -0.044921875}, {' that': -0.00063323975}, {' never': -2.0742416e-05}, {' sleeps': -0.00012874603}, {'!': -0.58203125}, {' New': -0.0026397705}, {' York': -5.9604645e-07}, {' is': -0.69921875}, {' a': -0.013061523}]), seed=1335840185419321600, finish_reason=<FinishReason.Length: 'length'>, message=ChatCompletionMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='The city that never sleeps! New York is a', tool_calls=[]))