In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from dataclasses import dataclass
from datasets import load_dataset

In [None]:
from common_utils.eval import EvalClient, EvalRunner
from common_utils.image_utils import show_image

In [None]:
from common_utils.buckets import CategoryEvalRunner

In [None]:
from common_utils.judge import JudgeClient, LLMJudge

#### Generate Different Model Outputs

In [None]:
@dataclass
class EvalConfig:
    base_url: str = "http://localhost:8001/v1"
    api_key: str = "EMPTY"
    # model: str = "Qwen/Qwen2.5-VL-3B-Instruct"
    model: str = "google/gemma-3-27b-it"
    temperature: float = 0.0
    max_tokens: int = 128
    
eval_client = EvalConfig()

@dataclass
class JudgeConfig:
    base_url: str = "http://localhost:8000/v1"
    api_key: str = "EMPTY"
    model: str = "Qwen/Qwen2.5-VL-7B-Instruct"
    max_retries: int = 3
    backoff: float = 0.6
    max_tokens: int = 128

judge_client = JudgeConfig()


In [None]:
judge_client

### Test the Evaluation

In [None]:
# 1) dataset (your existing filtering)
dataset = load_dataset("lmarena-ai/VisionArena-Chat", split="train", streaming=True)
en_dataset = dataset.filter(lambda ex: ex["num_turns"] == 1 and ex["language"] == "English")
small_ds = en_dataset.take(100)

In [None]:
# 3) wire eval
client = EvalClient(eval_client.base_url, eval_client.api_key)
runner = EvalRunner(eval_client, client)

In [None]:
# C) choose categories & run 10 per
categories = ["ocr"]  # add/remove as needed
cat_runner = CategoryEvalRunner(runner, k=10, id_key="conversation_id", exclusive=True)

### Test one Sample

In [None]:
# # 2) pick one sample
# it = iter(small_ds)
# first = next(it)  # advance as you like

In [None]:
# show_image(first.get("images")[0]['bytes'])

In [None]:
# # 4) run eval on the sample
# out = runner.run_on_sample(first)
# print("=== Eval Output ===")
# print("conv_id:", out.conversation_id)
# print("dataset_model:", out.dataset_model)
# print("user_question:", out.user_question[:120], "...")
# print("reference_answer:", out.reference_answer[:120], "...")
# print("model_answer:", out.model_answer[:200], "...")
# print("latency:", out.latency_sec, "sec")
# show_image(out.images[0].get('bytes'))

### Test one batches

In [None]:
buckets = cat_runner.build_buckets(iter(small_ds), categories)

In [None]:
# 4) run eval on the sample
out = runner.run_on_sample(buckets["ocr"][0])
print("=== Eval Output ===")
print("conv_id:", out.conversation_id)
print("dataset_model:", out.dataset_model)
print("user_question:", out.user_question, "...")
print("reference_answer:", out.reference_answer, "...")
print("model_answer:", out.model_answer, "...")
print("latency:", out.latency_sec, "sec")
show_image(out.images[0].get('bytes'))

In [None]:
# for i in range(0,len(buckets['ocr'])):
#     print("***************************************************************************************************")
#     # print(i)
#     for k, v in buckets['ocr'][i].items():
#         if k != "images":   # skip "b"
#             print(k, v)