In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dataclasses import dataclass
from datasets import load_dataset

In [3]:
from common_utils.eval import EvalClient, EvalRunner
from common_utils.image_utils import show_image

In [4]:
from common_utils.buckets import CategoryEvalRunner

In [5]:
from common_utils.judge import JudgeClient, LLMJudge

#### Generate Different Model Outputs

In [None]:
@dataclass
class EvalConfig:
    base_url: str = "http://localhost:8010/v1"
    api_key: str = "EMPTY"
    model: str = "Qwen/Qwen2.5-VL-3B-Instruct"
    # model: str = "google/gemma-3-27b-it"
    # model: str = "moonshotai/Kimi-VL-A3B-Thinking-2506"
    # model: str = "Qwen/Qwen2.5-VL-7B-Instruct"
    temperature: float = 0.5
    max_tokens: int = 2048
    
eval_client = EvalConfig()

@dataclass
class JudgeConfig:
    base_url: str = "http://localhost:8000/v1"
    api_key: str = "EMPTY"
    model: str = "google/gemma-3-27b-it"    
    max_retries: int = 3
    backoff: float = 0.6
    max_tokens: int = 2048
    temperature: float = 0.0

judge_client = JudgeConfig()


### Test the Evaluation

In [7]:
# 1) dataset (your existing filtering)
dataset = load_dataset("lmarena-ai/VisionArena-Chat", split="train", streaming=True)
en_dataset = dataset.filter(lambda ex: ex["num_turns"] == 1 and ex["language"] == "English")
LEN_SMALL_DS = 2000
small_ds = en_dataset.take(LEN_SMALL_DS)

In [8]:
# 3) wire eval
client = EvalClient(eval_client.base_url, eval_client.api_key)
runner = EvalRunner(eval_client, client)

In [9]:
# C) choose categories & run 10 per
categories = ["ocr", "code", "is_code", "refusal"]  # add/remove as needed
cat_runner = CategoryEvalRunner(runner, k=10, id_key="conversation_id", exclusive=True)

### Test one Sample

In [10]:
import random
import itertools

k = random.randrange(LEN_SMALL_DS/10)
sample = next(itertools.islice(small_ds, k, k+1))


In [26]:
# 4) run eval on the sample
out = runner.run_on_sample(sample)
print("=== Eval Output ===")
print("conv_id:", out.conversation_id)
print("dataset_model:", out.dataset_model)
print("user_question:", out.user_question[:120], "...")
print("reference_answer:", out.reference_answer[:120], "...")
print("model_answer:", out.model_answer[:200], "...")
print("latency:", out.latency_sec, "sec")
show_image(out.images[0].get('bytes'))

### Creating batches

In [12]:
buckets = cat_runner.build_buckets(iter(small_ds), categories)

### Execution

In [28]:
results_by_cat, all_results = cat_runner._run_buckets(buckets)

In [29]:
print(f"\nCollected {sum(len(v) for v in results_by_cat.values())} samples "
      f"across {len(results_by_cat)} categories; flat array size: {len(all_results)}")

In [30]:
# Example: peek first item from the flat array
if all_results:
    r0 = all_results[1]
    print("First result:")
    print(" conv_id:", r0.conversation_id)
    print(" category:", next((c for c, rs in results_by_cat.items() if r0 in rs), None))
    print(" user_question:", r0.user_question, "...")
    print(" model_answer:", r0.model_answer, "...")
    print(" latency:", r0.latency_sec)
    show_image(r0.images[0].get('bytes'))

#### Judge Evaluation

In [31]:
judge_client

In [32]:
judge_eval_client = JudgeClient(judge_client.base_url, judge_client.api_key)
judge = LLMJudge(judge_eval_client, judge_client.model, 
                 max_retries=judge_client.max_retries, backoff=judge_client.backoff, 
                 max_tokens=judge_client.max_tokens)

In [33]:
type(results_by_cat)

In [34]:
from common_utils.judge import CategoryJudgeRunner

In [37]:
# 2) run judging over eval outputs
judge_runner = CategoryJudgeRunner(judge, id_key="conversation_id", verbose=True)
judged_by_cat, judged_all = judge_runner.run(
    results_by_cat=results_by_cat,
    buckets=buckets,                      # from the bucketing step
    judge_model_name=judge_client.model,
)

In [38]:
# 3) peek first
if judged_all:
    SAMPLE = 38
    j0 = judged_all[SAMPLE]
    print("\nFirst judged:")
    print(" conv_id:", j0.conversation_id)
    
    print(" user input:", j0.user_question)
    print(" category:", j0.category)
    print(" score:", j0.judge_score)
    print("orignal model:", j0.dataset_model)
    print(" justification:", j0.judge_justification)
    print("***************************************************************************************************")
    print("reference answer:", j0.reference_answer)
    print("***************************************************************************************************")
    print()
    print("model answer:", j0.model_answer)
    show_image(all_results[SAMPLE].images[0].get('bytes'))

#### Saving the Results

In [39]:
from common_utils.results import ResultsWriter

In [40]:
# from results_writer import ResultsWriter
writer = ResultsWriter(output_dir="outputs", prefix="specific_qwen3b_eval_gemma27b_judge_new.json", timestamp=False)

# Save everything:
# paths = writer.save_all(
#     all_results=all_results,     # EvalOutput list
#     judged_all=judged_all,       # JudgedOutput list
#     buckets=buckets,             # raw selected records per category
# )

# print("Saved files:")
# for sect, files in paths.items():
#     for kind, p in files.items():
#         print(f"  {sect:8} {kind:12} -> {p}")

In [41]:
one_path = writer.save_one_json(
    eval_by_cat=results_by_cat,
    judged_by_cat=judged_by_cat,
    stem="combined"
)
