In [1]:
%load_ext autoreload
%autoreload 2

In [17]:
import json
from pathlib import Path
from google import genai
from google.genai import types

result_path = Path("/scratch/xiaowenz/datasets/surgeryvid_small/results/test/Qwen2.5-VL-3B-Instruct-openbiomedvid_qa_sys_default/sys_default_usr_default/results.json")

In [20]:
text = result_path.read_text()
len(text)

92579

In [None]:
from pydantic import BaseModel


prompt = """You are an expert evaluator tasked with assessing whether a model's answer to a visual question is correct.

You will be given a JOSN file. Each object in the file contains four fields:

```json
[
  {{
    "id": <id of question>,
    "question": <A question about visual content>,
    "answer": <the ground truth (correct) answer>,
    "model_answer": <The model's generated answer>
  }}
]
```

Your job is to determine if the `model_answer` is **semantically** correct, even if the wording differs from the ground truth.
**DO NOT MODIFY EXISTING FIELDS**.
Add a 5th field, `is_correct` to the new JOSN objects with the value true if the model's answer is correct, and false if it is not.
Make sure that the value of the is_correct field of each object is based solely on data in the object, and not on any other objects in the file.

{file_content}
"""

class OnePrediction(BaseModel):
  id: int
  question: str
  answer: str
  model_answer: str
  is_correct: bool

model = "gemini-2.5-pro"

prompt = prompt.format(file_content=result_path.read_text())
client = genai.Client()
# file = client.files.upload(file=result_path, config={'mime_type': 'application/json'})
response = client.models.generate_content(
    model=model,
    contents=[prompt],
    config=types.GenerateContentConfig(
      response_mime_type="application/json",
      response_schema=list[OnePrediction],
      temperature=0,
    )
)
eval_result_path = result_path.with_name(f"{model.replace('-', '_')}_results.json")
with open(eval_result_path, "w") as f:
  f.write(response.text)

print(f"Eval results saved to {eval_result_path}")

summary_path = result_path.with_name("summary.json")

total = 0
correct = 0
incorrect = 0
invalid = 0

with open(eval_result_path, "r") as f:
  responses = json.load(f)
for obj in responses:
  total += 1
  is_correct = obj.get("is_correct")
  if not isinstance(is_correct, bool):
    invalid += 1
    
  if is_correct == True:
    correct += 1
  else:
    incorrect += 1
    
accuracy = correct / total if total > 0 else 0
valid = total - invalid
accuracy_without_invalid = correct / valid if valid > 0 else 0

summary = {
    "total": total,
    "correct": correct,
    "invalid": invalid,
    "accuracy": accuracy,
    "accuracy_without_invalid": accuracy_without_invalid
}

with open(summary_path, "w") as f:
  json.dump(summary, f, indent=2)
  
print(f"Summary saved to {summary_path}")
print(summary)
eval_results = json.load(open(eval_result_path, "r"))
results = json.load(open(result_path, "r"))

for og, gemini in zip(results, eval_results):
  try:
    assert og["id"] == gemini["id"]
    assert og["question"] == gemini["question"]
    assert og["answer"] == gemini["answer"]
    assert og["model_answer"] == gemini["model_answer"]
  except AssertionError as e:
    print(og)
    print(gemini)
