This notebook is used to evaluate Factual correctness score for all 4 compare result

In [None]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# Load the model and processor
model_id = "Qwen/Qwen2.5-VL-7B-Instruct"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    attn_implementation="sdpa", # Use PyTorch's native optimized attention
    device_map="auto"
)

processor = AutoProcessor.from_pretrained(model_id)

Fetching 5 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 58254.22it/s]
Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 729/729 [00:02<00:00, 327.41it/s, Materializing param=model.visual.patch_embed.proj.weight]                          
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


In [None]:
from deepeval.models.base_model import DeepEvalBaseLLM
import torch

class QwenJudge(DeepEvalBaseLLM):
    def __init__(self, model, processor):
        self.model = model
        self.processor = processor
        # FIX 1: Add the 'name' attribute that DeepEval is looking for
        self.name = "Qwen2.5-VL-7B-Judge" 

    def load_model(self):
        return self.model

    # This method is used by DeepEval to check multimodal capability
    def supports_multimodal(self) -> bool:
        return True

    def generate(self, prompt: str) -> str:
        # Standard text generation for the Judge role
        messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = self.processor(text=[text], padding=True, return_tensors="pt").to(self.model.device)

        with torch.no_grad():
            generated_ids = self.model.generate(**inputs, max_new_tokens=512)
            
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        return self.processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True
        )[0]

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    def get_model_name(self):
        return self.name

# Re-initialize the judge
local_judge = QwenJudge(model, processor)

In [None]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams

medical_judge_metric = GEval(
    name="Medical Accuracy Judge",
    model=local_judge,
    criteria="Evaluate if the diagnosis is correct based on the text and images provided.",
    evaluation_params=[
        LLMTestCaseParams.INPUT, 
        LLMTestCaseParams.ACTUAL_OUTPUT, 
        LLMTestCaseParams.EXPECTED_OUTPUT
    ],
    # Ensure multimodal is handled if your version of DeepEval requires it
)

In [None]:
import ast
import json
from deepeval.test_case import LLMTestCase, MLLMImage
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams

# 1. Load your JSON
with open('RAGQwenVL7B_predictions_output.json', 'r') as f:
    data = json.load(f)

test_cases = []
for key, item in data.items():
    image_list_raw = item["question"]["ImageList"]
    
    # Standardize image list
    if isinstance(image_list_raw, str):
        image_urls = ast.literal_eval(image_list_raw)
    else:
        image_urls = image_list_raw

    # Create MLLMImage objects
    images = [MLLMImage(url=url) for url in image_urls]
    
    # 2. Use the standard LLMTestCase but include images in the input
    # This 'slug' approach is highly compatible with Qwen2.5-VL
    input_content = item["question"]["Text"]
    for i, img in enumerate(images):
        input_content += f"\nImage {i+1}: {img}"

    test_case = LLMTestCase(
        input=input_content,
        actual_output=item.get("answer", ""), 
        expected_output=item["ground_truth"]
    )
    test_cases.append(test_case)

print(f"Prepared {len(test_cases)} test cases.")

Prepared 16 test cases.


In [None]:
from deepeval import evaluate
from deepeval.evaluate import DisplayConfig

# Create a display config to enable/disable console output
display_config = DisplayConfig(print_results=True)

results = evaluate(
    test_cases=test_cases, 
    metrics=[medical_judge_metric],
    display_config=display_config # <--- Use this instead of the direct argument
)



Metrics Summary

  - ‚ùå Medical Accuracy Judge [GEval] (score: 0.0, threshold: 0.5, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The response does not align with the provided evaluation steps. It incorrectly diagnoses malaria based on symptoms and treatment history without considering the explicit visual details from the images. The response also infers that the patient's symptoms are consistent with malaria, which is not supported by the images. The response does not address the uncertainty of the rapid diagnostic test results for EBOV, which is a critical aspect of the expected output., error: None)

For multimodal test case:

  - input: The first patient was a 33-year-old physician who had been working in Liberia since October 2013, during which time he had remained healthy while taking daily combination therapy with atovaquone and proguanil as prophylaxis against malaria. In April 2014, he and his team established an EVD care unit in Monrovia, and patients with 

In [19]:
import json
import os
from pathlib import Path

# 1. Use Pathlib to define and create the directory
# This creates all parent folders and won't error if they already exist
output_dir = Path("/workspace/evaluation_results")
output_dir.mkdir(parents=True, exist_ok=True)

# 2. Extract and format the results (with the Unpacking Fix from before)
actual_results = results[0] if isinstance(results, tuple) else results
full_report = []

for result in actual_results:
    report_entry = {
        "input": getattr(result, 'input', 'N/A'),
        "actual_output": getattr(result, 'actual_output', 'N/A'),
        "expected_output": getattr(result, 'expected_output', 'N/A'),
        "metrics": []
    }
    
    if hasattr(result, 'metrics'):
        for metric in result.metrics:
            report_entry["metrics"].append({
                "metric_name": getattr(metric, 'name', 'Medical Judge'),
                "score": getattr(metric, 'score', 0),
                "reason": getattr(metric, 'reason', 'No reasoning found')
            })
    full_report.append(report_entry)

# 3. Save using the Path object
output_file = output_dir / "medical_qwen_full_report.json"
with open(output_file, "w") as f:
    json.dump(full_report, f, indent=4)

print(f"‚úÖ Success! Data is safe at: {output_file}")

‚úÖ Success! Data is safe at: /workspace/evaluation_results/medical_qwen_full_report.json
