This notebook is used to evaluate RAGAS metric for RAG approach

In [1]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# Load the model and processor
model_id = "Qwen/Qwen2.5-VL-7B-Instruct"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    attn_implementation="sdpa", # Use PyTorch's native optimized attention
    device_map="auto"
)

processor = AutoProcessor.from_pretrained(model_id)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 42886.54it/s]
Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 729/729 [00:05<00:00, 135.48it/s, Materializing param=model.visual.patch_embed.proj.weight]                         
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


In [2]:
from deepeval.models.base_model import DeepEvalBaseLLM
import torch
import re

class QwenJudge(DeepEvalBaseLLM):

    def __init__(self, model, processor):
        self.model = model
        self.processor = processor
        self.name = "Qwen2.5-VL-7B-Judge"

    def load_model(self):
        return self.model

    # RAGAS is TEXT-ONLY
    def supports_multimodal(self) -> bool:
        return True

    @staticmethod
    def _force_json(text: str) -> str:
        """
        Extract the first valid JSON object from text.
        Hard fail if none found.
        """
        match = re.search(r"\{[\s\S]*\}", text)
        if not match:
            raise ValueError(f"Judge did not output JSON:\n{text}")
        return match.group(0)

    def generate(self, prompt: str) -> str:
        system_prompt = (
            "You are a strict JSON generator.\n"
            "Return ONLY valid JSON.\n"
            "No explanations.\n"
            "No markdown.\n"
            "No trailing text."
        )

        messages = [
            {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
            {"role": "user", "content": [{"type": "text", "text": prompt}]}
        ]

        text = self.processor.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        inputs = self.processor(
            text=[text],
            padding=True,
            return_tensors="pt"
        ).to(self.model.device)

        with torch.no_grad():
            generated_ids = self.model.generate(
                **inputs,
                max_new_tokens=15000,
                do_sample=False,      # üö® REQUIRED
                temperature=0.0,      # üö® REQUIRED
                top_p=1.0,
            )

        output_ids = generated_ids[:, inputs.input_ids.shape[1]:]
        raw_output = self.processor.batch_decode(
            output_ids,
            skip_special_tokens=True
        )[0].strip()

        return self._force_json(raw_output)

    def get_model_name(self):
        return self.name

    async def a_generate(self, prompt: str) -> str:
    # Force sync execution even if DeepEval tries async
        return self.generate(prompt)

# Re-initialize the judge
local_judge = QwenJudge(model, processor)


In [3]:
import ast
import json
from deepeval.test_case import LLMTestCase, MLLMImage
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams

# 1. Load your JSON
with open('combine_prediction.json', 'r') as f:
    data = json.load(f)

test_cases = []
for key, item in data.items():
    image_list_raw = item["question"]["ImageList"]
    
    # Standardize image list
    if isinstance(image_list_raw, str):
        image_urls = ast.literal_eval(image_list_raw)
    else:
        image_urls = image_list_raw

    # Create MLLMImage objects
    images = [MLLMImage(url=url) for url in image_urls]
    
    # 2. Use the standard LLMTestCase but include images in the input
    # This 'slug' approach is highly compatible with Qwen2.5-VL
    input_content = item["question"]["Text"]
    for i, img in enumerate(images):
        input_content += f"\nImage {i+1}: {img}"

    test_case = LLMTestCase(
        input=input_content,
        actual_output=item.get("answer", ""), 
        expected_output=item["ground_truth"],
        retrieval_context=[item["contexts"]]
    )
    test_cases.append(test_case)

print(f"Prepared {len(test_cases)} test cases.")

Prepared 16 test cases.


In [4]:
# Define chunk size
chunk_size = 2

# Partition the list into sub-lists
sub_test_cases = [test_cases[i : i + chunk_size] for i in range(0, len(test_cases), chunk_size)]

# Verification
print(f"Created {len(sub_test_cases)} sub-test cases.")
for idx, sub_list in enumerate(sub_test_cases):
    print(f"Sub-case {idx + 1} contains {len(sub_list)} samples.")

Created 8 sub-test cases.
Sub-case 1 contains 2 samples.
Sub-case 2 contains 2 samples.
Sub-case 3 contains 2 samples.
Sub-case 4 contains 2 samples.
Sub-case 5 contains 2 samples.
Sub-case 6 contains 2 samples.
Sub-case 7 contains 2 samples.
Sub-case 8 contains 2 samples.


In [5]:
from deepeval.metrics import (
    FaithfulnessMetric,
    AnswerRelevancyMetric,
    ContextualPrecisionMetric,
    ContextualRecallMetric
)

metrics = [
    FaithfulnessMetric(threshold=0.3, model=local_judge, async_mode=False, truths_extraction_limit=50),
    AnswerRelevancyMetric(threshold=0.3, model=local_judge, async_mode=False),
    ContextualPrecisionMetric(threshold=0.3, model=local_judge, async_mode=False),
    ContextualRecallMetric(threshold=0.3, model=local_judge, async_mode=False),
]


In [6]:
from deepeval import evaluate
from deepeval.evaluate import DisplayConfig

display_config = DisplayConfig(
    print_results=True
)

results = evaluate(
    test_cases=sub_test_cases[0],
    metrics=metrics,
    display_config=display_config
)


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.




Metrics Summary

  - ‚ùå Faithfulness (score: 0.25, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.25 because the actual output lacks specific details about the final diagnosis, presence of symptoms such as fever and fatigue, and results of diagnostic tests, making it impossible to confirm or contradict the claims made., error: None)
  - ‚ùå Answer Relevancy (score: 0.0, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.00 because the image content does not provide any information about the diagnosis or symptoms of the patient., error: None)
  - ‚ùå Contextual Precision (score: 0.0, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.00 because the retrieval context ranks irrelevant nodes higher than the relevant ones. Specifically, the first node discusses various viral hemorrhagic fevers without mentioning Ebola virus disease or the patient's case detail

In [7]:
from deepeval import evaluate
from deepeval.evaluate import DisplayConfig

display_config = DisplayConfig(
    print_results=True
)

results = evaluate(
    test_cases=sub_test_cases[1],
    metrics=metrics,
    display_config=display_config
)




Metrics Summary

  - ‚úÖ Faithfulness (score: 1.0, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 1.00 because there are no contradictions found between the actual output and the retrieval context., error: None)
  - ‚úÖ Answer Relevancy (score: 0.5714285714285714, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.57 because the provided JSON does not contain any information relevant to the patient's condition or the input. It lacks any details about the patient's symptoms, medical history, or diagnostic findings., error: None)
  - ‚ùå Contextual Precision (score: 0.0, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.00 because the first node, ranked 1st, is deemed irrelevant ('no') as it does not mention any information related to the patient's presentation or diagnosis, such as fever, vomiting, abdominal pain, or altered consciousness, which are key feat

In [7]:
from deepeval import evaluate
from deepeval.evaluate import DisplayConfig

display_config = DisplayConfig(
    print_results=True
)

results = evaluate(
    test_cases=sub_test_cases[2],
    metrics=metrics,
    display_config=display_config
)




Metrics Summary

  - ‚ùå Faithfulness (score: 0.0, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.00 because the actual output contradicts the retrieval context by mentioning Plasmodium falciparum instead of Plasmodium knowlesi, and it omits severe malaria manifestations, laboratory results for acute kidney injury, severe anaemia, acute liver failure, elevated creatinine levels, low haemoglobin levels, elevated AST, ALT, ALP, and total bilirubin levels, as well as the patient's history of mining in an endemic area or exposure to contaminated water, and it also fails to mention the progression of symptoms despite initial treatment., error: None)
  - ‚ùå Answer Relevancy (score: 0.2, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.20 because the provided JSON does not contain any information related to the patient's condition or the symptoms described in the input. The images referenced in the JS

In [8]:
from deepeval import evaluate
from deepeval.evaluate import DisplayConfig

display_config = DisplayConfig(
    print_results=True
)

results = evaluate(
    test_cases=sub_test_cases[3],
    metrics=metrics,
    display_config=display_config
)




Metrics Summary

  - ‚ùå Faithfulness (score: 0.0, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.00 because the actual output contradicts the retrieval context by mentioning a live adult nematode worm and its size or location, which are not present in the retrieval context., error: None)
  - ‚úÖ Answer Relevancy (score: 0.5714285714285714, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.57 because the provided JSON does not contain any irrelevant statements. It appears to be a detailed medical report with relevant information about a patient's visit to the Nigerian Army Eye Centre, including symptoms, examination findings, and treatment. The JSON is concise and directly addresses the input without introducing any extraneous or irrelevant content., error: None)
  - ‚ùå Contextual Precision (score: 0.0, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.

In [9]:
from deepeval import evaluate
from deepeval.evaluate import DisplayConfig

display_config = DisplayConfig(
    print_results=True
)

results = evaluate(
    test_cases=sub_test_cases[4],
    metrics=metrics,
    display_config=display_config
)




Metrics Summary

  - ‚ùå Faithfulness (score: 0.0, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.00 because the actual output contains multiple contradictions regarding the patient's age, gender, and specific medical conditions compared to the retrieval context., error: None)
  - ‚úÖ Answer Relevancy (score: 0.6666666666666666, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.67 because the provided JSON does not contain any relevant information to address the given input. The input discusses a patient's medical history and clinical findings, while the JSON appears to be unrelated to this context., error: None)
  - ‚ùå Contextual Precision (score: 0.0, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.00 because the retrieval context does not mention any information related to trachoma, dry eye disease, or corneal ulcers, which are relevant to the pati

In [10]:
from deepeval import evaluate
from deepeval.evaluate import DisplayConfig

display_config = DisplayConfig(
    print_results=True
)

results = evaluate(
    test_cases=sub_test_cases[5],
    metrics=metrics,
    display_config=display_config
)




Metrics Summary

  - ‚ùå Faithfulness (score: 0.0, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.00 because the actual output contradicts multiple aspects of the retrieval context, including the absence of a 53-year-old woman, specific histopathological findings, positive PCR results, and the need for a travel history to endemic areas., error: None)
  - ‚ùå Answer Relevancy (score: 0.14285714285714285, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.14 because the provided JSON does not contain any relevant information to address the input query. It appears to be a restatement of the diagnosis without providing new information about the patient's condition or treatment., error: None)
  - ‚ùå Contextual Precision (score: 0.0, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.00 because the retrieval context does not mention any information related to s

In [11]:
from deepeval import evaluate
from deepeval.evaluate import DisplayConfig

display_config = DisplayConfig(
    print_results=True
)

results = evaluate(
    test_cases=sub_test_cases[6],
    metrics=metrics,
    display_config=display_config
)




Metrics Summary

  - ‚úÖ Faithfulness (score: 0.6, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.60 because the actual output contradicts the retrieval context by specifying a 20-year-old woman from the Democratic Republic of the Congo, while the retrieval context describes a 46-year-old Thai woman, leading to discrepancies in both age and location., error: None)
  - ‚ùå Answer Relevancy (score: 0.0, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.00 because the input does not contain any irrelevant statements. All information provided is relevant to the patient's medical history and current condition., error: None)
  - ‚ùå Contextual Precision (score: 0.0, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.00 because the retrieval context does not mention any information about a 56-year-old Guatemalan male landscaper presenting with symptoms of a chro

In [15]:
from deepeval import evaluate
from deepeval.evaluate import DisplayConfig

display_config = DisplayConfig(
    print_results=True
)

results = evaluate(
    test_cases=sub_test_cases[7],
    metrics=metrics,
    display_config=display_config
)




Metrics Summary

  - ‚ùå Faithfulness (score: 0.0, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.00 because the actual output includes references to Chronic Hepatitis C and HCV RNA, which are not mentioned in the retrieval context., error: None)
  - ‚úÖ Answer Relevancy (score: 0.6, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.60 because the statement does not provide new information about the patient's condition and suggests a possible false-positive result, which is not directly relevant to the diagnosis., error: None)
  - ‚ùå Contextual Precision (score: 0.0, threshold: 0.3, strict: False, evaluation model: Qwen2.5-VL-7B-Judge, reason: The score is 0.00 because the retrieval context does not mention any information related to the patient's condition, symptoms, or laboratory findings described in the expected output. It focuses on different cases with various symptoms and diagnoses unrela