In [1]:
import json
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
import random
import time

from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

processor = AutoProcessor.from_pretrained(MODEL_ID)


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 17.02it/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


In [3]:
SYSTEM_PROMPT = """
You are a smart assistant specialized in reading numerical values from scientific charts.

Each chart may contain bars, lines, or scattered points. When answering, always follow these steps:

1. First, analyze the X and Y axes — understand the scale, numeric ranges, and tick intervals.
2. Then, locate the chart element corresponding to the X value requested by the user.
3. Estimate the precise Y value by visually aligning the element with the Y-axis.
4. Return the value as a **single float number only**, without any explanation, unit, or extra wording.

Your answer should look like this: `42.7`
Never reply with full sentences or approximations like "around 40".
"""

def query_model(image: Image.Image, x_val: int) -> str:
    prompt = f"What is the Y value at X={x_val} in this chart?"
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt}
            ],
        }
    ]

    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True, system=SYSTEM_PROMPT.strip()
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to(model.device)

    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=64)

    generated_ids_trimmed = outputs[:, inputs.input_ids.shape[-1]:]
    decoded = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )
    return decoded[0].strip()


In [None]:
DATASETS = {
    "bar": {
        "jsonl": "benchmark_images/bar_charts/bar_metadata.jsonl",
        "img_dir": "benchmark_images/bar_charts",
        "output_csv": "qwen2_5_results_bar.csv"
    },
    "line": {
        "jsonl": "benchmark_images/line_poly_charts/line_poly_metadata.jsonl",
        "img_dir": "benchmark_images/line_poly_charts",
        "output_csv": "qwen2_5_results_line.csv"
    },
    "scatter": {
        "jsonl": "benchmark_images/scatter_charts/scatter_metadata.jsonl",
        "img_dir": "benchmark_images/scatter_charts",
        "output_csv": "qwen2_5_results_scatter.csv"
    }
}


In [5]:
for chart_type, cfg in DATASETS.items():
    print(f"📁 Processing {chart_type} charts...")
    results = []

    with open(cfg["jsonl"], "r", encoding="utf-8") as f:
        lines = f.readlines()

    for line in tqdm(lines, desc=f"🖼️ {chart_type}"):
        if random.random() > 0.5:
            continue
        entry = json.loads(line)
        image_id = entry["id"]
        image_path = os.path.join(cfg["img_dir"], os.path.basename(entry["image"]).replace("\\", "/"))
        points = random.sample(entry["points"], min(1, len(entry["points"])))

        try:
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            print(f"❌ Cannot open image {image_id}: {e}")
            continue

        for point in points:
            x_val = point["x"]
            y_true = point["y"]
            start_time = time.perf_counter()
            y_pred_raw = query_model(image, x_val)
            elapsed_time = time.perf_counter() - start_time

            try:
                y_pred = float(y_pred_raw.split()[0].replace(",", "."))
            except Exception:
                y_pred = None

            abs_error = abs(y_pred - y_true) if y_pred is not None else None
            results.append({
                "chart_type": chart_type,
                "image_id": image_id,
                "x": x_val,
                "y_true": y_true,
                "y_pred": y_pred,
                "abs_error": abs_error,
                "raw_output": y_pred_raw,
                "inference_time": elapsed_time
            })

    df = pd.DataFrame(results)
    df.to_csv(cfg["output_csv"], index=False)
    print(f"✅ Saved {chart_type} results to {cfg['output_csv']}")


📁 Processing bar charts...


🖼️ bar: 100%|██████████| 20/20 [17:43<00:00, 53.17s/it]


✅ Saved bar results to qwen2_5_results_bar.csv
📁 Processing line charts...


🖼️ line: 100%|██████████| 50/50 [31:21<00:00, 37.63s/it] 


✅ Saved line results to qwen2_5_results_line.csv
📁 Processing scatter charts...


🖼️ scatter: 100%|██████████| 50/50 [34:00<00:00, 40.82s/it] 

✅ Saved scatter results to qwen2_5_results_scatter.csv



