In [1]:
import json
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm
from transformers import AutoProcessor, Gemma3ForConditionalGeneration
import torch
import random
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# === CONFIG ===
MODEL_ID = "google/gemma-3-12b-it"
DATASETS = {
    "bar": {
        "jsonl": "benchmark_images/bar_charts/bar_metadata.jsonl",
        "img_dir": "benchmark_images/bar_charts",
        "output_csv": "gemma3_12b_results_bar.csv"
    },
    "line": {
        "jsonl": "benchmark_images/line_poly_charts/line_poly_metadata.jsonl",
        "img_dir": "benchmark_images/line_poly_charts",
        "output_csv": "gemma3_12b_results_line.csv"
    },
    "scatter": {
        "jsonl": "benchmark_images/scatter_charts/scatter_metadata.jsonl",
        "img_dir": "benchmark_images/scatter_charts",
        "output_csv": "gemma3_12b_results_scatter.csv"
    }
}

In [3]:
# === LOAD MODEL ===
print("🔧 Loading model and processor...")
model = Gemma3ForConditionalGeneration.from_pretrained(MODEL_ID, device_map="auto").eval()
processor = AutoProcessor.from_pretrained(MODEL_ID)

🔧 Loading model and processor...


Loading checkpoint shards: 100%|██████████| 5/5 [00:39<00:00,  7.99s/it]
Some parameters are on the meta device because they were offloaded to the cpu and disk.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [4]:
system_prompt = """
You are an intelligent assistant that helps extract precise numeric values from scientific charts. 
Each chart may include bar plots, line graphs, or scatter points. To answer accurately:

1. First, analyze the chart axes — determine the visible x and y ranges, including tick values and scales (linear/logarithmic).
2. Then, identify the specific visual element referenced in the user's question — such as a bar, point, or curve corresponding to a particular x-value.
3. Carefully estimate the corresponding y-value by using the position of the element relative to the axis scale.
4. Always return a single numeric value (float) as your answer, without explanation, unit, or additional text.
"""

# === PREDICTION FUNCTION ===
def query_model(image: Image.Image, x_val: int) -> str:
    prompt = f"What is the Y value at X={x_val} in this chart?"
    messages = [
        {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
        {"role": "user", "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]}
    ]
    inputs = processor.apply_chat_template(
        messages, add_generation_prompt=True, tokenize=True,
        return_dict=True, return_tensors="pt"
    ).to(model.device, dtype=torch.bfloat16)

    input_len = inputs["input_ids"].shape[-1]
    with torch.inference_mode():
        output = model.generate(**inputs, max_new_tokens=100, do_sample=False)
    decoded = processor.decode(output[0][input_len:], skip_special_tokens=True)
    return decoded.strip()

In [5]:
# === MAIN LOOP FOR EACH TYPE ===
for chart_type, cfg in DATASETS.items():
    print(f"📁 Processing {chart_type} charts...")
    results = []

    with open(cfg["jsonl"], "r", encoding="utf-8") as f:
        lines = f.readlines()

    for line in tqdm(lines, desc=f"🖼️ {chart_type}"):
        if random.random() > 0.5:
            continue
        entry = json.loads(line)
        print(f"🔍 Processing image {entry['id']}...")
        image_id = entry["id"]
        image_path = os.path.join(cfg["img_dir"], os.path.basename(entry["image"]).replace("\\", "/"))
        points = random.sample(entry["points"], min(1, len(entry["points"])))

        try:
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            print(f"❌ Cannot open image {image_id}: {e}")
            continue

        for point in points:
            x_val = point["x"]
            y_true = point["y"]
            start_time = time.perf_counter()
            y_pred_raw = query_model(image, x_val)
            elapsed_time = time.perf_counter() - start_time

            # Try to parse float
            try:
                y_pred = float(y_pred_raw.split()[0].replace(",", "."))
            except Exception:
                y_pred = None

            abs_error = abs(y_pred - y_true) if y_pred is not None else None
            results.append({
                "chart_type": chart_type,
                "image_id": image_id,
                "x": x_val,
                "y_true": y_true,
                "y_pred": y_pred,
                "abs_error": abs_error,
                "raw_output": y_pred_raw,
                "inference_time": elapsed_time
            })

    df = pd.DataFrame(results)
    df.to_csv(cfg["output_csv"], index=False)
    print(f"✅ Saved {chart_type} results to {cfg['output_csv']}")

📁 Processing bar charts...


🖼️ bar:   0%|          | 0/20 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 4b7da27a-c42a-4ee8-96d7-1c7ed52024c8...


🖼️ bar:   5%|▌         | 1/20 [01:31<28:52, 91.18s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image d98bf745-daf2-4aba-81ea-4139bd84e02d...


🖼️ bar:  20%|██        | 4/20 [02:34<09:09, 34.34s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image ab66dde9-8626-41ae-b65c-5815f9785fd7...


🖼️ bar:  25%|██▌       | 5/20 [03:33<10:19, 41.27s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 286f8a36-3844-41a2-823e-5f1000b82595...


🖼️ bar:  50%|█████     | 10/20 [04:35<03:40, 22.01s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 0b2ce439-94e6-4c4f-b6b8-c1dbc4e00283...


🖼️ bar:  70%|███████   | 14/20 [05:39<01:56, 19.40s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 4f87a721-f6e5-4712-8691-6562227eac34...


🖼️ bar:  80%|████████  | 16/20 [06:40<01:28, 22.09s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 3a9ef51d-293d-478b-8866-3ada0b7b7ec1...


🖼️ bar: 100%|██████████| 20/20 [07:43<00:00, 23.20s/it]


✅ Saved bar results to gemma3_12b_results_bar.csv
📁 Processing line charts...


🖼️ line:   0%|          | 0/50 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 21b2eff1-d2a4-4163-ac29-839327bcde89...


🖼️ line:   2%|▏         | 1/50 [01:10<57:38, 70.57s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image ad899faa-e298-4b2d-beda-39797af0694d...


🖼️ line:   4%|▍         | 2/50 [02:16<54:04, 67.59s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image d8f7b67b-00ff-4e1e-93f4-e4d7fcba37a8...


🖼️ line:   6%|▌         | 3/50 [03:19<51:32, 65.79s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image a08d3a7d-0381-4351-9944-9735a5eedacd...


🖼️ line:   8%|▊         | 4/50 [04:23<49:42, 64.85s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 1a4643b7-4b9e-4839-8ce6-b904bf8f1f01...


🖼️ line:  18%|█▊        | 9/50 [05:29<18:20, 26.83s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image a155969b-5725-4460-b71c-1376fe53f5c8...


🖼️ line:  20%|██        | 10/50 [06:35<22:26, 33.65s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 8433fe6c-d4b3-4e2e-8f84-7baff4086504...


🖼️ line:  22%|██▏       | 11/50 [07:46<26:41, 41.07s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 0ab6874d-32ff-4060-a4a8-152b2c18d56f...


🖼️ line:  24%|██▍       | 12/50 [09:21<33:35, 53.04s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 47ae2615-b472-4376-b49d-8ead62bd2f03...


🖼️ line:  26%|██▌       | 13/50 [10:47<37:29, 60.80s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 62a79057-35d1-4adb-83c8-fddd0c83195c...


🖼️ line:  36%|███▌      | 18/50 [12:13<17:43, 33.24s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 3894973b-6b46-4862-8fb3-861065402af5...


🖼️ line:  42%|████▏     | 21/50 [13:38<15:16, 31.60s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 26e229f1-7146-42c7-bdf8-6e5a3ebf0696...


🖼️ line:  48%|████▊     | 24/50 [15:14<13:42, 31.64s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image aa99daea-ff21-4c66-87e1-30099bd2b4cd...


🖼️ line:  50%|█████     | 25/50 [16:44<16:31, 39.68s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 9d813866-f5ea-4dff-9107-13698c61f4f1...


🖼️ line:  60%|██████    | 30/50 [18:11<09:33, 28.68s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 5978f57e-bf22-4bef-a163-33bfea724340...


🖼️ line:  62%|██████▏   | 31/50 [20:44<13:57, 44.10s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 2f0f2537-7617-4a22-95fa-9b80c5f5c531...


🖼️ line:  70%|███████   | 35/50 [23:00<09:57, 39.86s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 5bfbca18-19e2-4f98-86a8-a8facb7fa03b...


🖼️ line:  76%|███████▌  | 38/50 [25:12<08:14, 41.21s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image eee94515-a207-48b4-9760-e265bbe240fd...


🖼️ line:  84%|████████▍ | 42/50 [27:50<05:24, 40.58s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 5d153492-de31-41a1-97d4-1dd8686ca562...


🖼️ line:  90%|█████████ | 45/50 [30:16<03:34, 42.86s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image e8323faa-7d04-4239-b853-aea77fc6cb5d...


🖼️ line:  94%|█████████▍| 47/50 [32:48<02:29, 49.91s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 26761cad-57b4-4759-92c5-d6bad407725e...


🖼️ line:  98%|█████████▊| 49/50 [35:22<00:56, 56.22s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 57583e33-e9b4-4971-bb51-5a6177dc1f30...


🖼️ line: 100%|██████████| 50/50 [38:03<00:00, 45.67s/it]


✅ Saved line results to gemma3_12b_results_line.csv
📁 Processing scatter charts...


🖼️ scatter:   0%|          | 0/50 [00:00<?, ?it/s]

🔍 Processing image 6f5a28bd-6368-45de-a197-c4da654185c4...


The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
🖼️ scatter:   6%|▌         | 3/50 [02:20<36:34, 46.70s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image faeaf246-1551-404a-8dbc-370aade99e1a...


🖼️ scatter:   8%|▊         | 4/50 [04:33<57:13, 74.64s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 9ac40a7b-84bd-42a2-851d-1bfc1b256ce2...


🖼️ scatter:  16%|█▌        | 8/50 [07:15<36:48, 52.58s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 062e16ea-d9ea-4b57-9ec6-80137a5fc78e...


🖼️ scatter:  22%|██▏       | 11/50 [09:47<33:36, 51.70s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 0b656397-1b14-4a86-961d-b69464d0a5ca...


🖼️ scatter:  26%|██▌       | 13/50 [12:03<34:43, 56.31s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 89c26174-fb33-4030-98ac-14a463777930...


🖼️ scatter:  28%|██▊       | 14/50 [14:31<42:56, 71.56s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 06fc2b0e-5923-405b-be4b-11bece5e1f1d...


🖼️ scatter:  32%|███▏      | 16/50 [16:35<38:48, 68.48s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 8e26dc90-137f-472f-bfcb-b9b40eb005ab...


🖼️ scatter:  34%|███▍      | 17/50 [19:01<45:42, 83.09s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image e17b56fe-c739-4af5-bb50-c4fad0a9d7f5...


🖼️ scatter:  38%|███▊      | 19/50 [21:37<41:59, 81.26s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 3b68516a-0083-46f9-8b2e-8e0a85f4cf31...


🖼️ scatter:  40%|████      | 20/50 [23:40<44:47, 89.58s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image b43ce73b-56a4-44db-a8aa-dc33824bc8f1...


🖼️ scatter:  42%|████▏     | 21/50 [27:03<55:32, 114.91s/it]

🔍 Processing image 873d974a-a5b1-43b0-9d7e-25d18c19e876...


The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
🖼️ scatter:  44%|████▍     | 22/50 [29:58<1:00:18, 129.23s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 63e1ffb6-1958-43a3-b543-957809e5ad3c...


🖼️ scatter:  50%|█████     | 25/50 [33:26<41:11, 98.87s/it]   

🔍 Processing image dfcff6a7-a72d-4bdf-9936-a452b3d694fe...


The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
🖼️ scatter:  54%|█████▍    | 27/50 [35:34<33:33, 87.55s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image c1e29cae-8aab-492c-89be-bf0ec7d1a5f4...


🖼️ scatter:  56%|█████▌    | 28/50 [37:29<33:59, 92.72s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 0f805cc4-4e0c-47e8-839c-94df2e090309...


🖼️ scatter:  68%|██████▊   | 34/50 [40:00<13:34, 50.90s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 3e162208-21e8-4601-809e-dd0a68e260af...


🖼️ scatter:  70%|███████   | 35/50 [43:08<17:07, 68.49s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 47e01065-ae98-4e53-a379-3c85c9087759...


🖼️ scatter:  72%|███████▏  | 36/50 [46:15<20:16, 86.92s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image dc775ef1-344c-4680-ae62-99d985365e2d...


🖼️ scatter:  74%|███████▍  | 37/50 [49:22<22:45, 105.05s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 88d6cf96-b231-4362-8dac-7adabcb12496...


🖼️ scatter:  76%|███████▌  | 38/50 [51:24<21:41, 108.43s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 7dc5b844-528e-49d3-81b2-ec2e6b82617c...


🖼️ scatter:  78%|███████▊  | 39/50 [52:55<19:09, 104.52s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image dc77160e-04e3-4039-b234-35d35d71f898...


🖼️ scatter:  84%|████████▍ | 42/50 [55:41<10:42, 80.33s/it] The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 22caec9a-a323-46a2-bd66-84e41115eac8...


🖼️ scatter:  86%|████████▌ | 43/50 [57:45<10:20, 88.59s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 5dd3bf98-64c0-4209-894a-ad5dfa9b3674...


🖼️ scatter:  88%|████████▊ | 44/50 [1:00:24<10:21, 103.53s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image de4ed635-5bdb-47cf-8fe7-6ad0415fc6d0...


🖼️ scatter:  90%|█████████ | 45/50 [1:01:47<08:14, 98.83s/it] The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 6065e29b-5af0-40d5-aa41-5a2fea8a0199...


🖼️ scatter:  96%|█████████▌| 48/50 [1:03:40<02:16, 68.16s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing image 4cda0eb3-fb28-4444-b9a0-cc71f53efb1d...


🖼️ scatter: 100%|██████████| 50/50 [1:05:53<00:00, 79.06s/it]

✅ Saved scatter results to gemma3_12b_results_scatter.csv



