In [1]:
import json
import ipywidgets as widgets
from IPython.display import display, Image
import os
# from Qwen2VL_uigraph.model.processing_qwen2_vl import Qwen2VLProcessor
from qwen_vl_utils import process_vision_info
import ast  # To safely evaluate JSON-like strings
from PIL import Image, ImageDraw, ImageFont
import io

In [2]:
def draw_click_point(
    img_path: str,
    naive_x: float, naive_y: float,
    uigraph_x: float, uigraph_y: float,
    bbox: list,
    instruction: str,
    gap_between_text_and_image: int = 20
) -> bytes:
    """
    Draws a white instruction banner at the top, leaves a gap,
    then the original image annotated with:
      - Green rectangle for ground-truth bbox
      - Blue dot for Naive prediction (naive_x, naive_y)
      - Red dot for UI-Graph prediction (uigraph_x, uigraph_y)

    Returns image bytes in PNG format.
    """
    # 1) Load original image
    image = Image.open(img_path).convert("RGB")
    width, height = image.size

    # 2) Load a font that supports Chinese (fallback to default)
    try:
        font = ImageFont.truetype(
            "/usr/share/fonts/truetype/noto/NotoSansSC-Regular.otf", size=30
        )
    except IOError:
        font = ImageFont.load_default(size=30)

    # 3) Measure instruction text size
    tmp_draw = ImageDraw.Draw(image)
    text_bbox = tmp_draw.textbbox((0, 0), instruction, font=font)
    text_w = text_bbox[2] - text_bbox[0]
    text_h = text_bbox[3] - text_bbox[1]

    # 4) Compute padding for banner and gap
    top_margin = 10
    bottom_margin = gap_between_text_and_image
    padding_top = text_h + top_margin + bottom_margin

    # 5) Create new canvas and paste original image
    new_img = Image.new("RGB", (width, height + padding_top), "white")
    new_img.paste(image, (0, padding_top))

    draw = ImageDraw.Draw(new_img)

    # 6) Draw instruction centered
    x_text = (width - text_w) / 2
    y_text = top_margin
    draw.text((x_text, y_text), instruction, fill="black", font=font)

    # 7) Helper to convert relative coords to absolute
    def to_abs(rx, ry):
        return rx * width, padding_top + ry * height

    nx, ny = to_abs(naive_x, naive_y)
    ux, uy = to_abs(uigraph_x, uigraph_y)
    bx0, by0 = to_abs(bbox[0], bbox[1])
    bx1, by1 = to_abs(bbox[2], bbox[3])

    # 8) Draw bounding box and dots
    draw.rectangle([bx0, by0, bx1, by1], outline="green", width=3)
    r = 8
    # Blue dot = Naive prediction
    draw.ellipse([nx - r, ny - r, nx + r, ny + r], fill="blue")
    # Red dot = UI-Graph prediction
    draw.ellipse([ux - r, uy - r, ux + r, uy + r], fill="red")

    # 9) Export to PNG bytes
    with io.BytesIO() as out:
        new_img.save(out, format="PNG")
        return out.getvalue()

In [3]:
# Load JSON file
naive_json_path = "/home/syc/intern/wanshan/Qwen2-VL/agent_tasks/ScreenSpot/uigraph_prunelayer_0-04-24/screenspot_uigraph_qwen2vl-7b_max_pixels_1680_dropratio-0_web-prune-layer_0.json"

with open(naive_json_path, "r", encoding="utf-8") as f:
    naive_data = json.load(f)



In [25]:
prune_json_path = "/home/syc/intern/wanshan/Qwen2-VL/agent_tasks/ScreenSpot/uigraph_prunelayer_0-04-24/screenspot_uigraph_qwen2vl-7b_max_pixels_1680_dropratio-0.1_web-prune-layer_0.json"
with open(prune_json_path, "r", encoding="utf-8") as f:
    prune_data = json.load(f)
# Filter instances where Ele_match is False


In [26]:
prune_data[0]

{'img_path': '/data/data1/syc/intern/wanshan/datasets/ScreenSpot/screenspot_imgs/web_213f816e-8e80-4d13-970d-1347bbc7a2a8.png',
 'text': 'create a new project',
 'bbox': [0.906640625, 0.08958333333333333, 0.987890625, 0.13819444444444445],
 'pred': [0.95, 0.12],
 'matched': True,
 'response': '{"action_type": 4, "click_point": (0.95,0.12)}\n',
 'type': 'text',
 'source': 'gitlab'}

In [27]:
naive_data[0]['bbox']


[0.906640625, 0.08958333333333333, 0.987890625, 0.13819444444444445]

In [28]:
len(naive_data), len(prune_data)

(436, 436)

In [29]:
mismatched_idxs  = []
for sample_idx, sample in enumerate(naive_data):
    if 'matched' in sample and 'matched' in prune_data[sample_idx]:
        if sample['matched'] == True and prune_data[sample_idx]['matched'] == False:
            mismatched_idxs.append(sample_idx)
len(mismatched_idxs)

17

In [30]:
for sample_idx in mismatched_idxs:
    if naive_data[sample_idx]['img_path'] != prune_data[sample_idx]['img_path']:
        print('mismatched image_path', naive_data[sample_idx]['img_path'])

In [31]:
import os
import shutil
import ipywidgets as widgets
from IPython.display import display

# Assumes mismatched_idxs, naive_data, uigraph_data, load_visualize, draw_click_point are defined earlier

# Directory to save annotated images
save_dir = "./saved_preds"
os.makedirs(save_dir, exist_ok=True)

# Initialize sample index
sample_idx = 0

# Widgets for display
sentence_label = widgets.HTML()
image_widget = widgets.Image(format='png', width=1000)

# Hold the latest generated image bytes for saving
tcurrent_image_bytes = None

# current image name, use later when save annotation
current_image_name = None

# Function to update display
def update_display():
    global sample_idx, current_image_bytes, current_image_name
    if sample_idx < len(mismatched_idxs):
        idx = mismatched_idxs[sample_idx]
        naive_sample = naive_data[idx]
        uigraph_sample = prune_data[idx]

        # Show the user instruction or text
        instruction = naive_sample.get('text', '')
        sentence_label.value = f"**Naive:** {naive_sample} <br> **UIGRAPH:**{uigraph_sample}"

        # Extract relative click coordinates
        try:
            nx, ny = naive_sample.get('pred', (0, 0)) # naive
            px, py = uigraph_sample.get('pred', (0, 0)) # prune
        except Exception:
            nx = ny = px = py = 0

        # Ground-truth bounding box
        bbox = naive_sample.get('bbox', [0,0,0,0])
        img_path = naive_sample.get('img_path', '')
        current_image_name = os.path.basename(img_path)


        # Draw both Naive (blue) and UI-Graph (red) predictions on one image
        current_image_bytes = draw_click_point(
            img_path,
            nx, ny,
            px, py,
            bbox,
            instruction
        )

        # Update the image widget
        image_widget.value = current_image_bytes

    else:
        # No more samples
        sentence_label.value = ""
        image_widget.value = b""

# Button callbacks
def next_step(_):
    global sample_idx
    if sample_idx < len(mismatched_idxs) - 1:
        sample_idx += 1
    update_display()

def save_total_incorrect(_):
    out_path = os.path.join(save_dir, f"totally_incorrect-{current_image_name}")
    with open(out_path, "wb") as f:
        f.write(current_image_bytes)
    print(f"Totally incorrect annotated image saved to {out_path}")

def save_position_incorrect(_):
    out_path = os.path.join(save_dir, f"position_incorrect-{current_image_name}")
    with open(out_path, "wb") as f:
        f.write(current_image_bytes)
    print(f"Position incorrect annotated image saved to {out_path}")

# Create buttons
next_button = widgets.Button(description="Next Step")
next_button.on_click(next_step)

save_btn_total_inco = widgets.Button(description="Save Totally Incorrect", button_style='info')
save_btn_total_inco.on_click(save_total_incorrect)

save_btn_position_inco = widgets.Button(description="Save Position Incorrect", button_style='success')
save_btn_position_inco.on_click(save_position_incorrect)

# Layout
controls = widgets.HBox([next_button, save_btn_total_inco, save_btn_position_inco])

display(sentence_label, image_widget, controls)
# Initial display
update_display()

HTML(value='')

Image(value=b'', width='1000')

HBox(children=(Button(description='Next Step', style=ButtonStyle()), Button(button_style='info', description='…