In [1]:
import json
import ipywidgets as widgets
from IPython.display import display, Image
import os
from Qwen2VL_uigraph.model.processing_qwen2_vl import Qwen2VLProcessor
from qwen_vl_utils import process_vision_info
import ast  # To safely evaluate JSON-like strings
from PIL import Image, ImageDraw, ImageFont
import io

In [15]:
def draw_click_point(
    img_path: str,
    naive_x: float, naive_y: float,
    uigraph_x: float, uigraph_y: float,
    bbox: list,
    instruction: str,
    gap_between_text_and_image: int = 15
) -> bytes:
    """
    Draws a white instruction banner at the top, leaves a gap,
    then the original image annotated with:
      - Green rectangle for ground-truth bbox
      - Blue dot for Naive prediction (naive_x, naive_y)
      - Red dot for UI-Graph prediction (uigraph_x, uigraph_y)

    Returns image bytes in PNG format.
    """
    # 1) Load original image
    image = Image.open(img_path).convert("RGB")
    width, height = image.size

    # 2) Load a font that supports Chinese (fallback to default)
    try:
        font = ImageFont.truetype(
            "/usr/share/fonts/truetype/noto/NotoSansSC-Regular.otf", size=25
        )
    except IOError:
        font = ImageFont.load_default(size=25)

    # 3) Measure instruction text size
    tmp_draw = ImageDraw.Draw(image)
    text_bbox = tmp_draw.textbbox((0, 0), 'Instruction : ' + instruction, font=font)
    text_w = text_bbox[2] - text_bbox[0]
    text_h = text_bbox[3] - text_bbox[1]

    # 4) Compute padding for banner and gap
    top_margin = 5
    bottom_margin = gap_between_text_and_image
    padding_top = text_h + top_margin + bottom_margin

    # 5) Create new canvas and paste original image
    new_img = Image.new("RGB", (width, height + padding_top), "white")
    new_img.paste(image, (0, padding_top))

    draw = ImageDraw.Draw(new_img)

    # 6) Draw instruction centered
    x_text = (width - text_w) / 2
    y_text = top_margin
    draw.text((x_text, y_text), instruction, fill="black", font=font)

    # 7) Helper to convert relative coords to absolute
    def to_abs(rx, ry):
        return rx * width, padding_top + ry * height

    nx, ny = to_abs(naive_x, naive_y)
    ux, uy = to_abs(uigraph_x, uigraph_y)
    bx0, by0 = to_abs(bbox[0], bbox[1])
    bx1, by1 = to_abs(bbox[2], bbox[3])

    # 8) Draw bounding box and dots
    draw.rectangle([bx0, by0, bx1, by1], outline="green", width=3)
    br = 5
    r = 5
    # Blue dot = Naive prediction
    draw.ellipse([nx - br, ny - br, nx + br, ny + br], fill="blue")
    # Red dot = UI-Graph prediction
    draw.ellipse([ux - r, uy - r, ux + r, uy + r], fill="red")

    # 9) Export to PNG bytes
    with io.BytesIO() as out:
        new_img.save(out, format="PNG")
        return out.getvalue()

In [16]:
# Load JSON file
naive_json_path = "/home/syc/intern/wanshan/Qwen2-VL/agent_tasks/ScreenSpot/naive/screenspot_naive-desktop.json"

with open(naive_json_path, "r", encoding="utf-8") as f:
    naive_data = json.load(f)



In [17]:
uigraph_json_path = "/home/syc/intern/wanshan/Qwen2-VL/agent_tasks/ScreenSpot/uigraph_prunelayer_20-04-30/screenspot_uigraph_qwen2vl-7b_dropratio-0.6_desktop-prune-layer_20.json"
with open(uigraph_json_path, "r", encoding="utf-8") as f:
    uigraph_data = json.load(f)
# Filter instances where Ele_match is False


In [18]:
uigraph_data[0]

{'img_path': '/data/data1/syc/intern/wanshan/datasets/ScreenSpot/screenspot_imgs/pc_ede36f9b-1154-4f76-b7f8-c15d7d3f9b6e.png',
 'text': 'close',
 'bbox': [0.9479166666666666,
  0.14444444444444443,
  0.99375,
  0.2074074074074074],
 'pred': [0.98, 0.23],
 'matched': False,
 'response': '{"action_type": 4, "click_point": (0.98,0.23)}\n',
 'type': 'icon',
 'source': 'windows'}

In [19]:
naive_data[0]['bbox']


[0.9479166666666666, 0.14444444444444443, 0.99375, 0.2074074074074074]

In [20]:
len(naive_data), len(uigraph_data)

(334, 334)

In [21]:
mismatched_idxs  = []
for sample_idx, sample in enumerate(naive_data):
    if 'matched' in sample and 'matched' in uigraph_data[sample_idx]:
        if sample['matched'] == False and uigraph_data[sample_idx]['matched'] == True:
            mismatched_idxs.append(sample_idx)
len(mismatched_idxs)

7

In [22]:
# for sample_idx, sample in enumerate(naive_data):
#     if 'text' in sample and sample['text'] == 'switch to groups view':
#         print(sample_idx)

In [23]:
for sample_idx in mismatched_idxs:
    if naive_data[sample_idx]['img_path'] != uigraph_data[sample_idx]['img_path']:
        print('mismatched image_path', naive_data[sample_idx]['img_path'])

In [24]:
model_path = "/data/data1/syc/intern/wanshan/models/Qwen2-VL-7B-Instruct"
# model_path = "/data/data1/syc/intern/wanshan/models/showlab/ShowUI-2B_edited"

# min_pixel = 1344*28*28
max_pixel = 1680*28*28
# 1. Screenshot -> Graph
uigraph_train = True  # Enable ui graph during training
uigraph_test = True  # Enable ui graph during inference
uigraph_diff = 1  # Pixel difference used for constructing ui graph
uigraph_rand = False  # Enable random graph construction
# 2. Graph -> Mask
uimask_pre = True  # Prebuild patch selection mask in the preprocessor (not in model layers) for efficiency
uimask_ratio = 0.6 # Specify the percentage of patch tokens to skip per component
uimask_rand = False  # Enable random token selection instead of uniform selection


processor = Qwen2VLProcessor.from_pretrained(
    model_path,
    # min_pixels= min_pixel,
    max_pixels = max_pixel,
    uigraph_train=uigraph_train,
    uigraph_test=uigraph_test,
    uigraph_diff=uigraph_diff,
    uigraph_rand=uigraph_rand,
    uimask_pre=True,
    uimask_ratio=uimask_ratio,
    uimask_rand=uimask_rand,
)

In [25]:
def load_visualize(image_path):
    messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": image_path,
                # "min_pixels": min_pixel,
                # "max_pixels": max_pixel,
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
    ]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, _ = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        padding=True,
        return_tensors="pt",
        vis_dir="./visualize_imgs" # this folder to save visualization 
    )
    with open("./visualize_imgs/demo.png", "rb") as f:
        return f.read()

In [26]:
import os
import shutil
import ipywidgets as widgets
from IPython.display import display

# Assumes mismatched_idxs, naive_data, uigraph_data, load_visualize, draw_click_point are defined earlier

# Directory to save annotated images
save_dir = "./saved_preds"
os.makedirs(save_dir, exist_ok=True)

# Initialize sample index
sample_idx = 0

# Widgets for display
sentence_label = widgets.HTML()
image_widget = widgets.Image(format='png', width=1000)

# Hold the latest generated image bytes for saving
tcurrent_image_bytes = None

# current image name, use later when save annotation
current_image_name = None

# Function to update display
def update_display():
    global sample_idx, current_image_bytes, current_image_name
    if sample_idx < len(mismatched_idxs):
        idx = mismatched_idxs[sample_idx]
        naive_sample = naive_data[idx]
        uigraph_sample = uigraph_data[idx]

        # Show the user instruction or text
        instruction = naive_sample.get('text', '')
        sentence_label.value = f"**Naive:** {naive_sample} <br> **UIGRAPH:**{uigraph_sample}"

        # Extract relative click coordinates
        try:
            nx, ny = naive_sample.get('pred', (0, 0))
            ux, uy = uigraph_sample.get('pred', (0, 0))
        except Exception:
            nx = ny = ux = uy = 0

        # Ground-truth bounding box
        bbox = naive_sample.get('bbox', [0,0,0,0])
        img_path = naive_sample.get('img_path', '')
        current_image_name = os.path.basename(img_path)

        # Optionally visualize raw GUI
        _ = load_visualize(img_path)

        # Draw both Naive (blue) and UI-Graph (red) predictions on one image
        current_image_bytes = draw_click_point(
            "./visualize_imgs/demo.png",
            nx, ny,
            ux, uy,
            bbox,
            instruction
        )

        # Update the image widget
        image_widget.value = current_image_bytes

    else:
        # No more samples
        sentence_label.value = ""
        image_widget.value = b""

# Button callbacks
def next_step(_):
    global sample_idx
    if sample_idx < len(mismatched_idxs) - 1:
        sample_idx += 1
    update_display()

def save_total_incorrect(_):
    out_path = os.path.join(save_dir, f"totally_incorrect-{current_image_name}")
    with open(out_path, "wb") as f:
        f.write(current_image_bytes)
    print(f"Totally incorrect annotated image saved to {out_path}")

def save_position_incorrect(_):
    out_path = os.path.join(save_dir, f"position_incorrect-{current_image_name}")
    with open(out_path, "wb") as f:
        f.write(current_image_bytes)
    print(f"Position incorrect annotated image saved to {out_path}")

# Create buttons
next_button = widgets.Button(description="Next Step")
next_button.on_click(next_step)

save_btn_total_inco = widgets.Button(description="Save Totally Incorrect", button_style='info')
save_btn_total_inco.on_click(save_total_incorrect)

save_btn_position_inco = widgets.Button(description="Save Position Incorrect", button_style='success')
save_btn_position_inco.on_click(save_position_incorrect)

# Layout
controls = widgets.HBox([next_button, save_btn_total_inco, save_btn_position_inco])

display(sentence_label, image_widget, controls)
# Initial display
update_display()

HTML(value='')

Image(value=b'', width='1000')

HBox(children=(Button(description='Next Step', style=ButtonStyle()), Button(button_style='info', description='…

# Double Image Show

In [13]:

# # Function to draw click point on image
# def draw_click_point(img_path, click_x, click_y, bbox, pred=False):
#     """
#     img_path: str
#     click_x, click_y : ralative coordinate (0-1)
#     bbox, list : [x_low, y_low, x_high, y_high] (0-1)
#     pred : model prediction | ground truth
#         if ground truth, no output visualized image
#     """
#     if os.path.exists(img_path):
#         img = Image.open(img_path)
#         w, h = img.size  # Get image dimensions
        
#         # Convert relative to absolute coordinates
#         abs_x = int(click_x * w)
#         abs_y = int(click_y * h)
        
#         # Draw the dot
#         draw = ImageDraw.Draw(img)
#         dot_radius = 10  # Adjust dot size if needed
#         draw.ellipse((abs_x - dot_radius, abs_y - dot_radius, abs_x + dot_radius, abs_y + dot_radius), fill="red")

#         # Draw the bounding box (if exists)
#         if bbox: # [0.278, 0.64, 0.528, 0.688]
#             bbox_x_top_left = round(bbox[0] * w)
#             bbox_y_top_left = round(bbox[1] * h)
#             bbox_x_bot_right = round(bbox[2] * w)
#             bbox_y_bot_right = round(bbox[3] * h)
            
#             bbox_coords = [(bbox_x_top_left, bbox_y_top_left), (bbox_x_bot_right, bbox_y_bot_right)]
#             draw.rectangle(bbox_coords, outline="blue", width=3)  # Blue bbox

#         # Save the modified image temporarily
#         if not pred:
#             temp_img_path = "./visualize_imgs/image.png"
#         else:
#             temp_img_path = "./visualize_imgs/pred_image.png"

#         img.save(temp_img_path)
#         # Return the new image path
#         return temp_img_path
#     else:
#         return None

In [14]:
# import os
# import shutil
# import ipywidgets as widgets
# from IPython.display import display

# # Assumes mismatched_idxs, naive_data, uigraph_data, load_visualize, draw_click_point are defined earlier

# # Directory to save predictions
# save_dir = "./saved_preds"
# os.makedirs(save_dir, exist_ok=True)

# # Initialize sample index
# sample_idx = 0

# # Widgets for display
# sentence_label = widgets.HTML()
# sentece_uigraph_label = widgets.HTML()
# image_widget = widgets.Image(format='png', width=600)
# image_pred = widgets.Image(format='png', width=600)

# # Function to update display
# def update_display():
#     global sample_idx, naive_image_path, uigraph_image_path
#     if sample_idx < len(mismatched_idxs):
#         idx = mismatched_idxs[sample_idx]
#         naive_sample = naive_data[idx]
#         uigraph_sample = uigraph_data[idx]

#         sentence_label.value = f"**Naive Action(s):** {naive_sample}"
#         sentece_uigraph_label.value = f"**UI Graph Action(s):** {uigraph_sample}"

#         try:
#             click_x, click_y = naive_sample.get("pred", (0, 0))
#             uigraph_click_x, uigraph_click_y = uigraph_sample.get("pred", (0, 0))
#         except Exception:
#             click_x, click_y = 0, 0
#             uigraph_click_x, uigraph_click_y = 0, 0

#         bbox = naive_sample['bbox']
#         img_path = naive_sample["img_path"]

#         # Generate annotated images
#         _ = load_visualize(img_path)
#         naive_image_path = draw_click_point(img_path, click_x, click_y, bbox)
#         uigraph_image_path = draw_click_point("./visualize_imgs/demo.png", uigraph_click_x, uigraph_click_y, bbox, pred=True)

#         # Load images into widgets
#         if os.path.exists(naive_image_path) and os.path.exists(uigraph_image_path):
#             with open(naive_image_path, "rb") as f:
#                 image_widget.value = f.read()
#             with open(uigraph_image_path, "rb") as f:
#                 image_pred.value = f.read()
#         else:
#             sentence_label.value += f"\n(Error: Image not found)"
#     else:
#         sentence_label.value = ""
#         image_widget.value = b""
#         image_pred.value = b""

# # Button callbacks
# def next_step(_):
#     global sample_idx
#     if sample_idx < len(mismatched_idxs) - 1:
#         sample_idx += 1
#     update_display()

# def save_naive(_):
#     out_path = os.path.join(save_dir, f"naive_pred_{sample_idx}.png")
#     shutil.copy(naive_image_path, out_path)
#     print(f"Naive prediction saved to {out_path}")

# def save_uigraph(_):
#     out_path = os.path.join(save_dir, f"uigraph_pred_{sample_idx}.png")
#     shutil.copy(uigraph_image_path, out_path)
#     print(f"UI-Graph prediction saved to {out_path}")

# # Create buttons
# next_button = widgets.Button(description="Next Step")
# next_button.on_click(next_step)

# save_naive_btn = widgets.Button(description="Save Naive Pred", button_style='info')
# save_naive_btn.on_click(save_naive)

# save_uigraph_btn = widgets.Button(description="Save UI-Graph Pred", button_style='success')
# save_uigraph_btn.on_click(save_uigraph)

# # Layout
# image_box = widgets.HBox([image_widget, image_pred])
# controls = widgets.HBox([next_button, save_naive_btn, save_uigraph_btn])

# # Initial display and render
# update_display()
# display(sentence_label, sentece_uigraph_label, image_box, controls)
