In [20]:
import json
import ipywidgets as widgets
from IPython.display import display, Image
import os
from transformers import Qwen2VLProcessor
from qwen_vl_utils import process_vision_info
import ast  # To safely evaluate JSON-like strings
from PIL import Image, ImageDraw, ImageFont
import io

In [57]:
import io
from PIL import Image, ImageDraw, ImageFont

def draw_click_point(
    img_path,
    qwen2vl_x, qwen2vl_y,
    seeclick_x, seeclick_y,
    bbox,
    instruction,
    gap_between_text_and_image: int = 15
):
    """
    Draw instruction text in a white banner at the top, then leave
    `gap_between_text_and_image` pixels of blank, then the image.
    """
    # 1) Load image
    image = Image.open(img_path)
    width, height = image.size

    # 2) Choose a Chinese‐capable font (fallback to default)
    try:
        font = ImageFont.truetype("/usr/share/fonts/truetype/noto/NotoSansSC-Regular.otf", size=20)
    except IOError:
        font = ImageFont.load_default(size=20)

    # 3) Measure instruction text
    tmp = ImageDraw.Draw(image)
    left, top, right, bottom = tmp.textbbox((0,0), instruction, font=font)
    text_w, text_h = right - left, bottom - top

    # 4) Compute paddings
    top_margin = 5
    bottom_margin = gap_between_text_and_image
    padding_top = text_h + top_margin + bottom_margin

    # 5) New canvas: original height + that padding_top
    new_img = Image.new("RGB", (width, height + padding_top), "white")
    new_img.paste(image, (0, padding_top))

    draw = ImageDraw.Draw(new_img)

    # 6) Draw the instruction centered in its band
    x_text = (width - text_w) / 2
    y_text = top_margin
    draw.text((x_text, y_text), "Instruction: " + instruction, fill="black", font=font)

    # 7) Convert relative → absolute (accounting for padding)
    def to_abs(rx, ry):
        return rx * width, padding_top + ry * height

    qx, qy = to_abs(qwen2vl_x, qwen2vl_y)
    sx, sy = to_abs(seeclick_x, seeclick_y)
    bx0, by0 = to_abs(bbox[0], bbox[1])
    bx1, by1 = to_abs(bbox[2], bbox[3])

    # 8) Draw bbox + circles
    draw.rectangle([bx0, by0, bx1, by1], outline="green", width=2)
    r = 5
    draw.ellipse((qx-r, qy-r, qx+r, qy+r), fill="blue")
    draw.ellipse((sx-r, sy-r, sx+r, sy+r), fill="red")

    # 9) Return PNG bytes
    with io.BytesIO() as out:
        new_img.save(out, format="PNG")
        return out.getvalue()


In [58]:
# Load JSON file
qwen2vl_pred_path = "/home/syc/intern/wanshan/Thesis_result/Mind2Web/Text-History/Qwen2-VL-7B_naive/qwen2vl_resampler_7b_keep_1_date_0406_website.json"  # Change this to your JSON file path
# json_file = "/home/syc/intern/wanshan/Qwen2-VL/agent_tasks/custom_training_script/qwen2vl_train_train.json"
with open(qwen2vl_pred_path, "r", encoding="utf-8") as f:
    data = json.load(f)
# Filter instances where Ele_match is False
qwen2vl_data = [
    [step for step in episode] 
    for episode in data
]

# Initialize index
episode_index = 0
step_index = 0

In [59]:
seeclick_pred_path = "/home/syc/intern/wanshan/SeeClick/visualize_result/seeclick_mind2map_ckpt_4000/seeclick_mind2web_website.json"
with open(seeclick_pred_path, "r", encoding="utf-8") as f:
    data = json.load(f)
# Filter instances where Ele_match is False
seeclick_data = [
    [step for step in episode] 
    for episode in data
]

In [60]:
len(qwen2vl_data[0]), len(seeclick_data[0])

(7, 7)

In [61]:
# ground_truth_path = "/home/syc/intern/wanshan/Qwen2-VL/data/subset_100_samples.json"
# with open(ground_truth_path, "r", encoding="utf-8") as f:
#     ground_truth_data = json.load(f)

qwen2vl_data[0][0]['bbox_ref'] # x_left, y_left, x_right, y_right


[0.278, 0.64, 0.528, 0.688]

In [62]:
navigation_sequence  = []
for episode_idx, episode in enumerate(qwen2vl_data):
    for step_idx, step in enumerate(episode):
        if step['Ele_match'] == True and seeclick_data[episode_idx][step_idx]['Ele_match'] == False:
            navigation_sequence.append((episode_idx, step_idx))
sequence_index = 0

In [63]:
len(navigation_sequence)

444

In [64]:
last_draw_args = None  # will hold (img_path, qx, qy, sx, sy, bbox)

def save_image(_):
    if last_draw_args is None:
        print("Nothing to save yet.")
        return
    img_bytes = draw_click_point(*last_draw_args)
    fname = f"annotation_step_{sequence_index:03d}.png"
    with open(fname, "wb") as f:
        f.write(img_bytes)
    print(f"Saved ▶ {fname}")

In [65]:

def update_display():
    global sequence_index, last_draw_args
    
    if sequence_index < len(navigation_sequence):
        episode_index, step_index = navigation_sequence[sequence_index]
        episode_qwen2vl = qwen2vl_data[episode_index]
        episode_seeclick = seeclick_data[episode_index]
        # Ensure that the step index is valid for the current episode
        if step_index < len(episode_qwen2vl):
            step_qwen2vl = episode_qwen2vl[step_index]
            step_seeclick = episode_seeclick[step_index]

            instruction = step_qwen2vl.get('instruction')

            
            # Update the text for each prediction
            sentence_qwen2vl.value = f"**Qwen2VL Action(s):** {step_qwen2vl}"
            sentence_seeclick.value = f"**SeeClick Action(s):** {step_seeclick}"
            
            # Extract click coordinates from the sentence JSON data
            try:
                action_data_qwen2vl = ast.literal_eval(step_qwen2vl["sentence"][0])
                qwen2vl_click_x, qwen2vl_click_y = action_data_qwen2vl.get("click_point", (0, 0))
                bbox = step_qwen2vl.get("bbox_ref", None)
                
                action_data_seeclick = ast.literal_eval(step_seeclick["sentence"])
                seeclick_click_x, seeclick_click_y = action_data_seeclick.get("click_point", (0, 0))
            except Exception as e:
                # If extraction fails, default to top-left for both
                qwen2vl_click_x, qwen2vl_click_y = 0, 0
                seeclick_click_x, seeclick_click_y = 0, 0
            
            # Load the image from the Qwen2VL step (assuming it's the same image used by SeeClick)
            img_path = step_qwen2vl["img_path"]
            
            if os.path.exists(img_path):
                # Draw both click points on the image and update the widget
                last_draw_args = (img_path, qwen2vl_click_x, qwen2vl_click_y, seeclick_click_x, seeclick_click_y, bbox, instruction)
                png = draw_click_point(*last_draw_args)
                image_widget.value = png
            else:
                sentence_qwen2vl.value += f"\n(Error: Image not found at {img_path})"
        else:
            sentence_qwen2vl.value = "No more steps in this episode."
            image_widget.value = b""
    else:
        sentence_qwen2vl.value = ""
        image_widget.value = b""

def next_step(_):
    global sequence_index
    # Move to the next item in the navigation sequence if available
    if sequence_index < len(navigation_sequence) - 1:
        sequence_index += 1
    update_display()

# Widgets initialization
sentence_qwen2vl = widgets.HTML()
sentence_seeclick = widgets.HTML()
image_widget = widgets.Image(format='png')  # Single image widget for merged output
next_button = widgets.Button(description="Next Step")
save_button     = widgets.Button(description="Save Image", button_style="success")
next_button.on_click(next_step)
save_button.on_click(save_image)
# Initial display update and layout
display(
    widgets.VBox([
        sentence_qwen2vl,
        sentence_seeclick,
        image_widget,
        widgets.HBox([next_button, save_button])
    ])
)
update_display()

VBox(children=(HTML(value=''), HTML(value=''), Image(value=b''), HBox(children=(Button(description='Next Step'…

Saved ▶ annotation_step_000.png
Saved ▶ annotation_step_001.png
Saved ▶ annotation_step_002.png
Saved ▶ annotation_step_005.png
Saved ▶ annotation_step_007.png
Saved ▶ annotation_step_012.png
Saved ▶ annotation_step_015.png
Saved ▶ annotation_step_016.png
Saved ▶ annotation_step_017.png
Saved ▶ annotation_step_019.png
Saved ▶ annotation_step_020.png
Saved ▶ annotation_step_022.png
Saved ▶ annotation_step_024.png
Saved ▶ annotation_step_025.png
Saved ▶ annotation_step_026.png
Saved ▶ annotation_step_027.png
Saved ▶ annotation_step_029.png
Saved ▶ annotation_step_030.png
Saved ▶ annotation_step_031.png
Saved ▶ annotation_step_033.png
Saved ▶ annotation_step_034.png
Saved ▶ annotation_step_036.png
Saved ▶ annotation_step_037.png
Saved ▶ annotation_step_038.png
Saved ▶ annotation_step_039.png
Saved ▶ annotation_step_040.png
Saved ▶ annotation_step_041.png
Saved ▶ annotation_step_042.png
Saved ▶ annotation_step_044.png
Saved ▶ annotation_step_045.png
Saved ▶ annotation_step_046.png
Saved ▶ 