In [1]:
import json
import ipywidgets as widgets
from IPython.display import display, Image
import os
from transformers import Qwen2VLProcessor
from qwen_vl_utils import process_vision_info
import ast  # To safely evaluate JSON-like strings
from PIL import Image, ImageDraw
import io

In [2]:
def draw_click_point(img_path, qwen2vl_x, qwen2vl_y, seeclick_x, seeclick_y, bbox):
    """
    Load an image from img_path and draw two circles and a bounding box:
      - Blue circle for the Qwen2VL click at (qwen2vl_x, qwen2vl_y)
      - Red circle for the SeeClick click at (seeclick_x, seeclick_y)
      - Bounding box is drawn with a green rectangle using relative coordinates provided in bbox 
        as [x_left, y_left, x_right, y_right]
        
    Parameters:
      img_path: Path to the image file.
      qwen2vl_x, qwen2vl_y: Relative coordinates (0-1) of Qwen2VL click.
      seeclick_x, seeclick_y: Relative coordinates (0-1) of SeeClick click.
      bbox: List of relative coordinates [x_left, y_left, x_right, y_right] for the bounding box.
    
    Returns:
      Annotated image data in PNG format as binary.
    """
    # Open the image and get its dimensions
    image = Image.open(img_path)
    draw = ImageDraw.Draw(image)
    width, height = image.size

    # Convert relative click coordinates to absolute coordinates
    qwen2vl_abs_x = qwen2vl_x * width
    qwen2vl_abs_y = qwen2vl_y * height
    seeclick_abs_x = seeclick_x * width
    seeclick_abs_y = seeclick_y * height

    # Convert relative bbox coordinates to absolute coordinates
    bbox_left = bbox[0] * width
    bbox_top = bbox[1] * height
    bbox_right = bbox[2] * width
    bbox_bottom = bbox[3] * height

    # Debug print for absolute click coordinates and bbox
    # print("Qwen2VL click absolute:", qwen2vl_abs_x, qwen2vl_abs_y)
    # print("SeeClick click absolute:", seeclick_abs_x, seeclick_abs_y)
    # print("BBox absolute:", bbox_left, bbox_top, bbox_right, bbox_bottom)

    # Draw the bounding box on the image (green outline, width 2)
    draw.rectangle([bbox_left, bbox_top, bbox_right, bbox_bottom], outline="green", width=2)

    # Set a fixed radius for the click point markers
    radius = 5  

    # Draw the Qwen2VL prediction (blue circle)
    draw.ellipse(
        (qwen2vl_abs_x - radius, qwen2vl_abs_y - radius,
         qwen2vl_abs_x + radius, qwen2vl_abs_y + radius),
        fill="blue"
    )

    # Draw the SeeClick prediction (red circle)
    draw.ellipse(
        (seeclick_abs_x - radius, seeclick_abs_y - radius,
         seeclick_abs_x + radius, seeclick_abs_y + radius),
        fill="red"
    )

    # Save the modified image to a bytes buffer and return the PNG binary data
    with io.BytesIO() as output:
        image.save(output, format="PNG")
        return output.getvalue()

In [3]:
# Load JSON file
qwen2vl_pred_path = "/home/syc/intern/wanshan/Thesis_result/Mind2Web/Text-History/Qwen2-VL-7B_naive/qwen2vl_resampler_7b_keep_1_date_0406_website.json"  # Change this to your JSON file path
# json_file = "/home/syc/intern/wanshan/Qwen2-VL/agent_tasks/custom_training_script/qwen2vl_train_train.json"
with open(qwen2vl_pred_path, "r", encoding="utf-8") as f:
    data = json.load(f)
# Filter instances where Ele_match is False
qwen2vl_data = [
    [step for step in episode] 
    for episode in data
]

# Initialize index
episode_index = 0
step_index = 0

In [4]:
seeclick_pred_path = "/home/syc/intern/wanshan/SeeClick/visualize_result/seeclick_mind2map_ckpt_4000/seeclick_mind2web_website.json"
with open(seeclick_pred_path, "r", encoding="utf-8") as f:
    data = json.load(f)
# Filter instances where Ele_match is False
seeclick_data = [
    [step for step in episode] 
    for episode in data
]

In [5]:
len(qwen2vl_data[0]), len(seeclick_data[0])

(7, 7)

In [6]:
# ground_truth_path = "/home/syc/intern/wanshan/Qwen2-VL/data/subset_100_samples.json"
# with open(ground_truth_path, "r", encoding="utf-8") as f:
#     ground_truth_data = json.load(f)

qwen2vl_data[0][0]['bbox_ref'] # x_left, y_left, x_right, y_right


[0.278, 0.64, 0.528, 0.688]

In [7]:
navigation_sequence  = []
for episode_idx, episode in enumerate(qwen2vl_data):
    for step_idx, step in enumerate(episode):
        if step['Ele_match'] == True and seeclick_data[episode_idx][step_idx]['Ele_match'] == False:
            navigation_sequence.append((episode_idx, step_idx))
sequence_index = 0

In [8]:
len(navigation_sequence)

444

In [9]:
def update_display():
    global sequence_index
    
    if sequence_index < len(navigation_sequence):
        episode_index, step_index = navigation_sequence[sequence_index]
        episode_qwen2vl = qwen2vl_data[episode_index]
        episode_seeclick = seeclick_data[episode_index]

        # Ensure that the step index is valid for the current episode
        if step_index < len(episode_qwen2vl):
            step_qwen2vl = episode_qwen2vl[step_index]
            step_seeclick = episode_seeclick[step_index]
            
            # Update the text for each prediction
            sentence_qwen2vl.value = f"**Qwen2VL Action(s):** {step_qwen2vl}"
            sentence_seeclick.value = f"**SeeClick Action(s):** {step_seeclick}"
            
            # Extract click coordinates from the sentence JSON data
            try:
                action_data_qwen2vl = ast.literal_eval(step_qwen2vl["sentence"][0])
                qwen2vl_click_x, qwen2vl_click_y = action_data_qwen2vl.get("click_point", (0, 0))
                bbox = step_qwen2vl.get("bbox_ref", None)
                
                action_data_seeclick = ast.literal_eval(step_seeclick["sentence"])
                seeclick_click_x, seeclick_click_y = action_data_seeclick.get("click_point", (0, 0))
            except Exception as e:
                # If extraction fails, default to top-left for both
                qwen2vl_click_x, qwen2vl_click_y = 0, 0
                seeclick_click_x, seeclick_click_y = 0, 0
            
            # Load the image from the Qwen2VL step (assuming it's the same image used by SeeClick)
            img_path = step_qwen2vl["img_path"]
            
            if os.path.exists(img_path):
                # Draw both click points on the image and update the widget
                modified_img_bytes = draw_click_point(img_path, qwen2vl_click_x, qwen2vl_click_y,
                                                      seeclick_click_x, seeclick_click_y, bbox)
                image_widget.value = modified_img_bytes
            else:
                sentence_qwen2vl.value += f"\n(Error: Image not found at {img_path})"
        else:
            sentence_qwen2vl.value = "No more steps in this episode."
            image_widget.value = b""
    else:
        sentence_qwen2vl.value = ""
        image_widget.value = b""

def next_step(_):
    global sequence_index
    # Move to the next item in the navigation sequence if available
    if sequence_index < len(navigation_sequence) - 1:
        sequence_index += 1
    update_display()

# Widgets initialization
sentence_qwen2vl = widgets.HTML()
sentence_seeclick = widgets.HTML()
image_widget = widgets.Image(format='png', width=1000)  # Single image widget for merged output
next_button = widgets.Button(description="Next Step")
next_button.on_click(next_step)

# Initial display update and layout
display(sentence_qwen2vl, sentence_seeclick, image_widget, next_button)
update_display()

HTML(value='')

HTML(value='')

Image(value=b'', width='1000')

Button(description='Next Step', style=ButtonStyle())