In [49]:
import json
import ipywidgets as widgets
from IPython.display import display, Image
import os
from model.processing_qwen2_vl import Qwen2VLProcessor
from qwen_vl_utils import process_vision_info
import ast  # To safely evaluate JSON-like strings
from PIL import Image, ImageDraw


In [74]:
# Function to draw click point on image
def draw_click_point(img_path, click_x, click_y, bbox, pred=False):
    if os.path.exists(img_path):
        img = Image.open(img_path)
        w, h = img.size  # Get image dimensions
        
        # Convert relative to absolute coordinates
        abs_x = int(click_x * w)
        abs_y = int(click_y * h)
        
        # Draw the dot
        draw = ImageDraw.Draw(img)
        dot_radius = 10  # Adjust dot size if needed
        draw.ellipse((abs_x - dot_radius, abs_y - dot_radius, abs_x + dot_radius, abs_y + dot_radius), fill="red")

        # Draw the bounding box (if exists)
        if bbox:
            bbox_x = bbox.get("x", 0)
            bbox_y = bbox.get("y", 0)
            bbox_w = bbox.get("width", 0)
            bbox_h = bbox.get("height", 0)
            
            bbox_coords = [(bbox_x, bbox_y), (bbox_x + bbox_w, bbox_y + bbox_h)]
            draw.rectangle(bbox_coords, outline="blue", width=3)  # Blue bbox

        # Save the modified image temporarily
        if not pred:
            temp_img_path = "./visualize_imgs/image.png"
        else:
            temp_img_path = "./visualize_imgs/pred_image.png"

        img.save(temp_img_path)
        # Return the new image path
        return temp_img_path
    else:
        return None

In [75]:
# Load JSON file
json_file = "/home/syc/intern/wanshan/Qwen2-VL/agent_tasks/Qwen2-VL-2B/qwen2vl_mind2web_website.json"  # Change this to your JSON file path
with open(json_file, "r", encoding="utf-8") as f:
    data = json.load(f)
# Filter instances where Ele_match is False
filtered_data = [
    [step for step in episode] 
    for episode in data
]

# Initialize index
episode_index = 0
step_index = 0

In [76]:
uigraph_json_path = "/home/syc/intern/wanshan/Qwen2-VL/agent_tasks/Qwen2-VL_uigraph_inference_only/qwen2vl_uigraph_0.3_ratio_website.json"  # Change this to your JSON file path
with open(uigraph_json_path, "r", encoding="utf-8") as f:
    data = json.load(f)
# Filter instances where Ele_match is False
uigraph_data = [
    [step for step in episode] 
    for episode in data
]

In [77]:
len(filtered_data[0]), len(uigraph_data[0])

(7, 7)

In [78]:
ground_truth_path = "/data/data1/syc/intern/wanshan/mind2map_dataset/mind2web_data_test_website.json"
with open(ground_truth_path, "r", encoding="utf-8") as f:
    ground_truth_data = json.load(f)
ground_truth = [
    [step for step in episode['actions']] 
    for episode in ground_truth_data
]
ground_truth[0][0]['bbox']

{'x': 356.0, 'y': 461.0, 'width': 320.0, 'height': 34.0}

In [79]:
filtered_data[0]

[{'annot_id': '013781df-4391-4533-bcb1-15f6819064f6',
  'img_path': '/data/data1/syc/intern/wanshan/mind2map_dataset/mind2web_images/013781df-4391-4533-bcb1-15f6819064f6-79c4a963-4aa9-49c1-9257-6b0d5069c551.jpg',
  'instruction': 'What are the romantic reggae musics from BCD Studio that can be used in tik tok series in andorra',
  'sentence': ['{"action_type": 4, "click_point": (0.09,0.78)}'],
  'Op_match': True,
  'Ele_match': False,
  'Op_F1': [1.0, 4]},
 {'annot_id': '013781df-4391-4533-bcb1-15f6819064f6',
  'img_path': '/data/data1/syc/intern/wanshan/mind2map_dataset/mind2web_images/013781df-4391-4533-bcb1-15f6819064f6-7da11b14-b652-4507-8d50-ad7515c9e455.jpg',
  'instruction': 'What are the romantic reggae musics from BCD Studio that can be used in tik tok series in andorra',
  'sentence': ['{"action_type": 4, "click_point": (0.31,0.40)}'],
  'Op_match': True,
  'Ele_match': True,
  'Op_F1': [1.0, 4]},
 {'annot_id': '013781df-4391-4533-bcb1-15f6819064f6',
  'img_path': '/data/data

In [80]:
navigation_sequence  = []
for episode_idx, episode in enumerate(filtered_data):
    for step_idx, step in enumerate(episode):
        if step['Ele_match'] == True and uigraph_data[episode_idx][step_idx]['Ele_match'] == False:
            navigation_sequence.append((episode_idx, step_idx))
sequence_index = 0

In [83]:
model_path = "/data/data1/syc/intern/wanshan/models/Qwen2-VL-2B-Instruct"
# model_path = "/data/data1/syc/intern/wanshan/models/showlab/ShowUI-2B_edited"

min_pixel = 256*28*28
max_pixel = 1344*28*28
# 1. Screenshot -> Graph
uigraph_train = True        # Enable ui graph during training
uigraph_test = True         # Enable ui graph during inference
uigraph_diff = 1            # Pixel difference used for constructing ui graph
uigraph_rand = False        # Enable random graph construction 
# 2. Graph -> Mask 
uimask_pre = True           # Prebuild patch selection mask in the preprocessor (not in model layers) for efficiency
uimask_ratio = 0.3         # Specify the percentage of patch tokens to skip per component
uimask_rand = False         # Enable random token selection instead of uniform selection


processor = Qwen2VLProcessor.from_pretrained(
        model_path,
        min_pixels= min_pixel,
        max_pixels = max_pixel,
        uigraph_train=uigraph_train, uigraph_test=uigraph_test, uigraph_diff=uigraph_diff, uigraph_rand=uigraph_rand,
        uimask_pre=True, uimask_ratio=uimask_ratio, uimask_rand=uimask_rand,
)


In [84]:
def load_visualize(image_path):
    messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": image_path,
                "min_pixels": min_pixel,
                "max_pixels": max_pixel,
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
    ]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, _ = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        padding=True,
        return_tensors="pt",
        vis_dir="./visualize_imgs" # this folder to save visualization 
    )
    with open("./visualize_imgs/demo.png", "rb") as f:
        return f.read()

In [90]:

# Function to update display
def update_display():
    global sequence_index
    
    if sequence_index < len(navigation_sequence):
        episode_index, step_index = navigation_sequence[sequence_index]
        episode = filtered_data[episode_index]
        episode_uigraph = uigraph_data[episode_index]
        episode_ref = ground_truth[episode_index]

        if step_index < len(episode):
            step = episode[step_index]
            step_uigraph =  episode_uigraph[step_index]
            step_ref = episode_ref[step_index]
            
            # Extract sentence (convert list of JSON strings into readable text)
            # sentence_text = "\n".join(step["sentence"])
            
            # Update instruction and sentence
            # instruction_label.value = f"**Instruction:** {step['instruction']}"
            sentence_label.value = f"**Action(s):** {step}"
            sentece_uigraph_label.value = f"**UI Graph Action(s):** {step_uigraph}"
            # Extract click coordinates
            try:
                action_data = ast.literal_eval(step["sentence"][0])  # Convert string to dict
                click_x, click_y = action_data.get("click_point", (0, 0))

                action_data_uigraph = ast.literal_eval(step_uigraph["sentence"][0])  # Convert string to dict
                uigraph_click_x, uigraph_click_y = action_data_uigraph.get("click_point", (0, 0))
            except Exception as e:
                click_x, click_y = 0, 0  # Default to top-left corner on error

            # get bbox from groundtruth
            bbox = step_ref['bbox']
            
            # Load and update image
            img_path = step["img_path"]
            img_pred_path = step_uigraph["img_path"]
            
            modified_img_path = draw_click_point(img_path, click_x, click_y, bbox)
            modified_img_pred_path = draw_click_point(img_path, uigraph_click_x, uigraph_click_y, bbox, pred=True)
            if os.path.exists(img_path):
                with open(modified_img_path, "rb") as f:
                    image_widget.value = f.read()
                with open(modified_img_pred_path, "rb") as f:
                    image_pred.value = f.read()
                image_patch.value = load_visualize(img_path)
            else:
                sentence_label.value += f"\n(Error: Image not found at {img_path})"
        else:
            sentence_label.value = "No more steps in this episode."
            image_widget.value = b""
    else:
        # instruction_label.value = "No more episodes."
        sentence_label.value = ""
        image_widget.value = b""

# Next button function
def next_step(_):
    global sequence_index
    
    # Move to the next item in the navigation sequence
    if sequence_index < len(navigation_sequence) - 1:
        sequence_index += 1
        update_display()


    update_display()

# Widgets
# instruction_label = widgets.HTML()
sentence_label = widgets.HTML()
sentece_uigraph_label = widgets.HTML()
image_widget = widgets.Image(format='png', width=600)  # Set Image Size
image_pred = widgets.Image(format='png', width=600)
image_patch = widgets.Image(format='png', width=600)  # Set Image Size

# Layout to show images side by side
image_box = widgets.HBox([image_widget, image_pred])  # Side-by-side


next_button = widgets.Button(description="Next Step")
next_button.on_click(next_step)

# Initial display
update_display()

# Layout
display(sentence_label, sentece_uigraph_label,  image_box, image_patch, next_button)

HTML(value='**Action(s):** {\'annot_id\': \'802babc8-fd6c-4e8b-bab2-0bdc0a03d129\', \'img_path\': \'/data/data…

HTML(value='**UI Graph Action(s):** {\'annot_id\': \'802babc8-fd6c-4e8b-bab2-0bdc0a03d129\', \'img_path\': \'/…

HBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x05\x00\x00\x00\x02\xd0\x08\x02\x00\x…

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x05\x08\x00\x00\x02\xd8\x08\x02\x00\x00\x00\xbf\x9b\…

Button(description='Next Step', style=ButtonStyle())