In [1]:
from modeling_qwen2_vl import Qwen2VLForConditionalGeneration
from processing_qwen2_vl import Qwen2VLProcessor
# from configuration_qwen2_vl import Qwen2VLConfig
# from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor
from qwen_vl_utils import process_vision_info
import time 

In [2]:
model_path = "/data/data1/syc/intern/wanshan/models/Qwen2-VL-2B-Instruct"
# model_path = "/data/data1/syc/intern/wanshan/models/showlab/ShowUI-2B_edited"
device = 'cuda'

In [3]:
min_pixel = 256*28*28
max_pixel = 1344*28*28
import numpy
# 1. Screenshot -> Graph
uigraph_train = True        # Enable ui graph during training
uigraph_test = True         # Enable ui graph during inference
uigraph_diff = 1            # Pixel difference used for constructing ui graph
uigraph_rand = False        # Enable random graph construction 
# 2. Graph -> Mask 
uimask_pre = True           # Prebuild patch selection mask in the preprocessor (not in model layers) for efficiency
uimask_ratio = 0.8         # Specify the percentage of patch tokens to skip per component
uimask_rand = False         # Enable random token selection instead of uniform selection

In [4]:


processor = Qwen2VLProcessor.from_pretrained(
        model_path,
        min_pixels= min_pixel,
        max_pixels = max_pixel,
        uigraph_train=uigraph_train, uigraph_test=uigraph_test, uigraph_diff=uigraph_diff, uigraph_rand=uigraph_rand,
        uimask_pre=True, uimask_ratio=uimask_ratio, uimask_rand=uimask_rand,
        use_fast = True
    )

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "./chrome.png",
                "min_pixels": min_pixel,
                "max_pixels": max_pixel,
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)

inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
    vis_dir="./visualize_imgs" # this folder to save visualization 
)
inputs = inputs.to(device)



- Number of visual tokens

In [None]:
patch_assign = inputs['patch_assign']
print(f'patch_assign shape : {patch_assign.shape}') # [# visual tokens]

# verify number of visual tokens 
patch_pos = inputs['patch_pos']
print(patch_pos[patch_pos != -1].shape) # total visual tokens

from PIL import Image
img_path = './chrome.png'
img = Image.open(img_path)
# smart resize 
w, h = img.size
# Function to find the nearest multiple of 28
def nearest_multiple(value, base=28):
    return round(value / base) * base

# Compute the new dimensions
new_w = nearest_multiple(w, 28)
new_h = nearest_multiple(h, 28)
print(new_w // 28 * new_h // 28) 

- Number of UI components

In [None]:
patch_assign_len = inputs['patch_assign_len']
patch_assign_len # total ui components

# Model Generation

In [None]:
import torch
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path, 
    torch_dtype = torch.bfloat16,
    device_map = device).eval()

In [None]:
generated_ids = model.generate(**inputs, max_new_tokens=128, do_sample=True)

generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, 
)[0]

print(output_text)