In [1]:
# from modeling_qwen2_vl import Qwen2VLForConditionalGeneration
# from processing_qwen2_vl import Qwen2VLProcessor
# from configuration_qwen2_vl import Qwen2VLConfig
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor
from qwen_vl_utils import process_vision_info
import time 

In [2]:
# model_path = "/data/data1/syc/intern/wanshan/models/Qwen2-VL-2B-Instruct"
model_path = "/data/data1/syc/intern/wanshan/models/showlab/ShowUI-2B_edited"
device = 'cuda:2'
# uigraph_train = True        # Enable ui graph during training
# uigraph_test = True         # Enable ui graph during inference
# uigraph_diff = 1            # Pixel difference used for constructing ui graph
# uigraph_rand = False        # Enable random graph construction 
# # 2. Graph -> Mask 
# uimask_pre = True           # Prebuild patch selection mask in the preprocessor (not in model layers) for efficiency
# uimask_ratio = 0.5          # Specify the percentage of patch tokens to skip per component
# uimask_rand = False         # Enable random token selection instead of uniform selection

In [3]:
min_pixel = 256*28*28
max_pixel = 1000*28*28
processor = Qwen2VLProcessor.from_pretrained(
        model_path,
        min_pixels= min_pixel,
        max_pixels = max_pixel)
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "./chrome.png",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(device)

(1, 23, 42)


In [4]:
for key in inputs.keys():
    print(key)

input_ids
attention_mask
patch_pos
select_mask
pixel_values
image_grid_thw
patch_assign
patch_assign_sep
patch_assign_len


In [5]:
img_grid_thw = inputs['image_grid_thw']
img_grid_thw # [bsz, heigh // patch_size // merge_size, width // patch_size // merge_size]

tensor([[ 1, 46, 84]], device='cuda:2')

In [6]:
patch_pos = inputs['patch_pos']
patch_pos.shape # [bsz, seq_len]

torch.Size([1, 980])

In [7]:
select_mask = inputs['select_mask']
select_mask.sum() # total selected tokens

tensor(559, device='cuda:2')

- Number of visual tokens

In [8]:
patch_assign = inputs['patch_assign']
print(f'patch_assign shape : {patch_assign.shape}') # [# visual tokens]

# verify number of visual tokens 
patch_pos = inputs['patch_pos']
print(patch_pos[patch_pos != -1].shape) # total visual tokens

from PIL import Image
img_path = './chrome.png'
img = Image.open(img_path)
# smart resize 
w, h = img.size
# Function to find the nearest multiple of 28
def nearest_multiple(value, base=28):
    return round(value / base) * base

# Compute the new dimensions
new_w = nearest_multiple(w, 28)
new_h = nearest_multiple(h, 28)
print(new_w // 28 * new_h // 28) 

patch_assign shape : torch.Size([966])
torch.Size([966])
2691


- Number of UI components

In [9]:
patch_assign_len = inputs['patch_assign_len']
patch_assign_len # total ui components

tensor([138], device='cuda:2')

# Model Generation

In [10]:
import torch
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path, 
    torch_dtype = torch.bfloat16,
    device_map = device).eval()

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


In [11]:
generated_ids = model.generate(**inputs, max_new_tokens=128, do_sample=True)

generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, 
)[0]

print(output_text)

<|im_end|>
