In [1]:
from model_edited.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
from model_edited.processing_qwen2_vl import Qwen2VLProcessor
# from configuration_qwen2_vl import Qwen2VLConfig
# from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor
from qwen_vl_utils import process_vision_info
import time 

In [2]:
model_path = "/data/data1/syc/intern/wanshan/models/Qwen2-VL-2B-Instruct"

device = 'cuda:3'

In [3]:
min_pixel = 1344*28*28
max_pixel = 1680*28*28
# 1. Screenshot -> Graph
uigraph_train = True        # Enable ui graph during training
uigraph_test = True         # Enable ui graph during inference
uigraph_diff = 1            # Pixel difference used for constructing ui graph
uigraph_rand = False        # Enable random graph construction 
# 2. Graph -> Mask 
uimask_pre = True           # Prebuild patch selection mask in the preprocessor (not in model layers) for efficiency
uimask_ratio = 0.8         # Specify the percentage of patch tokens to skip per component
uimask_rand = False         # Enable random token selection instead of uniform selection

In [4]:
processor = Qwen2VLProcessor.from_pretrained(
        model_path,
        min_pixels= min_pixel,
        max_pixels = max_pixel,
        uigraph_train=uigraph_train, uigraph_test=uigraph_test, uigraph_diff=uigraph_diff, uigraph_rand=uigraph_rand,
        uimask_pre=True, uimask_ratio=uimask_ratio, uimask_rand=uimask_rand,
        use_fast = True
    )

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "./chrome.png",
                "min_pixels": min_pixel,
                "max_pixels": max_pixel,
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)

inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
    # vis_dir="./visualize_imgs" # this folder to save visualization 
).to(device)


# Model Generation

In [5]:
import re 
def parse_layer_type(str_ranges, L=28, default=0):
    # 0 is without layer token selection, 1 is with layer token selection. Below we provide examples:
    # [1,28,1] means that all LM layers use token selection; [1,28,0] means that do not.
    # Interleaved layer-wise '[2,2,1],[4,4,1],[6,6,1],[8,8,1],[10,10,1],[12,12,1],[14,14,1],[16,16,1],[18,18,1],[20,20,1],[22,22,1],[24,24,1],[26,26,1]'
    result = [default] * L
    matches = re.findall(r"\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]", str_ranges)
    for start, end, value in matches:
        start, end, value = int(start) - 1, int(end) - 1, int(value)
        if end >= L:
            end = L - 1
        result[start : end + 1] = [value] * (end - start + 1)
    return result


In [6]:
import torch
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path, 
    torch_dtype = torch.bfloat16,
    attn_implementation="flash_attention_2",
    lm_skip_layer=parse_layer_type("[1,28,0]"),
    lm_skip_ratio=0.2,
    device_map = device,
    prune_layer = 2
).eval()

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import time
with torch.no_grad():

    generated_ids = model.generate(**inputs, max_new_tokens=128, do_sample=True)

generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, 
)[0]

print(output_text)

The image shows a screenshot of the Google Chrome browser interface. Here are the details:

1. **Header**: The top of the screen displays the Google Chrome logo, which is a blue and yellow "G" in the center.
2. **Search Bar**: Below the logo, there is a search bar with the placeholder text "Search Google or type a URL." The search bar is divided into two sections: the left section contains the placeholder text, and the right section has a magnifying glass icon and a plus sign, indicating the ability to add a new tab or search.
3. **Navigation Bar**: At the top right corner,
