In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

path = "/data/share/Qwen2.5-1.5B-Instruct"

device = "cuda:7"
dtype = torch.bfloat16

tok = AutoTokenizer.from_pretrained(path)
model = AutoModelForCausalLM.from_pretrained(path, device_map=device, torch_dtype=dtype)
model = model.eval()

chat = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What can you do for me?"},
]

prompt = tok.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)
inp = tok(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    output = model.generate(
        **inp,
        max_new_tokens=128,
        do_sample=False,
    )
print(tok.batch_decode(output, skip_special_tokens=False)[0])
model.lm_head.weight

In [None]:
import qwen_utils
import rotation_utils

dim = model.config.hidden_size
num_heads = model.config.num_attention_heads
head_dim = dim // num_heads
R = rotation_utils.get_orthogonal_matrix(dim, mode="hadamard", device=device)
R_v = rotation_utils.get_orthogonal_matrix(head_dim, mode="hadamard", device=device)

qwen_utils.untie_word_embeddings(model)
qwen_utils.fuse_layer_norms(model)
qwen_utils.rotate_model(model, R, R_v)

with torch.no_grad():
    output = model.generate(
        **inp,
        max_new_tokens=128,
        do_sample=False,
    )
print(tok.batch_decode(output, skip_special_tokens=False)[0])
model.lm_head.weight

In [None]:
import torch
from transformers import Qwen2ForCausalLM
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

model_path = "/data/share/Qwen2-VL-2B-Instruct"

dtype = torch.bfloat16
device = "cuda:7"

# default: Load the model on the available device(s)
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     model_path, torch_dtype=dtype, device_map=device
# )

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=dtype,
    attn_implementation="flash_attention_2",
    device_map=device,
)

# default processer
# processor = AutoProcessor.from_pretrained(model_path)

# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
min_pixels = 256*28*28
max_pixels = 1280*28*28
processor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels, use_fast=False)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "./aniya.png",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(device)

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

['The image is an illustration featuring a group of characters from the anime "Spy x Family." The characters are depicted in a stylized, cartoonish manner, with a focus on their distinctive outfits and poses. The characters are arranged in a grid format, with each character occupying a different section of the image. The background is a light teal color, and the text "SPY×FAMILY" is prominently displayed at the top in bold, black letters. The overall style is reminiscent of the anime\'s art style, which is known for its detailed and expressive character designs.']
Qwen2VLForConditionalGeneration(
  (visual): Qwen2VisionTransformerPretrainedModel(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
    )
    (rotary_pos_emb): VisionRotaryEmbedding()
    (blocks): ModuleList(
      (0-31): 32 x Qwen2VLVisionBlock(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (norm2): LayerNorm((1280,), ep

In [2]:
import qwen_utils
import rotation_utils

qwen_utils.untie_word_embeddings(model)
qwen_utils.fuse_layer_norms(model)

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

tie word embeddings, clone lm_head from embed_tokens
['The image is an illustration featuring a group of characters from the anime "Spy x Family." The characters are depicted in a stylized, cartoonish manner, with a focus on their distinctive outfits and poses. The characters are arranged in a grid-like formation, with each character positioned in a different pose and attire. The background is a solid light blue color, and the characters are set against a darker blue circle, creating a striking contrast. The text "SPY×FAMILY" is prominently displayed at the top of the image in bold, black letters. The overall style is reminiscent of the anime\'s art style, which is known for its']


In [3]:
dim = model.config.hidden_size
num_heads = model.config.num_attention_heads
head_dim = dim // num_heads
R = rotation_utils.get_orthogonal_matrix(dim, mode="hadamard", device=device)
R_v = rotation_utils.get_orthogonal_matrix(head_dim, mode="hadamard", device=device)

qwen_utils.rotate_model(model, R, R_v)
with torch.no_grad():
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    print(output_text)

['The image is an illustration featuring six characters from the anime "Spy x Family." The characters are arranged in a grid format, with each character positioned in a different quadrant of the grid. The characters are depicted in various poses and expressions, creating a dynamic and engaging visual. The background is a light teal color, and the text "SPY×FAMILY" is prominently displayed at the top of the image in bold, black letters. The overall style is reminiscent of anime art, with detailed line work and vibrant colors.']


Parameter containing:
tensor([[ 0.0479, -0.0479, -0.0601,  ...,  0.0820, -0.0250,  0.1182],
        [ 0.0374,  0.0562, -0.2021,  ...,  0.0522, -0.0527,  0.0236],
        [ 0.0339, -0.2217, -0.0058,  ..., -0.1055, -0.1245, -0.0518],
        ...,
        [-0.0112,  0.0099,  0.1602,  ..., -0.0269, -0.0164, -0.0635],
        [-0.0112,  0.0099,  0.1602,  ..., -0.0269, -0.0165, -0.0635],
        [-0.0112,  0.0099,  0.1602,  ..., -0.0269, -0.0165, -0.0635]],
       device='cuda:7', dtype=torch.bfloat16, requires_grad=True)

In [4]:
model.save_pretrained("/data/share/Qwen2-VL-2B-Instruct-rotated")