In [1]:
import os

from qwen_vl_utils import process_vision_info
from transformers import (
    AutoModelForCausalLM,
    AutoProcessor,
    AutoTokenizer,
    Qwen2_5_VLForConditionalGeneration,
    Qwen2VLForConditionalGeneration,
)


class Sample:
    def __init__(
        self,
        input_text: str,
        label_text: str | None = None,
        image_path: str | None = None,
    ):
        self.input_text = input_text
        self.label_text = label_text
        self.image_path = image_path


def load_model(
    ckpt: str,
    model_class: AutoModelForCausalLM
    | Qwen2VLForConditionalGeneration
    | Qwen2_5_VLForConditionalGeneration,
    processor_class: AutoProcessor | AutoTokenizer,
):
    tokenizer = processor_class.from_pretrained(ckpt)
    # tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    model = model_class.from_pretrained(ckpt)
    return tokenizer, model


def data_collator(processor: AutoProcessor, batch: list[Sample]):
    messages_list = []
    targets = []

    for item in batch:
        image_path = item.image_path
        user_content = []
        if image_path and os.path.exists(image_path):
            user_content.append({"type": "image", "image": image_path})
        user_content.append({"type": "text", "text": item.input_text})
        user_message = [{"role": "user", "content": user_content}]

        messages_list.append(user_message)
        if item.label_text:
            targets.append(item.label_text)

    user_image_inputs, user_video_inputs = process_vision_info(messages_list)
    input_texts = [
        processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        for messages in messages_list
    ]
    inputs = processor(
        text=input_texts,
        images=user_image_inputs,
        videos=user_video_inputs,
        return_tensors="pt",
        padding_side="left",
        padding=True,
        truncation=True,
    )

    return inputs, targets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch.utils.data import Dataset


class BaseDataset(Dataset):
    def __init__(
        self,
        input_text_list: list[str],
        label_text_list: list[str] | None = None,
        image_path_list: list[str] | None = None,
    ):
        self.input_text_list = input_text_list
        self.label_text_list = label_text_list
        self.image_path_list = image_path_list

    def __len__(self):
        return len(self.input_text_list)

    def __getitem__(self, index):
        return Sample(
            self.input_text_list[index],
            self.label_text_list[index] if self.label_text_list else None,
            self.image_path_list[index] if self.image_path_list else None,
        )

In [4]:
import torch

torch.cuda.set_device(5)

# ckpt = "Qwen/Qwen2-VL-2B-Instruct"
model_class = Qwen2VLForConditionalGeneration
# ckpt = "Qwen/Qwen2.5-VL-7B-Instruct"
ckpt = "/home/fanghaotian/src/GRec/ckpt/Instruments/Qwen2-VL-2B-Instruct-seqrec-item2index-1-qwen7B/checkpoint-48696"
# model_class = Qwen2_5_VLForConditionalGeneration
processor, model = load_model(
    ckpt,
    model_class,
    AutoProcessor,
)

model.eval()
model.to("cuda")

Qwen2VLForConditionalGeneration(
  (model): Qwen2VLModel(
    (visual): Qwen2VisionTransformerPretrainedModel(
      (patch_embed): PatchEmbed(
        (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
      )
      (rotary_pos_emb): VisionRotaryEmbedding()
      (blocks): ModuleList(
        (0-31): 32 x Qwen2VLVisionBlock(
          (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
          (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
          (attn): VisionAttention(
            (qkv): Linear(in_features=1280, out_features=3840, bias=True)
            (proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (mlp): VisionMlp(
            (fc1): Linear(in_features=1280, out_features=5120, bias=True)
            (act): QuickGELUActivation()
            (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          )
        )
      )
      (merger): PatchMerger(
        (ln_q): LayerN

In [5]:
dataset = BaseDataset(
    input_text_list=[
        "Hello, how are you? Response:",
        "What is the weather in Tokyo? Response:",
        "123 Response:",
        "What is the capital of France? Response:",
        "1 + 1 = ? Response:",
        "What is the capital of China? Response:",
    ],
)

# batch = [dataset]
inputs = data_collator(processor, dataset)

print(inputs)

({'input_ids': tensor([[151643, 151644,   8948,    198,   2610,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198,   9707,     11,   1246,
            525,    498,     30,   5949,     25, 151645,    198, 151644,  77091,
            198],
        [151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,   3838,    374,    279,   9104,
            304,  26194,     30,   5949,     25, 151645,    198, 151644,  77091,
            198],
        [151643, 151643, 151643, 151643, 151644,   8948,    198,   2610,    525,
            264,  10950,  17847,     13, 151645,    198, 151644,    872,    198,
             16,     17,     18,   5949,     25, 151645,    198, 151644,  77091,
            198],
        [151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,   3838,    374,    279,   6722,
            315,   9625,     30,   5949,

In [6]:
length = len(dataset)
for i in range(length - 5, length):
    batch = data_collator(processor, [dataset[i]])
    print(batch)
    inputs = batch[0]
    # inputs: set(input, target, item_ids)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}

    # use beam search
    output = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=4,
        # max_length=10,
        # prefix_allowed_tokens_fn=prefix_allowed_tokens,
        num_beams=10,
        num_return_sequences=10,
        output_scores=True,
        return_dict_in_generate=True,
        early_stopping=True,
    )
    output_ids = output["sequences"]
    scores = output["sequences_scores"]

    print("Outputs:", "=" * 50)
    # decode all the results
    output_texts = processor.batch_decode(output_ids, skip_special_tokens=True)

    # extract the output behind the "Response:"
    for i, o in enumerate(output_texts):
        o = o.split("Response:")[-1]
        o = o.split("assistant")[-1]
        print(o, "|", batch[1], "|", float(scores[i]))

({'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,   3838,    374,    279,   9104,
            304,  26194,     30,   5949,     25, 151645,    198, 151644,  77091,
            198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}, [])

The weather in Tokyo | [] | -58.808467864990234

As of my last | [] | -69.80632019042969

As of now, | [] | -83.30500793457031

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0
({'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,     16,     17,     18,   5949,
             25, 151645,    198, 151644,  77091,    198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

({'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,   3838,    374,    279,   9104,
            304,  26194,     30,   5949,     25, 151645,    198, 151644,  77091,
            198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}, [])
Inputs: ==================================================
[]
Outputs: ==================================================

I'm sorry, | [] | -49.234371185302734

The weather in Tokyo | [] | -55.3034553527832

As an AI, | [] | -69.26478576660156

As an AI language | [] | -73.59571075439453

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0
({'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,     16,     17,     18,   5949,
             25, 151645,    198, 151644,  77091,    198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}, [])
Inputs: ==================================================
[]
Outputs: ==================================================

Hello! How can | [] | -34.297645568847656

I'm sorry, | [] | -39.966915130615234

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0
({'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,   3838,    374,    279,   6722,
            315,   9625,     30,   5949,     25, 151645,    198, 151644,  77091,
            198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}, [])
Inputs: ==================================================
[]
Outputs: ==================================================

The capital of France | [] | -6.2154364585876465

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0
({'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,     16,    488,    220,     16,
            284,    937,   5949,     25, 151645,    198, 151644,  77091,    198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])}, [])
Inputs: ==================================================
[]
Outputs: ==================================================

2 | [] | -33.09654235839844

1 + 1 | [] | -36.219970703125

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0
({'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,   3838,    374,    279,   6722,
            315,   5616,     30,   5949,     25, 151645,    198, 151644,  77091,
            198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}, [])
Inputs: ==================================================
[]
Outputs: ==================================================

The capital of China | [] | -11.392071723937988

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

({'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,   3838,    374,    279,   9104,
            304,  26194,     30,   5949,     25, 151645,    198, 151644,  77091,
            198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}, [])
Outputs: ==================================================

The weather in Tokyo | [] | -347652.84375

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0
({'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,     16,     17,     18,   5949,
             25, 151645,    198, 151644,  77091,    198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}, [])
Outputs: ==================================================

It seems like you | [] | -385329.34375

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0
({'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,   3838,    374,    279,   6722,
            315,   9625,     30,   5949,     25, 151645,    198, 151644,  77091,
            198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}, [])
Outputs: ==================================================

The capital of France | [] | -2851.505126953125

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0
({'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,     16,    488,    220,     16,
            284,    937,   5949,     25, 151645,    198, 151644,  77091,    198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])}, [])
Outputs: ==================================================

1 + 1 | [] | -12083.3037109375

1 + 3 | [] | -4042594.25

1 + 4 | [] | -4329086.0

1 + 9 | [] | -4475847.5

1 + 7 | [] | -4588588.5

1 + = | [] | -5730992.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0
({'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,   3838,    374,    279,   6722,
            315,   5616,     30,   5949,     25, 151645,    198, 151644,  77091,
            198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}, [])
Outputs: ==================================================

The capital of China | [] | -11114.3544921875

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

 | [] | -1000000000.0

In [7]:
# offload model and processor
import gc

del model, processor
gc.collect()
torch.cuda.empty_cache()