In [1]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

# model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
# model_name = "./models/Qwen2.5-VL-7B-MUs1"
model_name = "../../LLaMA-Factory/saves/Qwen2.5-VL-7B-Instruct/freeze/train_2025-04-14"
device = "cuda:1"
# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_name, torch_dtype=torch.bfloat16, device_map=device
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2.5-VL-7B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processer
processor = AutoProcessor.from_pretrained(model_name)

# The default range for the number of visual tokens per image in the model is 4-16384.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
import re
import json

# 将字符串存入json文件
def save_data_to_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


# 从json文件读取字符串
def read_data_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def read_data_from_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            json_obj = json.loads(line.strip())
            data.append(json_obj)
    return data

def save_data_to_jsonl(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for item in data:
            line = json.dumps(item, ensure_ascii=False)
            f.write(line + '\n')


def extract_think_content(text: str):
    pattern = r'<think>(.*?)</think>'
    matches = re.findall(pattern, text, re.DOTALL)
    return matches

def extract_answer_content(text: str):
    pattern = r'<answer>(.*?)</answer>'
    matches = re.findall(pattern, text, re.DOTALL)
    return matches

def messages_generate(system_prompt: str, image_list: list, prompt: str):
    messages = [
        {
            "role": "system",
            "content": system_prompt
        }
    ]
    content = []
    for image_path in image_list:
        content.append(
            {
                "type": "image",
                "image": image_path,
            }
        )
    content.append({"type": "text", "text": prompt})
    messages.append({"role": "user", "content": content})
    return messages


def inference(messages: list, device: str, text=None, temperature=0.6, top_p=0.95, max_new_tokens=512):


    # Preparation for inference
    if text is None:
        text = processor.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
        )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(device)

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text[0], text



def split_thinking_answer(output_text):
    think_trace = extract_think_content(output_text)
    answer = extract_answer_content(output_text)
    if think_trace == []:
        think_trace = ""
    else:
        think_trace = think_trace[0]
    if answer == []:
        answer = ""
    else:
        answer = answer[0]
    return think_trace, answer
    
# 测试示例
# system_prompt = "你是一名人工智能助手，专门研究超声医学领域。你收到了一个超声选择题，请给出你的思考过程，并放在<think>思考过程</think>标签内，只输出一个选项，把选项答案放在<answer>选项</answer>内。"
# image_list = [
#     "../../Udata0328/images/238564_1.jpeg",
#     "../../Udata0328/images/238564_2.jpeg"
# ]
# prompt = "肝脏在这次超声检查中的形态是怎样的？A: 缩小 B: 正常 C: 稍饱满 D: 增大\n<image>\n<image>"
# device = "cuda:0"
# messages = messages_generate(system_prompt=system_prompt, image_list=image_list, prompt=prompt)
# temperature=0.6
# top_p=0.95
# wait = '等等，'
# output_text, text = inference(messages=messages, device=device, temperature=temperature, top_p=top_p)
# budget_forcing_text = text
# for i in range(1):
#     think_trace = extract_think_content(output_text)
#     answer = extract_answer_content(output_text)
#     print(f'think trace:\n{think_trace[0]}\n')
#     print(f'answer:\n{answer[0]}\n')
#     budget_forcing_text = budget_forcing_text + '<think>' + think_trace[0] + wait
#     output_text, _ = inference(messages=messages, text=budget_forcing_text, device=device, temperature=temperature, top_p=top_p)
#     output_text = '<think>' + think_trace[0] + wait + output_text

# think_trace = extract_think_content(output_text)
# answer = extract_answer_content(output_text)
# print(f'think trace:\n{think_trace[0]}\n')
# print(f'answer:\n{answer[0]}\n')
# print(f'output_text:\n{output_text}')
    

In [None]:
import json
import os
from tqdm import tqdm

test_path = "/home/wangsj/learn/VSCodeDataProcess/Udata0328/test-data/Reasoning_test_CSYXLCSJJC_filtered.json"
output_path = "Pass@1_Reasoning_test_CSYXLCSJJC_filtered_k4t6p7l96.jsonl"
image_path_pre = "/home/wangsj/learn/VSCodeDataProcess/Udata0328/"
system_prompt = "你是一名人工智能助手，专门研究超声医学领域。你收到了一个超声选择题，请给出你的思考过程，并放在<think>思考过程</think>标签内，只输出一个选项，把选项答案放在<answer>选项</answer>内。"
image_list = [
    "../../Udata0328/images/238564_1.jpeg",
    "../../Udata0328/images/238564_2.jpeg"
]
prompt = "肝脏在这次超声检查中的形态是怎样的？A: 缩小 B: 正常 C: 稍饱满 D: 增大\n<image>\n<image>"
device = "cuda:1"
pass_1 = 4
messages = messages_generate(system_prompt=system_prompt, image_list=image_list, prompt=prompt)
temperature=0.6
top_p=0.7
budget_len = 96
max_new_tokens_tmp=1024
wait = '等等，'
num_ignore = 2

sum = 0.0
num = 0
tmp = {}
with open(test_path, "r") as file:
    data = json.load(file)
    for mcq in tqdm(data):
        num = num + 1
        score = 0.0
        prompt = mcq["conversations"][1]["value"]
        image_list = []
        if "images" in mcq.keys():
            for image_path in mcq["images"]:
                image_list.append(f"{image_path_pre}{image_path}")
        prompt = prompt.replace("阴道", "yindao")
        messages = messages_generate(system_prompt=system_prompt, image_list=image_list, prompt=prompt)
        pass_nlist = []
        label_answer = extract_answer_content(mcq["conversations"][2]["value"])[0]
        for step in range(pass_1):

            output_text, text = inference(messages=messages, device=device, temperature=temperature, top_p=top_p, max_new_tokens=max_new_tokens_tmp)
            budget_forcing_text = text
            think_trace, answer = split_thinking_answer(output_text)
            num_tmp = 0
            while len(think_trace) < budget_len:
                num_tmp += 1
                tmp_text = budget_forcing_text + '<think>' + think_trace + wait
                output_text, _ = inference(messages=messages, text=tmp_text, device=device, temperature=temperature, top_p=top_p, max_new_tokens=max_new_tokens_tmp)
                output_text = '<think>' + think_trace + wait + output_text
                think_trace, answer = split_thinking_answer(output_text)
                if num_tmp > num_ignore:
                    break
                    
            
            tmp_text = budget_forcing_text + '<think>' + think_trace + '</'
            output_text, _ = inference(messages=messages, text=tmp_text, device=device, temperature=temperature, top_p=top_p, max_new_tokens=4096)
            output_text =  '<think>' + think_trace + '</' + output_text
            think_trace, answer = split_thinking_answer(output_text)
            pass_nlist.append(output_text)

            if answer == label_answer:
                score = score + 1
        
        score = score / pass_1
        tmp["prompt"] = prompt
        tmp["predict"] = output_text
        tmp["label"] = mcq["conversations"][2]["value"]
        tmp["score"] = score
        tmp["pass_nlist"] = pass_nlist
        sum = sum + score
            
        if num % 25 == 0:
            print(f"Scores: {sum} | Total number of questions: {num}")
        
        with open(output_path, "a") as f:
            f.write(json.dumps(tmp, ensure_ascii=False) + '\n')
    
with open(output_path.split('.')[0] + ".txt", "w") as f:
    f.write(f"Scores: {sum}\n")
    f.write(f"Total number of questions: {num}\n")
    f.write(f"Accuracy: {sum / num:.2f}")



  7%|▋         | 25/341 [09:09<2:05:03, 23.74s/it]

Scores: 17.5 | Total number of questions: 25


 15%|█▍        | 50/341 [18:38<1:46:16, 21.91s/it]

Scores: 36.0 | Total number of questions: 50


 22%|██▏       | 75/341 [28:38<1:41:49, 22.97s/it]

Scores: 56.0 | Total number of questions: 75


 29%|██▉       | 100/341 [39:54<1:31:20, 22.74s/it]

Scores: 72.0 | Total number of questions: 100


 37%|███▋      | 125/341 [49:29<1:24:42, 23.53s/it]

Scores: 91.5 | Total number of questions: 125


 44%|████▍     | 150/341 [58:51<1:17:52, 24.46s/it]

Scores: 108.5 | Total number of questions: 150


 51%|█████▏    | 175/341 [1:08:02<41:15, 14.91s/it]  

Scores: 129.75 | Total number of questions: 175


 59%|█████▊    | 200/341 [1:13:47<34:11, 14.55s/it]

Scores: 147.75 | Total number of questions: 200


 66%|██████▌   | 225/341 [1:19:34<27:02, 13.99s/it]

Scores: 169.25 | Total number of questions: 225


 73%|███████▎  | 250/341 [1:25:49<20:29, 13.51s/it]

Scores: 191.5 | Total number of questions: 250


 81%|████████  | 275/341 [1:32:12<15:20, 13.94s/it]

Scores: 211.75 | Total number of questions: 275


 88%|████████▊ | 300/341 [1:38:22<09:28, 13.88s/it]

Scores: 230.75 | Total number of questions: 300


 95%|█████████▌| 325/341 [1:45:08<04:45, 17.83s/it]

Scores: 252.25 | Total number of questions: 325


100%|██████████| 341/341 [1:48:43<00:00, 19.13s/it]
