In [1]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

## Qwen2-VL-finetune

### Download model

In [2]:
model_id = 'Qwen/Qwen2.5-VL-7B-Instruct-AWQ'
save_dir = './vlm/'

In [3]:
# !pip install git+https://github.com/huggingface/transformers
# !pip install qwen-vl-utils
# !pip install openai
# !pip install flash-attn --no-build-isolation
# !pip install swanlab  # monitor
# !swanlab login

In [4]:
# from huggingface_hub import snapshot_download

# snapshot_download(repo_id=model_id, local_dir=save_dir, local_dir_use_symlinks=False)

In [None]:
import transformers
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoTokenizer

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    save_dir, torch_dtype=torch.float16, 
    attn_implementation="flash_attention_2", device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(save_dir, trust_remote=True)
processor = AutoProcessor.from_pretrained(save_dir)

model.enable_input_require_grads()   # 开启梯度检查点时，要执行该方法

# print(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [6]:
from qwen_vl_utils import process_vision_info

def preprocess_func(example):
    MAX_LENGTH = 8192
    input_ids, attention_mask, labels = [], [], []
    url = example["message"][0]["conversation"][0]['url']
    caption = example["message"][0]["conversation"][1]['caption']
    
    messages = [
        {
            "role": "system", 
            "content": "You are a helpful assistant in recognize math equations in either handwritten or printed text."
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Recognize the equation in the image, write its LaTeX code bettwen $$\t and \t$$"
                },
                {
                    "type": "image",
                    "image": url,
                    "resized_height": 280,
                    "resized_width": 280,
                },
            ]
        },
        {
            "role": "assistant",
            "content": [
                {
                    "type": "text",
                    "text": caption
                }
            ]
        }
    ]

    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    img_inputs, _ = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=img_inputs,
        padding=True,
        return_tensors='pt'
    )
    inputs = {key: value.tolist() for key, value in inputs.items()}
    instruction = inputs
    response = tokenizer(f'{caption}', add_special_tokens=False)
    input_ids = (
        instruction["input_ids"][0] + response['input_ids'] + [tokenizer.pad_token_id]
    )
    attention_mask = instruction['attention_mask'][0] + response['attention_mask'] + [1]
    labels = (
        [-100] * len(instruction['input_ids'][0])
        + response['input_ids']
        + [tokenizer.pad_token_id]
    )

    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    labels = torch.tensor(labels)

    inputs['pixel_values'] = torch.tensor(inputs['pixel_values'])
    # 由 (1, h, w) 变换为 (h, w)
    inputs['image_grid_thw'] = torch.tensor(inputs['image_grid_thw']).squeeze(0)  
    return {
        "input_ids": input_ids, 
        "attention_mask": attention_mask, 
        "labels": labels,
        "pixel_values": inputs['pixel_values'], 
        "image_grid_thw": inputs['image_grid_thw']
    }


In [7]:
""" 数据集准备 """
import json
import random
from datasets import Dataset

dataset = './ft_data.json'
with open(dataset, 'r') as f:
    data = json.load(f)

# 设置 seed
random.seed(5525)
# shuffle 
random.shuffle(data)

train_set, test_set = data[:1080], data[1080:]

# with open("ft_data_train.json", "w") as f:
#     json.dump(train_set, f)

# with open("ft_data_test.json", "w") as f:
#     json.dump(test_set, f)

In [8]:
train_data = Dataset.from_json('ft_data_train.json')
train_data = train_data.map(preprocess_func)
print(train_data)

Dataset({
    features: ['message', 'input_ids', 'attention_mask', 'labels', 'pixel_values', 'image_grid_thw'],
    num_rows: 1080
})


In [9]:
from peft import LoraConfig, TaskType, get_peft_model, PeftModel

# 配置LoRA
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False,  # 训练模式
    r=64,  # Lora 秩
    lora_alpha=16,
    lora_dropout=0.05,  # Dropout 比例
    bias="none",
)

# 获取LoRA模型
peft_model = get_peft_model(model, config)

In [10]:
from transformers import (
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    Qwen2VLForConditionalGeneration,
    AutoProcessor,
)

# 配置训练参数
args = TrainingArguments(
    output_dir="./output/Qwen2-VL-2B",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    logging_first_step=5,
    num_train_epochs=2,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",
)

In [18]:
import swanlab
from swanlab.integration.transformers import SwanLabCallback

# 设置SwanLab回调
swanlab_callback = SwanLabCallback(
    project="Qwen2-VL-finetune",
    experiment_name="qwen2-vl-coco2014",
    config={
        "model": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
        "dataset": "https://disk.pku.edu.cn/anyshare/en-us/link/AAF10CCC4D539543F68847A9010C607139?_tb=none&expires_at=1970-01-01T08%3A00%3A00%2B08%3A00&item_type=&password_required=false&title=HMER%20Dataset&type=anonymous",
        "github": "https://github.com/Wooonster/HOCR",
        "prompt": "https://github.com/Wooonster/HOCR",
        "train_data_number": len(train_data),
        "lora_rank": 64,
        "lora_alpha": 16,
        "lora_dropout": 0.1,
    },
)

In [20]:
# 配置Trainer
trainer = Trainer(
    model=peft_model,
    args=args,
    train_dataset=train_data,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[swanlab_callback],
)

# 开启模型训练
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


[1m[34mswanlab[0m[0m: Tracking run with swanlab version 0.4.8                                   
[1m[34mswanlab[0m[0m: Run data will be saved locally in [35m[1m/root/autodl-tmp/HOCR/finetune/swanlog/run-20250226_154210-a3b1799d[0m[0m
[1m[34mswanlab[0m[0m: 👋 Hi [1m[39mWonster[0m[0m, welcome to swanlab!
[1m[34mswanlab[0m[0m: Syncing run [33mqwen2-vl-coco2014[0m to the cloud
[1m[34mswanlab[0m[0m: 🌟 Run `[1mswanlab watch /root/autodl-tmp/HOCR/finetune/swanlog[0m` to view SwanLab Experiment Dashboard locally
[1m[34mswanlab[0m[0m: 🏠 View project at [34m[4mhttps://swanlab.cn/@Wonster/Qwen2-VL-finetune[0m[0m
[1m[34mswanlab[0m[0m: 🚀 View run at [34m[4mhttps://swanlab.cn/@Wonster/Qwen2-VL-finetune/runs/jdhirmsa7fu6q0g17tlfl[0m[0m


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,0.0002


KeyboardInterrupt: 

In [14]:
# 配置测试参数
val_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=True,  # 训练模式
    r=64,  # Lora 秩
    lora_alpha=16,
    lora_dropout=0.05,  # Dropout 比例
    bias="none",
)

# 获取测试模型
val_peft_model = PeftModel.from_pretrained(model, model_id="./ft_model/Qwen2.5-vl-7b-instruct-awq/checkpoint-134", config=val_config)

In [15]:
def predict(messages, model):
    # 准备推理
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # 生成输出
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    
    return output_text[0]

In [21]:
# 读取测试数据
with open("ft_data_test.json", "r") as f:
    test_dataset = json.load(f)

test_image_list = []
for item in test_dataset:
    url = item["message"][0]["conversation"][0]['url']
    caption = item["message"][0]["conversation"][1]['caption']
    
    messages = [
        {
            "role": "system", 
            "content": "You are a helpful assistant in recognize math equations in either handwritten or printed text."
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Recognize the equation in the image, write its LaTeX code bettwen $$\t and \t$$"
                },
                {
                    "type": "image",
                    "image": url,
                    "resized_height": 280,
                    "resized_width": 280,
                },
            ]
        }
    ]
    
    response = predict(messages, val_peft_model)
    messages.append({"role": "assistant", "content": f"{response}"})
    print(messages[-1])

    test_image_list.append(swanlab.Image(url, caption=response))

swanlab.log({"Prediction": test_image_list})

# 在Jupyter Notebook中运行时要停止SwanLab记录 需要调用swanlab.finish()
swanlab.finish()

{'role': 'assistant', 'content': '$$x = x_a - x_b$$'}
{'role': 'assistant', 'content': '$$a=1...7$$'}
{'role': 'assistant', 'content': '$$\tx+u'}
{'role': 'assistant', 'content': '$$\\sqrt{1 + z^2}$$'}
{'role': 'assistant', 'content': '$$C_{t}=K$$'}
{'role': 'assistant', 'content': '$$\\text{and } \\text{one goes down from } m$$'}
{'role': 'assistant', 'content': '$$\\frac{1}{\\epsilon}\\int_{-\\infty}^{\\infty}dz$$'}
{'role': 'assistant', 'content': '$$\t\\alpha b - \\alpha^{-2} b^{-2}\n$$'}
{'role': 'assistant', 'content': '$$\tb_{m}=\\lim _{a\\to 0}{b_{m}-a}'}
{'role': 'assistant', 'content': '$$\\Delta^{m}(m)=8x-\\frac{1}{6}(m+1)(m+2)(m-3)$$'}
{'role': 'assistant', 'content': '$$\\sqrt{\\frac{R}{n}}$$'}
{'role': 'assistant', 'content': '$$\\frac{-4}{\\sqrt{360}}$$'}
{'role': 'assistant', 'content': '$$\\sqrt{-8}$$'}
{'role': 'assistant', 'content': '$$\\lim_{R \\to 0} k^2 G(R) = \\infty$$'}
{'role': 'assistant', 'content': '$$\\int d^d x e(x)$$'}
{'role': 'assistant', 'content': '$