### 学术加速

如果是在 AutoDL 租的服务器，下载模型前运行下面 cell 以获取学术加速

In [1]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

## Qwen2.5-VL-finetune

In [12]:
MAX_LENGTH = 8192
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cpu'

### Download model

In [2]:
model_id_3b = 'Qwen/Qwen2.5-VL-3B-Instruct-AWQ'
model_id_7b = 'Qwen/Qwen2.5-VL-7B-Instruct-AWQ'
save_dir_3b = './model/base/vl3b/'  # change to your save path
save_dir_7b = './model/base/vl7b/' 

In [7]:
from huggingface_hub import snapshot_download

# snapshot_download(repo_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', local_dir='./vl3b/', local_dir_use_symlinks=False)
snapshot_download(repo_id=model_id_7b, local_dir=save_dir_7b, local_dir_use_symlinks=False)

For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

'/root/autodl-tmp/HOCR/finetune/model/base/vl7b'

In [14]:
import transformers
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoTokenizer
from qwen_vl_utils import process_vision_info


model_7b = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    save_dir_7b, torch_dtype=torch.float16, trust_remote=True, 
    attn_implementation="flash_attention_2"
)
model_7b.to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(save_dir_7b)
processor = AutoProcessor.from_pretrained(save_dir_7b)

model_7b.enable_input_require_grads()   # 开启梯度检查点时(training_args.gradient_checkpointing=True,)要执行该方法

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


NameError: name 'model_7b' is not defined

In [5]:
print(model_7b)

Qwen2_5_VLForConditionalGeneration(
  (visual): Qwen2_5_VisionTransformerPretrainedModel(
    (patch_embed): Qwen2_5_VisionPatchEmbed(
      (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
    )
    (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()
    (blocks): ModuleList(
      (0-31): 32 x Qwen2_5_VLVisionBlock(
        (norm1): Qwen2RMSNorm((1280,), eps=1e-06)
        (norm2): Qwen2RMSNorm((1280,), eps=1e-06)
        (attn): Qwen2_5_VLVisionFlashAttention2(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (mlp): Qwen2_5_VLMLP(
          (gate_proj): Linear(in_features=1280, out_features=3420, bias=True)
          (up_proj): Linear(in_features=1280, out_features=3420, bias=True)
          (down_proj): Linear(in_features=3420, out_features=1280, bias=True)
          (act_fn): SiLU()
        )
      )
    )
    (merger): Qwen2_5_VLPatchMerg

### 数据集加载、划分、映射

In [19]:
def preprocess_func(example):
    # print(f'example.ids: {example,keys()}'
    input_ids, attention_mask, labels = [], [], []
    url = example["message"][0]["conversation"][0]['url']
    caption = example["message"][0]["conversation"][1]['caption']
    
    messages = [
        {
            "role": "system", 
            "content": "You are a helpful assistant in recognize math equations in either handwritten or printed text."
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text", "text": "Recognize the equation in the image, write its LaTeX code between $$\n and \n$$"
                },
                {
                    "type": "image",
                    "image": url,
                    "resized_height": 280,
                    "resized_width": 280,
                },
            ]
        },
        {
            "role": "assistant",
            "content": [
                {
                    "type": "text",
                    "text": caption
                }
            ]
        }
    ]

    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    img_inputs, _ = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=img_inputs,
        padding=True,
        return_tensors='pt'
    )
    inputs = {key: value.tolist() for key, value in inputs.items()}
    instruction = inputs
    response = tokenizer(f'{caption}', add_special_tokens=False)
    input_ids = (
        instruction["input_ids"][0] + response['input_ids'] + [tokenizer.pad_token_id]
    )
    attention_mask = instruction['attention_mask'][0] + response['attention_mask'] + [1]
    labels = (
        [-100] * len(instruction['input_ids'][0])
        + response['input_ids']
        + [tokenizer.pad_token_id]
    )

    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    labels = torch.tensor(labels)

    inputs['pixel_values'] = torch.tensor(inputs['pixel_values'])
    # 由 (1, h, w) 变换为 (h, w)
    inputs['image_grid_thw'] = torch.tensor(inputs['image_grid_thw']).squeeze(0)  
    return {
        "input_ids": input_ids, 
        "attention_mask": attention_mask, 
        "labels": labels,
        "pixel_values": inputs['pixel_values'], 
        "image_grid_thw": inputs['image_grid_thw']
    }

In [20]:
""" 数据集准备 """
import json
import random
from datasets import load_dataset, Dataset

dataset_dir = 'data/ft_data.json'
dataset = load_dataset('json', data_files=dataset_dir)  # load dataset
dataset = dataset['train'].train_test_split(test_size=0.15, shuffle=True, seed=5525)  # split
dataset.save_to_disk('data/ft_dataset')  # save dataset

training_dataset, test_dataset = dataset['train'], dataset['test']
training_dataset = training_dataset.map(preprocess_func)  # mapping training dataset

training_dataset, test_dataset

Saving the dataset (0/1 shards):   0%|          | 0/1019 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/1019 [00:00<?, ? examples/s]

NameError: name 'process_vision_info' is not defined

### 预测、测试函数

In [17]:
from test_funcs import compute_bleu, compute_exprate
from collections import defaultdict


def predict(messages, model):
    # 准备推理
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(model.device)

    # 生成输出
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text[0]

def test_results(dataset, model, return_list=False):
    # test_dataset = dataset['test']
    test_outputs = defaultdict(list)
    swan_list = []
    
    for item in dataset:
        url = item["message"][0]["conversation"][0]['url']
        caption = item["message"][0]["conversation"][1]['caption']
        # Create the conversation prompt
        messages = [
            {
                "role": "system", 
                "content": "You are a helpful assistant in recognizing math equations in either handwritten or printed text."
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Recognize the equation in the image, write its LaTeX code between $$\\t and \\t$$"
                    },
                    {
                        "type": "image",
                        "image": url,
                        "resized_height": 280,
                        "resized_width": 280,
                    },
                ]
            }
        ]
    
        # Generate a prediction using the model
        response = predict(messages, model)
        # Save the prediction keyed by image URL
        test_outputs[url] = [response, caption]
        swan_list.append(swanlab.Image(url, caption=response)
        
    # Compute evaluation metrics
    compute_bleu(test_outputs)
    compute_exprate(test_outputs)
    
    return swan_list if return_list else None

### 配置 LoRA 参数

In [29]:
from peft import LoraConfig, TaskType, get_peft_model, PeftModel

# 配置LoRA
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False,  # 训练模式
    r=8,  # Lora 秩
    lora_alpha=16,
    lora_dropout=0.1,  # Dropout 比例
    bias="none",
)

# 获取LoRA模型
peft_model_7b = get_peft_model(model_7b, config)

### 配置预训练参数

In [31]:
from transformers import (
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    Qwen2VLForConditionalGeneration,
    AutoProcessor,
)

# 配置训练参数
training_args = TrainingArguments(
    output_dir="./model/output/Qwen2.5-VL-7B-ft/",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,  # total batch size = per_device_train_batch_size * gradient_accumulation_steps
    gradient_accumulation_steps=4,
    logging_strategy='steps',
    logging_steps=25,
    logging_first_step=True,
    num_train_epochs=3,
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=1e-4,
    lr_scheduler_type='cosine',
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",
    load_best_model_at_end=True,
)

如果想记录下训练可视化数据，可以用 swanlab，没有在下面 Trainer 里把 `callbacks=[swanlab_callback],  # 没有注释即可` 注释即可

In [32]:
import swanlab
from swanlab.integration.transformers import SwanLabCallback

# 设置SwanLab回调
swanlab_callback = SwanLabCallback(
    project="Qwen2.5-VL-7b-finetune-2",
    experiment_name="qwen2.5-vl-crohme2019",
    config={
        "model": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
        "dataset": "https://disk.pku.edu.cn/anyshare/en-us/link/AAF10CCC4D539543F68847A9010C607139?_tb=none&expires_at=1970-01-01T08%3A00%3A00%2B08%3A00&item_type=&password_required=false&title=HMER%20Dataset&type=anonymous",
        "github": "https://github.com/Wooonster/HOCR",
        "prompt": "Recognize the equation in the image, write its LaTeX code bettwen $$\t and \t$$",
        "train_data_number": len(train_dataset),
        "lora_rank": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.1,
    },
)

开始训练，并且稳定后，在终端 `watch -n 1 nvidia-smi` 观测一下 GPU 显存占用情况，可以记录一下

In [None]:
# 配置Trainer
trainer = Trainer(
    model=peft_model_7b,
    args=args,
    train_dataset=training_dataset,
    eval_dataset=test_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[swanlab_callback],  # 没有注释即可
)

# 开启模型训练
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss


[1m[33mswanlab[0m[0m: Step 1 on key train/loss already exists, ignored.
[1m[33mswanlab[0m[0m: Step 1 on key train/grad_norm already exists, ignored.
[1m[33mswanlab[0m[0m: Step 1 on key train/learning_rate already exists, ignored.
[1m[33mswanlab[0m[0m: Step 1 on key train/epoch already exists, ignored.


In [4]:
from peft import LoraConfig, TaskType, get_peft_model, PeftModel

# 配置测试参数
val_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=True,  # 训练模式
    r=8,  # Lora 秩
    lora_alpha=16,
    lora_dropout=0.1,  # Dropout 比例
    bias="none",
)

# 获取测试模型
val_peft_model = PeftModel.from_pretrained(model_7b, model_id="./output/Qwen2-VL-7B/checkpoint-134", config=val_config)

NameError: name 'model_7b' is not defined

In [None]:
swan_list = test_results(test_dataset, val_peft_model, return_list=True)  # 没有 swanlab, return_list=False
if swan_list is not None:
    swanlab.log({"Prediction": swan_list})
    # 在 Jupyter Notebook 中运行时要停止SwanLab记录 需要调用swanlab.finish()
    swanlab.finish()

  return F.conv3d(


{'role': 'assistant', 'content': '$$x = x_a - x_b$$'}
{'role': 'assistant', 'content': '$$a=1...7$$'}
{'role': 'assistant', 'content': '$$x+u$$'}
{'role': 'assistant', 'content': '$$\\sqrt{1 + z^2}$$'}
{'role': 'assistant', 'content': '$$C_{t}=K$$'}
{'role': 'assistant', 'content': '$$\\text{and } \\text{one goes down from } m$$'}
{'role': 'assistant', 'content': '$$\\frac{1}{\\epsilon}\\int_{-\\infty}^{\\infty}dz$$'}
{'role': 'assistant', 'content': '$$a b - a ^ { - 2 } b ^ { - 2 }$$'}
{'role': 'assistant', 'content': '$$b_{m}=\\lim _{a\\to 0}{\\frac {b_{m}-a}{a}}$$'}
{'role': 'assistant', 'content': '$$\\Delta ^ { \\prime } ( m ) = 8 x - \\frac { 1 } { 6 } ( m + 1 ) ( m + 2 ) ( m + 3 )$$'}
{'role': 'assistant', 'content': '$$\\sqrt{\\frac{R}{n}}$$'}
{'role': 'assistant', 'content': '$$\\frac{-4}{\\sqrt{360}}$$'}
{'role': 'assistant', 'content': '$$\\sqrt{-8}$$'}
{'role': 'assistant', 'content': '$$\\lim_{R \\to 0} k^2 G(R) = \\infty$$'}
{'role': 'assistant', 'content': '$$\\int d^dx 