## MT5 fine-tune 基本流程

In [3]:
from transformers import (Seq2SeqTrainer, Seq2SeqTrainingArguments,
                          MT5ForConditionalGeneration, AutoTokenizer,
                          DataCollatorForSeq2Seq)

from datasets import DatasetDict, Dataset

import os
import torch
import logging

  from .autonotebook import tqdm as notebook_tqdm


## 准备数据集


数据集组织格式，对应的文本文件：

src_train, valid, test

tgt_train, valid, test

文件内容对应行为平行语句
```text

src_train :
hello
ok
...

tgt_train:
你好
好的
...
```

使用方式 
```python

tokenized_dataset = get_dataset(src_file_train, tgt_file_train,batch_size=16, tokenizer=tokenizer, src_lang_code="zh", tgt_lang_code="en",max_sentence_length=512)                    # lang_code 对应语言代码
```

In [7]:
from functools import partial


src_file = ["/apdcephfs_cq2/share_1567347/hayuxu/data/nmt/zh-en/ELRC_2922/en-zh.zh",]
tgt_file = ["/apdcephfs_cq2/share_1567347/hayuxu/data/nmt/zh-en/ELRC_2922/en-zh.en",]

def get_src_ref_pre_cor_paras_from_file(*files):
    """_summary_
        files 的顺序必须是src， ref， pre， cor，后面的可以为空，但前面的必需有
    Returns:
        返回[[src],[ref], ...]
    """
    file_data = []
    for i, path in enumerate(files):
        with open(path, 'r') as f:
            f_data = f.readlines()
        f_data = [s.rstrip('\n').rstrip(" ") for s in f_data]
        file_data.append(f_data)

    # 过滤掉句子长度为0的句子
    trans_para = [item for item in zip(*file_data) if all([len(x)>0 for x in item])]
    return trans_para

def get_translate_paras_from_file(src_file, tgt_file):
    trans_paras = get_src_ref_pre_cor_paras_from_file(src_file, tgt_file)   #这里tgt_file作为ref传进去的
    return [[p[0], p[1]] for p in trans_paras]

def preprocess_function(examples, src_lang, tgt_lang, tokenizer, max_input_length, max_target_length):
    inputs = [ex for ex in examples[src_lang]]
    targets = [ex for ex in examples[tgt_lang]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Set up the tokenizer for targets 源语言与目标语言使用联合词典的
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    # model_inputs["labels_attention_mask"] = labels["attention_mask"]
    return model_inputs



def get_tokenized_datasets(tokenizer, trans_para, src_lang, tgt_lang, max_input_length, max_target_length, batch_size=None):
    """
    注意 着里的trans_para 只能是有两个元素的，分别作为源语言和目标语言, 也可以是datasetdict
    只进行tokenized不做split trans_para 可以是list也可以是DatasetDict
    """
    batch_tokenize_fn = partial(preprocess_function,
                                tokenizer=tokenizer,
                                src_lang=src_lang,
                                tgt_lang=tgt_lang,
                                max_input_length=max_input_length,
                                max_target_length=max_target_length,
                                )
    if not isinstance(trans_para, DatasetDict):
        trans_para = {
            src_lang: [src for src, _ in trans_para],
            tgt_lang: [tgt for _, tgt in trans_para]
        }
        raw_datasets = Dataset.from_dict(trans_para)
        raw_datasets = DatasetDict({'train': raw_datasets})
    else:
        raw_datasets = trans_para
    remove_names = raw_datasets['train'].column_names if "train" in raw_datasets else raw_datasets['test'].column_names

    tokenized_datasets = raw_datasets.map(batch_tokenize_fn, batched=True, batch_size=batch_size,
                                          remove_columns=remove_names)
    return tokenized_datasets


def get_dataset(src_f, tgt_f, batch_size, tokenizer, src_lang_code, tgt_lang_code, max_sentence_length):
    trans_para = get_translate_paras_from_file(src_f, tgt_f)
    datasets = get_tokenized_datasets(tokenizer, trans_para, src_lang_code, tgt_lang_code,
                                        max_input_length=max_sentence_length,
                                        max_target_length=max_sentence_length,batch_size=batch_size)
    return datasets['train']


### 加载tokenizer

In [5]:
model_name_or_path = "/apdcephfs_cq2/share_1567347/hayuxu/models/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5TokenizerFast(name_or_path='/apdcephfs_cq2/share_1567347/hayuxu/models/mt5-small', vocab_size=250100, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True)

### 处理数据集

In [8]:
tokenized_dataset = get_dataset(src_file[0], tgt_file[0],
                                batch_size=16, tokenizer=tokenizer, 
                                src_lang_code="zh", tgt_lang_code="en",
                                max_sentence_length=512)
tokenized_dataset

Map: 100%|██████████| 100/100 [00:00<00:00, 520.00 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})

## 加载模型

In [9]:
model = MT5ForConditionalGeneration.from_pretrained(model_name_or_path).cuda()
model

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
          

In [15]:
from transformers import DataCollatorForSeq2Seq

## 这里又个坑，数据pad只能在这里设置，不要在别处设置，会出bug
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True, max_length=256, return_tensors="pt")

## 构造评估函数

训练中不评估，这里也有个坑，不建议在这里写评估，有个小bug

## 设置训练参数

可以设置学习率，batch_size， 等等

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/apdcephfs_cq2/share_1567347/hayuxu/models",            
    evaluation_strategy="no",
    save_strategy="epoch",
    logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
    fp16_opt_level="O3",
    push_to_hub=False,
    save_total_limit = 2,                               # 保存的checkpoint的最大个数
)
training_args





In [16]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

##  开始训练

In [17]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.03 GiB (GPU 0; 8.00 GiB total capacity; 5.25 GiB already allocated; 967.54 MiB free; 5.30 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF