In [4]:
import os
import json
import random

import numpy as np
import torch
from torch.utils.data import Dataset

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import sacrebleu

## 1. 超参数 & 环境设置

In [5]:
batch_size = 8
max_input_len = 512
max_target_len = 64
num_epochs = 4
learning_rate = 2e-5
seed = 42

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed)

Using device: cuda


## 2. 数据加载 & Dataset

In [7]:
def load_jsonl(path):
    examples = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            examples.append(json.loads(line.strip()))
    return examples


class QADataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            "context":  item["context"],
            "question": item["question"],
            "answer":   item["answer"],
        }


train_data = load_jsonl('./DuReaderQG/train.json')
valid_data = load_jsonl('./DuReaderQG/dev.json')
train_dataset = QADataset(train_data)
valid_dataset = QADataset(valid_data)
print(f"length of train dataset: {len(train_dataset)}")
print(f"length of valid dataset: {len(valid_dataset)}")
print(train_dataset[0])

length of train dataset: 14520
length of valid dataset: 984
{'context': '第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。', 'question': '仙剑奇侠传3第几集上天界', 'answer': '第35集'}


## 3. Tokenizer 和 model

In [8]:
checkpoint = "../mengzi-t5-base"
tokenizer  = AutoTokenizer.from_pretrained(checkpoint, use_fast=False)
model      = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

## 4. collate_fn

In [9]:
def collate_fn(batch_sample):
    inputs  = [f"question: {example['question']}  context: {example['context']}" for example in batch_sample]
    targets = [example["answer"] for example in batch_sample]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_len,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_len,
            padding=True,
            truncation=True,
            return_tensors="pt",
        ).input_ids

    # 把[PAD]的id换成-100， 训练时忽略
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

## 5. compute_metrics

In [None]:
def compute_metrics(eval_pred):
    preds, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # 把 -100 还原成 pad_id，再 decode references
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

    references = [decoded_refs]

    bleu = sacrebleu.corpus_bleu(decoded_preds, references, force=True)
    return {
        "bleu":   np.mean(bleu.precisions) / 100,
        "bleu-1": bleu.precisions[0] / 100,
        "bleu-2": bleu.precisions[1] / 100,
        "bleu-3": bleu.precisions[2] / 100,
        "bleu-4": bleu.precisions[3] / 100,
    }

In [None]:
# text = ["2002年11月8日至14日", "比尔·拉塞尔", "南宁至东莞的距离", "其原创性及文中陈述内容未经本站证实,本地宝对本文及其中全部或者部分内容的真实性、完整性、及时性不作任何保证和承诺,请网友自行核实相关内容。"]
# preds_biased = ["11月8日至14日", "比尔", "到东莞的距离", "其原创性及文中陈述内容未经本站证实,部分内容的真实性、完整性、及时性不作任何保证和承诺。"]
# source = [text]
# input = tokenizer(text,
#                   padding=True,
#                   truncation=True,
#                   max_length = max_input_len,
#                   return_tensors="pt")["input_ids"]
# preds = tokenizer.batch_decode(input, skip_special_tokens=True)
# bleu = sacrebleu.corpus_bleu(preds_biased, source, force=True)
# print(preds_biased)
# print(source)
# print(np.mean(bleu.precisions))
# print(bleu.precisions)

['11月8日至14日', '比尔', '到东莞的距离', '其原创性及文中陈述内容未经本站证实,部分内容的真实性、完整性、及时性不作任何保证和承诺。']
[['2002年11月8日至14日', '比尔·拉塞尔', '南宁至东莞的距离', '其原创性及文中陈述内容未经本站证实,本地宝对本文及其中全部或者部分内容的真实性、完整性、及时性不作任何保证和承诺,请网友自行核实相关内容。']]
33.333333333333336
[33.333333333333336, 50.0, 50.0, 0.0]


## 6. 配置trainingarguments & trainer

In [77]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./outputs",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,

    predict_with_generate=True,          # 让 Trainer 调用 model.generate
    generation_num_beams=5,
    generation_max_length=max_target_len,

    fp16=True
)

In [78]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=collate_fn,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp half precision backend
  self.scaler = torch.cuda.amp.GradScaler()


## 7. 训练 保存

In [None]:
trainer.train()
metrics = trainer.evaluate()
print("[FINAL EVAL]", metrics)

In [None]:
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")