Step.1 導入相關套件

In [None]:
!pip install rouge-chinese
!pip install datasets

In [7]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer , AutoModelForSeq2SeqLM , DataCollatorForSeq2Seq , Seq2SeqTrainer , Seq2SeqTrainingArguments

*Step.2 載入數據*

In [None]:
ds = Dataset.load_from_disk("./nlpcc_2017")
ds

In [None]:
ds = ds.train_test_split(100 , seed=42)
ds

In [None]:
ds["train"][0]

*Step.3 數據處理*

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Langboat/mengzi-t5-base")
tokenizer

In [17]:
def process_func(exmaples):
    contents = ["摘要生成: \n" + e for e in exmaples["content"]]
    inputs = tokenizer(contents, max_length= 384 , truncation=True)
    labels = tokenizer(text_target=exmaples["title"], max_length=64, truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs

In [None]:
tokenized_ds = ds.map(process_func , batched=True)

In [None]:
tokenizer.decode(tokenized_ds["train"][0]["input_ids"])

In [None]:
tokenizer.decode(tokenized_ds["train"][0]["labels"])

*Step.4 建立模型*

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("Langboat/mengzi-t5-base")

*Step.5 評估函數*

In [23]:
import numpy as np
from rouge_chinese import Rouge

rouge = Rouge()

def compute_metric(evalPred):
    predictions, labels = evalPred
    decode_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decode_preds = [" ".join(p) for p in decode_preds]
    decode_labels = [" ".join(l) for l in decode_labels]
    scores = rouge.get_scores(decode_preds, decode_labels, avg=True)
    return {
        "rouge-1": scores["rouge-1"]["f"],
        "rouge-2": scores["rouge-2"]["f"],
        "rouge-l": scores["rouge-l"]["f"],
    }


*Step.6 訓練參數*

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir="./summary",
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 8,
    gradient_accumulation_steps = 8,
    logging_steps = 8,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    metric_for_best_model = "rouge-l",
    predict_with_generate=True
)

*Step.7 訓練器*

In [None]:
trainer = Seq2SeqTrainer(
    args = args,
    model = model,
    train_dataset = tokenized_ds["train"],
    eval_dataset = tokenized_ds["test"],
    compute_metrics = compute_metric,
    tokenizer = tokenizer,
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)
)

*Step.8 模型訓練*

In [None]:
trainer.train()

*Step.9 模型推理*

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)

In [None]:
pipe("摘要生成:\n" + ds["test"][-1]["content"], max_length=64, do_sample=True)

In [None]:
ds["test"][-1]["title"]