In [24]:
import os
os.environ["TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL"] = "1"
os.environ["USE_DISTRIBUTED"] = "0"

In [2]:
from datasets import load_dataset

datasets = load_dataset("supremezxc/nlpcc_2017", split="train").shuffle(seed=42).select(range(5000)).train_test_split(test_size=0.02)

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Langboat/mengzi-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("Langboat/mengzi-t5-base").to("cuda")

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/282 [00:00<?, ?it/s]



In [7]:
datasets["train"][0]["data"]

{'content': '昨天凌晨,新乡原阳县韩董庄乡焦庵村的一个私人家具厂发生了火灾,一个年仅4岁的男孩被大火烧死,而孩子的父母,也就是家具厂老板邵某和妻子被严重烧伤,性命垂危,医生说很可能熬不过两天。这场火灾不是意外,而是有人故意放火,放火的不是仇家,而是邵某的亲生儿子、4岁小男孩的亲哥哥!这一切到底是为什么?睡梦中家中突发大火一死两伤(看视频请点我!)4月12日凌晨5点多,40多岁的邵某和妻儿正在睡梦中,突然,家里燃起大火。邵某的家是一个小型的家具厂,存放着大量的木材、海绵等易燃物,火势很快蔓延开来。当地消防迅速赶到,大火很快被扑灭了。邵某和妻子被救了出来,伤势十分严重,立即被送往医院进行救治,更让人遗憾的是,他们年仅4岁的小儿子被救出时已经停止了呼吸。纵火者竟是亲哥哥纵火后淡定上网打游戏村民说,邵某是安阳滑县人,家里一共四口人,邵某还有一个大儿子,17岁,叫邵明明(化名),可是大家怎么都没有找到他,大清早的他去了哪里?邻居说明明喜欢去网吧,得知这一消息,民警赶紧分头去找,果然在网吧找到了正在玩游戏的明明,民警告诉他家里失火了,令民警没想到的是,明明竟淡定的说:“我知道,火是我放的。”纵火只因玩网游被父亲打骂邵明明为什么在自己家里放火?民警询问得知,前天晚上,因为沉迷上网打游戏,明明挨了爸爸的一顿揍,这让他非常恼火,并怀恨在心。于是趁爸爸妈妈和弟弟都在睡觉的时候,在房间里放了一把火,然后去了附近网吧上网。他没想到,这把火烧死了弟弟,并害的爸爸妈妈住进医院,现在还生死未卜。记者从当地警方了解到,明明小学毕业就辍学了,用村民的话说“这孩子有点憨”,平时什么都不干就喜欢上网打游戏,爸妈说了多少次就是不改。目前,邵明明已经被当地警方刑事拘留。都市君有话说:只因挨了打骂就放火烧家,原因也许不会这么简单,但它的确是悲剧产生的导火索。记得曾有教育专家在谈到棍棒教育时忠告说:你的暴力倾向会在孩子身上发扬光大。可不,简单粗暴的做法就换来了极端的报复,这也给做父母的敲了个警钟。',
 'title': '组图:新乡17岁男孩沉迷网游被父亲打骂,怀恨纵火不料烧死4岁弟弟,事后淡定去网吧;父母也被严重烧伤生命垂危。'}

In [14]:
def process_function(examples):
    contents = ["摘要生成: \n" + v["content"] for v in examples["data"]]
    labels = [v["title"] for v in examples["data"]]

    inputs_contents = tokenizer(contents, max_length=384, truncation=True)
    inputs_labels = tokenizer(labels, max_length=64, truncation=True)

    inputs_contents["labels"] = inputs_labels["input_ids"]

    return inputs_contents


In [15]:
tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)

Map:   0%|          | 0/4900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [21]:
import numpy as np
from rouge_chinese import Rouge

rouge = Rouge()

def compute_metric(pred):
    predictions, labels = pred
    decode_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decode_preds = [" ".join(p) for p in decode_preds]
    decode_labels = [" ".join(l) for l in decode_labels]
    scores = rouge.get_scores(decode_preds, decode_labels, avg=True)
    print(scores)
    return {
        "rouge-1": scores["rouge-1"]["f"],
        "rouge-2": scores["rouge-2"]["f"],
        "rouge-l": scores["rouge-l"]["f"],
    }

In [22]:
from transformers import Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./summary",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    logging_steps=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="rouge-l",
    predict_with_generate=True
)

In [25]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metric,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer)
)

ModuleNotFoundError: Could not import module 'Seq2SeqTrainer'. Are this object's requirements defined correctly?

In [27]:
from torch._C._distributed_c10d import FakeProcessGroup

ModuleNotFoundError: No module named 'torch._C._distributed_c10d'; 'torch._C' is not a package