In [58]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import os
import random
from transformers import T5Tokenizer, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer
import numpy as np
import json

In [59]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(42)

In [62]:
batch_size = 4
max_length = 512
target_length = 64
num_epoch = 4
learning_rate = 2e-5

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {device} device.")

checkpoint = ".\mengzi-t5-base"
tokenizer = T5Tokenizer.from_pretrained(checkpoint)

Didn't find file .\mengzi-t5-base\added_tokens.json. We won't load it.
Didn't find file .\mengzi-t5-base\special_tokens_map.json. We won't load it.
Didn't find file .\mengzi-t5-base\tokenizer_config.json. We won't load it.
loading file .\mengzi-t5-base\spiece.model
loading file None
loading file None
loading file None
loading configuration file .\mengzi-t5-base\config.json
Model config T5Config {
  "_name_or_path": ".\\mengzi-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "gradient_checkpointing": false,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "torch_dt

Using cuda device.


In [63]:
class QADataset(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)

    def load_data(self, data_file):
        Data = {}

        with open(data_file, 'rt', encoding='utf-8') as f:
            for idx, line in enumerate(f):
                data = json.loads(line)
                Data[idx] = data

        return Data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]


train_dataset = QADataset('DuReaderQG/train.json')
dev_dataset = QADataset('DuReaderQG/dev.json')
print(f"length of train dataset: {len(train_dataset)}")
print(f"length of dev dataset: {len(dev_dataset)}")
print(train_dataset[0])

length of train dataset: 14520
length of dev dataset: 984
{'context': '第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。', 'answer': '第35集', 'question': '仙剑奇侠传3第几集上天界', 'id': 0}


In [65]:
def collate_fn(batch_samples):
    input_texts = [
        f"question: {example['question']} context: {example['context']}"
        for example in batch_samples
    ]
    target_text = [
        example['answer']
        for example in batch_samples
    ]

    batch_inputs = tokenizer(
        input_texts,
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            target_text,
            max_length=target_length,
            padding=True,
            truncation=True,
            return_tensors='pt'
        ).input_ids

    labels[labels == tokenizer.pad_token_id] = -100

    batch_inputs["labels"] = labels
    return batch_inputs

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dev_dataset = DataLoader(train_dataloader, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
print(f"Number of batches on train dataset: {len(train_dataloader)}")
print("inputs shape: ", {k: v.shape for k, v in next(iter(train_dataloader)).items()})
# print(next(iter(train_dataloader)))

Number of batches on train dataset: 3630
inputs shape:  {'input_ids': torch.Size([4, 326]), 'attention_mask': torch.Size([4, 326]), 'labels': torch.Size([4, 14])}


In [66]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

loading configuration file .\mengzi-t5-base\config.json
Model config T5Config {
  "_name_or_path": ".\\mengzi-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "gradient_checkpointing": false,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.17.0",
  "use_cache": true,
  "vocab_size": 32128
}

loading weights file .\mengzi-t5-base\pytorch_model.bin
All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model c

In [67]:
training_args = TrainingArguments(
    output_dir="./outputs",
    num_train_epochs=num_epoch,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=learning_rate,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=collate_fn,
    tokenizer=tokenizer,
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [68]:
trainer.train()

***** Running training *****
  Num examples = 14520
  Num Epochs = 4


  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 14520


  0%|          | 0/14520 [00:00<?, ?it/s]

KeyboardInterrupt: 