In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding


data_files = {
    'train': 'data/DuReaderQG/train.json',
    'validation': 'data/DuReaderQG/dev.json'
}
raw_datasets = load_dataset("json", data_files=data_files)
print(raw_datasets)
print(raw_datasets['train'].features)

DatasetDict({
    train: Dataset({
        features: ['context', 'answer', 'question', 'id'],
        num_rows: 14520
    })
    validation: Dataset({
        features: ['context', 'answer', 'question', 'id'],
        num_rows: 984
    })
})
{'context': Value(dtype='string', id=None), 'answer': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'id': Value(dtype='int64', id=None)}


In [None]:
from transformers import AutoModelForSeq2SeqLM

checkpoint = 'langboat/mengzi-t5-base'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [6]:
raw_datasets['train'][0]

{'context': '第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。',
 'answer': '第35集',
 'question': '仙剑奇侠传3第几集上天界',
 'id': 0}

In [7]:
result = tokenizer(
    raw_datasets['train'][0]['question'],
    raw_datasets['train'][0]['context'],
    truncation=True,
    return_tensors='pt'
)
result

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': tensor([[    7,  1707,  1467,   992,  3979,   707,   100,   379,   645,   647,
          9724,  1252,     1,  3389,  2838,   647,   843,   408, 10694, 17973,
          1763,     3,  1276,    87,    84,  2744,    84,  1419,  6141,     3,
           122,  6409,     9,  2177, 17534,     5,  1707,  1468, 11725,   229,
             3,   408,  5542,   119, 28060,     3, 18440,  3844,     4,  5542,
          2190,  1468,     3,    54,  1069, 12914,    83,  5665,   335,   215,
             9,  4514, 17339,    69,     4,   843,   408,  1200,  3771,  1902,
            10,     3,  6625,  8435,   603,  1100,     3,   266,   119,  1954,
             4,  5542,   145,   711, 27530, 11755,     3, 13737,    21,   304,
          3779,    68,   843,   408,     5, 21769,  7074,     4,   122,  6409,
          5184,   299,   854,     5, 21769,     3,   299,   854,   720,   267,
           448,   756,    58,   807,  2037,    87,  1252,   481,    15,  2125,
             4,   122,  6409,  3412,  

In [8]:
with tokenizer.as_target_tokenizer():
    labels = tokenizer(
        raw_datasets['train'][0]['answer'],
        truncation=True,
        return_tensors='pt'
    )['input_ids']
labels



tensor([[3389, 2838,  647,    1]])

In [9]:
label_ids = model.prepare_decoder_input_ids_from_labels(labels)
label_ids

tensor([[   0, 3389, 2838,  647]])

In [15]:
import torch
def tokenize_function(example):
    result = tokenizer(
        example['question'],
        # example['context'],
        truncation=True,
        padding=True,
        return_tensors='pt'
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example['answer'],
            truncation=True,
            padding=True,
            return_tensors='pt'
        )['input_ids']
        result['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(labels)
        end_token_idx = torch.where(labels == tokenizer.eos_token_id)[1]
        for idx, end_idx in enumerate(end_token_idx):
            labels[idx][end_idx + 1:] = -100
        example['labels'] = labels
    return result

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map: 100%|██████████| 14520/14520 [00:00<00:00, 31723.29 examples/s]
Map: 100%|██████████| 984/984 [00:00<00:00, 28364.63 examples/s]


In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['context', 'answer', 'question', 'id', 'labels', 'input_ids', 'attention_mask', 'decoder_input_ids'],
        num_rows: 14520
    })
    validation: Dataset({
        features: ['context', 'answer', 'question', 'id', 'labels', 'input_ids', 'attention_mask', 'decoder_input_ids'],
        num_rows: 984
    })
})

In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments('output/model', eval_strategy='epoch')

In [18]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    processing_class=tokenizer,
)

In [19]:
trainer.train()



ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
predictions = trainer.predict(tokenized_datasets['validation'])
print(predictions.predictions.shape, predictions.label_ids.shape)