In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from tqdm import tqdm
import torch

BATCH_SIZE=16
LR=0.0001
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 构建DataCollator对象
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# 读取自己的语料库，一行代表一个样本
print('-- File reading...')
# 构建TextDataset对象
with tqdm(total=100) as pbar:
    train_dataset = TextDataset(tokenizer=tokenizer, file_path='./dataset/dailydialog/train.txt', block_size=128)
    pbar.update(90)
    eval_dataset = TextDataset(tokenizer=tokenizer, file_path='./dataset/dailydialog/test.txt', block_size=128)
    pbar.update(10)
print('-- Model loading...')
# 读取GPT2预训练模型
with tqdm(total=100) as pbar:
    model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
    pbar.update(100)
print('-- Prepare for training...')
# 定义训练参数
with tqdm(total=100) as pbar:
    training_args = TrainingArguments(
        output_dir='./results',      # 输出目录
        overwrite_output_dir=False,
        num_train_epochs=1,          # 训练轮数
        per_device_train_batch_size=BATCH_SIZE,  # 每个设备的训练批次大小
        save_total_limit=10,             # 最多保存的模型数量
        do_train=True,
        do_eval=True,
        evaluation_strategy='epoch',
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=2,
        learning_rate=LR,
        weight_decay=0.005,
        warmup_steps=100,
        logging_steps=200,
        save_steps=2000,
    )
    # 初始化Trainer对象
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
    )
    pbar.update(100)
print('-- Start training...')
# 开始训练
trainer.train()

-- File reading...


100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 237.81it/s]


-- Model loading...


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 30.67it/s]


-- Prepare for training...


  0%|                                                                                          | 0/100 [00:00<?, ?it/s]


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate`: Run `pip install --upgrade accelerate`

In [4]:
train_dataset[0]

tensor([ 1279,    66,    29, 13816,   837,  5395,   837,   703,   546,  1016,
          329,   257,  1178, 16800,   706,  8073,  5633,  1279,    80,    29,
          921,   760,   326,   318, 29850,   475,   318,  1107,   407,   922,
          329,   674, 13547,   764,  1279,    64,    29,   198, 25515,   837,
         5395,   837,   703,   546,  1016,   329,   257,  1178, 16800,   706,
         8073,  5633,  1279,    66,    29,   921,   760,   326,   318, 29850,
          475,   318,  1107,   407,   922,   329,   674, 13547,   764,  1279,
           80,    29,  1867,   466,   345,  1612,  5633,   632,   481,  1037,
          514,   284,  8960,   764,  1279,    64,    29,   198, 25515,   837,
         5395,   837,   703,   546,  1016,   329,   257,  1178, 16800,   706,
         8073,  5633,   921,   760,   326,   318, 29850,   475,   318,  1107,
          407,   922,   329,   674, 13547,   764,  1279,    66,    29,  1867,
          466,   345,  1612,  5633,   632,   481,  1037,   514])