In [1]:
import pdb
import jieba
import torch
from torch.nn import CrossEntropyLoss
from transformers import XLNetTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class args():
    lr = 3e-5
    epoch = 10
    batch_size = 16
    max_length = 80
    weight_decay = 0.01
    output_dir = "./results"

In [3]:
def get_dataset():
    import pandas as pd
    from datasets import load_dataset
    data = pd.read_excel("trial.xls")
    pd.DataFrame.to_csv(data, "trial.csv")
    dataset = load_dataset('csv', data_files="trial.csv")
    dataset = dataset['train'].train_test_split(train_size=0.8)
    return dataset

In [4]:
pad_id = 0

In [5]:
class XLNetTokenizer(XLNetTokenizer):
    translator = str.maketrans(" \n", "\u2582\u2583")
    def _tokenize(self, text, *args, **kwargs):
        text = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]
        text = " ".join(text)
        return super()._tokenize(text, *args, **kwargs)
    def _decode(self, *args, **kwargs):
        text = super()._decode(*args, **kwargs)
        text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n')
        return text

In [6]:
class Trainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs[0]
        labels = inputs['input_ids']
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        # 忽略pad_id的loss,并对所有的非pad_id的loss进行求和
        loss_fct = CrossEntropyLoss(ignore_index=pad_id, reduction='sum')  
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [7]:
model_name = 'mymusise/CPM-Generate-distill'
tokenizer = XLNetTokenizer.from_pretrained(model_name)
global pad_id
pad_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'CpmTokenizer'. 
The class this function is called from is 'XLNetTokenizer'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
datasets = get_dataset()
cls, sep = tokenizer.cls_token, tokenizer.sep_token
datasets['train'] = datasets['train'].map( 
    lambda e: tokenizer(
        [cls + e['keyword'][i] + sep + e['sentence'][i] + sep for i in range(len(e['keyword']))],
        truncation=True, padding='max_length', max_length=args.max_length
    ), batched=True,
)
datasets['train'].set_format(type='torch', columns=['input_ids'])
datasets['test'] = datasets['test'].map( 
    lambda e: tokenizer(
        [cls + e['keyword'][i] + sep + e['sentence'][i] + sep for i in range(len(e['keyword']))],
        truncation=True, padding='max_length', max_length=args.max_length
    ), batched=True,
)
datasets['test'].set_format(type='torch', columns=['input_ids'])

Using custom data configuration default-bbb762a6384ebf4e


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-bbb762a6384ebf4e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 2910.69it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 193.42it/s]


Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-bbb762a6384ebf4e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 471.64it/s]
  0%|          | 0/2 [00:00<?, ?ba/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.508 seconds.
Prefix dict has been built successfully.
100%|██████████| 2/2 [00:01<00:00,  1.97ba/s]
100%|██████████| 1/1 [00:00<00:00,  7.69ba/s]


In [9]:
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
for p in model.parameters():
    p.requires_grad = True

In [10]:
training_args = TrainingArguments(
    learning_rate=args.lr,
    output_dir=args.output_dir,
    evaluation_strategy='epoch',
    num_train_epochs=args.epoch,
    weight_decay=args.weight_decay,
    per_device_eval_batch_size=args.batch_size,
    per_device_train_batch_size=args.batch_size,
)
trainer = Trainer(
    model=model, 
    args=training_args, 
    eval_dataset=datasets['test'],
    train_dataset=datasets['train'],
)
trainer.train()

The following columns in the training set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: Unnamed: 0, sentence, keyword, Unnamed: 0.1. If Unnamed: 0, sentence, keyword, Unnamed: 0.1 are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1649
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1040


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log
4,No log,No log
5,859.949200,No log
6,859.949200,No log
7,859.949200,No log
8,859.949200,No log
9,859.949200,No log
10,374.552000,No log


The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: Unnamed: 0, sentence, keyword, Unnamed: 0.1. If Unnamed: 0, sentence, keyword, Unnamed: 0.1 are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 413
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: Unnamed: 0, sentence, keyword, Unnamed: 0.1. If Unnamed: 0, sentence, keyword, Unnamed: 0.1 are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 413
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: Unnamed: 0, sentence, keyword, Unnamed: 0.1. If Unnamed: 0, sentence, keyword, Unnamed: 0.1 are not expected 

TrainOutput(global_step=1040, training_loss=605.3508610652043, metrics={'train_runtime': 98.9887, 'train_samples_per_second': 166.585, 'train_steps_per_second': 10.506, 'total_flos': 673235251200000.0, 'train_loss': 605.3508610652043, 'epoch': 10.0})

In [53]:
from transformers import TextGenerationPipeline
model = model.cpu()
text_generater = TextGenerationPipeline(model, tokenizer)
keyword = "晚安"
print('关键词:', keyword)
keyword = tokenizer.cls_token + keyword + tokenizer.sep_token
text = text_generater(keyword, max_length=80, top_k=1, use_cache=True, prefix='')[0]['generated_text']
text = text[len(keyword):]
print('土味情话:', end=' ')
for s in text:
    print(s, end='')
    if s == tokenizer.sep_token or s == '。':
        break

关键词: 晚安
土味情话: 你知道我的爱有多深吗?我的爱就像大海,波澜不惊,只是为了见你一眼,就已经蓄谋已久。