In [1]:
# ! pip install transformers
import os
import torch
from torch import cuda
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import BertTokenizer, T5ForConditionalGeneration, Text2TextGenerationPipeline
from transformers import TrainingArguments, Trainer, logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# tokenizer = AutoTokenizer.from_pretrained("t5-small")
# model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
# No Chinese was used for pre-train

# tokenizer = AutoTokenizer.from_pretrained("mxmax/Chinese_Chat_T5_Base")
# model = AutoModelForSeq2SeqLM.from_pretrained("mxmax/Chinese_Chat_T5_Base")

# https://huggingface.co/uer/t5-small-chinese-cluecorpussmall
tokenizer = BertTokenizer.from_pretrained("uer/t5-small-chinese-cluecorpussmall")
model = T5ForConditionalGeneration.from_pretrained("uer/t5-small-chinese-cluecorpussmall")

# To train on Google Colab
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cpu


T5ForConditionalGeneration(
  (shared): Embedding(21228, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(21228, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [3]:
### Process training dataset
path = '../data/'

def read_data(file):
    with open (path+file) as t:
        data = t.readlines()
    return data

train_set = read_data('train_data.txt')
dev_set = read_data('dev_data.txt')
test_set = read_data('test_data.txt')

# type(train_set)
print(train_set[:2])

['{"groundTruth": ["发扬光大", "平易近人", "温文尔雅"], "candidates": [["意气风发", "街谈巷议", "人才辈出", "一脉相传", "后继有人", "发扬光大", "腥风血雨"], ["平易近人", "落落大方", "八仙过海", "彬彬有礼", "史无前例", "盛气凌人", "好自为之"], ["不拘小节", "风流潇洒", "无病呻吟", "言谈举止", "壮志凌云", "关门闭户", "温文尔雅"]], "content": "由实力派演员刘威饰演的清华第三任校长蒋南翔，是我国著名的青年运动家和教育家，他跟清华终身校长梅贻琦一样，都是由清华人自己培养出来的校长。历史上的蒋南翔是著名的“一二九”学生救亡运动的领导人之一，他在清华校长之位14年期间，不但很好的继承了清华建校之初的优秀传统与理念，而且更加的#idiom#，他把清华的教师队伍扩大了将近5倍，将清华本科人数破万，为新中国培养了大量的有用人才。在《天行健》中饰演蒋南翔的刘威是观众所熟悉的著名实力派演员，早在1987年刘威就在《关东大侠》中饰演豪爽仗义的关云天一角而获得了金鸡奖最佳男主角的提名，后来更是因在《唐明皇》中精湛的表演而一举夺得金鹰奖最佳男演员奖。此次《天行健》选定刘威来出演正是看中了他#idiom#的表演方式和对人物深入内心的刻画。至此，《天行健》中涉及的三位清华校长的人选都已经曝光，#idiom#的第一任校长赵文?、稳重坚毅的第二任校长孙逊、亲切务实的第三任校长刘威，再加上梁思成、林徽因、朱自清、闻一多等一批“大师”的加盟，相信作为清华百年校庆重点项目之一的《天行健》一定会带领观众重温那段不能抹去的历史。", "realCount": 3}\n', '{"groundTruth": ["肥头大耳"], "candidates": [["超凡入圣", "骨瘦如柴", "青面獠牙", "虎背熊腰", "成人之美", "肥头大耳", "神不守舍"]], "content": "#idiom#的掌柜只穿一件衬衫，坐在柜台里。几个堂倌穿着脏得发黑的白工作服，因为没有顾客，都散坐在桌子旁。这当儿看到这位不寻常的客人，都露出好奇的神色列宁曾批评他理论上的错误，同时认为他“所写的全部哲学，赶紧迎上前来伺候。聂赫留朵夫要了一瓶矿泉水，在离窗较远的地方挨着一张

In [4]:
def preprocess(data):
    text_input = []
    idiom_output = []
    for i in range(len(data)):
        data[i] = eval(data[i])
        input_text = data[i]['content']
        ground_truth = data[i]['groundTruth']
        candidates = data[i]['candidates']
        
        for j, idiom in enumerate(ground_truth):
            candidates_str = '|'.join([c for c in candidates[j]])
            input_text = input_text.replace('#idiom#', "(" + candidates_str + ")", 1)
            label = idiom
            
        text_input.append(input_text)
        idiom_output.append(ground_truth)
    return text_input, idiom_output

train_input, train_output = preprocess(train_set)
dev_input, dev_output = preprocess(dev_set)
test_input, test_output = preprocess(test_set)

print(train_input[:2], '\n', train_output[:2])

['由实力派演员刘威饰演的清华第三任校长蒋南翔，是我国著名的青年运动家和教育家，他跟清华终身校长梅贻琦一样，都是由清华人自己培养出来的校长。历史上的蒋南翔是著名的“一二九”学生救亡运动的领导人之一，他在清华校长之位14年期间，不但很好的继承了清华建校之初的优秀传统与理念，而且更加的(意气风发|街谈巷议|人才辈出|一脉相传|后继有人|发扬光大|腥风血雨)，他把清华的教师队伍扩大了将近5倍，将清华本科人数破万，为新中国培养了大量的有用人才。在《天行健》中饰演蒋南翔的刘威是观众所熟悉的著名实力派演员，早在1987年刘威就在《关东大侠》中饰演豪爽仗义的关云天一角而获得了金鸡奖最佳男主角的提名，后来更是因在《唐明皇》中精湛的表演而一举夺得金鹰奖最佳男演员奖。此次《天行健》选定刘威来出演正是看中了他(平易近人|落落大方|八仙过海|彬彬有礼|史无前例|盛气凌人|好自为之)的表演方式和对人物深入内心的刻画。至此，《天行健》中涉及的三位清华校长的人选都已经曝光，(不拘小节|风流潇洒|无病呻吟|言谈举止|壮志凌云|关门闭户|温文尔雅)的第一任校长赵文?、稳重坚毅的第二任校长孙逊、亲切务实的第三任校长刘威，再加上梁思成、林徽因、朱自清、闻一多等一批“大师”的加盟，相信作为清华百年校庆重点项目之一的《天行健》一定会带领观众重温那段不能抹去的历史。', '(超凡入圣|骨瘦如柴|青面獠牙|虎背熊腰|成人之美|肥头大耳|神不守舍)的掌柜只穿一件衬衫，坐在柜台里。几个堂倌穿着脏得发黑的白工作服，因为没有顾客，都散坐在桌子旁。这当儿看到这位不寻常的客人，都露出好奇的神色列宁曾批评他理论上的错误，同时认为他“所写的全部哲学，赶紧迎上前来伺候。聂赫留朵夫要了一瓶矿泉水，在离窗较远的地方挨着一张铺有肮脏桌布的小桌坐下。'] 
 [['发扬光大', '平易近人', '温文尔雅'], ['肥头大耳']]


In [6]:
# # Results before fine-tuning
# def postprocess(text):
#     return text.replace(".", "").replace('</>','')

# def answer_fn(text, top_k=50):
#     encoding = tokenizer(text=[text], truncation=True, padding=True, max_length=256, return_tensors="pt").to(device) 
#     out = model.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_length=512,temperature=0.5,do_sample=True,repetition_penalty=3.0 ,top_k=top_k)
#     result = tokenizer.batch_decode(out["sequences"], skip_special_tokens=True)
#     return postprocess(result[0])

# x1 = """世锦赛的整体水平远高于亚洲杯，要如同亚洲杯那样“鱼与熊掌兼得”，就需要各方面密切配合、（凭空捏造|高头大马|通力合作|同舟共济|和衷共济|蓬头垢面|紧锣密鼓）。作为主帅的俞觉敏，除了得打破保守思想，敢于破格用人，还得巧于用兵、(叫苦连天|量体裁衣|金榜题名|百战不殆|知彼知己|风流才子)、
# 灵活排阵，指挥得当，力争通过比赛推新人、出佳绩、出新的战斗力。"""

# # y1 = ["高头大马", "叫苦连天"]

# result=answer_fn(x1, top_k=50)
# print("模型生成:",result)
# print('*'*100)

In [None]:
# Tokenize the data
def tokenize(batch):
    tokenized = tokenizer(
        batch[0], 
        padding=True, 
        truncation=True, 
        max_length=512, 
        return_tensors='pt'
    ).to(device), 
    tokenizer(
        batch[1], 
        padding=True, 
        truncation=True, 
        max_length=512, 
        return_tensors='pt'
    ).to(device)
    return tokenized

train_dataset = [tokenize(batch) for batch in zip(train_input, train_output)]

In [None]:
train_dataset[0]

In [None]:
# Train
# Define the training parameters
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,             
    per_device_train_batch_size=2,   # batch size per device during training
    per_device_eval_batch_size=2,    # batch size for evaluation
    gradient_accumulation_steps=4,   # train on GPU more efficiently
    gradient_checkpointing=True,
    # fp16=True, 
    optim='adafactor',               ## to be revised
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1000,              # number of steps between logging messages
    save_total_limit=1,              # limit the total amount of checkpoints that can be saved
    save_steps=5000,                 # number of steps between saving checkpoints
    evaluation_strategy='steps',     # evaluation strategy to adopt during training
    eval_steps=1000,                 # number of steps between evaluations
    load_best_model_at_end=True,     # load the best model when training ends
)

# Train the model
trainer = Trainer(
    model=model,                     
    args=training_args,              
    train_dataset=train_dataset,     
)

trainer.train()

In [None]:
dev_dataset = [tokenize(batch) for batch in zip(dev_input, dev_output)]
trainer.evaluate(eval_dataset=dev_dataset)


In [None]:
# def answer_fn(text, top_k=50):
#     encoding = tokenizer(text=[text], truncation=True, padding=True, max_length=256, return_tensors="pt").to(device) 
#     out = model.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_length=512,temperature=0.5,do_sample=True,repetition_penalty=3.0 ,top_k=top_k)
#     result = tokenizer.batch_decode(out["sequences"], skip_special_tokens=True)
#     return postprocess(result[0]) 
# while True:
#     text = input('Please input []')
#     result=answer_fn(text, top_k=50)
#     print("Summary:",result)
#     print('*'*100)