# T5/mT5-small: Prompt Engineering, Evaluation and Comparison

## Imports and Device Setting

In [1]:
! pip install transformers
! pip install sentencepiece    # for AutoTokenizer
import os
import torch
import re
from torch import cuda, nn, optim
from transformers import BertTokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer, logging
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
manual_seed = 585
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
cuda


In [3]:
# path = '../data/'    # change the path as needed
path = '/content/gdrive/My Drive/585data/'

def read_data(file):
    with open (path+file) as t:
        data = t.readlines()
        for i in range(len(data)):
            data[i] = eval(data[i])
    return data

test_set = read_data('test_data.txt')[:3000]

# type(train_set)
print(test_set[:2], '\n', len(test_set))

[{'groundTruth': ['现身说法'], 'candidates': [['旷日持久', '公正廉洁', '苦口婆心', '现身说法', '白日做梦', '深入浅出', '肺腑之言']], 'content': '只要路过的旅客稍有迟疑，或者对他们的宣传单多看几眼，基本上这个旅客就别想轻松脱身了，记者就在9月3日接站时目睹了这样一幕：一个学生接过招生人员递来的宣传单，只是问了一下“你们学校有没有分数要求？”两个招生人员就“白话”开了，一个表示分数都好说，只要有好学的精神；另一个则#idiom#，大讲自己选择的专业现在收获颇丰；最后在招生人员“我们学校毕业后可以完全解决就业”的忽悠下，这个学生旅客被他们拉上了到校参观的班车。', 'realCount': 1}, {'groundTruth': ['神来之笔', '赞不绝口'], 'candidates': [['画龙点睛', '悔过自新', '拍案叫绝', '鬼斧神工', '神来之笔', '颠倒黑白', '中流砥柱'], ['敬谢不敏', '拍案叫绝', '心悦诚服', '叹为观止', '赞不绝口', '口口声声', '扬眉吐气']], 'content': '亨利的这个#idiom#被法国媒体形容为“空中舞蹈”，亨利自己对球队表现也很满意，“上半场开局一般，但很快觉醒，下半场的进攻让人看到真正的法国，尤其是我们的速度让对方有了麻烦。”而和亨利搭档的本泽马对老大哥#idiom#，“和他在一起配合很容易，他给了我很多信心。”', 'realCount': 2}] 
 3000


## Evaluation Functions

In [4]:
def f1_score(sys, gold):
    tp = 0
    total = 0
    pos = 0
    for s, g in zip(sys, gold):
        total += len(g)
        pos += len(s)
        tp += len(g & s)
    precision = tp / pos if pos != 0 else 0
    recall = tp / total if total != 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    return precision, recall, f1, tp
    
def accuracy(sys, gold, tp):
    total = 0
    for s, g in zip(sys, gold):
        total += len(g)
    return tp / total

## Load the Pre-trained T5-small Model

In [5]:
base_t5_cn = T5ForConditionalGeneration.from_pretrained("uer/t5-small-chinese-cluecorpussmall")
base_t5_cn.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(21228, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(21228, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [6]:
# Tokenizers
bert_tokenizer = BertTokenizer.from_pretrained("uer/t5-small-chinese-cluecorpussmall")
auto_tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")



## Can Prompt Engineering Only Work Well?

In [7]:
def prompt(data):
    '''data shall be the output of `read_data`'''
    text_input = []
    gold_text = []
    for i in range(len(data)):
        # data[i] = eval(data[i])
        input_text = data[i]['content']
        candidates = data[i]['candidates']
        ground_truth = set(data[i]['groundTruth'])
        gold_text.append(ground_truth)

        candidate_str = ''
        for candidate in candidates:
            candidate_str += '('+'|'.join(candidate)+')'
        
        preprocess_idx = -1
        def replace(match):
            nonlocal preprocess_idx
            preprocess_idx += 1
            return 'extra{}'.format(preprocess_idx)
        input_text = re.sub(r'#idiom#', replace, input_text)

        instruction = '请从下列括号中分别选择合适的成语填入空缺处：{}'.format(candidate_str)
        
        text_input.append(instruction+'\n'+input_text)

    return text_input, gold_text

In [8]:
def postprocess(text):
    text = text.replace("0", "").replace("1", "").replace("2", "").replace("3", "").replace("4", "").replace("5", "").replace("6", "").replace("7", "").replace("8", "").replace("9", "")
    return text.replace(".", "").replace(' ','').replace('extra', ',')
    
def answer_fn(model, text, tokenizer, top_k=50):
    encoding = tokenizer(text=[text], truncation=True, padding=True, max_length=256, return_tensors="pt", return_token_type_ids=False).to(device) 
    out = model.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_length=512,temperature=0.5,do_sample=True,repetition_penalty=3.0 ,top_k=top_k)
    result = tokenizer.batch_decode(out["sequences"], skip_special_tokens=True)
    return postprocess(result[0])

### T5-small Pretrained on Chinese





In [9]:
# these codes run super slow, and the example shows a mess, so we only test on a small set
test_input, gold_text = prompt(test_set[:100])
print(test_input[0])
answer_fn(base_t5_cn, test_input[0], bert_tokenizer)

请从下列括号中分别选择合适的成语填入空缺处：(旷日持久|公正廉洁|苦口婆心|现身说法|白日做梦|深入浅出|肺腑之言)
只要路过的旅客稍有迟疑，或者对他们的宣传单多看几眼，基本上这个旅客就别想轻松脱身了，记者就在9月3日接站时目睹了这样一幕：一个学生接过招生人员递来的宣传单，只是问了一下“你们学校有没有分数要求？”两个招生人员就“白话”开了，一个表示分数都好说，只要有好学的精神；另一个则extra0，大讲自己选择的专业现在收获颇丰；最后在招生人员“我们学校毕业后可以完全解决就业”的忽悠下，这个学生旅客被他们拉上了到校参观的班车。


'但开,编尽管,先,虽,本经精紧定,一才,西。无字,,于同,果新国身与,金段,至,奇,到,上,吃,、得不知,：免,美,守等,单基网世,有,子文便,值意限心科,班小学月,会,,点，,何格,修（论自分方？位着,送的业下作,,,额马你更社神从旧,前,过问情付和,批,微博终气委发家生总低代,手,华命性为之变成,,,外余热折时卡,指均,我中居,要进,是,官险推真以寸,看则说老最费,,三,股莫主刘传恩,号通梦否,能人工图,,,非初,笔,日全面收效,周显易错实环负内来往黄,,,可奥,后水,二(,计向克口究程银英女物各在其,保正,尚然,盘除将,万准假,年,男！深含左胡难右重称,供虚权波,关底,多表记北出,省既,,品名,,,还双,起,立隐四被元东千,交听,,,依,事西型城两晚茶据古,,,他,,,高用坐注早【,《,镇著,特,固,空了,,,,,西化奖现长群而雅?证,压,这种,必止活尽际常把,明没菜者新零,,别,信,确道,,好米皮筋动,,楼,,专,因,达欢,喜市,吉言客村c尽此尽,农户部整处约张,喝菲懂又爱,离,印,就夫样,打搞端贵致价,员取,,汤,,码路,综南尽,,台,,套源,死,字,简,近到,不,'

In [10]:
gold_text[:10]

[{'现身说法'},
 {'神来之笔', '赞不绝口'},
 {'难分难舍'},
 {'先天不足'},
 {'不寒而栗'},
 {'凶多吉少'},
 {'堂而皇之'},
 {'精神抖擞'},
 {'孤注一掷'},
 {'闭门造车'}]

In [11]:
sys = []
for q in test_input:  
    a = answer_fn(base_t5_cn, q, bert_tokenizer)
    a = a.strip(' .。，#<extra0123456789>').split(',')
    sys.append(set(a))

In [12]:
p, r, f1, tp = f1_score(sys, gold_text)
print('Prompt Engineering on T5-small before fine-tuning: ')
print(f"Accuracy for test set is {accuracy(sys, gold_text, tp)}")
print(f"F1 score for test set is {f1}")

Prompt Engineering on T5-small before fine-tuning: 
Accuracy for test set is 0.0
F1 score for test set is 0


### mT5-small


In [13]:
base_mt5 = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
base_mt5.to(device)

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
          

In [14]:
print(test_input[0])
answer_fn(base_mt5, test_input[0], auto_tokenizer)

请从下列括号中分别选择合适的成语填入空缺处：(旷日持久|公正廉洁|苦口婆心|现身说法|白日做梦|深入浅出|肺腑之言)
只要路过的旅客稍有迟疑，或者对他们的宣传单多看几眼，基本上这个旅客就别想轻松脱身了，记者就在9月3日接站时目睹了这样一幕：一个学生接过招生人员递来的宣传单，只是问了一下“你们学校有没有分数要求？”两个招生人员就“白话”开了，一个表示分数都好说，只要有好学的精神；另一个则extra0，大讲自己选择的专业现在收获颇丰；最后在招生人员“我们学校毕业后可以完全解决就业”的忽悠下，这个学生旅客被他们拉上了到校参观的班车。


'<,_id_>。'

In [15]:
sys = []
for q in test_input:  
    a = answer_fn(base_t5_cn, q, bert_tokenizer)
    a = a.replace('_id_', '').strip('<>. ').split(',')
    sys.append(set(a))

In [16]:
p, r, f1, tp = f1_score(sys, gold_text)
print('Prompt Engineering on mT5-small before fine-tuning: ')
print(f"Accuracy for test set is {accuracy(sys, gold_text, tp)}")
print(f"F1 score for test set is {f1}")

Prompt Engineering on mT5-small before fine-tuning: 
Accuracy for test set is 0.0
F1 score for test set is 0


In [17]:
sys[:2]

[{'',
  '(四时城压了男张',
  '?复',
  '。反',
  '一',
  '下无',
  '不',
  '与点和基段',
  '专',
  '东信新盘合李空袁白',
  '为称多',
  '主权约中',
  '之余免知科性',
  '书',
  '事含程物周代',
  '二同寸',
  '亲言高处既',
  '以图班于子上值西',
  '传他管但水离正交积实女费',
  '位论',
  '何',
  '依',
  '修错通',
  '先准微奇将',
  '其',
  '写双欢负楼活',
  '分月格是',
  '创',
  '北致',
  '单',
  '印码神非',
  '发。',
  '古省克向晚',
  '可批开计得',
  '史假头',
  '右',
  '名英',
  '后',
  '听《',
  '品',
  '因深',
  '固尽千',
  '国你到从限学',
  '均菜市立',
  '外口然号种付供梦三端起源股除两',
  '好记账',
  '委小',
  '字',
  '守',
  '尚',
  '工',
  '已达零搞出',
  '年',
  '往用奥',
  '心',
  '息',
  '成过低世定人',
  '我',
  '房户西',
  '打贵',
  '技重奖文倒装么',
  '把',
  '折',
  '数著',
  '新金会',
  '日网问',
  '旧自方',
  '更',
  '有收终',
  '本尽明才',
  '机',
  '来说卡',
  '果马文经吃否生',
  '样被全环家',
  '死',
  '气真胡着',
  '汉价',
  '爱懂动',
  '特',
  '现',
  '疑额作易莫指最初前皮',
  '的',
  '看华老官证',
  '社推进手、',
  '笔',
  '米夫',
  '精',
  '紧总居万',
  '美',
  '而效',
  '至',
  '菲新',
  '藏命显究西西欧波型银',
  '虚隐',
  '要底',
  '身',
  '轻刘恩别左长注',
  '送变意，能等便',
  '酷劲化',
  '险',
  '面关保各博者常短',
  '题客据',
  '黄坐在则还早',
  '！',
  '（',
  '）',
  '：情热业内',
  '？'},
 {'',
 

## Preprocess Test Input to Feed in the Fine-tuned Models

In [18]:
def preprocess(data, tokenizer):
    text_input = []
    idiom_output = []
    for i in range(len(data)):
        input_text = data[i]['content']
        ground_truth = data[i]['groundTruth']
        candidates = data[i]['candidates']

        candidate_str = ''
        for candidate in candidates:
            candidate_str += '('+'|'.join(candidate)+')'
        
        preprocess_idx = -1
        def replace(match):
            nonlocal preprocess_idx
            preprocess_idx += 1
            return 'extra{}'.format(preprocess_idx)
        input_text = re.sub(r'#idiom#', replace, input_text)

        instruction = '请从下列括号中分别选择合适的成语填入空缺处：{}'.format(candidate_str)
        # input_text = input_text.replace('#idiom#', '_')
        output_text = ','.join(ground_truth)
        
        text_input.append(instruction+'\n'+input_text)
        idiom_output.append(output_text)
    
    print(text_input[0], idiom_output[0])    
    input_tok = tokenizer.batch_encode_plus(text_input,
                                            add_special_tokens=False, 
                                            return_token_type_ids=False)
    output_tok = tokenizer.batch_encode_plus(idiom_output, 
                                             add_special_tokens=False,
                                             return_token_type_ids=False)
    return input_tok, output_tok

In [19]:
class IdiomDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]

        target_ids = self.outputs['input_ids'][idx]
        target_attention_mask = self.outputs['attention_mask'][idx]
        return {"input_ids": input_ids, "attention_mask":attention_mask, "output_ids":target_ids}


def collate_fn(batch):
    batch_input = [torch.LongTensor(example['input_ids']) for example in batch]
    batch_output = [torch.LongTensor(example['output_ids']) for example in batch]
    batch_mask = [torch.LongTensor(example['attention_mask']) for example in batch]

    padded_batch_input_ids = pad_sequence(batch_input, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_batch_label = pad_sequence(batch_output, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_batch_att_mask = pad_sequence(batch_mask, batch_first=True, padding_value=0)

    return {"input_ids": padded_batch_input_ids, "attention_mask": padded_batch_att_mask, "labels": padded_batch_label}

def to_device(data, device):
    new_data = {}
    for k in data:
        # k = k.to(device)
        new_data[k] = data[k].to(device)
    return new_data

In [20]:
@torch.no_grad()
def fill_idiom(model, loader, tokenizer):

    all_preds = []
    all_labels = []
    model.eval()
    for batch in loader:
        batch = to_device(batch, device)
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        outputs = model.generate(input_ids=input_ids, 
                                 attention_mask=attention_mask, 
                                 return_dict_in_generate=True, 
                                 pad_token_id=tokenizer.pad_token_id, 
                                 max_length=512, 
                                 top_k=15)
        truncated_outputs = []

        decode_texts = tokenizer.batch_decode([l[l != 0] for l in outputs['sequences']])
        gold_texts = tokenizer.batch_decode([l[l != 0] for l in labels])
        # print(decode_texts, gold_texts)
        for gold, decode in zip(gold_texts, decode_texts):
            l = set(gold.replace(' ', '').replace('[CLS]', '').split(','))
            p = set(decode.replace(' ', '').replace('[CLS]', '').split(','))
            # print(l, p)
            all_labels.append(l)
            all_preds.append(p)
        # print(decode_texts)
        # print(gold_texts)
        # break
    
    return all_preds, all_labels


## Evaluate the Fine-tuned Models on Test Set

In [21]:
# for T5-small
test_input, test_output = preprocess(test_set, bert_tokenizer)

请从下列括号中分别选择合适的成语填入空缺处：(旷日持久|公正廉洁|苦口婆心|现身说法|白日做梦|深入浅出|肺腑之言)
只要路过的旅客稍有迟疑，或者对他们的宣传单多看几眼，基本上这个旅客就别想轻松脱身了，记者就在9月3日接站时目睹了这样一幕：一个学生接过招生人员递来的宣传单，只是问了一下“你们学校有没有分数要求？”两个招生人员就“白话”开了，一个表示分数都好说，只要有好学的精神；另一个则extra0，大讲自己选择的专业现在收获颇丰；最后在招生人员“我们学校毕业后可以完全解决就业”的忽悠下，这个学生旅客被他们拉上了到校参观的班车。 现身说法


In [22]:
test_dataset = IdiomDataset(test_input, test_output)
test_loader = DataLoader(test_dataset, batch_size=128, collate_fn=collate_fn, shuffle=False)

### T5-small Fine-tuned on the 20,000 Training Data

In [23]:
tuned_t5_cn = T5ForConditionalGeneration.from_pretrained("uer/t5-small-chinese-cluecorpussmall")
tuned_t5_cn.load_state_dict(torch.load(path+"T5-small_model_5epoch.pt", map_location=device))
tuned_t5_cn.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(21228, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(21228, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [24]:
tokenizer = bert_tokenizer   # for colate_fn function
sys, gold = fill_idiom(tuned_t5_cn, test_loader, bert_tokenizer)
p, r, f1, tp = f1_score(sys, gold)

In [25]:
print('T5-small trained on 20000 data: ')
print(f"Accuracy for test set is {accuracy(sys, gold, tp)}")
print(f"F1 score for test set is {f1}")

T5-small trained on 20000 data: 
Accuracy for test set is 0.4138309549945115
F1 score for test set is 0.4152554041029877


### T5-small Fine-tuned on 200,000 Training Data (NOT Standard)

In [26]:
tuned_t5_cn_bd = T5ForConditionalGeneration.from_pretrained("uer/t5-small-chinese-cluecorpussmall")
tuned_t5_cn_bd.load_state_dict(torch.load(path+"T5-small_model_3epoch_20wData.pt", map_location=device))
tuned_t5_cn_bd.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(21228, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(21228, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [27]:
sys, gold = fill_idiom(tuned_t5_cn_bd, test_loader, bert_tokenizer)
p, r, f1, tp = f1_score(sys, gold)

In [28]:
print('T5-small trained on 200000 data: ')
print(f"Accuracy for test set is {accuracy(sys, gold, tp)}")
print(f"F1 score for test set is {f1}")

T5-small trained on 200000 data: 
Accuracy for test set is 0.6789242590559824
F1 score for test set is 0.6791106231128191


### mT5-small Fine-tuned on 20,000 Training Data

In [29]:
tuned_mt5 = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
tuned_mt5.load_state_dict(torch.load(path+"mT5-small_model_5epoches.pt", map_location=device))
tuned_mt5.to(device)

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
          

In [30]:
# The tokenizer is changed, so the test data needs to be re-preprocessed
test_input, test_output = preprocess(test_set, auto_tokenizer)
test_dataset = IdiomDataset(test_input, test_output)
 
tokenizer = auto_tokenizer  # for collate_fn function
test_loader = DataLoader(test_dataset, batch_size=128, collate_fn=collate_fn, shuffle=False)

请从下列括号中分别选择合适的成语填入空缺处：(旷日持久|公正廉洁|苦口婆心|现身说法|白日做梦|深入浅出|肺腑之言)
只要路过的旅客稍有迟疑，或者对他们的宣传单多看几眼，基本上这个旅客就别想轻松脱身了，记者就在9月3日接站时目睹了这样一幕：一个学生接过招生人员递来的宣传单，只是问了一下“你们学校有没有分数要求？”两个招生人员就“白话”开了，一个表示分数都好说，只要有好学的精神；另一个则extra0，大讲自己选择的专业现在收获颇丰；最后在招生人员“我们学校毕业后可以完全解决就业”的忽悠下，这个学生旅客被他们拉上了到校参观的班车。 现身说法


In [31]:
sys, gold = fill_idiom(tuned_mt5, test_loader, auto_tokenizer)
p, r, f1, tp = f1_score(sys, gold)

In [32]:
sys[:10]

[{'现身说法'},
 {'赞不绝口', '鬼斧神工'},
 {'依依不舍'},
 {'先天不足'},
 {'瞠目结舌'},
 {'十拿九稳'},
 {'堂而皇之'},
 {'昂首阔步'},
 {'山穷水尽'},
 {'照本宣科'}]

In [33]:
print('mT5-small trained on 20000 data: ')
print(f"Accuracy for test set is {accuracy(sys, gold, tp)}")
print(f"F1 score for test set is {f1}")

mT5-small trained on 20000 data: 
Accuracy for test set is 0.4234357848518112
F1 score for test set is 0.42419243986254296


## Can the Fine-tuned Model Understand Similar Instruction(s)?

In [34]:
def prompt_2(data):
    '''data shall be the output of `read_data`'''
    text_input = []
    
    for i in range(len(data)):
        # data[i] = eval(data[i])
        input_text = data[i]['content']
        candidates = data[i]['candidates']

        candidate_str = ''
        for candidate in candidates:
            candidate_str += '('+'|'.join(candidate)+')'
        
        preprocess_idx = -1
        def replace(match):
            nonlocal preprocess_idx
            preprocess_idx += 1
            return 'extra{}'.format(preprocess_idx)
        input_text = re.sub(r'#idiom#', replace, input_text)

        instruction = '请依次选择括号里的成语填空：{}'.format(candidate_str)
        
        text_input.append(instruction+'\n'+input_text)

    return text_input

In [35]:
test_input_2 = prompt_2(test_set)
print(test_input_2[0])
answer_fn(tuned_t5_cn, test_input_2[0], bert_tokenizer)

请依次选择括号里的成语填空：(旷日持久|公正廉洁|苦口婆心|现身说法|白日做梦|深入浅出|肺腑之言)
只要路过的旅客稍有迟疑，或者对他们的宣传单多看几眼，基本上这个旅客就别想轻松脱身了，记者就在9月3日接站时目睹了这样一幕：一个学生接过招生人员递来的宣传单，只是问了一下“你们学校有没有分数要求？”两个招生人员就“白话”开了，一个表示分数都好说，只要有好学的精神；另一个则extra0，大讲自己选择的专业现在收获颇丰；最后在招生人员“我们学校毕业后可以完全解决就业”的忽悠下，这个学生旅客被他们拉上了到校参观的班车。


'白日做梦开点和,远所定尽,'

In [36]:
test_input_2 = prompt_2(test_set)
print(test_input_2[0])
answer_fn(tuned_mt5, test_input_2[0], auto_tokenizer)

请依次选择括号里的成语填空：(旷日持久|公正廉洁|苦口婆心|现身说法|白日做梦|深入浅出|肺腑之言)
只要路过的旅客稍有迟疑，或者对他们的宣传单多看几眼，基本上这个旅客就别想轻松脱身了，记者就在9月3日接站时目睹了这样一幕：一个学生接过招生人员递来的宣传单，只是问了一下“你们学校有没有分数要求？”两个招生人员就“白话”开了，一个表示分数都好说，只要有好学的精神；另一个则extra0，大讲自己选择的专业现在收获颇丰；最后在招生人员“我们学校毕业后可以完全解决就业”的忽悠下，这个学生旅客被他们拉上了到校参观的班车。


'现身说法,白日做梦+uzo真ensya'

Seems not... Let's see some more data!

In [37]:
sys = []
for q in test_input_2[:100]:  
    a = answer_fn(tuned_t5_cn, q, bert_tokenizer)
    a = a.strip(' .。，#<>').split(',')
    sys.append(set(a))

In [38]:
p, r, f1, tp = f1_score(sys, gold_text)
print('Use different prompts as what was trained on T5-small after fine-tuning: ')
print(f"Accuracy for test set is {accuracy(sys, gold_text, tp)}")
print(f"F1 score for test set is {f1}")

Use different prompts as what was trained on T5-small after fine-tuning: 
Accuracy for test set is 0.05982905982905983
F1 score for test set is 0.04142011834319526


In [39]:
sys[:10]

[{'分依', '动', '白日做梦'},
 {'', '画龙点睛', '立同价不', '赞不绝口'},
 {'依居不舍之', '无一风有', '无不', '风'},
 {'之可迷动心', '先天不足有'},
 {'心地一人', '骇人听闻动'},
 {'凶多吉少入水高长一'},
 {'', '不心手', '招摇过市可神'},
 {'名言人可行人', '昂首阔步心分', '知地'},
 {'孤注一掷地心地'},
 {'', '光照本宣科定一', '大'}]

In [43]:
sys = []
for q in test_input_2[:100]:  
    a = answer_fn(tuned_mt5, q, auto_tokenizer)
    a = a.replace('_id_', '').strip(' .。，#<>').split(',')
    sys.append(set(a))

In [44]:
p, r, f1, tp = f1_score(sys, gold_text)
print('Use different prompts as what was trained on mT5-small after fine-tuning: ')
print(f"Accuracy for test set is {accuracy(sys, gold_text, tp)}")
print(f"F1 score for test set is {f1}")

Use different prompts as what was trained on mT5-small after fine-tuning: 
Accuracy for test set is 0.1623931623931624
F1 score for test set is 0.08539325842696631


In [45]:
sys[:10]

[{'现身说法', '白日做梦<xD>lerken了下来说一点后来迟疑了一些真相走了出来以下是一本书铺的信息填空并如单了起来的过程'},
 {'拍案叫绝',
  '诚心悦信后当五感慨ensyawność勉вачкаReturn至衷地说lerken以下是口腔音词填空了!”泣为观止dunk力forward义标引来深いただきたい谈以此推手hemiksipanFinishவையும்称惜望度缓keet迟提abilitymezichtelimkupusius谅itingValue精神полните',
  '赞不尽服enschaft敬谢意念惭Feedback扬眉吐气шимиCancel赏道声Receive明华说纷纷!)'},
 {'>bes',
  '依循无间',
  '相随两人之间在一起的合作以此左右untersch地舍之下的结果填空为期据此交际<xD><x>lerken置虚することに息念naha说并行<'},
 {'>一言为定Baolaureat',
  '归根结蒂',
  '权宜之计<xF>过度komandialiteit<xD>ensya+<x><x><xF>优良价值<'},
 {'>+menti不安zmagojatklini说Tende空而来+<',
  '><x><xD><xFD>bes<xA>相守<x><',
  '>hedenAktualSerbi息',
  '>ienti悔<x>可想而知<xF><xD>Студ域名填补并论<x><x><',
  '时不我待暴露真相+骇人听闻<',
  '瞠目结舌'},
 {'><x><xC><xE>暴露出来portfoliotendent空升息<',
  '><xD>+diantuv外<x>据说势趋之下俱低lerken并论较弱komandi<',
  '>vista为顶上线<x><x><',
  '>倒去Vokaliteit稍逊别<xF',
  '>拉客战下盘<xF><xFD><',
  '十拿九稳',
  '高开主胜若明+<x>aru<'},
 {'>+',
  '>meni谅落下了水线经过别论<',
  '>看上去老实一样<',
  '>顶上天立地<xF><x><',
  '堂而皇之',
  '指望下帝了一点光光明地走了如此一丈niego<x><x>他<xF>掉下去说出来了他不过神来谈的感觉тую并靠近前道而来besen

It looks that the fine-tuned models can make some predictions, and they work better than those before fine-tuning. However, it shows that the proper prompts highly rely on the training data during fine-tune, even after fine-tuning, which indicates that the fine-tuned model cannot really understand the instructions. 

Given this primary conclusion, if we have extra time, we may simplify the data preprocessing.