# T5-small Fine-tuning

## Imports and Device Setting

In [52]:
# ! pip install transformers
import os
import torch
from torch import cuda, nn, optim
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import BertTokenizer, T5ForConditionalGeneration, Text2TextGenerationPipeline
from transformers import TrainingArguments, Trainer, logging

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
manual_seed = 585
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## Load the Pre-trained Model

In [2]:
# tokenizer = AutoTokenizer.from_pretrained("t5-small")
# model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
# No Chinese was used for pre-train

# tokenizer = AutoTokenizer.from_pretrained("mxmax/Chinese_Chat_T5_Base")
# model = AutoModelForSeq2SeqLM.from_pretrained("mxmax/Chinese_Chat_T5_Base")

# https://huggingface.co/uer/t5-small-chinese-cluecorpussmall
tokenizer = BertTokenizer.from_pretrained("uer/t5-small-chinese-cluecorpussmall")
model = T5ForConditionalGeneration.from_pretrained("uer/t5-small-chinese-cluecorpussmall")

# To train on Google Colab
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cpu


T5ForConditionalGeneration(
  (shared): Embedding(21228, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(21228, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

## Preprocess data

In [25]:
# path = '../data/'    # change the path as needed
path = '/content/gdrive/My Drive/585data/'

def read_data(file):
    with open (path+file) as t:
        data = t.readlines()
    return data

train_set = read_data('train_data.txt')[:15000]
dev_set = read_data('dev_data.txt')[:2000]
test_set = read_data('test_data.txt')[:2000]

# type(train_set)
print(train_set[:2], '\n', len(train_set), len(dev_set), len(test_set))

['{"groundTruth": ["发扬光大", "平易近人", "温文尔雅"], "candidates": [["意气风发", "街谈巷议", "人才辈出", "一脉相传", "后继有人", "发扬光大", "腥风血雨"], ["平易近人", "落落大方", "八仙过海", "彬彬有礼", "史无前例", "盛气凌人", "好自为之"], ["不拘小节", "风流潇洒", "无病呻吟", "言谈举止", "壮志凌云", "关门闭户", "温文尔雅"]], "content": "由实力派演员刘威饰演的清华第三任校长蒋南翔，是我国著名的青年运动家和教育家，他跟清华终身校长梅贻琦一样，都是由清华人自己培养出来的校长。历史上的蒋南翔是著名的“一二九”学生救亡运动的领导人之一，他在清华校长之位14年期间，不但很好的继承了清华建校之初的优秀传统与理念，而且更加的#idiom#，他把清华的教师队伍扩大了将近5倍，将清华本科人数破万，为新中国培养了大量的有用人才。在《天行健》中饰演蒋南翔的刘威是观众所熟悉的著名实力派演员，早在1987年刘威就在《关东大侠》中饰演豪爽仗义的关云天一角而获得了金鸡奖最佳男主角的提名，后来更是因在《唐明皇》中精湛的表演而一举夺得金鹰奖最佳男演员奖。此次《天行健》选定刘威来出演正是看中了他#idiom#的表演方式和对人物深入内心的刻画。至此，《天行健》中涉及的三位清华校长的人选都已经曝光，#idiom#的第一任校长赵文?、稳重坚毅的第二任校长孙逊、亲切务实的第三任校长刘威，再加上梁思成、林徽因、朱自清、闻一多等一批“大师”的加盟，相信作为清华百年校庆重点项目之一的《天行健》一定会带领观众重温那段不能抹去的历史。", "realCount": 3}\n', '{"groundTruth": ["肥头大耳"], "candidates": [["超凡入圣", "骨瘦如柴", "青面獠牙", "虎背熊腰", "成人之美", "肥头大耳", "神不守舍"]], "content": "#idiom#的掌柜只穿一件衬衫，坐在柜台里。几个堂倌穿着脏得发黑的白工作服，因为没有顾客，都散坐在桌子旁。这当儿看到这位不寻常的客人，都露出好奇的神色列宁曾批评他理论上的错误，同时认为他“所写的全部哲学，赶紧迎上前来伺候。聂赫留朵夫要了一瓶矿泉水，在离窗较远的地方挨着一张

In [26]:
def preprocess(data):
    text_input = []
    idiom_output = []
    for i in range(len(data)):
        data[i] = eval(data[i])
        input_text = data[i]['content']
        ground_truth = data[i]['groundTruth']
        candidates = data[i]['candidates']
        
        instruction = '请从下列列表中分别选择合适的成语填入空缺处：{}'.format(candidates)
        # candidates_str = '|'.join([c for c in candidates[j]])
        input_text = input_text.replace('#idiom#', '_')
        output_text = '，'.join(ground_truth)
        
        text_input.append(instruction+input_text)
        idiom_output.append(output_text)
        
    input_emb = tokenizer(text_input, return_token_type_ids=False)
    output_emb = tokenizer(idiom_output, return_token_type_ids=False)
    return input_emb, output_emb

In [27]:
train_input, train_output = preprocess(train_set)
dev_input, dev_output = preprocess(dev_set)
test_input, test_output = preprocess(test_set)

In [38]:
print(train_input.keys(), train_output.keys())

dict_keys(['input_ids', 'attention_mask']) dict_keys(['input_ids', 'attention_mask'])


In [43]:
print(train_input['input_ids'][0], '\n', train_input['attention_mask'][0])

[101, 6435, 794, 678, 1154, 1154, 6134, 704, 1146, 1166, 6848, 2885, 1394, 6844, 4638, 2768, 6427, 1856, 1057, 100, 142, 100, 704, 8038, 138, 138, 112, 2692, 3698, 7599, 1355, 112, 117, 112, 6125, 6448, 2350, 6379, 112, 117, 112, 782, 2798, 6777, 1139, 112, 117, 112, 671, 5549, 4685, 837, 112, 117, 112, 1400, 5326, 3300, 782, 112, 117, 112, 1355, 2813, 1045, 1920, 112, 117, 112, 5581, 7599, 6117, 7433, 112, 140, 117, 138, 112, 2398, 3211, 6818, 782, 112, 117, 112, 5862, 5862, 1920, 3175, 112, 117, 112, 1061, 803, 6814, 3862, 112, 117, 112, 2509, 2509, 3300, 4851, 112, 117, 112, 1380, 3187, 1184, 891, 112, 117, 112, 4670, 3698, 1119, 782, 112, 117, 112, 1962, 5632, 711, 722, 112, 140, 117, 138, 112, 679, 2872, 2207, 5688, 112, 117, 112, 7599, 3837, 4045, 3818, 112, 117, 112, 3187, 4567, 1460, 1412, 112, 117, 112, 6241, 6448, 715, 3632, 112, 117, 112, 1896, 2562, 1119, 756, 112, 117, 112, 1068, 7305, 7308, 2787, 112, 117, 112, 3946, 3152, 2209, 7414, 112, 140, 140, 4507, 2141, 1213, 3836

In [46]:
from torch.utils.data import Dataset, DataLoader

class IdiomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]
        # target_ids = self.inputs['input_ids'][idx]

        target_ids = self.labels['input_ids'][idx]
        target_attention_mask = self.labels['attention_mask'][idx]
        return {"input_ids": input_ids, "attention_mask":attention_mask, "label_ids":target_ids}


def collate_fn(batch):
    batch_input = [torch.LongTensor(example['input_ids']) for example in batch]
    batch_output = [torch.LongTensor(example['input_ids']) for example in batch]
    batch_mask = [torch.LongTensor(example['attention_mask']) for example in batch]

    padded_batch_input_ids = pad_sequence(batch_input, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_batch_label = pad_sequence(batch_label, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_batch_att_mask = pad_sequence(batch_mask, batch_first=True, padding_value=0)

    return {"input_ids": padded_batch_input_ids, "attention_mask": padded_batch_att_mask, "labels": padded_batch_label}

def to_device(data, device):
    new_data = {}
    for k in data:
        new_data[k] = data[k].to(device)
    return new_data

In [50]:
train_dataset = IdiomDataset(train_input, train_output)
train_loader = DataLoader(train_dataset, batch_size=8, collate_fn=collate_fn, shuffle=False)

dev_dataset = IdiomDataset(dev_input, dev_output)
dev_loader = DataLoader(dev_dataset, batch_size=8, collate_fn=collate_fn, shuffle=False)

## Training

In [None]:
epoches = 3
optimizer = optim.Adam(model.parameters(), lr=5e-5)
model.train()

epoch_loss = 0.0
    log_loss = 0.0
    for idx, batch in enumerate(train_loader):
        # print("idx:", idx, ", log_step: ", log_step)
        optimizer.zero_grad()
        batch = to_device(batch, device)
        loss = model(**batch).loss
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        log_loss += loss.item()
        if idx % log_step == 0:
            print(f"Train Step: {idx} Loss: {log_loss / log_step}")
            log_loss = 0.0
    return epoch_loss / len(train_loader)

In [None]:
torch.save(model, path+"T5-small_model_3epoch.pt")

## Evaluation

In [54]:

@torch.no_grad()
def evaluate(model:nn.Module, eval_loader:DataLoader):
    eval_loss = 0.0
    correct = 0
    total = 0
    model.eval()
    print("eval_loader len:", len(eval_loader))
    for batch in eval_loader:
        batch = to_device(batch, device)
        output = model(**batch)
        loss = output.loss
        eval_loss += loss.item()
        pred = output.logits.argmax(-1)
        label = batch["labels"]
        correct += torch.where(label!=0, pred==label, 0).sum().item()
        total += torch.sum(label!=0).item()

    eval_acc = correct / total
    eval_loss = eval_loss / len(eval_loader) 
    print(total, correct)
    return eval_acc, eval_loss

ImportError: cannot import name 'pad_sequence' from 'torch.nn.utils' (/Users/qmy/opt/miniconda3/lib/python3.10/site-packages/torch/nn/utils/__init__.py)