# T5 英文文本生成
## 教學目標
使用T5模型根據英文關鍵字生成完整的句子

## 適用對象
 - 已對python的基本語法和有一定瞭解和掌握程度
 - 對深度學習的基本概念有初步的認識

## 執行方法
在 Jupyter notebook 中，選取想要執行的區塊後，使用以下其中一種方法執行
 - 上方工具列中，按下 Cell < Run Cells 執行
 - 使用快捷鍵 Shift + Enter 執行

## 大綱
 - [安裝套件](#安裝套件)
 - [載入T5模型](#載入T5模型)
 - [資料處理](#資料處理)
 - [超參數](#超參數)
 - [訓練](#訓練)
 - [驗證](#驗證)


## 安裝套件
 - transformers (4.37.0) huggingface讀取模型的套件
 - datasets (2.16.1) huggingface讀取資料集的套件
 - torcheval (0.0.7) 各種評價標準

In [1]:
! pip install transformers
! pip install datasets
! pip install torcheval
! pip install pytorch-ignite



In [2]:
import transformers as T
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from ignite.metrics import Rouge
import re

device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
from transformers import BertTokenizer, RobertaTokenizer,BertTokenizerFast


## 載入T5模型
 - 使用huggingface裝載模型的架構、參數和tokenizer
 - 保存在路徑./cache/中
 - 用.to(device)把模型裝載入訓練設備(GPU)

In [4]:
t5_model = T.T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", cache_dir="./cache/").to(device)
t5_tokenizer = T.T5Tokenizer.from_pretrained("google/flan-t5-base", cache_dir="./cache/")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import jieba

In [6]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

In [7]:
bert_tokenizer_fast = BertTokenizerFast.from_pretrained("bert-base-chinese",add_special_tokens=False,do_lower_case=True)

In [8]:
gpt2_model = T.GPT2LMHeadModel.from_pretrained("gpt2", cache_dir="./cache/").to(device)
gpt2_tokenizer = T.GPT2Tokenizer.from_pretrained("gpt2", cache_dir="./cache/")
# gpt2_model = T.GPT2ForSequenceClassification.from_pretrained("gpt2", cache_dir="./cache/").to(device)
# gpt2_tokenizer = T.GPT2Tokenizer.from_pretrained("gpt2", cache_dir="./cache/")
gpt2_tokenizer.padding_side = "left"
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
# gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [9]:
from transformers import BertTokenizerFast,AutoModelForCausalLM

tokenizer_3 = BertTokenizerFast.from_pretrained('bert-base-chinese')
model_3 = AutoModelForCausalLM.from_pretrained('ckiplab/gpt2-base-chinese').to(device)

## 資料處理
 - 使用 torch.utils.data 中的 Dataset 和 Dataloader 成批次地讀取和預處理資料
 - 使用“/”將每個輸入的關鍵字和每個輸出鏈接起來

In [10]:
# def get_tensor(sample):
#     # 將模型的輸入和ground truth打包成Tensor
#     # t5 tokenizer不能處理中文，所以換成bert的試試看
#     # model_inputs = t5_tokenizer.batch_encode_plus([each["text"] for each in sample], padding=True, truncation=True, return_tensors="pt")
#     # model_outputs = t5_tokenizer.batch_encode_plus([each["summary"] for each in sample], padding=True, truncation=True, return_tensors="pt")
#     # bert
#     # model_inputs = bert_tokenizer.batch_encode_plus([each["text"] for each in sample], padding=True, truncation=True, return_tensors="pt")
#     # model_outputs = bert_tokenizer.batch_encode_plus([each["summary"] for each in sample], padding=True, truncation=True, return_tensors="pt")
#     # bertfast
#     model_inputs = bert_tokenizer_fast.batch_encode_plus([each["text"] for each in sample], padding=True, truncation=True, return_tensors="pt",return_offsets_mapping=True,add_special_tokens=False)
#     model_outputs = bert_tokenizer_fast.batch_encode_plus([each["summary"] for each in sample], padding=True, truncation=True, return_tensors="pt",return_offsets_mapping=True,add_special_tokens=False)
#     return model_inputs["input_ids"].to(device), model_outputs["input_ids"].to(device)

# class CommonGenDataset(Dataset):
#     def __init__(self, split="train") -> None:
#         super().__init__()
#         assert split in ["train", "validation", "test"]
#         # data_df = load_dataset("allenai/common_gen", split=split, cache_dir="./cache/").to_pandas().groupby("concept_set_idx")

#         data_df = load_dataset("hugcyp/LCSTS", split=split, cache_dir="./cache/").to_pandas()
#         self.data = []
#         # for each in data_df:
#         #     targets = "/ ".join([s+"." if not s.endswith(".") else s for s in each[1].target.to_list()])
#         #     concepts = ", ".join(each[1].concepts.to_list()[0])
#         #     self.data.append({"concepts": concepts, "targets": targets})
#         for num in range(0,len(data_df)):
#           self.data.append({"summary":data_df['summary'][num],"text":data_df['text'][num]})

#     def __getitem__(self, index):
#         return self.data[index]

#     def __len__(self):
#         return len(self.data)



In [11]:
def get_tensor_gpt2(sample):
    input_ids = gpt2_tokenizer.batch_encode_plus([each["text"] for each in sample], padding=True, truncation=True, return_tensors="pt")["input_ids"].to(device)
    labels = gpt2_tokenizer.batch_encode_plus([each["summary"] for each in sample], padding=True, truncation=True, return_tensors="pt")["input_ids"].to(device)
    return input_ids, labels
class CommonGenDataset_gpt2(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        assert split in ["train", "validation", "test"]
        data_df = load_dataset("hugcyp/LCSTS", split=split, cache_dir="./cache/").to_pandas()
        self.data = [{"summary":data_df['summary'][num],"text":data_df['text'][num]} for num in range(len(data_df))]

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)

## 超參數
 - 學習率 (learning rate): 1e-5
 - 訓練輪數 (epochs): 3
 - 優化器 (optimizer): AdamW
 - 批次大小 (batch size): 8
 - 評量指標 (evaluation matrics)Rouge-2

In [12]:
# lr = 1e-5
# epochs = 1
# optimizer = AdamW(t5_model.parameters(), lr = 1e-5)
# train_batch_size = 8
# validation_batch_size = 8
# common_gen_train = DataLoader(CommonGenDataset(split="train").data[:200], collate_fn=get_tensor, batch_size=train_batch_size, shuffle=True)
# common_gen_validation = DataLoader(CommonGenDataset(split="validation").data[:200], collate_fn=get_tensor, batch_size=validation_batch_size, shuffle=False)
# rouge = Rouge(variants=["L", 2], multiref="best")

In [13]:
lr = 1e-5
epochs = 1
optimizer = AdamW(gpt2_model.parameters(), lr=lr)
train_batch_size = 8
validation_batch_size = 8
common_gen_train_gpt2 = DataLoader(CommonGenDataset_gpt2(split="train").data[:200], collate_fn=get_tensor_gpt2, batch_size=train_batch_size, shuffle=True)
common_gen_validation_gpt2 = DataLoader(CommonGenDataset_gpt2(split="validation").data[:200], collate_fn=get_tensor_gpt2, batch_size=validation_batch_size, shuffle=False)
rouge = Rouge(variants=["L", 2], multiref="best")

## 驗證
驗證程式
 - 將驗證資料輸入模型，用Rouge-2評價輸出的效果
 - Rouge的使用方法參考 https://pytorch.org/ignite/generated/ignite.metrics.Rouge.html

In [14]:
# def evaluate(model):
#     pbar = tqdm(common_gen_validation)
#     pbar.set_description(f"Evaluating")

#     for inputs, targets in pbar:
#         # output = [re.split(r"[/]", each.replace("<pad>", "")) for each in t5_tokenizer.batch_decode(model.generate(inputs, max_length=50))]
#         # targets = [re.split(r"[/]", each.replace("<pad>", "")) for each in t5_tokenizer.batch_decode(targets)]
#         output = [re.split(r"[/]", each.replace("<pad>", "")) for each in bert_tokenizer.batch_decode(model.generate(inputs, max_length=50))]
#         targets = [re.split(r"[/]", each.replace("<pad>", "")) for each in bert_tokenizer.batch_decode(targets)]
#         for i in range(len(output)):
#             sentences = [s.replace('.', ' .').split() for s in output[i]]
#             ground_thruths = [t.replace('.', ' .').split() for t in targets[i]]
#             for s in sentences:
#                 rouge.update(([s], [ground_thruths]))
#     return rouge.compute()


In [15]:
def evaluate_gpt2(model):
    pbar = tqdm(common_gen_validation_gpt2)
    pbar.set_description(f"Evaluating")

    for inputs, targets in pbar:
        output = gpt2_model.generate(inputs, max_length=50, num_return_sequences=1, temperature=1.0)
        output = [gpt2_tokenizer.decode(ids, skip_special_tokens=True) for ids in output]
        targets = [gpt2_tokenizer.decode(ids, skip_special_tokens=True) for ids in targets]
        for i in range(len(output)):
            rouge.update([(output[i].split(), targets[i].split())])
    return rouge.compute()

## 訓練
 - 將資料成批次輸入T5模型，並獲取其損失函數數值，隨後計算梯度優化
 - tqdm用來顯示模型的訓練進度

In [16]:
# for ep in range(epochs):
#     pbar = tqdm(common_gen_train)
#     pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
#     for inputs, targets in pbar:
#         optimizer.zero_grad()
#         loss = t5_model(input_ids=inputs, labels=targets).loss
#         loss.backward()

#         optimizer.step()
#         pbar.set_postfix(loss = loss.item())
#     # torch.save(t5_model, f'./saved_models/ep{ep}.mod')
#     print(f"Rouge-2 score on epoch {ep}:", evaluate(t5_model))

In [None]:
for ep in range(epochs):
    pbar = tqdm(common_gen_train_gpt2)
    pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
    for inputs, targets in pbar:
        optimizer.zero_grad()
        # print()
        # print(inputs)
        # print(targets)
        padding_needed = inputs.size(1) - targets.size(1)

        # 定义填充参数
        padding = (0, padding_needed)

        # 在张量后面填充
        padded_targets = torch.nn.functional.pad(targets, padding, value=0)

        pad_token_id = gpt2_tokenizer.pad_token_id
        inputs[inputs == pad_token_id] = gpt2_tokenizer.eos_token_id  # 將填充標記 ID 替換為 EOS（結束）標記 ID

        # # 創建注意力遮罩
        attention_mask = inputs.ne(gpt2_tokenizer.pad_token_id)
        # output = gpt2_model.generate(inputs, max_length=inputs.size(1)+1, num_return_sequences=1, temperature=1.0)
        loss = gpt2_model(input_ids=inputs, labels=padded_targets,attention_mask=attention_mask).loss
        # loss = gpt2_model(**inputs, labels=inputs["input_ids"]).loss
        loss.backward()
        optimizer.step()
        pbar.set_postfix(loss=loss.item())
    print(f"Rouge-2 score on epoch {ep}:", evaluate_gpt2(gpt2_model))

Training epoch [1/1]:   4%|▍         | 1/25 [00:49<19:52, 49.70s/it, loss=10.7]

In [None]:
print(f"Rouge-2 score on epoch {ep}:", evaluate(t5_model))