# **T5-base LoRA fine-tuning Evaluate**

From : [JooYoung Song](https://github.com/Song-Joo-Young/ToTTo-Fine-tuning-in-colab/tree/main)

Code Reference :
* ToTTo : https://github.com/google-research-datasets/ToTTo
* BLEURT : https://github.com/google-research/bleurt

## **1.1 Google Drive mound & install library**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets sentencepiece peft accelerate evaluate

Collecting datasets
  Downloading datasets-2.17.0-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting peft
  Downloading peft-0.8.2-py3-none-any.whl (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.27.0-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.7/279.7 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-15.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (38.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m 

In [3]:
import json
import os

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import T5Tokenizer

# from preprocess_utils import get_highlighted_subtable, linearize_subtable

## **1.2 Preprocessing code (ToTTo dataset)**

In [4]:
# Google's Official Preprocess Codes
# https://github.com/google-research/language/blob/master/language/totto/baseline_preprocessing/preprocess_utils.py

import copy

def _add_adjusted_col_offsets(table):
  """Add adjusted column offsets to take into account multi-column cells."""
  adjusted_table = []
  for row in table:
    real_col_index = 0
    adjusted_row = []
    for cell in row:
      adjusted_cell = copy.deepcopy(cell)
      adjusted_cell["adjusted_col_start"] = real_col_index
      adjusted_cell["adjusted_col_end"] = (
          adjusted_cell["adjusted_col_start"] + adjusted_cell["column_span"])
      real_col_index += adjusted_cell["column_span"]
      adjusted_row.append(adjusted_cell)
    adjusted_table.append(adjusted_row)
  return adjusted_table


def _get_heuristic_row_headers(adjusted_table, row_index, col_index):
  """Heuristic to find row headers."""
  row_headers = []
  row = adjusted_table[row_index]
  for i in range(0, col_index):
    if row[i]["is_header"]:
      row_headers.append(row[i])
  return row_headers


def _get_heuristic_col_headers(adjusted_table, row_index, col_index):
  """Heuristic to find column headers."""
  adjusted_cell = adjusted_table[row_index][col_index]
  adjusted_col_start = adjusted_cell["adjusted_col_start"]
  adjusted_col_end = adjusted_cell["adjusted_col_end"]
  col_headers = []
  for r in range(0, row_index):
    row = adjusted_table[r]
    for cell in row:
      if (cell["adjusted_col_start"] < adjusted_col_end and
          cell["adjusted_col_end"] > adjusted_col_start):
        if cell["is_header"]:
          col_headers.append(cell)

  return col_headers


def get_highlighted_subtable(table, cell_indices, with_heuristic_headers=False):
  """Extract out the highlighted part of a table."""
  highlighted_table = []

  adjusted_table = _add_adjusted_col_offsets(table)

  for (row_index, col_index) in cell_indices:
    cell = table[row_index][col_index]
    if with_heuristic_headers:
      row_headers = _get_heuristic_row_headers(adjusted_table, row_index,
                                               col_index)
      col_headers = _get_heuristic_col_headers(adjusted_table, row_index,
                                               col_index)
    else:
      row_headers = []
      col_headers = []

    highlighted_cell = {
        "cell": cell,
        "row_headers": row_headers,
        "col_headers": col_headers
    }
    highlighted_table.append(highlighted_cell)

  return highlighted_table


def linearize_full_table(table, cell_indices, table_page_title,
                         table_section_title):
  """Linearize full table with localized headers and return a string."""
  table_str = ""
  if table_page_title:
    table_str += "<page_title> " + table_page_title + " </page_title> "
  if table_section_title:
    table_str += "<section_title> " + table_section_title + " </section_title> "

  table_str += "<table> "
  adjusted_table = _add_adjusted_col_offsets(table)
  for r_index, row in enumerate(table):
    row_str = "<row> "
    for c_index, col in enumerate(row):

      row_headers = _get_heuristic_row_headers(adjusted_table, r_index, c_index)
      col_headers = _get_heuristic_col_headers(adjusted_table, r_index, c_index)

      # Distinguish between highlighted and non-highlighted cells.
      if [r_index, c_index] in cell_indices:
        start_cell_marker = "<highlighted_cell> "
        end_cell_marker = "</highlighted_cell> "
      else:
        start_cell_marker = "<cell> "
        end_cell_marker = "</cell> "

      # The value of the cell.
      item_str = start_cell_marker + col["value"] + " "

      # All the column headers associated with this cell.
      for col_header in col_headers:
        item_str += "<col_header> " + col_header["value"] + " </col_header> "

      # All the row headers associated with this cell.
      for row_header in row_headers:
        item_str += "<row_header> " + row_header["value"] + " </row_header> "

      item_str += end_cell_marker
      row_str += item_str

    row_str += "</row> "
    table_str += row_str

  table_str += "</table>"
  if cell_indices:
    assert "<highlighted_cell>" in table_str
  return table_str


def linearize_subtable(subtable, table_page_title, table_section_title):
  """Linearize the highlighted subtable and return a string of its contents."""
  table_str = ""
  if table_page_title:
    table_str += "<page_title> " + table_page_title + " </page_title> "
  if table_section_title:
    table_str += "<section_title> " + table_section_title + " </section_title> "
  table_str += "<table> "

  for item in subtable:
    cell = item["cell"]
    row_headers = item["row_headers"]
    col_headers = item["col_headers"]

    # The value of the cell.
    item_str = "<cell> " + cell["value"] + " "

    # All the column headers associated with this cell.
    for col_header in col_headers:
      item_str += "<col_header> " + col_header["value"] + " </col_header> "

    # All the row headers associated with this cell.
    for row_header in row_headers:
      item_str += "<row_header> " + row_header["value"] + " </row_header> "

    item_str += "</cell> "
    table_str += item_str

  table_str += "</table>"
  return table_str

## **1.3 Model & tokenizer setting for generation**

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    print("Current GPU Index:", torch.cuda.current_device())
    print("Current GPU Name:", torch.cuda.get_device_name(torch.cuda.current_device()))

Current GPU Index: 0
Current GPU Name: Tesla T4


In [6]:
from transformers import T5Tokenizer, T5Model

# Pre-Trained T5 Tokenizer
tokenizer=T5Tokenizer.from_pretrained('t5-base')
# Add Special Tokens: Table Tags
tokenizer.add_special_tokens({
    'additional_special_tokens': [
        '<page_title>',
        '</page_title>',
        '<section_title>',
        '</section_title>',
        '<table>',
        '</table>',
        '<cell>',
        '</cell>',
        '<col_header>',
        '</col_header>',
        '<row_header>',
        '</row_header>'
    ]
})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


12

In [7]:
from torch.utils.data import Dataset

class ToTToDataset(Dataset):
    def __init__(self, path_data, tokenizer):
        self.tokenizer = tokenizer
        self.data = []
        self.label = []
        self.attention_mask = []

        # Load Dataset
        with open(path_data, 'r') as f:
            dataset = f.read().splitlines()

        for _data in dataset:
            data = json.loads(_data)

            # Preprocess
            subtable = get_highlighted_subtable(table=data['table'], cell_indices=data['highlighted_cells'], with_heuristic_headers=True)
            cells_linearized = linearize_subtable(subtable=subtable, table_page_title=data['table_page_title'], table_section_title=data['table_section_title'])

            # Encode
            encoded_dict = tokenizer.encode_plus(cells_linearized, max_length=512, truncation=True, padding="max_length", return_attention_mask=True)
            self.data.append(encoded_dict['input_ids'])
            self.attention_mask.append(encoded_dict['attention_mask'])
            self.label.append(tokenizer.encode(data['sentence_annotations'][0]['final_sentence'], max_length=512, truncation=True))

        print(len(self.data), 'datas')
        print(len(self.label), 'labels')

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.data[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'labels': torch.tensor(self.label[idx], dtype=torch.long)
        }
        return item


    def __len__(self):
        return len(self.data)

In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Pre-Trained T5 Model
model=T5ForConditionalGeneration.from_pretrained('t5-base').to(device)
# Resize PLM's Embedding Layer
model.resize_token_embeddings(len(tokenizer))

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Embedding(32112, 768)

In [9]:
dataset_dev = ToTToDataset(path_data="/content/drive/MyDrive/ToTTo_data/totto_dev_data.jsonl", tokenizer=tokenizer)

7700 datas
7700 labels


In [10]:
# batch_size = 24
batch_size = 8

In [11]:
from transformers import DataCollatorForSeq2Seq

# Data collator 인스턴스 생성
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [12]:
from torch.utils.data import DataLoader

# DataLoader 생성
dataloader = DataLoader(dataset_dev, batch_size=8, collate_fn=data_collator)

# 배치 데이터 형식 확인
for batch in dataloader:
    print(batch.keys())
    print(batch['input_ids'].shape)
    print(batch['attention_mask'].shape)
    if 'labels' in batch:
        print(batch['labels'].shape)
    break  # 첫 번째 배치만 확인하고 반복 중지

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])
torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8, 46])


In [13]:
import torch
import json
from transformers import T5ForConditionalGeneration, AutoTokenizer
from transformers import T5Config

tokenizer = T5Tokenizer.from_pretrained('t5-base')

tokenizer.add_special_tokens({
    'additional_special_tokens': [
        '<page_title>',
        '</page_title>',
        '<section_title>',
        '</section_title>',
        '<table>',
        '</table>',
        '<cell>',
        '</cell>',
        '<col_header>',
        '</col_header>',
        '<row_header>',
        '</row_header>'
    ]
})

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


12

In [14]:
from transformers import T5ForConditionalGeneration

model=T5ForConditionalGeneration.from_pretrained('t5-base')
model.resize_token_embeddings(len(tokenizer))

Embedding(32112, 768)

In [15]:
from peft import LoraConfig, get_peft_model, TaskType

peft_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.01,
)

# LoRA T5-small model
model = get_peft_model(model, peft_config)

In [16]:
model_path='/content/drive/MyDrive/ToTTo_T5-base_LoRA/model/epoch3/T5-base_LoRA_Fine-Tuning_lr0.001_epoch3.pth'

# 저장된 state_dict 로드
model.load_state_dict(torch.load(model_path))

# 모델을 평가 모드로 설정
model.eval()

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32112, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32112, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.01, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
              

### **Text Genration**

In [17]:
model.to(device)

# Generation
if os.path.exists('/content/drive/MyDrive/ToTTo_T5-base_LoRA/generation_text/generation_dev_epoch3.txt'):
    os.remove('/content/drive/MyDrive/ToTTo_T5-base_LoRA/generation_text/generation_dev_epoch3.txt')
f=open('/content/drive/MyDrive/ToTTo_T5-base_LoRA/generation_text/generation_dev_epoch3.txt', 'a')

count = 0

with torch.no_grad():
    for batch in dataloader:
        count += 1
        if (count) % 100 == 0:
            print(batch_size*(count), 'generated')
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Generate sequences
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=300,
            num_beams=5,
            early_stopping=True,

        )

        generated_sentences = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for sentence in generated_sentences:
            f.write(sentence + '\n')

f.close()

800 generated
1600 generated
2400 generated
3200 generated
4000 generated
4800 generated
5600 generated
6400 generated
7200 generated


## **2.1 Evaluation repository**

In [18]:
!git clone https://github.com/Song-Joo-Young/language.git language_repo

Cloning into 'language_repo'...
remote: Enumerating objects: 3851, done.[K
remote: Counting objects: 100% (835/835), done.[K
remote: Compressing objects: 100% (474/474), done.[K
remote: Total 3851 (delta 404), reused 676 (delta 350), pack-reused 3016[K
Receiving objects: 100% (3851/3851), 6.24 MiB | 18.17 MiB/s, done.
Resolving deltas: 100% (2181/2181), done.


In [19]:
!pip install git+https://github.com/google-research/bleurt.git

Collecting git+https://github.com/google-research/bleurt.git
  Cloning https://github.com/google-research/bleurt.git to /tmp/pip-req-build-aihrmi1t
  Running command git clone --filter=blob:none --quiet https://github.com/google-research/bleurt.git /tmp/pip-req-build-aihrmi1t
  Resolved https://github.com/google-research/bleurt.git to commit cebe7e6f996b40910cfaa520a63db47807e3bf5c
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: BLEURT
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone
  Created wheel for BLEURT: filename=BLEURT-0.0.2-py3-none-any.whl size=16456765 sha256=2df24d0d3eb01df3360a4a906b75a245f904751fcb3707b832a6bf6473bc31e9
  Stored in directory: /tmp/pip-ephem-wheel-cache-rb64_d1h/wheels/64/f4/2c/509a6c31b8ebde891a81029fd94f199b1b92f0e7cfc20d417a
Successfully built BLEURT
Installing collected packages: BLEURT
Successfully installed BLEURT-0.0.2


## **2.2 Setting Up the Directory & Requirement for BLEURT**

In [20]:
%cd language_repo

/content/language_repo


In [21]:
# Downloads the BLEURT-base checkpoint.
! wget https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip .
! unzip BLEURT-20.zip

--2024-02-11 03:21:36--  https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.136.207, 142.250.148.207, 209.85.200.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.136.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2140294207 (2.0G) [application/octet-stream]
Saving to: ‘BLEURT-20.zip’


2024-02-11 03:22:00 (86.7 MB/s) - ‘BLEURT-20.zip’ saved [2140294207/2140294207]

--2024-02-11 03:22:00--  http://./
Resolving . (.)... failed: No address associated with hostname.
wget: unable to resolve host address ‘.’
FINISHED --2024-02-11 03:22:00--
Total wall clock time: 24s
Downloaded: 1 files, 2.0G in 24s (86.7 MB/s)
Archive:  BLEURT-20.zip
   creating: BLEURT-20/
  inflating: BLEURT-20/bert_config.json  
  inflating: BLEURT-20/saved_model.pb  
   creating: BLEURT-20/variables/
  inflating: BLEURT-20/variables/variables.index  
  inflating: BLEURT-20/variables/v

In [22]:
! pip3 install -r language/totto/eval_requirements.txt

Collecting sacrebleu (from -r language/totto/eval_requirements.txt (line 3))
  Downloading sacrebleu-2.4.0-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu->-r language/totto/eval_requirements.txt (line 3))
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu->-r language/totto/eval_requirements.txt (line 3))
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.4.0


#### **Evaluation**

In [23]:
# Epoch 3
! bash language/totto/totto_eval.sh --prediction_path /content/drive/MyDrive/ToTTo_T5-base_LoRA/generation_text/generation_dev_epoch3.txt --target_path /content/drive/MyDrive/ToTTo_data/totto_dev_data.jsonl --bleurt_ckpt BLEURT-20

Running with the following variables:
PREDICTION_PATH   : /content/drive/MyDrive/ToTTo_T5-base_LoRA/generation_text/generation_dev_epoch3.txt
TARGET_PATH       : /content/drive/MyDrive/ToTTo_data/totto_dev_data.jsonl 
BLEURT_CKPT       : BLEURT-20 
OUTPUT_DIR        : temp
MODE              : test
Creating Output directory.
Cloning moses for BLEU script.
Cloning into 'temp/mosesdecoder'...
remote: Enumerating objects: 148103, done.[K
remote: Counting objects: 100% (531/531), done.[K
remote: Compressing objects: 100% (234/234), done.[K
remote: Total 148103 (delta 329), reused 445 (delta 293), pack-reused 147572[K
Receiving objects: 100% (148103/148103), 129.88 MiB | 14.51 MiB/s, done.
Resolving deltas: 100% (114355/114355), done.
Writing references.
Writing tables in PARENT format.
Preparing predictions.
Writing predictions.
Running detokenizers.
Computing BLEU (overall)
{
 "name": "BLEU",
 "score": 44.7,
 "signature": "nrefs:3|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.0",
 "