## **T5-base LoRA Fine-tuning on ToTTo**

From : [JooYoung Song](https://github.com/Song-Joo-Young/ToTTo-Fine-tuning-in-colab/tree/main)

Code Reference :
* PEFT : https://huggingface.co/docs/peft/main/en/index
* ToTTo : https://github.com/google-research-datasets/ToTTo
* Prompt-Tuning-on-ToTTo : https://github.com/ChainsmokersAI/Prompt-Tuning-on-ToTTo

In [None]:
# Google Drive Mount

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Get Dataset

!wget https://storage.googleapis.com/totto-public/totto_data.zip
!unzip totto_data.zip

--2024-02-10 00:08:44--  https://storage.googleapis.com/totto-public/totto_data.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.135.207, 173.194.202.207, 173.194.203.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.135.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 187724372 (179M) [application/zip]
Saving to: ‘totto_data.zip’


2024-02-10 00:08:46 (82.3 MB/s) - ‘totto_data.zip’ saved [187724372/187724372]

Archive:  totto_data.zip
  inflating: totto_data/totto_dev_data.jsonl  
  inflating: totto_data/totto_train_data.jsonl  
  inflating: totto_data/unlabeled_totto_test_data.jsonl  


In [None]:
# 드라이브에 데이터셋 저장 추후 가중치도 저장할 폴더
# Copy Dataset to your Google Drive
import shutil
import os

source_folder = '/content/totto_data'
destination_folder = '/content/drive/MyDrive/ToTTo_data'

if os.path.exists(destination_folder):
    shutil.rmtree(destination_folder)

shutil.copytree(source_folder, destination_folder)

'/content/drive/MyDrive/ToTTo_data'

### **1. Preprocessing**

In [None]:
!pip install transformers datasets sentencepiece peft accelerate evaluate

Collecting datasets
  Downloading datasets-2.17.0-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting peft
  Downloading peft-0.8.2-py3-none-any.whl (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.27.0-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.7/279.7 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-15.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (38.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m 

In [None]:
# Load Train Set
# with open('/content/totto_data/totto_train_data.jsonl', 'r') as f:
with open('/content/drive/MyDrive/ToTTo_data/totto_train_data.jsonl', 'r') as f:
    data_train=f.read().splitlines()
    f.close()

# Number of Train Data
len(data_train)

120761

In [None]:
import json

# Sample Data
data_sample=json.loads(data_train[-1])

# Key-Value Set
for key, value in data_sample.items():
    # if key=='table': continue

    print('→', key, '\n \t ', value)

→ table 
 	  [[{'value': 'Rank', 'is_header': True, 'column_span': 1, 'row_span': 1}, {'value': 'Lane', 'is_header': True, 'column_span': 1, 'row_span': 1}, {'value': 'Name', 'is_header': True, 'column_span': 1, 'row_span': 1}, {'value': 'Nationality', 'is_header': True, 'column_span': 1, 'row_span': 1}, {'value': 'Time', 'is_header': True, 'column_span': 1, 'row_span': 1}, {'value': 'Notes', 'is_header': True, 'column_span': 1, 'row_span': 1}], [{'value': '', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': '4', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': 'Matt Grevers', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': 'United States', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': '52.16', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': 'OR', 'is_header': False, 'column_span': 1, 'row_span': 1}], [{'value': '', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': '2', 'is_header': False, 'co

In [None]:
# Google's Official Preprocess Codes
# https://github.com/google-research/language/blob/master/language/totto/baseline_preprocessing/preprocess_utils.py

import copy

def _add_adjusted_col_offsets(table):
  """Add adjusted column offsets to take into account multi-column cells."""
  adjusted_table = []
  for row in table:
    real_col_index = 0
    adjusted_row = []
    for cell in row:
      adjusted_cell = copy.deepcopy(cell)
      adjusted_cell["adjusted_col_start"] = real_col_index
      adjusted_cell["adjusted_col_end"] = (
          adjusted_cell["adjusted_col_start"] + adjusted_cell["column_span"])
      real_col_index += adjusted_cell["column_span"]
      adjusted_row.append(adjusted_cell)
    adjusted_table.append(adjusted_row)
  return adjusted_table


def _get_heuristic_row_headers(adjusted_table, row_index, col_index):
  """Heuristic to find row headers."""
  row_headers = []
  row = adjusted_table[row_index]
  for i in range(0, col_index):
    if row[i]["is_header"]:
      row_headers.append(row[i])
  return row_headers


def _get_heuristic_col_headers(adjusted_table, row_index, col_index):
  """Heuristic to find column headers."""
  adjusted_cell = adjusted_table[row_index][col_index]
  adjusted_col_start = adjusted_cell["adjusted_col_start"]
  adjusted_col_end = adjusted_cell["adjusted_col_end"]
  col_headers = []
  for r in range(0, row_index):
    row = adjusted_table[r]
    for cell in row:
      if (cell["adjusted_col_start"] < adjusted_col_end and
          cell["adjusted_col_end"] > adjusted_col_start):
        if cell["is_header"]:
          col_headers.append(cell)

  return col_headers


def get_highlighted_subtable(table, cell_indices, with_heuristic_headers=False):
  """Extract out the highlighted part of a table."""
  highlighted_table = []

  adjusted_table = _add_adjusted_col_offsets(table)

  for (row_index, col_index) in cell_indices:
    cell = table[row_index][col_index]
    if with_heuristic_headers:
      row_headers = _get_heuristic_row_headers(adjusted_table, row_index,
                                               col_index)
      col_headers = _get_heuristic_col_headers(adjusted_table, row_index,
                                               col_index)
    else:
      row_headers = []
      col_headers = []

    highlighted_cell = {
        "cell": cell,
        "row_headers": row_headers,
        "col_headers": col_headers
    }
    highlighted_table.append(highlighted_cell)

  return highlighted_table


def linearize_full_table(table, cell_indices, table_page_title,
                         table_section_title):
  """Linearize full table with localized headers and return a string."""
  table_str = ""
  if table_page_title:
    table_str += "<page_title> " + table_page_title + " </page_title> "
  if table_section_title:
    table_str += "<section_title> " + table_section_title + " </section_title> "

  table_str += "<table> "
  adjusted_table = _add_adjusted_col_offsets(table)
  for r_index, row in enumerate(table):
    row_str = "<row> "
    for c_index, col in enumerate(row):

      row_headers = _get_heuristic_row_headers(adjusted_table, r_index, c_index)
      col_headers = _get_heuristic_col_headers(adjusted_table, r_index, c_index)

      # Distinguish between highlighted and non-highlighted cells.
      if [r_index, c_index] in cell_indices:
        start_cell_marker = "<highlighted_cell> "
        end_cell_marker = "</highlighted_cell> "
      else:
        start_cell_marker = "<cell> "
        end_cell_marker = "</cell> "

      # The value of the cell.
      item_str = start_cell_marker + col["value"] + " "

      # All the column headers associated with this cell.
      for col_header in col_headers:
        item_str += "<col_header> " + col_header["value"] + " </col_header> "

      # All the row headers associated with this cell.
      for row_header in row_headers:
        item_str += "<row_header> " + row_header["value"] + " </row_header> "

      item_str += end_cell_marker
      row_str += item_str

    row_str += "</row> "
    table_str += row_str

  table_str += "</table>"
  if cell_indices:
    assert "<highlighted_cell>" in table_str
  return table_str


def linearize_subtable(subtable, table_page_title, table_section_title):
  """Linearize the highlighted subtable and return a string of its contents."""
  table_str = ""
  if table_page_title:
    table_str += "<page_title> " + table_page_title + " </page_title> "
  if table_section_title:
    table_str += "<section_title> " + table_section_title + " </section_title> "
  table_str += "<table> "

  for item in subtable:
    cell = item["cell"]
    row_headers = item["row_headers"]
    col_headers = item["col_headers"]

    # The value of the cell.
    item_str = "<cell> " + cell["value"] + " "

    # All the column headers associated with this cell.
    for col_header in col_headers:
      item_str += "<col_header> " + col_header["value"] + " </col_header> "

    # All the row headers associated with this cell.
    for row_header in row_headers:
      item_str += "<row_header> " + row_header["value"] + " </row_header> "

    item_str += "</cell> "
    table_str += item_str

  table_str += "</table>"
  return table_str

In [None]:
# from preprocess_utils import get_highlighted_subtable, linearize_subtable

print('→', 'Highlighted Cells')
for (index_row, index_col) in data_sample['highlighted_cells']:
    print(data_sample['table'][index_row][index_col])

print('\n→', 'Linearized (Preprocessed) Cells')
subtable=get_highlighted_subtable(table=data_sample['table'], cell_indices=data_sample['highlighted_cells'], with_heuristic_headers=True)
cells_linearized=linearize_subtable(
    subtable=subtable,
    table_page_title=data_sample['table_page_title'],
    table_section_title=data_sample['table_section_title']
)
print(cells_linearized)

print('\n→', 'Final (Label) Sentence')
for sentence in data_sample['sentence_annotations']:
    print(sentence['final_sentence'])

→ Highlighted Cells
{'value': '4', 'is_header': False, 'column_span': 1, 'row_span': 1}
{'value': 'Camille Lacourt', 'is_header': False, 'column_span': 1, 'row_span': 1}
{'value': '53.08', 'is_header': False, 'column_span': 1, 'row_span': 1}

→ Linearized (Preprocessed) Cells
<page_title> Swimming at the 2012 Summer Olympics – Men's 100 metre backstroke </page_title> <section_title> Final </section_title> <table> <cell> 4 <col_header> Rank </col_header> </cell> <cell> Camille Lacourt <col_header> Name </col_header> </cell> <cell> 53.08 <col_header> Time </col_header> </cell> </table>

→ Final (Label) Sentence
Lacourt was dropped to a fourth-place time in 53.08.


In [None]:
# Prepare for Training
from transformers import T5Tokenizer

# T5 Tokenizer
tokenizer=T5Tokenizer.from_pretrained('t5-base')

# Vocab Size
len(tokenizer)

In [None]:
# Add Special Tokens: Table Tags
tokenizer.add_special_tokens({
    'additional_special_tokens': [
        '<page_title>',
        '</page_title>',
        '<section_title>',
        '</section_title>',
        '<table>',
        '</table>',
        '<cell>',
        '</cell>',
        '<col_header>',
        '</col_header>',
        '<row_header>',
        '</row_header>'
    ]
})
# When Training, Resize PLM's Embedding Layer
# model.resize_token_embeddings(len(tokenizer))

# Vocab Size
len(tokenizer)

In [None]:
# Tokenize Linearized Cells
print(tokenizer.tokenize(cells_linearized))

### **2. LoRA Finetuning (t5-small)**

In [None]:
import json

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup

# Google's Official Preprocess Codes
# https://github.com/google-research/language/blob/master/language/totto/baseline_preprocessing/preprocess_utils.py
# from preprocess_utils import get_highlighted_subtable, linearize_subtable

In [None]:
# Pre-Trained T5 Tokenizer
tokenizer=T5Tokenizer.from_pretrained('t5-base')
# Add Special Tokens: Table Tags
tokenizer.add_special_tokens({
    'additional_special_tokens': [
        '<page_title>',
        '</page_title>',
        '<section_title>',
        '</section_title>',
        '<table>',
        '</table>',
        '<cell>',
        '</cell>',
        '<col_header>',
        '</col_header>',
        '<row_header>',
        '</row_header>'
    ]
})

In [None]:
# Pre-Trained T5 Model
model=T5ForConditionalGeneration.from_pretrained('t5-base')
# Resize PLM's Embedding Layer
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Original T5-base model
print(model)

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

# Original T5-base model
print(print_number_of_trainable_model_parameters(model))

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

peft_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.01,
)

# LoRA T5-base model
model = get_peft_model(model, peft_config)
print(print_number_of_trainable_model_parameters(model))

In [None]:
# LoRA T5-base model
print(model)

In [None]:
# 모든 파라미터 requires_grad 확인
for name, param in model.named_parameters():
    print(name, param.requires_grad)

#### **LoRA Training**

In [None]:
from torch.utils.data import Dataset

class ToTToDataset(Dataset):
    def __init__(self, path_data, tokenizer):
        self.tokenizer = tokenizer
        self.data = []
        self.label = []
        self.attention_mask = []

        # Load Dataset
        with open(path_data, 'r') as f:
            dataset = f.read().splitlines()

        for _data in dataset:
            data = json.loads(_data)

            # Preprocess
            subtable = get_highlighted_subtable(table=data['table'], cell_indices=data['highlighted_cells'], with_heuristic_headers=True)
            cells_linearized = linearize_subtable(subtable=subtable, table_page_title=data['table_page_title'], table_section_title=data['table_section_title'])

            # Encode
            encoded_dict = tokenizer.encode_plus(cells_linearized, max_length=512, truncation=True, padding="max_length", return_attention_mask=True)
            self.data.append(encoded_dict['input_ids'])
            self.attention_mask.append(encoded_dict['attention_mask'])
            self.label.append(tokenizer.encode(data['sentence_annotations'][0]['final_sentence'], max_length=512, truncation=True))

        print(len(self.data), 'datas')
        print(len(self.label), 'labels')

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.data[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'labels': torch.tensor(self.label[idx], dtype=torch.long)
        }
        return item


    def __len__(self):
        return len(self.data)

In [None]:
dataset_train = ToTToDataset(path_data="/content/drive/MyDrive/ToTTo_data/totto_train_data.jsonl", tokenizer=tokenizer)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

# output_dir="/content/drive/MyDrive/ToTTo_T5-base_LoRA/model/epoch1"
output_dir = "/content/drive/MyDrive/ToTTo_T5-base_LoRA/model/epoch4"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	  auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=1,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=2000,
    save_strategy="no",
    report_to="tensorboard",
)

# Data collator 인스턴스 생성
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Seq2SeqTrainer 인스턴스 생성
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset_train,
)

model.config.use_cache = False

#### **Train**

In [None]:
# train model
trainer.train()

# 모델 저장
trainer.save_model(output_dir)

# 모델 구성 저장 (필요한 경우)
model.config.save_pretrained(output_dir)

# 모델의 state_dict 저장 (safetensor는 모델의 구조가 바뀌는 경우 불러오기가 까다로워서 그냥 pth도 저장)
torch.save(model.state_dict(), f'{output_dir}/T5-base_LoRA_Fine-Tuning_lr{training_args.learning_rate}_epoch1.pth')

Step,Training Loss
2000,1.3929
4000,1.2593
6000,1.2197
8000,1.1851
10000,1.1555
12000,1.1331
14000,1.1155




In [None]:
output_dir="/content/drive/MyDrive/ToTTo_T5-base_LoRA/model/epoch2"

# train model
trainer.train()

# 모델 저장
trainer.save_model(output_dir)

# 모델 구성 저장 (필요한 경우)
model.config.save_pretrained(output_dir)

# 모델의 state_dict 저장 (safetensor는 모델의 구조가 바뀌는 경우 불러오기가 까다로워서 그냥 pth도 저장)
torch.save(model.state_dict(), f'{output_dir}/T5-base_LoRA_Fine-Tuning_lr{training_args.learning_rate}_epoch2.pth')

Step,Training Loss
2000,1.1817
4000,1.1728
6000,1.1494
8000,1.1439
10000,1.1285
12000,1.0875
14000,1.069


#### **Runtime Reset**

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    print("Current GPU Index:", torch.cuda.current_device())
    print("Current GPU Name:", torch.cuda.get_device_name(torch.cuda.current_device()))

Current GPU Index: 0
Current GPU Name: Tesla T4


In [None]:
model_path='/content/drive/MyDrive/ToTTo_T5-base_LoRA/model/epoch2/T5-base_LoRA_Fine-Tuning_lr0.001_epoch2.pth'

# 저장된 state_dict 로드
model.load_state_dict(torch.load(model_path))
model.to(device)

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32112, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32112, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.01, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
              

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

output_dir="/content/drive/MyDrive/ToTTo_T5-base_LoRA/model/epoch3"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	  auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=1,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=2000,
    save_strategy="no",
    report_to="tensorboard",
)

# Data collator 인스턴스 생성
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Seq2SeqTrainer 인스턴스 생성
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset_train,
)

model.config.use_cache = False

In [None]:
# train model
trainer.train()

# 모델 저장
trainer.save_model(output_dir)

# 모델 구성 저장 (필요한 경우)
model.config.save_pretrained(output_dir)

# 모델의 state_dict 저장 (safetensor는 모델의 구조가 바뀌는 경우 불러오기가 까다로워서 그냥 pth도 저장)
torch.save(model.state_dict(), f'{output_dir}/T5-base_LoRA_Fine-Tuning_lr{training_args.learning_rate}_epoch3.pth')

Step,Training Loss
2000,1.131
4000,1.1303
6000,1.1222
8000,1.1019
10000,1.0843
12000,1.07
14000,1.0623


