## **T5-small LoRA Fine-tuning on ToTTo**

From : [JooYoung Song](https://github.com/Song-Joo-Young/ToTTo-Fine-tuning-in-colab/tree/main)

Code Reference :
* PEFT : https://huggingface.co/docs/peft/main/en/index
* ToTTo : https://github.com/google-research-datasets/ToTTo
* Prompt-Tuning-on-ToTTo : https://github.com/ChainsmokersAI/Prompt-Tuning-on-ToTTo

In [1]:
# Google Drive Mount

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Get Dataset

!wget https://storage.googleapis.com/totto-public/totto_data.zip
!unzip totto_data.zip

--2024-02-06 07:54:38--  https://storage.googleapis.com/totto-public/totto_data.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.2.207, 2607:f8b0:4023:c0b::cf, 2607:f8b0:4023:c0d::cf, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.2.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 187724372 (179M) [application/zip]
Saving to: ‘totto_data.zip’


2024-02-06 07:54:41 (59.4 MB/s) - ‘totto_data.zip’ saved [187724372/187724372]

Archive:  totto_data.zip
  inflating: totto_data/totto_dev_data.jsonl  
  inflating: totto_data/totto_train_data.jsonl  
  inflating: totto_data/unlabeled_totto_test_data.jsonl  


In [None]:
# 드라이브에 데이터셋 저장 추후 가중치도 저장할 폴더
# Copy Dataset to your Google Drive
import shutil
import os

source_folder = '/content/totto_data'
destination_folder = '/content/drive/MyDrive/ToTTo_data'

if os.path.exists(destination_folder):
    shutil.rmtree(destination_folder)

shutil.copytree(source_folder, destination_folder)

'/content/drive/MyDrive/ToTTo_data'

### **1. Preprocessing**

In [2]:
!pip install transformers datasets sentencepiece peft accelerate evaluate

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting peft
  Downloading peft-0.8.2-py3-none-any.whl (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.4 MB/s[0m eta 

In [3]:
# Load Train Set
# with open('/content/totto_data/totto_train_data.jsonl', 'r') as f:
with open('/content/drive/MyDrive/ToTTo_data/totto_train_data.jsonl', 'r') as f:
    data_train=f.read().splitlines()
    f.close()

# Number of Train Data
len(data_train)

120761

In [4]:
import json

# Sample Data
data_sample=json.loads(data_train[-1])

# Key-Value Set
for key, value in data_sample.items():
    # if key=='table': continue

    print('→', key, '\n \t ', value)

→ table 
 	  [[{'value': 'Rank', 'is_header': True, 'column_span': 1, 'row_span': 1}, {'value': 'Lane', 'is_header': True, 'column_span': 1, 'row_span': 1}, {'value': 'Name', 'is_header': True, 'column_span': 1, 'row_span': 1}, {'value': 'Nationality', 'is_header': True, 'column_span': 1, 'row_span': 1}, {'value': 'Time', 'is_header': True, 'column_span': 1, 'row_span': 1}, {'value': 'Notes', 'is_header': True, 'column_span': 1, 'row_span': 1}], [{'value': '', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': '4', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': 'Matt Grevers', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': 'United States', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': '52.16', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': 'OR', 'is_header': False, 'column_span': 1, 'row_span': 1}], [{'value': '', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': '2', 'is_header': False, 'co

In [5]:
# Google's Official Preprocess Codes
# https://github.com/google-research/language/blob/master/language/totto/baseline_preprocessing/preprocess_utils.py

import copy

def _add_adjusted_col_offsets(table):
  """Add adjusted column offsets to take into account multi-column cells."""
  adjusted_table = []
  for row in table:
    real_col_index = 0
    adjusted_row = []
    for cell in row:
      adjusted_cell = copy.deepcopy(cell)
      adjusted_cell["adjusted_col_start"] = real_col_index
      adjusted_cell["adjusted_col_end"] = (
          adjusted_cell["adjusted_col_start"] + adjusted_cell["column_span"])
      real_col_index += adjusted_cell["column_span"]
      adjusted_row.append(adjusted_cell)
    adjusted_table.append(adjusted_row)
  return adjusted_table


def _get_heuristic_row_headers(adjusted_table, row_index, col_index):
  """Heuristic to find row headers."""
  row_headers = []
  row = adjusted_table[row_index]
  for i in range(0, col_index):
    if row[i]["is_header"]:
      row_headers.append(row[i])
  return row_headers


def _get_heuristic_col_headers(adjusted_table, row_index, col_index):
  """Heuristic to find column headers."""
  adjusted_cell = adjusted_table[row_index][col_index]
  adjusted_col_start = adjusted_cell["adjusted_col_start"]
  adjusted_col_end = adjusted_cell["adjusted_col_end"]
  col_headers = []
  for r in range(0, row_index):
    row = adjusted_table[r]
    for cell in row:
      if (cell["adjusted_col_start"] < adjusted_col_end and
          cell["adjusted_col_end"] > adjusted_col_start):
        if cell["is_header"]:
          col_headers.append(cell)

  return col_headers


def get_highlighted_subtable(table, cell_indices, with_heuristic_headers=False):
  """Extract out the highlighted part of a table."""
  highlighted_table = []

  adjusted_table = _add_adjusted_col_offsets(table)

  for (row_index, col_index) in cell_indices:
    cell = table[row_index][col_index]
    if with_heuristic_headers:
      row_headers = _get_heuristic_row_headers(adjusted_table, row_index,
                                               col_index)
      col_headers = _get_heuristic_col_headers(adjusted_table, row_index,
                                               col_index)
    else:
      row_headers = []
      col_headers = []

    highlighted_cell = {
        "cell": cell,
        "row_headers": row_headers,
        "col_headers": col_headers
    }
    highlighted_table.append(highlighted_cell)

  return highlighted_table


def linearize_full_table(table, cell_indices, table_page_title,
                         table_section_title):
  """Linearize full table with localized headers and return a string."""
  table_str = ""
  if table_page_title:
    table_str += "<page_title> " + table_page_title + " </page_title> "
  if table_section_title:
    table_str += "<section_title> " + table_section_title + " </section_title> "

  table_str += "<table> "
  adjusted_table = _add_adjusted_col_offsets(table)
  for r_index, row in enumerate(table):
    row_str = "<row> "
    for c_index, col in enumerate(row):

      row_headers = _get_heuristic_row_headers(adjusted_table, r_index, c_index)
      col_headers = _get_heuristic_col_headers(adjusted_table, r_index, c_index)

      # Distinguish between highlighted and non-highlighted cells.
      if [r_index, c_index] in cell_indices:
        start_cell_marker = "<highlighted_cell> "
        end_cell_marker = "</highlighted_cell> "
      else:
        start_cell_marker = "<cell> "
        end_cell_marker = "</cell> "

      # The value of the cell.
      item_str = start_cell_marker + col["value"] + " "

      # All the column headers associated with this cell.
      for col_header in col_headers:
        item_str += "<col_header> " + col_header["value"] + " </col_header> "

      # All the row headers associated with this cell.
      for row_header in row_headers:
        item_str += "<row_header> " + row_header["value"] + " </row_header> "

      item_str += end_cell_marker
      row_str += item_str

    row_str += "</row> "
    table_str += row_str

  table_str += "</table>"
  if cell_indices:
    assert "<highlighted_cell>" in table_str
  return table_str


def linearize_subtable(subtable, table_page_title, table_section_title):
  """Linearize the highlighted subtable and return a string of its contents."""
  table_str = ""
  if table_page_title:
    table_str += "<page_title> " + table_page_title + " </page_title> "
  if table_section_title:
    table_str += "<section_title> " + table_section_title + " </section_title> "
  table_str += "<table> "

  for item in subtable:
    cell = item["cell"]
    row_headers = item["row_headers"]
    col_headers = item["col_headers"]

    # The value of the cell.
    item_str = "<cell> " + cell["value"] + " "

    # All the column headers associated with this cell.
    for col_header in col_headers:
      item_str += "<col_header> " + col_header["value"] + " </col_header> "

    # All the row headers associated with this cell.
    for row_header in row_headers:
      item_str += "<row_header> " + row_header["value"] + " </row_header> "

    item_str += "</cell> "
    table_str += item_str

  table_str += "</table>"
  return table_str

In [6]:
# from preprocess_utils import get_highlighted_subtable, linearize_subtable

print('→', 'Highlighted Cells')
for (index_row, index_col) in data_sample['highlighted_cells']:
    print(data_sample['table'][index_row][index_col])

print('\n→', 'Linearized (Preprocessed) Cells')
subtable=get_highlighted_subtable(table=data_sample['table'], cell_indices=data_sample['highlighted_cells'], with_heuristic_headers=True)
cells_linearized=linearize_subtable(
    subtable=subtable,
    table_page_title=data_sample['table_page_title'],
    table_section_title=data_sample['table_section_title']
)
print(cells_linearized)

print('\n→', 'Final (Label) Sentence')
for sentence in data_sample['sentence_annotations']:
    print(sentence['final_sentence'])

→ Highlighted Cells
{'value': '4', 'is_header': False, 'column_span': 1, 'row_span': 1}
{'value': 'Camille Lacourt', 'is_header': False, 'column_span': 1, 'row_span': 1}
{'value': '53.08', 'is_header': False, 'column_span': 1, 'row_span': 1}

→ Linearized (Preprocessed) Cells
<page_title> Swimming at the 2012 Summer Olympics – Men's 100 metre backstroke </page_title> <section_title> Final </section_title> <table> <cell> 4 <col_header> Rank </col_header> </cell> <cell> Camille Lacourt <col_header> Name </col_header> </cell> <cell> 53.08 <col_header> Time </col_header> </cell> </table>

→ Final (Label) Sentence
Lacourt was dropped to a fourth-place time in 53.08.


In [7]:
# Prepare for Training
from transformers import T5Tokenizer

# T5 Tokenizer
tokenizer=T5Tokenizer.from_pretrained('t5-small')

# Vocab Size
len(tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


32100

In [8]:
# Add Special Tokens: Table Tags
tokenizer.add_special_tokens({
    'additional_special_tokens': [
        '<page_title>',
        '</page_title>',
        '<section_title>',
        '</section_title>',
        '<table>',
        '</table>',
        '<cell>',
        '</cell>',
        '<col_header>',
        '</col_header>',
        '<row_header>',
        '</row_header>'
    ]
})
# When Training, Resize PLM's Embedding Layer
# model.resize_token_embeddings(len(tokenizer))

# Vocab Size
len(tokenizer)

32112

In [9]:
# Tokenize Linearized Cells
print(tokenizer.tokenize(cells_linearized))

['<page_title>', '▁Swimming', '▁at', '▁the', '▁2012', '▁Summer', '▁Olympics', '▁', '–', '▁Men', "'", 's', '▁100', '▁', 'metre', '▁back', 'stroke', '</page_title>', '<section_title>', '▁Final', '</section_title>', '<table>', '<cell>', '▁4', '<col_header>', '▁', 'Rank', '</col_header>', '</cell>', '<cell>', '▁Camill', 'e', '▁La', 'court', '<col_header>', '▁Name', '</col_header>', '</cell>', '<cell>', '▁53', '.', '08', '<col_header>', '▁Time', '</col_header>', '</cell>', '</table>']


### **2. LoRA Finetuning (t5-small)**

In [10]:
import json

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup

# Google's Official Preprocess Codes
# https://github.com/google-research/language/blob/master/language/totto/baseline_preprocessing/preprocess_utils.py
# from preprocess_utils import get_highlighted_subtable, linearize_subtable

In [11]:
# Pre-Trained T5 Tokenizer
tokenizer=T5Tokenizer.from_pretrained('t5-small')
# Add Special Tokens: Table Tags
tokenizer.add_special_tokens({
    'additional_special_tokens': [
        '<page_title>',
        '</page_title>',
        '<section_title>',
        '</section_title>',
        '<table>',
        '</table>',
        '<cell>',
        '</cell>',
        '<col_header>',
        '</col_header>',
        '<row_header>',
        '</row_header>'
    ]
})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


12

In [12]:
# Pre-Trained T5 Model
model=T5ForConditionalGeneration.from_pretrained('t5-small')
# Resize PLM's Embedding Layer
model.resize_token_embeddings(len(tokenizer))

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Embedding(32112, 512)

In [13]:
# Original T5-small model
print(model)

T5ForConditionalGeneration(
  (shared): Embedding(32112, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32112, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [14]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

# Original T5-small model
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 60498432
all model parameters: 60498432
percentage of trainable model parameters: 100.00%


In [15]:
from peft import LoraConfig, get_peft_model, TaskType

peft_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.01,
)

# LoRA T5-small model
model = get_peft_model(model, peft_config)
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 294912
all model parameters: 60793344
percentage of trainable model parameters: 0.49%


In [16]:
# LoRA T5-small model
print(model)

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32112, 512)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32112, 512)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=512, out_features=512, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.01, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=512, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
              

In [17]:
# 모든 파라미터 requires_grad 확인
for name, param in model.named_parameters():
    print(name, param.requires_grad)

base_model.model.shared.weight False
base_model.model.encoder.block.0.layer.0.SelfAttention.q.base_layer.weight False
base_model.model.encoder.block.0.layer.0.SelfAttention.q.lora_A.default.weight True
base_model.model.encoder.block.0.layer.0.SelfAttention.q.lora_B.default.weight True
base_model.model.encoder.block.0.layer.0.SelfAttention.k.weight False
base_model.model.encoder.block.0.layer.0.SelfAttention.v.base_layer.weight False
base_model.model.encoder.block.0.layer.0.SelfAttention.v.lora_A.default.weight True
base_model.model.encoder.block.0.layer.0.SelfAttention.v.lora_B.default.weight True
base_model.model.encoder.block.0.layer.0.SelfAttention.o.weight False
base_model.model.encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight False
base_model.model.encoder.block.0.layer.0.layer_norm.weight False
base_model.model.encoder.block.0.layer.1.DenseReluDense.wi.weight False
base_model.model.encoder.block.0.layer.1.DenseReluDense.wo.weight False
base_model.model.encoder

#### **LoRA Training**

In [18]:
from torch.utils.data import Dataset

class ToTToDataset(Dataset):
    def __init__(self, path_data, tokenizer):
        self.tokenizer = tokenizer
        self.data = []
        self.label = []
        self.attention_mask = []

        # Load Dataset
        with open(path_data, 'r') as f:
            dataset = f.read().splitlines()

        for _data in dataset:
            data = json.loads(_data)

            # Preprocess
            subtable = get_highlighted_subtable(table=data['table'], cell_indices=data['highlighted_cells'], with_heuristic_headers=True)
            cells_linearized = linearize_subtable(subtable=subtable, table_page_title=data['table_page_title'], table_section_title=data['table_section_title'])

            # Encode
            encoded_dict = tokenizer.encode_plus(cells_linearized, max_length=512, truncation=True, padding="max_length", return_attention_mask=True)
            self.data.append(encoded_dict['input_ids'])
            self.attention_mask.append(encoded_dict['attention_mask'])
            self.label.append(tokenizer.encode(data['sentence_annotations'][0]['final_sentence'], max_length=512, truncation=True))

        print(len(self.data), 'datas')
        print(len(self.label), 'labels')

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.data[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'labels': torch.tensor(self.label[idx], dtype=torch.long)
        }
        return item


    def __len__(self):
        return len(self.data)

In [19]:
dataset_train = ToTToDataset(path_data="/content/drive/MyDrive/ToTTo_data/totto_train_data.jsonl", tokenizer=tokenizer)

120761 datas
120761 labels


In [20]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

output_dir="/content/drive/MyDrive/ToTTo_T5-small_LoRA/model/epoch10"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	  auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=10,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=2000,
    save_strategy="no",
    report_to="tensorboard",
)

# Data collator 인스턴스 생성
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Seq2SeqTrainer 인스턴스 생성
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset_train,
)

model.config.use_cache = False

##### **Error debugging (Trainer.train())**

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-29-b7e217745f1c> in <cell line: 2>()
*       1 # train model
* ----> 2 trainer.train()

9 frames
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose) \\
*    3212         # The model's main input name, usually `input_ids`, has be passed for padding
*    3213         if self.model_input_names[0] not in encoded_inputs:
* -> 3214             raise ValueError(
*    3215                 "You should supply an encoding or a list of encodings to this method "
*    3216                 f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided []

In [None]:
from torch.utils.data import DataLoader

# 임시 DataLoader 생성
temp_loader = DataLoader(trainer.train_dataset, batch_size=4, collate_fn=data_collator)

# 배치 데이터 형식 확인
for batch in temp_loader:
    print(batch.keys())
    print(batch['input_ids'].shape)
    print(batch['attention_mask'].shape)
    if 'labels' in batch:
        print(batch['labels'].shape)
    break  # 첫 번째 배치만 확인하고 반복 중지

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])
torch.Size([4, 512])
torch.Size([4, 512])
torch.Size([4, 43])


In [None]:
from torch.utils.data import DataLoader

# DataLoader 생성 시 collate_fn에 data_collator 지정
sample_loader = DataLoader(trainer.train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)

# DataLoader에서 하나의 배치를 가져와서 확인
sample_batch = next(iter(sample_loader))

# 배치 데이터 확인
print("Batch keys:", sample_batch.keys())
for key, value in sample_batch.items():
    print(f"{key}: shape {value.shape}")

Batch keys: dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])
input_ids: shape torch.Size([8, 512])
attention_mask: shape torch.Size([8, 512])
labels: shape torch.Size([8, 62])
decoder_input_ids: shape torch.Size([8, 62])


#### **Train**

In [21]:
# train model
trainer.train()

Step,Training Loss
2000,1.7905
4000,1.6221
6000,1.5875
8000,1.555
10000,1.5321
12000,1.5183
14000,1.5103
16000,1.4983
18000,1.4839
20000,1.4712


TrainOutput(global_step=150960, training_loss=1.3879326121416684, metrics={'train_runtime': 34809.3021, 'train_samples_per_second': 34.692, 'train_steps_per_second': 4.337, 'total_flos': 1.6453417090154496e+17, 'train_loss': 1.3879326121416684, 'epoch': 10.0})

In [22]:
# 모델 저장
trainer.save_model(output_dir)

# 모델 구성 저장 (필요한 경우)
model.config.save_pretrained(output_dir)

# 모델의 state_dict 저장 (safetensor는 모델의 구조가 바뀌는 경우 불러오기가 까다로워서 그냥 pth도 저장)
torch.save(model.state_dict(), f'/content/drive/MyDrive/ToTTo_T5-small_LoRA/model/epoch10/T5-small_LoRA_Fine-Tuning_lr{training_args.learning_rate}_epoch10.pth')

