## **T5-base Fine-tuning on ToTTo**

From : [JooYoung Song](https://github.com/Song-Joo-Young/ToTTo-Fine-tuning-in-colab/tree/main)

Code Reference :
* ToTTo : https://github.com/google-research-datasets/ToTTo
* Prompt-Tuning-on-ToTTo : https://github.com/ChainsmokersAI/Prompt-Tuning-on-ToTTo

In [1]:
# Google Drive Mount

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Get Dataset

!wget https://storage.googleapis.com/totto-public/totto_data.zip
!unzip totto_data.zip

--2024-02-01 01:51:35--  https://storage.googleapis.com/totto-public/totto_data.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.2.207, 2607:f8b0:4023:c0d::cf, 2607:f8b0:4023:c06::cf
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.2.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 187724372 (179M) [application/zip]
Saving to: ‘totto_data.zip’


2024-02-01 01:51:38 (73.9 MB/s) - ‘totto_data.zip’ saved [187724372/187724372]

Archive:  totto_data.zip
  inflating: totto_data/totto_dev_data.jsonl  
  inflating: totto_data/totto_train_data.jsonl  
  inflating: totto_data/unlabeled_totto_test_data.jsonl  


In [None]:
# 드라이브에 데이터셋 저장 추후 가중치도 저장할 폴더
# Copy Dataset to your Google Drive
import shutil
import os

source_folder = '/content/totto_data'
destination_folder = '/content/drive/MyDrive/ToTTo_T5-base'

if os.path.exists(destination_folder):
    shutil.rmtree(destination_folder)

shutil.copytree(source_folder, destination_folder)

### **1. Preprocessing**

In [2]:
!pip install transformers datasets sentencepiece

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Installing collected 

In [3]:
# Load Train Set
# with open('/content/totto_data/totto_train_data.jsonl', 'r') as f:
with open('/content/drive/MyDrive/ToTTo_T5-base/totto_train_data.jsonl', 'r') as f:
    data_train=f.read().splitlines()
    f.close()

# Number of Train Data
len(data_train)

120761

In [4]:
import json

# Sample Data
data_sample=json.loads(data_train[-1])

# Key-Value Set
for key, value in data_sample.items():
    # if key=='table': continue

    print('→', key, '\n \t ', value)

→ table 
 	  [[{'value': 'Rank', 'is_header': True, 'column_span': 1, 'row_span': 1}, {'value': 'Lane', 'is_header': True, 'column_span': 1, 'row_span': 1}, {'value': 'Name', 'is_header': True, 'column_span': 1, 'row_span': 1}, {'value': 'Nationality', 'is_header': True, 'column_span': 1, 'row_span': 1}, {'value': 'Time', 'is_header': True, 'column_span': 1, 'row_span': 1}, {'value': 'Notes', 'is_header': True, 'column_span': 1, 'row_span': 1}], [{'value': '', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': '4', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': 'Matt Grevers', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': 'United States', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': '52.16', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': 'OR', 'is_header': False, 'column_span': 1, 'row_span': 1}], [{'value': '', 'is_header': False, 'column_span': 1, 'row_span': 1}, {'value': '2', 'is_header': False, 'co

In [5]:
# Google's Official Preprocess Codes
# https://github.com/google-research/language/blob/master/language/totto/baseline_preprocessing/preprocess_utils.py

import copy

def _add_adjusted_col_offsets(table):
  """Add adjusted column offsets to take into account multi-column cells."""
  adjusted_table = []
  for row in table:
    real_col_index = 0
    adjusted_row = []
    for cell in row:
      adjusted_cell = copy.deepcopy(cell)
      adjusted_cell["adjusted_col_start"] = real_col_index
      adjusted_cell["adjusted_col_end"] = (
          adjusted_cell["adjusted_col_start"] + adjusted_cell["column_span"])
      real_col_index += adjusted_cell["column_span"]
      adjusted_row.append(adjusted_cell)
    adjusted_table.append(adjusted_row)
  return adjusted_table


def _get_heuristic_row_headers(adjusted_table, row_index, col_index):
  """Heuristic to find row headers."""
  row_headers = []
  row = adjusted_table[row_index]
  for i in range(0, col_index):
    if row[i]["is_header"]:
      row_headers.append(row[i])
  return row_headers


def _get_heuristic_col_headers(adjusted_table, row_index, col_index):
  """Heuristic to find column headers."""
  adjusted_cell = adjusted_table[row_index][col_index]
  adjusted_col_start = adjusted_cell["adjusted_col_start"]
  adjusted_col_end = adjusted_cell["adjusted_col_end"]
  col_headers = []
  for r in range(0, row_index):
    row = adjusted_table[r]
    for cell in row:
      if (cell["adjusted_col_start"] < adjusted_col_end and
          cell["adjusted_col_end"] > adjusted_col_start):
        if cell["is_header"]:
          col_headers.append(cell)

  return col_headers


def get_highlighted_subtable(table, cell_indices, with_heuristic_headers=False):
  """Extract out the highlighted part of a table."""
  highlighted_table = []

  adjusted_table = _add_adjusted_col_offsets(table)

  for (row_index, col_index) in cell_indices:
    cell = table[row_index][col_index]
    if with_heuristic_headers:
      row_headers = _get_heuristic_row_headers(adjusted_table, row_index,
                                               col_index)
      col_headers = _get_heuristic_col_headers(adjusted_table, row_index,
                                               col_index)
    else:
      row_headers = []
      col_headers = []

    highlighted_cell = {
        "cell": cell,
        "row_headers": row_headers,
        "col_headers": col_headers
    }
    highlighted_table.append(highlighted_cell)

  return highlighted_table


def linearize_full_table(table, cell_indices, table_page_title,
                         table_section_title):
  """Linearize full table with localized headers and return a string."""
  table_str = ""
  if table_page_title:
    table_str += "<page_title> " + table_page_title + " </page_title> "
  if table_section_title:
    table_str += "<section_title> " + table_section_title + " </section_title> "

  table_str += "<table> "
  adjusted_table = _add_adjusted_col_offsets(table)
  for r_index, row in enumerate(table):
    row_str = "<row> "
    for c_index, col in enumerate(row):

      row_headers = _get_heuristic_row_headers(adjusted_table, r_index, c_index)
      col_headers = _get_heuristic_col_headers(adjusted_table, r_index, c_index)

      # Distinguish between highlighted and non-highlighted cells.
      if [r_index, c_index] in cell_indices:
        start_cell_marker = "<highlighted_cell> "
        end_cell_marker = "</highlighted_cell> "
      else:
        start_cell_marker = "<cell> "
        end_cell_marker = "</cell> "

      # The value of the cell.
      item_str = start_cell_marker + col["value"] + " "

      # All the column headers associated with this cell.
      for col_header in col_headers:
        item_str += "<col_header> " + col_header["value"] + " </col_header> "

      # All the row headers associated with this cell.
      for row_header in row_headers:
        item_str += "<row_header> " + row_header["value"] + " </row_header> "

      item_str += end_cell_marker
      row_str += item_str

    row_str += "</row> "
    table_str += row_str

  table_str += "</table>"
  if cell_indices:
    assert "<highlighted_cell>" in table_str
  return table_str


def linearize_subtable(subtable, table_page_title, table_section_title):
  """Linearize the highlighted subtable and return a string of its contents."""
  table_str = ""
  if table_page_title:
    table_str += "<page_title> " + table_page_title + " </page_title> "
  if table_section_title:
    table_str += "<section_title> " + table_section_title + " </section_title> "
  table_str += "<table> "

  for item in subtable:
    cell = item["cell"]
    row_headers = item["row_headers"]
    col_headers = item["col_headers"]

    # The value of the cell.
    item_str = "<cell> " + cell["value"] + " "

    # All the column headers associated with this cell.
    for col_header in col_headers:
      item_str += "<col_header> " + col_header["value"] + " </col_header> "

    # All the row headers associated with this cell.
    for row_header in row_headers:
      item_str += "<row_header> " + row_header["value"] + " </row_header> "

    item_str += "</cell> "
    table_str += item_str

  table_str += "</table>"
  return table_str

In [6]:
# from preprocess_utils import get_highlighted_subtable, linearize_subtable

print('→', 'Highlighted Cells')
for (index_row, index_col) in data_sample['highlighted_cells']:
    print(data_sample['table'][index_row][index_col])

print('\n→', 'Linearized (Preprocessed) Cells')
subtable=get_highlighted_subtable(table=data_sample['table'], cell_indices=data_sample['highlighted_cells'], with_heuristic_headers=True)
cells_linearized=linearize_subtable(
    subtable=subtable,
    table_page_title=data_sample['table_page_title'],
    table_section_title=data_sample['table_section_title']
)
print(cells_linearized)

print('\n→', 'Final (Label) Sentence')
for sentence in data_sample['sentence_annotations']:
    print(sentence['final_sentence'])

→ Highlighted Cells
{'value': '4', 'is_header': False, 'column_span': 1, 'row_span': 1}
{'value': 'Camille Lacourt', 'is_header': False, 'column_span': 1, 'row_span': 1}
{'value': '53.08', 'is_header': False, 'column_span': 1, 'row_span': 1}

→ Linearized (Preprocessed) Cells
<page_title> Swimming at the 2012 Summer Olympics – Men's 100 metre backstroke </page_title> <section_title> Final </section_title> <table> <cell> 4 <col_header> Rank </col_header> </cell> <cell> Camille Lacourt <col_header> Name </col_header> </cell> <cell> 53.08 <col_header> Time </col_header> </cell> </table>

→ Final (Label) Sentence
Lacourt was dropped to a fourth-place time in 53.08.


In [7]:
# Prepare for Training
from transformers import T5Tokenizer

# T5 Tokenizer
tokenizer=T5Tokenizer.from_pretrained('t5-base')

# Vocab Size
len(tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


32100

In [8]:
# Add Special Tokens: Table Tags
tokenizer.add_special_tokens({
    'additional_special_tokens': [
        '<page_title>',
        '</page_title>',
        '<section_title>',
        '</section_title>',
        '<table>',
        '</table>',
        '<cell>',
        '</cell>',
        '<col_header>',
        '</col_header>',
        '<row_header>',
        '</row_header>'
    ]
})
# When Training, Resize PLM's Embedding Layer
# model.resize_token_embeddings(len(tokenizer))

# Vocab Size
len(tokenizer)

32112

In [9]:
# Tokenize Linearized Cells
print(tokenizer.tokenize(cells_linearized))

['<page_title>', '▁Swimming', '▁at', '▁the', '▁2012', '▁Summer', '▁Olympics', '▁', '–', '▁Men', "'", 's', '▁100', '▁', 'metre', '▁back', 'stroke', '</page_title>', '<section_title>', '▁Final', '</section_title>', '<table>', '<cell>', '▁4', '<col_header>', '▁', 'Rank', '</col_header>', '</cell>', '<cell>', '▁Camill', 'e', '▁La', 'court', '<col_header>', '▁Name', '</col_header>', '</cell>', '<cell>', '▁53', '.', '08', '<col_header>', '▁Time', '</col_header>', '</cell>', '</table>']


### **2. Finetuning (t5-base)**

In [10]:
import json

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup

# Google's Official Preprocess Codes
# https://github.com/google-research/language/blob/master/language/totto/baseline_preprocessing/preprocess_utils.py
# from preprocess_utils import get_highlighted_subtable, linearize_subtable

In [11]:
# Train Config
device=torch.device('cuda:0')
lr=1e-4
batch_size=8 # 3 for 't5-large' and make 'accumulation_steps' larger
accumulation_steps=3
epochs=5

In [12]:
# Pre-Trained T5 Tokenizer
tokenizer=T5Tokenizer.from_pretrained('t5-base')
# Add Special Tokens: Table Tags
tokenizer.add_special_tokens({
    'additional_special_tokens': [
        '<page_title>',
        '</page_title>',
        '<section_title>',
        '</section_title>',
        '<table>',
        '</table>',
        '<cell>',
        '</cell>',
        '<col_header>',
        '</col_header>',
        '<row_header>',
        '</row_header>'
    ]
})

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


12

In [13]:
class ToTToDataset(Dataset):
    def __init__(self, path_data, tokenizer):
        #
        self.data=[]
        self.label=[]

        # Load Dataset
        with open(path_data, 'r') as f:
            dataset=f.read().splitlines()
            f.close()

        for _data in dataset:
            data=json.loads(_data)

            # Preprocess
            subtable=get_highlighted_subtable(table=data['table'], cell_indices=data['highlighted_cells'], with_heuristic_headers=True)
            cells_linearized=linearize_subtable(
                subtable=subtable,
                table_page_title=data['table_page_title'],
                table_section_title=data['table_section_title']
            )

            # Encode
            encoded=tokenizer.encode(cells_linearized)
            if len(encoded)>512:
                # Truncate
                encoded=encoded[:511]+[tokenizer.eos_token_id]
            self.data.append(encoded)
            self.label.append(tokenizer.encode(data['sentence_annotations'][0]['final_sentence']))

        print(len(self.data), 'datas')
        print(len(self.label), 'labels')

    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]

    def __len__(self):
        return len(self.data)

In [14]:
def collate_fn(batch):
    """
    Same Sequence Length on Same Batch
    """
    max_len_data=0
    max_len_label=0
    for data, label in batch:
        if len(data)>max_len_data: max_len_data=len(data)
        if len(label)>max_len_label: max_len_label=len(label)

    datas=[]
    attn_masks=[]
    labels=[]
    for data, label in batch:
        data.extend([tokenizer.pad_token_id]*(max_len_data-len(data)))
        datas.append(data)

        attn_mask=[int(e!=tokenizer.pad_token_id) for e in data]
        attn_masks.append(attn_mask)

        label.extend([-100]*(max_len_label-len(label)))
        labels.append(label)

    return torch.tensor(datas), torch.tensor(attn_masks), torch.tensor(labels)

In [15]:
# Pre-Trained T5 Model
model=T5ForConditionalGeneration.from_pretrained('t5-base')
# Resize PLM's Embedding Layer
model.resize_token_embeddings(len(tokenizer))

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Embedding(32112, 768)

In [16]:
print(model)

T5ForConditionalGeneration(
  (shared): Embedding(32112, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32112, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [17]:
# dataset_train=ToTToDataset(path_data='/content/totto_data/totto_train_data.jsonl', tokenizer=tokenizer)
dataset_train=ToTToDataset(path_data='/content/drive/MyDrive/ToTTo_T5-base/totto_train_data.jsonl', tokenizer=tokenizer)
dataloader_train=DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

Token indices sequence length is longer than the specified maximum sequence length for this model (578 > 512). Running this sequence through the model will result in indexing errors


120761 datas
120761 labels


In [18]:
# Optim, Scheduler
optimizer=AdamW(model.parameters(), lr=lr)
scheduler=get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=1000,
    num_training_steps=int(epochs*len(dataset_train)/(accumulation_steps*batch_size))
)



In [19]:
from tqdm import tqdm

step_global=0

for epoch in range(epochs):
    # Train Phase
    model.train()
    model.to(device)

    loss_train = 0
    optimizer.zero_grad()

    tqdm_dataloader_train = tqdm(dataloader_train, desc=f'Epoch {epoch + 1}')

    for step, (data, attn_mask, label) in enumerate(tqdm_dataloader_train):
        data = data.to(device)
        attn_mask = attn_mask.to(device)
        label = label.to(device)

        outputs = model(input_ids=data, attention_mask=attn_mask, labels=label)

        loss = outputs[0] / accumulation_steps
        loss.backward()

        loss_train += loss.item()

        if (step + 1) % accumulation_steps == 0:
            step_global += 1

            # Console
            if step_global % 1000 == 0:
                print(f'\n Epoch {epoch + 1}  Step {step_global} Train loss {loss_train:.4f}')
            # Set Loss to 0
            loss_train = 0

            optimizer.step()
            scheduler.step()

            optimizer.zero_grad()

    # Save Model
    model.to(torch.device('cpu'))
    torch.save(model.state_dict(), f'/content/drive/MyDrive/ToTTo_T5-base/model/T5-base_Fine-Tuning_lr{lr}_batch{int(accumulation_steps*batch_size)}_epoch{epoch+1}.pth')


Epoch 1:  20%|█▉        | 3000/15096 [17:43<1:12:46,  2.77it/s]


 Epoch 1  Step 1000 Train loss 1.1804


Epoch 1:  40%|███▉      | 6000/15096 [35:59<44:31,  3.40it/s]


 Epoch 1  Step 2000 Train loss 1.3387


Epoch 1:  60%|█████▉    | 9000/15096 [54:18<37:02,  2.74it/s]


 Epoch 1  Step 3000 Train loss 1.0142


Epoch 1:  79%|███████▉  | 12000/15096 [1:12:38<14:09,  3.64it/s]


 Epoch 1  Step 4000 Train loss 1.0326


Epoch 1:  99%|█████████▉| 15000/15096 [1:30:50<00:32,  2.92it/s]


 Epoch 1  Step 5000 Train loss 1.1553


Epoch 1: 100%|██████████| 15096/15096 [1:31:25<00:00,  2.75it/s]
Epoch 2:  19%|█▉        | 2904/15096 [36:39<2:14:08,  1.51it/s]


 Epoch 2  Step 6000 Train loss 1.1510


Epoch 2:  39%|███▉      | 5904/15096 [1:13:45<2:06:08,  1.21it/s]


 Epoch 2  Step 7000 Train loss 1.0231


Epoch 2:  59%|█████▉    | 8904/15096 [1:50:49<1:32:17,  1.12it/s]


 Epoch 2  Step 8000 Train loss 0.9752


Epoch 2:  79%|███████▉  | 11904/15096 [2:28:19<36:22,  1.46it/s]


 Epoch 2  Step 9000 Train loss 0.9853


Epoch 2:  99%|█████████▊| 14904/15096 [3:05:42<01:59,  1.60it/s]


 Epoch 2  Step 10000 Train loss 1.1179


Epoch 2: 100%|██████████| 15096/15096 [3:08:00<00:00,  1.34it/s]
Epoch 3:  19%|█▊        | 2808/15096 [49:00<3:25:17,  1.00s/it]


 Epoch 3  Step 11000 Train loss 1.0970


Epoch 3:  38%|███▊      | 5808/15096 [1:41:22<2:43:12,  1.05s/it]


 Epoch 3  Step 12000 Train loss 0.9119


Epoch 3:  58%|█████▊    | 8808/15096 [2:33:45<1:50:47,  1.06s/it]


 Epoch 3  Step 13000 Train loss 0.8262


Epoch 3:  78%|███████▊  | 11808/15096 [3:26:07<57:30,  1.05s/it]


 Epoch 3  Step 14000 Train loss 0.9843


Epoch 3:  98%|█████████▊| 14808/15096 [4:18:33<05:02,  1.05s/it]


 Epoch 3  Step 15000 Train loss 1.0589


Epoch 3: 100%|██████████| 15096/15096 [4:23:35<00:00,  1.05s/it]
Epoch 4:  18%|█▊        | 2712/15096 [49:03<3:41:52,  1.07s/it]


 Epoch 4  Step 16000 Train loss 0.7081


Epoch 4:  38%|███▊      | 5712/15096 [1:43:19<2:49:45,  1.09s/it]


 Epoch 4  Step 17000 Train loss 1.0178


Epoch 4:  58%|█████▊    | 8712/15096 [2:37:35<1:56:31,  1.10s/it]


 Epoch 4  Step 18000 Train loss 0.6478


Epoch 4:  78%|███████▊  | 11712/15096 [3:31:53<1:02:15,  1.10s/it]


 Epoch 4  Step 19000 Train loss 0.8178


Epoch 4:  97%|█████████▋| 14712/15096 [4:26:10<06:56,  1.09s/it]


 Epoch 4  Step 20000 Train loss 0.8123


Epoch 4: 100%|██████████| 15096/15096 [4:33:07<00:00,  1.09s/it]
Epoch 5:  17%|█▋        | 2616/15096 [49:04<3:53:06,  1.12s/it]


 Epoch 5  Step 21000 Train loss 0.7298


Epoch 5:  37%|███▋      | 5616/15096 [1:45:24<3:00:31,  1.14s/it]


 Epoch 5  Step 22000 Train loss 0.8804


Epoch 5:  57%|█████▋    | 8616/15096 [2:41:46<2:00:27,  1.12s/it]


 Epoch 5  Step 23000 Train loss 0.7858


Epoch 5:  77%|███████▋  | 11616/15096 [3:38:10<1:06:00,  1.14s/it]


 Epoch 5  Step 24000 Train loss 0.7195


Epoch 5:  97%|█████████▋| 14616/15096 [4:34:33<09:03,  1.13s/it]


 Epoch 5  Step 25000 Train loss 1.0546


Epoch 5: 100%|██████████| 15096/15096 [4:43:34<00:00,  1.13s/it]


**1 EPOCH 이상 안 돌아가면 나눠서 학습 진행**

* Split the training
    * 1 epoch training at each runtime

#### **Epoch 2**

In [None]:
# EPOCH 2
# Google Drive Mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers datasets sentencepiece

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with

In [None]:
# Google's Official Preprocess Codes
# https://github.com/google-research/language/blob/master/language/totto/baseline_preprocessing/preprocess_utils.py

import copy

def _add_adjusted_col_offsets(table):
  """Add adjusted column offsets to take into account multi-column cells."""
  adjusted_table = []
  for row in table:
    real_col_index = 0
    adjusted_row = []
    for cell in row:
      adjusted_cell = copy.deepcopy(cell)
      adjusted_cell["adjusted_col_start"] = real_col_index
      adjusted_cell["adjusted_col_end"] = (
          adjusted_cell["adjusted_col_start"] + adjusted_cell["column_span"])
      real_col_index += adjusted_cell["column_span"]
      adjusted_row.append(adjusted_cell)
    adjusted_table.append(adjusted_row)
  return adjusted_table


def _get_heuristic_row_headers(adjusted_table, row_index, col_index):
  """Heuristic to find row headers."""
  row_headers = []
  row = adjusted_table[row_index]
  for i in range(0, col_index):
    if row[i]["is_header"]:
      row_headers.append(row[i])
  return row_headers


def _get_heuristic_col_headers(adjusted_table, row_index, col_index):
  """Heuristic to find column headers."""
  adjusted_cell = adjusted_table[row_index][col_index]
  adjusted_col_start = adjusted_cell["adjusted_col_start"]
  adjusted_col_end = adjusted_cell["adjusted_col_end"]
  col_headers = []
  for r in range(0, row_index):
    row = adjusted_table[r]
    for cell in row:
      if (cell["adjusted_col_start"] < adjusted_col_end and
          cell["adjusted_col_end"] > adjusted_col_start):
        if cell["is_header"]:
          col_headers.append(cell)

  return col_headers


def get_highlighted_subtable(table, cell_indices, with_heuristic_headers=False):
  """Extract out the highlighted part of a table."""
  highlighted_table = []

  adjusted_table = _add_adjusted_col_offsets(table)

  for (row_index, col_index) in cell_indices:
    cell = table[row_index][col_index]
    if with_heuristic_headers:
      row_headers = _get_heuristic_row_headers(adjusted_table, row_index,
                                               col_index)
      col_headers = _get_heuristic_col_headers(adjusted_table, row_index,
                                               col_index)
    else:
      row_headers = []
      col_headers = []

    highlighted_cell = {
        "cell": cell,
        "row_headers": row_headers,
        "col_headers": col_headers
    }
    highlighted_table.append(highlighted_cell)

  return highlighted_table


def linearize_full_table(table, cell_indices, table_page_title,
                         table_section_title):
  """Linearize full table with localized headers and return a string."""
  table_str = ""
  if table_page_title:
    table_str += "<page_title> " + table_page_title + " </page_title> "
  if table_section_title:
    table_str += "<section_title> " + table_section_title + " </section_title> "

  table_str += "<table> "
  adjusted_table = _add_adjusted_col_offsets(table)
  for r_index, row in enumerate(table):
    row_str = "<row> "
    for c_index, col in enumerate(row):

      row_headers = _get_heuristic_row_headers(adjusted_table, r_index, c_index)
      col_headers = _get_heuristic_col_headers(adjusted_table, r_index, c_index)

      # Distinguish between highlighted and non-highlighted cells.
      if [r_index, c_index] in cell_indices:
        start_cell_marker = "<highlighted_cell> "
        end_cell_marker = "</highlighted_cell> "
      else:
        start_cell_marker = "<cell> "
        end_cell_marker = "</cell> "

      # The value of the cell.
      item_str = start_cell_marker + col["value"] + " "

      # All the column headers associated with this cell.
      for col_header in col_headers:
        item_str += "<col_header> " + col_header["value"] + " </col_header> "

      # All the row headers associated with this cell.
      for row_header in row_headers:
        item_str += "<row_header> " + row_header["value"] + " </row_header> "

      item_str += end_cell_marker
      row_str += item_str

    row_str += "</row> "
    table_str += row_str

  table_str += "</table>"
  if cell_indices:
    assert "<highlighted_cell>" in table_str
  return table_str


def linearize_subtable(subtable, table_page_title, table_section_title):
  """Linearize the highlighted subtable and return a string of its contents."""
  table_str = ""
  if table_page_title:
    table_str += "<page_title> " + table_page_title + " </page_title> "
  if table_section_title:
    table_str += "<section_title> " + table_section_title + " </section_title> "
  table_str += "<table> "

  for item in subtable:
    cell = item["cell"]
    row_headers = item["row_headers"]
    col_headers = item["col_headers"]

    # The value of the cell.
    item_str = "<cell> " + cell["value"] + " "

    # All the column headers associated with this cell.
    for col_header in col_headers:
      item_str += "<col_header> " + col_header["value"] + " </col_header> "

    # All the row headers associated with this cell.
    for row_header in row_headers:
      item_str += "<row_header> " + row_header["value"] + " </row_header> "

    item_str += "</cell> "
    table_str += item_str

  table_str += "</table>"
  return table_str

In [None]:
import json

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup

# Google's Official Preprocess Codes
# https://github.com/google-research/language/blob/master/language/totto/baseline_preprocessing/preprocess_utils.py
# from preprocess_utils import get_highlighted_subtable, linearize_subtable

In [None]:
class ToTToDataset(Dataset):
    def __init__(self, path_data, tokenizer):
        #
        self.data=[]
        self.label=[]

        # Load Dataset
        with open(path_data, 'r') as f:
            dataset=f.read().splitlines()
            f.close()

        for _data in dataset:
            data=json.loads(_data)

            # Preprocess
            subtable=get_highlighted_subtable(table=data['table'], cell_indices=data['highlighted_cells'], with_heuristic_headers=True)
            cells_linearized=linearize_subtable(
                subtable=subtable,
                table_page_title=data['table_page_title'],
                table_section_title=data['table_section_title']
            )

            # Encode
            encoded=tokenizer.encode(cells_linearized)
            if len(encoded)>512:
                # Truncate
                encoded=encoded[:511]+[tokenizer.eos_token_id]
            self.data.append(encoded)
            self.label.append(tokenizer.encode(data['sentence_annotations'][0]['final_sentence']))

        print(len(self.data), 'datas')
        print(len(self.label), 'labels')

    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]

    def __len__(self):
        return len(self.data)


def collate_fn(batch):
    """
    Same Sequence Length on Same Batch
    """
    max_len_data=0
    max_len_label=0
    for data, label in batch:
        if len(data)>max_len_data: max_len_data=len(data)
        if len(label)>max_len_label: max_len_label=len(label)

    datas=[]
    attn_masks=[]
    labels=[]
    for data, label in batch:
        data.extend([tokenizer.pad_token_id]*(max_len_data-len(data)))
        datas.append(data)

        attn_mask=[int(e!=tokenizer.pad_token_id) for e in data]
        attn_masks.append(attn_mask)

        label.extend([-100]*(max_len_label-len(label)))
        labels.append(label)

    return torch.tensor(datas), torch.tensor(attn_masks), torch.tensor(labels)

In [None]:
# Pre-Trained T5 Tokenizer
tokenizer=T5Tokenizer.from_pretrained('t5-base')
# Add Special Tokens: Table Tags
tokenizer.add_special_tokens({
    'additional_special_tokens': [
        '<page_title>',
        '</page_title>',
        '<section_title>',
        '</section_title>',
        '<table>',
        '</table>',
        '<cell>',
        '</cell>',
        '<col_header>',
        '</col_header>',
        '<row_header>',
        '</row_header>'
    ]
})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


12

In [None]:
# Train Config
device=torch.device('cuda:0')
lr=1e-4
batch_size=8 # 3 for 't5-large' and make 'accumulation_steps' larger
accumulation_steps=3
epochs=1

In [None]:
# Pre-Trained T5 Model
model=T5ForConditionalGeneration.from_pretrained('t5-base').to(device)
# Resize PLM's Embedding Layer
model.resize_token_embeddings(len(tokenizer))

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Embedding(32112, 768)

In [None]:
# dataset_train=ToTToDataset(path_data='/content/totto_data/totto_train_data.jsonl', tokenizer=tokenizer)
dataset_train=ToTToDataset(path_data='/content/drive/MyDrive/ToTTo_T5-base/totto_train_data.jsonl', tokenizer=tokenizer)
dataloader_train=DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

Token indices sequence length is longer than the specified maximum sequence length for this model (578 > 512). Running this sequence through the model will result in indexing errors


120761 datas
120761 labels


In [None]:
# Optim, Scheduler
optimizer=AdamW(model.parameters(), lr=lr)
scheduler=get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=1000,
    num_training_steps=int(epochs*len(dataset_train)/(accumulation_steps*batch_size))
)



In [None]:
# 저장된 가중치 파일 로드
model = T5ForConditionalGeneration.from_pretrained('t5-base')

model_path = '/content/drive/MyDrive/ToTTo_T5-base/model/T5-base_Fine-Tuning_lr0.0001_batch24_epoch2.pth'
model.load_state_dict(torch.load(model_path))

model = model.to(device)

In [None]:
from tqdm import tqdm

step_global=0

for epoch in range(epochs):
    # Train Phase
    model.train()
    model.to(device)

    loss_train = 0
    optimizer.zero_grad()

    tqdm_dataloader_train = tqdm(dataloader_train, desc=f'Epoch {epoch + 1}')

    for step, (data, attn_mask, label) in enumerate(tqdm_dataloader_train):
        data = data.to(device)
        attn_mask = attn_mask.to(device)
        label = label.to(device)

        outputs = model(input_ids=data, attention_mask=attn_mask, labels=label)

        loss = outputs[0] / accumulation_steps
        loss.backward()

        loss_train += loss.item()

        if (step + 1) % accumulation_steps == 0:
            step_global += 1

            # Console
            if step_global % 1000 == 0:
                print(f'\n Epoch {epoch + 1}  Step {step_global} Train loss {loss_train:.4f}')
            # Set Loss to 0
            loss_train = 0

            optimizer.step()
            scheduler.step()

            optimizer.zero_grad()

    # Save Model
    model.to(torch.device('cpu'))
    torch.save(model.state_dict(), f'/content/drive/MyDrive/ToTTo_T5-base/model/T5-base_Fine-Tuning_lr{lr}_batch{int(accumulation_steps*batch_size)}_epoch{epoch+1}.pth')

Epoch 2:  20%|█▉        | 3000/15096 [16:34<1:08:08,  2.96it/s]

	 Epoch 2  	 Step 1000 	 Train loss 0.9454


Epoch 2:  40%|███▉      | 6000/15096 [33:27<37:22,  4.06it/s]

	 Epoch 2  	 Step 2000 	 Train loss 0.9558


Epoch 2:  60%|█████▉    | 9000/15096 [50:33<31:08,  3.26it/s]

	 Epoch 2  	 Step 3000 	 Train loss 0.9974


Epoch 2:  79%|███████▉  | 12000/15096 [1:07:42<18:00,  2.86it/s]

	 Epoch 2  	 Step 4000 	 Train loss 1.0023


Epoch 2:  99%|█████████▉| 15001/15096 [1:24:23<00:38,  2.47it/s]

	 Epoch 2  	 Step 5000 	 Train loss 0.8843


Epoch 2: 100%|██████████| 15096/15096 [1:24:54<00:00,  2.96it/s]


#### **Epoch 3**

In [None]:
# EPOCH 3
# Google Drive Mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 런타임 리셋 후 필요한 코드는 Epoch 2에서 실행
# After resetting the runtime, the required code runs on Epoch 2

In [None]:
# 저장된 가중치 파일 로드
model = T5ForConditionalGeneration.from_pretrained('t5-base')

model_path = '/content/drive/MyDrive/ToTTo_T5-base/model/T5-base_Fine-Tuning_lr0.0001_batch24_epoch3.pth'
model.load_state_dict(torch.load(model_path))

model = model.to(device)

In [None]:
from tqdm import tqdm

step_global=0

for epoch in range(epochs):
    # Train Phase
    model.train()
    model.to(device)

    loss_train = 0
    optimizer.zero_grad()

    tqdm_dataloader_train = tqdm(dataloader_train, desc=f'Epoch {epoch + 1}')

    for step, (data, attn_mask, label) in enumerate(tqdm_dataloader_train):
        data = data.to(device)
        attn_mask = attn_mask.to(device)
        label = label.to(device)

        outputs = model(input_ids=data, attention_mask=attn_mask, labels=label)

        loss = outputs[0] / accumulation_steps
        loss.backward()

        loss_train += loss.item()

        if (step + 1) % accumulation_steps == 0:
            step_global += 1

            # Console
            if step_global % 1000 == 0:
                print(f'\n Epoch {epoch + 1}  Step {step_global} Train loss {loss_train:.4f}')
            # Set Loss to 0
            loss_train = 0

            optimizer.step()
            scheduler.step()

            optimizer.zero_grad()

    # Save Model
    model.to(torch.device('cpu'))
    torch.save(model.state_dict(), f'/content/drive/MyDrive/ToTTo_T5-base/model/T5-base_Fine-Tuning_lr{lr}_batch{int(accumulation_steps*batch_size)}_epoch{epoch+1}.pth')

Epoch 3:  20%|█▉        | 3001/15096 [18:06<56:20,  3.58it/s]  

	 Epoch 3  	 Step 1000 	 Train loss 0.9134


Epoch 3:  40%|███▉      | 6000/15096 [35:55<1:35:31,  1.59it/s]

	 Epoch 3  	 Step 2000 	 Train loss 0.9819


Epoch 3:  60%|█████▉    | 9001/15096 [54:07<31:01,  3.27it/s]

	 Epoch 3  	 Step 3000 	 Train loss 0.9991


Epoch 3:  79%|███████▉  | 12000/15096 [1:12:04<14:45,  3.50it/s]

	 Epoch 3  	 Step 4000 	 Train loss 1.1929


Epoch 3:  99%|█████████▉| 15000/15096 [1:30:09<00:28,  3.35it/s]

	 Epoch 3  	 Step 5000 	 Train loss 1.1185


Epoch 3: 100%|██████████| 15096/15096 [1:30:48<00:00,  2.77it/s]


#### **Epoch 4**

In [None]:
# EPOCH 4
# Google Drive Mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 런타임 리셋 후 필요한 코드는 Epoch 2에서 실행
# After resetting the runtime, the required code runs on Epoch 2

In [None]:
# 저장된 가중치 파일 로드
model = T5ForConditionalGeneration.from_pretrained('t5-base')

model_path = '/content/drive/MyDrive/ToTTo_T5-base/model/T5-base_Fine-Tuning_lr0.0001_batch24_epoch4.pth'
model.load_state_dict(torch.load(model_path))

model = model.to(device)

In [None]:
from tqdm import tqdm

step_global=0

for epoch in range(epochs):
    # Train Phase
    model.train()
    model.to(device)

    loss_train = 0
    optimizer.zero_grad()

    tqdm_dataloader_train = tqdm(dataloader_train, desc=f'Epoch {epoch + 1}')

    for step, (data, attn_mask, label) in enumerate(tqdm_dataloader_train):
        data = data.to(device)
        attn_mask = attn_mask.to(device)
        label = label.to(device)

        outputs = model(input_ids=data, attention_mask=attn_mask, labels=label)

        loss = outputs[0] / accumulation_steps
        loss.backward()

        loss_train += loss.item()

        if (step + 1) % accumulation_steps == 0:
            step_global += 1

            # Console
            if step_global % 1000 == 0:
                print(f'\n Epoch {epoch + 1}  Step {step_global} Train loss {loss_train:.4f}')
            # Set Loss to 0
            loss_train = 0

            optimizer.step()
            scheduler.step()

            optimizer.zero_grad()

    # Save Model
    model.to(torch.device('cpu'))
    torch.save(model.state_dict(), f'/content/drive/MyDrive/ToTTo_T5-base/model/T5-base_Fine-Tuning_lr{lr}_batch{int(accumulation_steps*batch_size)}_epoch{epoch+1}.pth')

Epoch 4:  20%|█▉        | 3000/15096 [17:45<1:01:23,  3.28it/s]

	 Epoch 4  	 Step 1000 	 Train loss 0.9015


Epoch 4:  40%|███▉      | 6000/15096 [35:44<48:09,  3.15it/s]

	 Epoch 4  	 Step 2000 	 Train loss 0.9649


Epoch 4:  60%|█████▉    | 9000/15096 [53:47<40:44,  2.49it/s]

	 Epoch 4  	 Step 3000 	 Train loss 1.1325


Epoch 4:  79%|███████▉  | 12000/15096 [1:11:46<17:57,  2.87it/s]

	 Epoch 4  	 Step 4000 	 Train loss 0.9925


Epoch 4:  99%|█████████▉| 15000/15096 [1:29:36<00:43,  2.21it/s]

	 Epoch 4  	 Step 5000 	 Train loss 1.0296


Epoch 4: 100%|██████████| 15096/15096 [1:30:09<00:00,  2.79it/s]


#### **Epoch 5**

In [None]:
# EPOCH 5
# Google Drive Mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 런타임 리셋 후 필요한 코드는 Epoch 2에서 실행
# After resetting the runtime, the required code runs on Epoch 2

In [None]:
# 저장된 가중치 파일 로드
model = T5ForConditionalGeneration.from_pretrained('t5-base')

model_path = '/content/drive/MyDrive/ToTTo_T5-base/model/T5-base_Fine-Tuning_lr0.0001_batch24_epoch5.pth'
model.load_state_dict(torch.load(model_path))

model = model.to(device)

In [None]:
from tqdm import tqdm

step_global=0

for epoch in range(epochs):
    # Train Phase
    model.train()
    model.to(device)

    loss_train = 0
    optimizer.zero_grad()

    tqdm_dataloader_train = tqdm(dataloader_train, desc=f'Epoch {epoch + 1}')

    for step, (data, attn_mask, label) in enumerate(tqdm_dataloader_train):
        data = data.to(device)
        attn_mask = attn_mask.to(device)
        label = label.to(device)

        outputs = model(input_ids=data, attention_mask=attn_mask, labels=label)

        loss = outputs[0] / accumulation_steps
        loss.backward()

        loss_train += loss.item()

        if (step + 1) % accumulation_steps == 0:
            step_global += 1

            # Console
            if step_global % 1000 == 0:
                print(f'\n Epoch {epoch + 1}  Step {step_global} Train loss {loss_train:.4f}')
            # Set Loss to 0
            loss_train = 0

            optimizer.step()
            scheduler.step()

            optimizer.zero_grad()

    # Save Model
    model.to(torch.device('cpu'))
    torch.save(model.state_dict(), f'/content/drive/MyDrive/ToTTo_T5-base/model/T5-base_Fine-Tuning_lr{lr}_batch{int(accumulation_steps*batch_size)}_epoch{epoch+1}.pth')

Epoch 5:  20%|█▉        | 3000/15096 [17:25<1:49:47,  1.84it/s]

	 Epoch 5  	 Step 1000 	 Train loss 1.0678


Epoch 5:  40%|███▉      | 6000/15096 [34:45<43:03,  3.52it/s]

	 Epoch 5  	 Step 2000 	 Train loss 0.9333


Epoch 5:  60%|█████▉    | 9000/15096 [52:21<54:46,  1.85it/s]

	 Epoch 5  	 Step 3000 	 Train loss 0.9322


Epoch 5:  79%|███████▉  | 12000/15096 [1:09:38<18:17,  2.82it/s]

	 Epoch 5  	 Step 4000 	 Train loss 1.0046


Epoch 5:  99%|█████████▉| 15001/15096 [1:27:12<00:27,  3.47it/s]

	 Epoch 5  	 Step 5000 	 Train loss 1.0282


Epoch 5: 100%|██████████| 15096/15096 [1:27:45<00:00,  2.87it/s]
