In [1]:
import warnings
warnings.filterwarnings('ignore')
import datetime
import os

import numpy as np
import pandas as pd

import tensorboard
import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import load_dataset, DatasetDict, Dataset
from transformers import DataCollatorForSeq2Seq
from transformers import BartConfig, T5Config
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

2023-04-30 00:08:56.619425: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-30 00:08:57.334514: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/wsl/lib:
2023-04-30 00:08:57.334606: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/wsl/lib:


## Loading Tokenizer & Model Checkpoint

In [5]:
kobart_checkpoint = 'gogamza/kobart-base-v2'
kot5_checkpoint = 'psyche/KoT5'
checkpoint = kot5_checkpoint

In [8]:
if checkpoint == kobart_checkpoint:
    config = BartConfig.from_pretrained(kobart_checkpoint)
    config['vocab'] = 30000
else:
    config = T5Config.from_pretrained(kot5_checkpoint)

In [9]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, max_length=512, truncation=True, padding='max_length', vocab=config.vocab_size)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, config=config)

## Loading Datasets

In [10]:
dataset = Dataset.from_pandas(pd.read_json('data/simplified_data.json'))

len(dataset)

1129363

In [11]:
train_testvalid = dataset.train_test_split(test_size=0.1)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
dataset_dict = DatasetDict({
    'train': train_testvalid['train'],
    'valid': test_valid['train'],
    'test': test_valid['test'],
    })

In [12]:
def tokenize(row):
    form_embeddings = tokenizer(row['form'])
    with tokenizer.as_target_tokenizer():
        correct_form_embeddings = tokenizer(row['corrected_form'])

    return {
        'input_ids': form_embeddings['input_ids'],
        'attention_mask': form_embeddings['attention_mask'],
        'labels': correct_form_embeddings['input_ids'],
    }

In [13]:
replaced_checkpoint = checkpoint.replace('/', '-')
tokenized_dataset_path = f'data/{replaced_checkpoint}_tokenized_dataset'

if not os.path.exists(tokenized_dataset_path):
    tokenized_dataset = dataset_dict.map(tokenize).remove_columns(['form', 'corrected_form'])
    tokenized_dataset.save_to_disk(tokenized_dataset_path)
else:
    tokenized_dataset = load_dataset(tokenized_dataset_path)

Map:   0%|          | 0/1016426 [00:00<?, ? examples/s]

Map:   0%|          | 0/56468 [00:00<?, ? examples/s]

Map:   0%|          | 0/56469 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1016426 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/56468 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/56469 [00:00<?, ? examples/s]

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [15]:
tokenized_dataset['train'][0]

{'__index_level_0__': 1124838,
 'input_ids': [19317, 1530, 4589, 13866, 12339, 1],
 'attention_mask': [1, 1, 1, 1, 1, 1],
 'labels': [19317, 25907, 1530, 4589, 13866, 26087, 1]}

In [16]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    weight_decay=0.01,
    report_to="tensorboard",
    push_to_hub=False,
)

In [17]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['valid'],
    data_collator=data_collator,
)

In [18]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: form, corrected_form, __index_level_0__. If form, corrected_form, __index_level_0__ are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1016426
  Num Epochs = 2
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 31764
  Number of trainable parameters = 222903552


IndexError: list index out of range

In [None]:
trainer.evaluate(dataset_dict['valid'])

In [None]:
# To prevent unwanted saves
raise RuntimeError

In [None]:
NOW_STR = datetime.datetime.now().strftime('%y%m%d-%H:%M')
trainer.create_model_card(
    language='Korean',
    tags='Grammar',
    model='KoGrammar',
    finetuned_from=checkpoint
)
trainer.save_model(f"./models/{NOW_STR}")