<a href="https://colab.research.google.com/github/SOL1archive/KoGrammar/blob/main/baseline_train_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

IN_COLAB

True

In [None]:
if IN_COLAB:
    !pip install transformers
    !pip install datasets
    !pip install evaluate
    !pip install rouge_score
    !pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import warnings
warnings.filterwarnings('ignore')
import datetime
import os
import gc
from collections import namedtuple
from pprint import pprint
from tqdm import tqdm

import numpy as np
import pandas as pd

import tensorboard
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
import torchmetrics

from datasets import load_dataset, load_from_disk, concatenate_datasets, DatasetDict, Dataset
from transformers import DataCollatorForSeq2Seq
from transformers import BartConfig, T5Config
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

## Settings

In [None]:
MANUAL_TRAINING = True
MANUAL_VALIDATION = True
NUM_EPOCHS = 1
MID_CHECKPOINT_NUM = 5
MID_PROCESS_PRINT_NUM = 100

## Loading Tokenizer & Model Checkpoint

In [None]:
kobart_checkpoint = 'gogamza/kobart-base-v2'
kot5_checkpoint = 'psyche/KoT5'
kobart_baseline_checkpoint = ''
checkpoint = kobart_checkpoint
print(f'Using Checkpoint: {checkpoint}')

Using Checkpoint: gogamza/kobart-base-v2


In [None]:
if checkpoint == kobart_checkpoint:
    config = BartConfig.from_pretrained(kobart_checkpoint)
    #config['vocab'] = 30000
else:
    config = T5Config.from_pretrained(kot5_checkpoint)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, 
                                          max_length=512, 
                                          truncation=False, 
                                          padding='max_length',
                                          #vocab=config.vocab_size
                                          )
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, config=config)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [None]:
if len(tokenizer) != model.config.vocab_size:
    raise RuntimeError(f'Tokenizer vocab size and model vocab size do not match(Tokenizer:{len(tokenizer)} Model: {model.config.vocab_size}). Which would lead to further error in training.')

## Loading Datasets

In [None]:
dataset = Dataset.from_pandas(pd.read_json('drive/MyDrive/projects/KoGrammar/data/simplified_data.json'))

len(dataset)

1129363

In [None]:
train_testvalid = dataset.train_test_split(test_size=0.1)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
train_data = train_testvalid['train'].train_test_split(test_size=0.5)
dataset_dict = DatasetDict({
    'train': train_testvalid['train'],
    'train_baseline': train_data['train'],
    'train_distil': concatenate_datasets([
        train_data['train'].train_test_split(test_size=0.5)['train'],
        train_data['test'].train_test_split(test_size=0.5)['train']
    ]).shuffle(),
    'valid': test_valid['train'],
    'test': test_valid['test'],
    })

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['form', 'corrected_form', '__index_level_0__'],
        num_rows: 1016426
    })
    train_baseline: Dataset({
        features: ['form', 'corrected_form', '__index_level_0__'],
        num_rows: 508213
    })
    train_distil: Dataset({
        features: ['form', 'corrected_form', '__index_level_0__'],
        num_rows: 508212
    })
    valid: Dataset({
        features: ['form', 'corrected_form', '__index_level_0__'],
        num_rows: 56468
    })
    test: Dataset({
        features: ['form', 'corrected_form', '__index_level_0__'],
        num_rows: 56469
    })
})

In [None]:
def tokenize(row):
    form_embeddings = tokenizer(row['form'], max_length=512, truncation=True, padding='max_length')
    with tokenizer.as_target_tokenizer():
        correct_form_embeddings = tokenizer(row['corrected_form'], max_length=512, truncation=True, padding='max_length')

    return {
        'input_ids': form_embeddings['input_ids'],
        'attention_mask': form_embeddings['attention_mask'],
        'labels': correct_form_embeddings['input_ids'],
    }

In [None]:
dataset_dict.keys()

dict_keys(['train', 'train_baseline', 'train_distil', 'valid', 'test'])

In [None]:
os.getcwd()

'/content'

In [None]:
replaced_checkpoint = checkpoint.replace('/', '-')
tokenized_dataset_path = f'drive/MyDrive/projects/KoGrammar/data/{replaced_checkpoint}_tokenized_dataset'

In [None]:
os.path.exists(tokenized_dataset_path)

True

In [None]:
if not os.path.exists(tokenized_dataset_path):
    tokenized_dataset = (dataset_dict
                         .map(tokenize, 
                              batched=True, 
                              batch_size=128, 
                              num_proc=10
                              )
                         .remove_columns(['form', 'corrected_form'])
                         )
    
    tokenized_dataset.save_to_disk(tokenized_dataset_path)
else:
    tokenized_dataset = load_from_disk(tokenized_dataset_path)

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1016426
    })
    train_baseline: Dataset({
        features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 508213
    })
    train_distil: Dataset({
        features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 508212
    })
    valid: Dataset({
        features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 56468
    })
    test: Dataset({
        features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 56469
    })
})

## Training

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors='pt')

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    report_to="tensorboard",
    push_to_hub=False,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['valid'],
    data_collator=data_collator,
)

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
if not MANUAL_TRAINING:
    trainer.train()
else:
    total_loss_lt = []
    batch_loss_lt = []

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    optimizer = AdamW(model.parameters(), lr=2e-5)
    trainset = tokenized_dataset['train_baseline'].with_format("torch", device=device)
    dataloader = DataLoader(trainset, 
                            batch_size=10, 
                            shuffle=False, 
                            num_workers=4,
                            #collate_fn=lambda lt: pad_sequence(lt, 
                            #                                   batch_first=True, 
                            #                                   padding_value=tokenizer.pad_token_id
                            #                                   )
                            )
    if not next(model.parameters()).is_cuda and device == torch.device('cuda'):
        model.to(device)
    
    model.train()
    for epoch in range(NUM_EPOCHS):
        total_steps = len(dataloader)
        save_divisor = total_steps // MID_CHECKPOINT_NUM
        print_divisor = total_steps // MID_PROCESS_PRINT_NUM
        for i, batch in enumerate(tqdm(dataloader)):
            X = {
                    'input_ids': batch['input_ids'],
                    'attention_mask': batch['attention_mask'],
                }
            y = batch['labels']
            outputs = model(**X, labels=y)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            #gc.collect()
            #torch.cuda.empty_cache()

            batch_loss_lt.append(loss.item())
            if i % print_divisor == print_divisor - 1:
                batch_loss_series = pd.Series(batch_loss_lt)
                print(f'batch {i} loss: {loss.item()} mean: {batch_loss_series.mean()}')
                batch_loss_lt = []

            if i % save_divisor == save_divisor - 1:
                SAVE_STR = datetime.datetime.now().strftime('%y%m%d-%H:%M') + f'-batch{i}'
                trainer.create_model_card(
                    language='Korean',
                    tags='Grammar',
                    model=model,
                    finetuned_from=checkpoint
                )
                trainer.save_model(f"drive/MyDrive/projects/KoGrammar/models/{SAVE_STR}")

        total_loss_lt += batch_loss_lt
        batch_loss_series = pd.Series(batch_loss_lt)
        print(f'epoch {epoch + 1} loss: {loss.item()} mean: {batch_loss_series.mean()}')
    '''
    except:
        print(
            'input_ids: ' + str(X['input_ids'].shape), 
            'attention_mask: ' + str(X['attention_mask'].shape), 
            'labels: ' + str(y.shape), 
            sep='\t'
        )
        '''

 27%|██▋       | 13499/50822 [3:00:22<8:18:42,  1.25it/s]


KeyboardInterrupt: ignored

In [None]:
total_loss_series = pd.Series(total_loss_lt)
total_loss_series.plot.line()

## Validation

In [None]:
if not MANUAL_VALIDATION:
    trainer.evaluate(dataset_dict['valid'])
else:
    loss_lt = []

    model.eval()
    validset = tokenized_dataset['valid'].with_format("torch", device=device)
    dataloader = DataLoader(validset, batch_size=1, shuffle=True)
    if not next(model.parameters()).is_cuda and device == torch.device('cuda'):
        model.to(device)

    try:
        with torch.no_grad():
            for batch in dataloader:
                X = {
                        'input_ids': batch['input_ids'],
                        'attention_mask': batch['attention_mask'],
                    }
                y = batch['labels']
                outputs = model(**X, labels=y)
                loss = outputs.loss
                loss_lt.append(loss.item())
                gc.collect()
                torch.cuda.empty_cache()
    except:
        pass
    
    loss_series = pd.Series(loss_lt)
    print(f'loss: {loss_series.mean()}')

In [None]:
validset = tokenized_dataset['valid'].with_format("torch", device=device)
test_sample = validset.shuffle().select(range(1))
test_sample_gt = test_sample['labels']
test_sample = test_sample.remove_columns('labels')[0]
test_sample_input = dict()
test_sample_input['input_ids'] = test_sample['input_ids'].unsqueeze(0)
test_sample_input['attention_mask'] = test_sample['attention_mask'].unsqueeze(0)
output = model.generate(**test_sample_input)
input_text = tokenizer.decode(test_sample_input['input_ids'].squeeze(0), skip_special_tokens=True)
output_text = tokenizer.decode(output.squeeze(0), skip_special_tokens=True)
gt_text = tokenizer.decode(test_sample_gt.squeeze(0), skip_special_tokens=True)

print(input_text, output_text, gt_text, sep='\n\n')

5만원되면 출금가능

5만 원 되면 출금 가능.

5만 원 되면 출금 가능.


In [None]:
def eval(model, tokenizer, input_seq, target_seq, metric):
    generated_ids = model.generate(**input_seq)
    generated_sentence = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    target_sentence = tokenizer.decoder(target_seq, skip_special_tokens=True)
    score = metric.compute(generated_sentence, target_sentence)

    return score

In [None]:
bleu_scores = []
rouge_scores = []
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')

for example in tqdm(validset.shuffle()):
    input_sentence = {
                        'input_ids': example['input_ids'].unsqueeze(0),
                        'attention_mask': example['attention_mask'].unsqueeze(0),
                     }
    
    target_seq = example['labels']
    generated_ids = model.generate(**input_sentence)
    generated_sentence = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    target_sentence = tokenizer.decode(target_seq, skip_special_tokens=True)
    
    bleu_score = bleu.compute(predictions=generated_sentence, references=target_sentence)
    rouge_score = rouge.compute(predictions=generated_sentence, references=target_sentence)

    bleu_scores.append(bleu_score)
    rouge_scores.append(rouge_score)

average_bleu_score = pd.Series(bleu_scores).mean()
average_rouge_score = pd.Series(rouge_scores).mean()
pd.concat([average_bleu_score, average_rouge_score], axis=1)

  0%|          | 1/56468 [00:00<8:18:17,  1.89it/s]


ValueError: ignored

## Saving

In [None]:
# To prevent unwanted saves
raise RuntimeError

In [None]:
NOW_STR = datetime.datetime.now().strftime('%y%m%d-%H:%M')
trainer.create_model_card(
    language='Korean',
    tags='Grammar',
    #model='KoGrammar',
    finetuned_from=checkpoint
)
trainer.save_model(f"drive/MyDrive/projects/KoGrammar/models/{NOW_STR}")