<a href="https://colab.research.google.com/github/SOL1archive/ClauseSummary/blob/main/main-model-train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

'Process in Colab' if IN_COLAB else 'Process in Local'

In [None]:
if IN_COLAB:
    !pip install transformers
    !pip install datasets
    !pip install evaluate
    !pip install rouge_score
    !pip install torchmetrics
    !pip install rouge
    !pip install --upgrade accelerate

In [None]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/')

In [None]:
# 깃허브에서는 빼야됨
%cd drive/MyDrive/projects/ClauseSummary

In [None]:
import warnings
warnings.filterwarnings('ignore')
import datetime
import os
import gc
from pprint import pprint
from typing import Callable, Dict, List, Optional, Tuple, Union
from tqdm import tqdm

import numpy as np
import pandas as pd

import tensorboard
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR, CyclicLR
import torchmetrics

from datasets import load_dataset, load_from_disk, concatenate_datasets, DatasetDict, Dataset
from transformers import get_linear_schedule_with_warmup
from transformers import DataCollatorForSeq2Seq
from transformers import BartConfig, T5Config
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [None]:
class TokenizeMapWrapper:
    def __init__(self, tokenizer, feature, option=None):
        if option is None:
            option = {
                'max_length': 512,
                'truncation': True,
                'padding': 'max_length',
            }
          
        self.option = option
        self.feature = feature
        self.tokenizer = tokenizer

    def __call__(self, row):
        return self.tokenizer(row[self.feature], **self.option)

    def __repr__(self):
        return f'{self.__class__.__name__}(tokenizer={self.tokenizer})'

class Seq2SeqTokenizeMapWrapper(TokenizeMapWrapper):
    def __init__(self, tokenizer, feature, target, option=None):
        super().__init__(tokenizer, feature, option)
        self.target = target

    def seq2seq_tokenize(self, row):
        form_embeddings = self.tokenizer(row[self.feature], **self.option)
        with self.tokenizer.as_target_tokenizer():
            correct_form_embeddings = self.tokenizer(row[self.target], **self.option)

        return {
            'input_ids': form_embeddings['input_ids'],
            'attention_mask': form_embeddings['attention_mask'],
            'labels': correct_form_embeddings['input_ids'],
        }

    def __call__(self, row):
        return self.seq2seq_tokenize(row)

## Setting

- 학습 환경에 맞게 조정하기 (특히 **경로 설정**)

In [None]:
MANUAL_TRAINING = True
MANUAL_VALIDATION = True
NUM_EPOCHS = 1
MID_CHECKPOINT_NUM = 2
MID_PROCESS_PRINT_NUM = 50

In [None]:
t5_large_summary_checkpoint = 'lcw99/t5-large-korean-text-summary'
t5_base_summary_checkpoint = 'eenzeenee/t5-base-korean-summarization'
kobart_summary_checkpoint = 'gogamza/kobart-summarization'
checkpoint = t5_base_summary_checkpoint
print(f'Using Checkpoint: {checkpoint}')

In [None]:
original_dataset_path = './data/dataset-term-summary.json'
tokenized_dataset_path = f'./data/{checkpoint.replace("/", "-")}-tokenized-dataset'

In [None]:
SAVE_STR = datetime.datetime.now().strftime('%y%m%d-%H:%M')
model_save_path = f"./model/{SAVE_STR}"

## Load Tokenizer & Model Checkpoint

In [None]:
if 'bart' in checkpoint.lower():
    config = BartConfig.from_pretrained(checkpoint)
    #config['vocab'] = 30000
else:
    config = T5Config.from_pretrained(checkpoint)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, 
                                          max_length=512, 
                                          truncation=False, 
                                          padding='max_length',
                                          #vocab=config.vocab_size
                                          )
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, config=config)

In [None]:
if len(tokenizer) != model.config.vocab_size:
    raise RuntimeError(f'Tokenizer vocab size and model vocab size do not match(Tokenizer:{len(tokenizer)} Model: {model.config.vocab_size}). Which would lead to further error in training.')

## Load Dataset

In [None]:
if not os.path.exists(tokenized_dataset_path):
    dataset = Dataset.from_pandas(pd.read_json(original_dataset_path, encoding='utf-8')[['text', 'summary']])
    tokenizer_wrapper = Seq2SeqTokenizeMapWrapper(tokenizer, 'text', 'summary')

    tokenized_dataset = (dataset
                         .map(tokenizer_wrapper, 
                              batched=True, 
                              batch_size=128, 
                              num_proc=10
                              )
                         .remove_columns(['text', 'summary'])
                         )
    
    tokenized_dataset_dict = tokenized_dataset.train_test_split(test_size=0.2, shuffle=True)
    tokenized_dataset_dict.save_to_disk(tokenized_dataset_path)
else:
    tokenized_dataset_dict = load_from_disk(tokenized_dataset_path)

## Training

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors='pt')

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    report_to="tensorboard",
    push_to_hub=False,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_dataset_dict['train'],
    data_collator=data_collator,
)

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
# model.train()

# if not MANUAL_TRAINING:
#     trainer.train()
# else:
#     total_loss = []
#     epoch_loss = []
#     batch_loss = []
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     trainset = tokenized_dataset_dict['train'].with_format('torch', device=device)
#     dataloader = DataLoader(trainset, batch_size=1, shuffle=False) # TODO: Batch size 조절
    
#     # TODO: Write a code for **Hyperparameter Tuning**
#     optimizer = AdamW(model.parameters(), lr = training_args.learning_rate, weight_decay = training_args.weight_decay)
#     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=NUM_EPOCHS * len(dataloader))

#     for epoch in range(NUM_EPOCHS):
#         total_steps = len(dataloader)
#         save_divisor = total_steps // MID_CHECKPOINT_NUM
#         print_divisor = total_steps // MID_PROCESS_PRINT_NUM
#         for i, batch in enumerate(tqdm(dataloader)):
#             X = {
#                     'input_ids': batch['input_ids'],
#                     'attention_mask': batch['attention_mask'],
#                 }
#             y = batch['labels']
            
#             outputs = model(**X, labels=y)
#             loss = outputs.loss
#             loss.backward()
#             optimizer.step()
#             optimizer.zero_grad()
#             scheduler.step()

#             batch_loss.append(loss.item())
#             if i % print_divisor == print_divisor - 1:
#                 epoch_loss += batch_loss
#                 batch_loss_series = pd.Series(batch_loss)
#                 print(f'\tbatch {i}\tloss: {loss.item()}\tmean: {batch_loss_series.mean()}')
#                 batch_loss = []

#             if i % save_divisor == save_divisor - 1:
#                 trainer.create_model_card(
#                     language='Korean',
#                     tags='Grammar',
#                     finetuned_from=checkpoint
#                 )
#                 trainer.save_model(model_save_path + f'-epoch-{epoch + 1}' + '-batch-{i + 1}')

#         total_loss += epoch_loss
#         batch_loss_series = pd.Series(epoch_loss)
#         epoch_loss = []
#         print(f'epoch {epoch + 1} loss: {loss.item()} mean: {batch_loss_series.mean()}')

In [None]:
# total_loss_series = pd.Series(total_loss)
# total_loss_series.plot.line()

In [None]:
print(checkpoint)

In [None]:
## finding the best parameters
def mean(A):
    sum = 0
    for a in A:
        sum += a
    return sum / len(A)

# 개인적으로는 L2 Norm 계수보다는 Learning rate나 learning rate scheduling 최적화에 집중하는 것도 좋을듯??

learning_rates = [1e-5, 5e-5]
weight_decays = [0.03, 0.05, 0.07]
for learning_rate in learning_rates:
    for decay in weight_decays:
        gc.collect()
        torch.cuda.empty_cache()
        training_args = Seq2SeqTrainingArguments(
            output_dir="./results",
            evaluation_strategy = "epoch",
            learning_rate=learning_rate,
            per_device_train_batch_size=64,
            per_device_eval_batch_size=64,
            num_train_epochs=NUM_EPOCHS,
            weight_decay=decay,
            report_to="tensorboard",
            push_to_hub=False,
        )
        total_loss = []
        epoch_loss = []
        batch_loss = []
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        trainset = tokenized_dataset_dict['train'].with_format('torch', device=device)
        dataloader = DataLoader(trainset, batch_size=1, shuffle=False) # TODO: Batch size 조절
        
        # TODO: Write a code for **Hyperparameter Tuning**
        optimizer = AdamW(model.parameters(), lr = training_args.learning_rate, weight_decay = training_args.weight_decay)
        optimizer_name = "AdamW"
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=NUM_EPOCHS * len(dataloader)) # TODO: Scheduler 조정
        scheduler_name = "linear_schedule"

        for epoch in range(NUM_EPOCHS):
            total_steps = len(dataloader)
            save_divisor = total_steps // MID_CHECKPOINT_NUM
            print_divisor = total_steps // MID_PROCESS_PRINT_NUM
            for i, batch in enumerate(tqdm(dataloader)):
                X = {
                        'input_ids': batch['input_ids'],
                        'attention_mask': batch['attention_mask'],
                    }
                y = batch['labels']
                
                outputs = model(**X, labels=y)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()

                batch_loss.append(loss.item())
                if i % print_divisor == print_divisor - 1:
                    epoch_loss += batch_loss
                    batch_loss_series = pd.Series(batch_loss)
                    print(f'\tbatch {i}\tloss: {loss.item()}\tmean: {batch_loss_series.mean()}')
                    batch_loss = []

                if i % save_divisor == save_divisor - 1:
                    trainer.create_model_card(
                        language='Korean',
                        tags='Grammar',
                        finetuned_from=checkpoint
                    )
                    trainer.save_model(model_save_path + f'-epoch-{epoch + 1}' + '-batch-{i + 1}')

            total_loss += epoch_loss
            batch_loss_series = pd.Series(epoch_loss)
            epoch_loss = []

        # for recording
        total_loss.sort()
        top5_loss = mean(total_loss[:5])
        text = "%s, %s, %s, %f, %f, %f\n"%(checkpoint, optimizer_name, scheduler_name, training_args.learning_rate, training_args.weight_decay, top5_loss)
        with open('./results/experiments.csv', 'a') as f:
            f.write(text)
        trainer.create_model_card(
            language='Korean',
            finetuned_from=checkpoint
        )
        trainer.save_model(model_save_path + f'lr={learning_rate}-decay={weight_decays}')

In [None]:
# text = "Checkpoint, optimizer, scheduler, learning_rate, weight_decay, top5_loss\n"
# with open('./results/experiments.csv', 'w') as f:
#     f.write(text)

## Validation

In [None]:
def generate_seq(model, tokenizer, input):
    generated_ids = model.generate(**input)
    generated_text = tokenizer.decode(generated_ids.squeeze(0), skip_special_tokens=True)
    
    return generated_text

def generate_input_target(model, tokenizer, input, label):
    input_text = tokenizer.decode(input['input_ids'].squeeze(0), skip_special_tokens=True)
    generated_text = generate_seq(model, tokenizer, input)
    target_text = tokenizer.decode(label.squeeze(0), skip_special_tokens=True)
    
    return {
        'input_text': input_text,
        'generated_text': generated_text, 
        'target_text': target_text
    }

def generate_from_data(model, tokenizer, data):
    label = data['labels']
    input_data = dict()
    input_data['input_ids'] = data['input_ids']
    input_data['attention_mask'] = data['attention_mask']

    return generate_input_target(model, tokenizer, input_data, label)

def eval(model, tokenizer, input_seq, label, metric: Callable, options = dict()):
    generated_input_target = generate_input_target(model, tokenizer, input_seq, label)
    score = metric(
        generated_input_target['generated_text'], 
        generated_input_target['target_text'],
        **options
    )

    return score

def eval_from_data(model, tokenizer, dataset, metric: Callable, options = dict()):
    result = []
    for data in dataset:
        label = data['labels']
        input_data = {
            'input_ids': data['input_ids'],
            'attention_mask': data['attention_mask'],
        }

        result.append(eval(model, tokenizer, input_data, label, metric, options))

    return pd.Series(result)

def eval_bleu(model, tokenizer, tokenized_testset):
    bleu_score_lt = []
    for example in tqdm(tokenized_testset):
        output = generate_from_data(model, tokenizer, example)
        try:
            bleu_score = sentence_bleu([output['target_text']], 
                                       output['generated_text'], 
                                       smoothing_function=SmoothingFunction().method1
            )
        except ValueError:
            continue
        bleu_score_lt.append(bleu_score)
    
    return pd.DataFrame({'BLEU': bleu_score_lt})

def eval_rogue(model, tokenizer, tokenized_testset):
    rouge = Rouge()
    rouge_score_dict = dict()
    rouge_score_dict['Precision'] = []
    rouge_score_dict['Recall'] = []
    rouge_score_dict['F1'] = []

    for example in tqdm(tokenized_testset):
        output = generate_from_data(model, tokenizer, example)
        try:
            rouge_score = rouge.get_scores(output['generated_text'], 
                                           output['target_text']
            )
        except ValueError:
            continue
        rouge_score_precision = rouge_score[0]['rouge-2']['p']
        rouge_score_recall = rouge_score[0]['rouge-2']['r']
        rouge_score_f = rouge_score[0]['rouge-2']['f']
        
        rouge_score_dict['Precision'].append(rouge_score_precision)
        rouge_score_dict['Recall'].append(rouge_score_recall)
        rouge_score_dict['F1'].append(rouge_score_f)
    
    return pd.DataFrame(rouge_score_dict)

In [None]:
bleu_df = eval_bleu(model, tokenizer, tokenized_dataset['test'])
bleu_df.to_csv(f"./results/{checkpoint[checkpoint.rfind('/'):]}bleu.csv", index=False)
bleu_df.describe()

In [None]:
rouge_df = eval_rogue(model, tokenizer, tokenized_dataset['test'])
rouge_df.to_csv(f"./results/{checkpoint[checkpoint.rfind('/'):]}rouge.csv", index=False)
rouge_df.describe()