<a href="https://colab.research.google.com/github/Tiabet/Complete_story/blob/master/KoBART_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!pip install transformers
!pip install accelerate
!pip install datasets
!pip install evaluate
!pip install rouge
!pip install konlpy
!pip install pytorch-lightning

In [1]:
import json
import pandas as pd

import logging
import sys

from datasets import Dataset
from torch.utils.data import DataLoader
from tokenizers.processors import TemplateProcessing
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
import torch
import torch.nn.functional as F
import torchmetrics
from torch.optim.lr_scheduler import CyclicLR
from transformers import BartForConditionalGeneration, AutoTokenizer

DATA

In [2]:
def jsonlload(fname):
    with open(fname, "r", encoding="utf-8") as f:
        lines = f.read().strip().split("\n")
        j_list = [json.loads(line) for line in lines]

    return j_list


def jsonldump(j_list, fname):
    with open(fname, "w", encoding='utf-8') as f:
        for json_data in j_list:
            f.write(json.dumps(json_data, ensure_ascii=False)+'\n')

In [3]:
def StoryDataLoader(fname, tokenizer, batch_size, max_length, mode="train"):
    """
    Build Data Loader

    """

    dataset = Dataset.from_json(fname, mode)

    if not tokenizer.cls_token:
        tokenizer.cls_token = tokenizer.bos_token
    if not tokenizer.sep_token:
        tokenizer.sep_token = tokenizer.eos_token

    tokenizer._tokenizer.post_processor = TemplateProcessing(
        single=f"{tokenizer.cls_token} $0 {tokenizer.sep_token}",
        pair=f"{tokenizer.cls_token} $A {tokenizer.sep_token} $B:1 {tokenizer.sep_token}:1",
        special_tokens=[(tokenizer.cls_token, tokenizer.cls_token_id), (tokenizer.sep_token, tokenizer.sep_token_id)],
    )

    def preprocess_function(examples):
        processed = {}
        tokenizer_input = tokenizer(
            examples["input"]["sentence1"],
            examples["input"]["sentence3"],
            padding="max_length",
            max_length=max_length,
            truncation=True
        )
        processed["input_ids"] = tokenizer_input["input_ids"],
        processed["attention_mask"] = tokenizer_input["attention_mask"]

        if mode == "train":
            tokenizer_output = tokenizer(
                examples["output"],
                padding="max_length",
                max_length=max_length,
                truncation=True
            )
            processed["decoder_input_ids"] = tokenizer_output["input_ids"]
            processed["decoder_attention_mask"] = tokenizer_output["attention_mask"]

        return processed

    dataset = dataset.map(
        preprocess_function,
        remove_columns=dataset.column_names
    ).with_format("torch")
    dataloader = DataLoader(dataset, shuffle=(True if mode=="train" else False), batch_size=batch_size)

    return dataloader


MODULE

In [80]:
import os
import pytorch_lightning as pl
import torch
import torch.nn.functional as F
from torch.optim.lr_scheduler import CyclicLR
from transformers import BartForConditionalGeneration
from typing import Any, Dict

class StoryModule(pl.LightningModule):
    """
    Attributes:
        model: BART model
        total_steps: total training steps for lr scheduling
        max_learning_rate: Max LR
        min_learning_rate: Min LR
        warmup_rate: warmup step rate
        model_save_dir: path to save model
    """

    def __init__(
        self,
        model: BartForConditionalGeneration,
        total_steps: int,
        max_learning_rate: float,
        min_learning_rate: float,
        warmup_rate: float,
        model_save_dir: str,
    ):
        super().__init__()

        self.model = model
        self.validation_step_outputs = []
        self.total_steps = total_steps
        self.max_learning_rate = max_learning_rate
        self.min_learning_rate = min_learning_rate
        self.warmup_rate = warmup_rate
        self.model_save_dir = model_save_dir

        self.save_hyperparameters(
            {
                **model.config.to_dict(),
                "total_steps": total_steps,
                "max_learning_rate": self.max_learning_rate,
                "min_learning_rate": self.min_learning_rate,
                "warmup_rate": self.warmup_rate,
            }
        )

    def training_step(self, batch, batch_idx):
        output = self.model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            decoder_input_ids=batch["decoder_input_ids"],
            decoder_attention_mask=batch["decoder_attention_mask"],
            return_dict=True,
        )

        labels = batch["decoder_input_ids"][:, 1:].reshape(-1)
        logits = output["logits"][:, :-1].reshape([labels.shape[0], -1])

        loss = F.cross_entropy(logits, labels, ignore_index=self.model.config.pad_token_id)

        metrics = {"loss": loss}
        self.log_dict(metrics, prog_bar=True, logger=True, on_step=True)

        return metrics

    def validation_step(self, batch, batch_idx):
        output = self.model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            decoder_input_ids=batch["decoder_input_ids"],
            decoder_attention_mask=batch["decoder_attention_mask"],
            return_dict=True,
        )

        labels = batch["decoder_input_ids"][:, 1:].reshape(-1)
        logits = output["logits"][:, :-1].reshape([labels.shape[0], -1])

        loss = F.cross_entropy(logits, labels, ignore_index=self.model.config.pad_token_id)

        metrics = {"val_loss": loss}
        self.validation_step_outputs.append(metrics)
        self.log_dict(metrics, prog_bar=True, logger=True, on_epoch=True)

        return metrics

    def test_step(self, *args, **kwargs):
        return self.validation_step(*args, **kwargs)

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(params=self.model.parameters(), lr=self.max_learning_rate)
        scheduler = CyclicLR(
            optimizer,
            base_lr=self.min_learning_rate,
            max_lr=self.max_learning_rate,
            step_size_up=int(self.total_steps * self.warmup_rate),
            step_size_down=self.total_steps - int(self.total_steps * self.warmup_rate),
            mode='triangular',
            cycle_momentum=False
        )

        return {
            "optimizer": optimizer,
            "lr_scheduler": {"scheduler": scheduler, "interval": "step", "name": "Learning Rate"},
        }

    def on_validation_epoch_end(self):

#        outputs = torch.stack(self.validation_step_outputs)

        #if self.trainer.is_global_zero:
        #    val_losses = [output["val_loss"].mean() for output in outputs]
#
 #           val_loss_mean = sum(val_losses) / len(val_losses)
#
 #           self.model.save_pretrained(
  #              os.path.join(
   #                 self.model_save_dir,
    #                f"model-{self.current_epoch:02d}epoch-{self.global_step}steps-{val_loss_mean:.4f}loss"
     #           )
      #      )
       #     self.validation_step_outputs.clear()



SyntaxError: ignored

UTILS

In [75]:
def get_logger(name: str) -> logging.Logger:
    """Return logger for logging

    Args:
        name: logger name
    """
    logger = logging.getLogger(name)
    logger.propagate = False
    logger.setLevel(logging.DEBUG)
    if not logger.handlers:
        handler = logging.StreamHandler(sys.stdout)
        handler.setFormatter(logging.Formatter("[%(asctime)s] %(message)s"))
        logger.addHandler(handler)
    return logger


Train

In [76]:
import os
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from transformers import BartForConditionalGeneration, AutoTokenizer


# Define your configuration parameters here
output_dir = "/content/drive/MyDrive"
model_path = "gogamza/kobart-base-v2"
tokenizer_path = "gogamza/kobart-base-v2"
batch_size = 4
valid_batch_size = 4
max_seq_len = 512
accumulate_grad_batches = 1
epochs = 10
max_learning_rate = 2e-4
min_learning_rate = 1e-5
warmup_rate = 0.1
gpus = 1  # Set this to 0 if you want to run on CPU
logging_interval = 100
evaluate_interval = 500
seed = 42

# Create the output directory
print(f'[+] Save output to "{output_dir}"')

# Set Random Seed
pl.seed_everything(seed)
print(f"[+] Set Random Seed to {seed}")

print(f"[+] GPU: {gpus}")

print(f'[+] Load Tokenizer"')
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

print(f'[+] Load Dataset')
train_dataloader = StoryDataLoader("/content/drive/MyDrive/nikluge-sc-2023-test.jsonl", tokenizer, batch_size, max_seq_len)
valid_dataloader = StoryDataLoader("/content/drive/MyDrive/nikluge-sc-2023-dev.jsonl", tokenizer, valid_batch_size, max_seq_len)
total_steps = len(train_dataloader) * epochs


INFO:lightning_fabric.utilities.seed:Global seed set to 42


[+] Save output to "/content/drive/MyDrive"
[+] Set Random Seed to 42
[+] GPU: 1
[+] Load Tokenizer"


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
Using cls_token, but it is not set yet.
Using sep_token, but it is not set yet.


[+] Load Dataset


In [77]:

if model_path:
    print(f'[+] Load Model from "{model_path}"')
    model = BartForConditionalGeneration.from_pretrained(model_path)

print(f"[+] Load Pytorch Lightning Module")
lightning_module = StoryModule(
    model,
    total_steps,
    max_learning_rate,
    min_learning_rate,
    warmup_rate,
    output_dir
)

print(f"[+] Start Training")
train_loggers = [TensorBoardLogger(output_dir, "", "logs")]

# If evaluate_interval passed float F, check the validation set 1/F times during a training epoch
if evaluate_interval == 1:
    evaluate_interval = 1.0
trainer = pl.Trainer(
    logger=train_loggers,
    max_epochs=epochs,
    log_every_n_steps=logging_interval,
    val_check_interval=evaluate_interval,
    accumulate_grad_batches=accumulate_grad_batches,
    callbacks=[LearningRateMonitor(logging_interval="step")],
    accelerator="gpu",
)


[+] Load Model from "gogamza/kobart-base-v2"


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


[+] Load Pytorch Lightning Module
[+] Start Training


In [78]:
trainer.fit(lightning_module, train_dataloader, valid_dataloader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                         | Params
-------------------------------------------------------
0 | model | BartForConditionalGeneration | 123 M 
-------------------------------------------------------
123 M     Trainable params
0         Non-trainable params
123 M     Total params
495.440   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

TypeError: ignored