In [None]:
%pip install lightning==2.0.8
%pip install loguru
%pip install transformers==4.32.1

In [None]:
import argparse
import numpy as np
import pandas as pd
import lightning as L

from torch.utils.data import Dataset, DataLoader


class KoBARTSummaryDataset(Dataset):
    def __init__(self, file, tokenizer, max_len, ignore_index=-100):
        super().__init__()
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.docs = pd.read_csv(file, sep='\t')
        self.len = self.docs.shape[0]

        self.pad_index = self.tokenizer.pad_token_id
        self.ignore_index = ignore_index

    def add_padding_data(self, inputs):
        if len(inputs) < self.max_len:
            pad = np.array([self.pad_index] * (self.max_len - len(inputs)))
            inputs = np.concatenate([inputs, pad])
        else:
            inputs = inputs[:self.max_len]

        return inputs

    def add_ignored_data(self, inputs):
        if len(inputs) < self.max_len:
            pad = np.array([self.ignore_index] * (self.max_len - len(inputs)))
            inputs = np.concatenate([inputs, pad])
        else:
            inputs = inputs[:self.max_len]

        return inputs

    def __getitem__(self, idx):
        instance = self.docs.iloc[idx]
        input_ids = self.tokenizer.encode(instance['news'])
        input_ids = self.add_padding_data(input_ids)

        label_ids = self.tokenizer.encode(instance['summary'])
        label_ids.append(self.tokenizer.eos_token_id)
        dec_input_ids = [self.tokenizer.eos_token_id]
        dec_input_ids += label_ids[:-1]
        dec_input_ids = self.add_padding_data(dec_input_ids)
        label_ids = self.add_ignored_data(label_ids)

        return {'input_ids': np.array(input_ids, dtype=np.int_),
                'decoder_input_ids': np.array(dec_input_ids, dtype=np.int_),
                'labels': np.array(label_ids, dtype=np.int_)
                }

    def __len__(self):
        return self.len


class KobartSummaryModule(L.LightningDataModule):
    def __init__(self, train_file,
                 test_file, tok,
                 max_len=512,
                 batch_size=8,
                 num_workers=4):
        super().__init__()
        self.batch_size = batch_size
        self.max_len = max_len
        self.train_file_path = train_file
        self.test_file_path = test_file
        self.tok = tok
        self.num_workers = num_workers

    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = argparse.ArgumentParser(
            parents=[parent_parser], add_help=False)
        parser.add_argument('--num_workers',
                            type=int,
                            default=4,
                            help='num of worker for dataloader')
        return parser

    # OPTIONAL, called for every GPU/machine (assigning state is OK)
    def setup(self, stage):
        # split dataset
        self.train = KoBARTSummaryDataset(self.train_file_path,
                                          self.tok,
                                          self.max_len)
        self.test = KoBARTSummaryDataset(self.test_file_path,
                                         self.tok,
                                         self.max_len)

    def train_dataloader(self):
        train = DataLoader(self.train,
                           batch_size=self.batch_size,
                           num_workers=self.num_workers, shuffle=True)
        return train

    def val_dataloader(self):
        val = DataLoader(self.test,
                         batch_size=self.batch_size,
                         num_workers=self.num_workers, shuffle=False)
        return val

    def test_dataloader(self):
        test = DataLoader(self.test,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers, shuffle=False)
        return test


In [None]:
import torch
import lightning as L

from collections import defaultdict
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
from transformers.optimization import AdamW, get_linear_schedule_with_warmup


class KoBARTConditionalGeneration(L.LightningModule):
    def __init__(
            self,
            hparams,
            **kwargs):
        super().__init__()
        self.save_hyperparameters(hparams)
        self.model = BartForConditionalGeneration.from_pretrained('digit82/kobart-summarization')
        self.model.train()
        self.bos_token = '<s>'
        self.eos_token = '</s>'

        self.tokenizer = PreTrainedTokenizerFast.from_pretrained('digit82/kobart-summarization')
        self.pad_token_id = self.tokenizer.pad_token_id

        self.outputs = defaultdict(list)

    def configure_optimizers(self):
        # Prepare optimizer
        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(
                nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(
                nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.hparams.lr, correct_bias=False)
        num_workers = self.hparams.num_workers

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(self.trainer.estimated_stepping_batches * 0.1),
            num_training_steps=self.trainer.estimated_stepping_batches,
        )

        lr_scheduler = {'scheduler': scheduler,
                        'monitor': 'loss', 'interval': 'step',
                        'frequency': 1}
        return [optimizer], [lr_scheduler]

    def forward(self, inputs):
        attention_mask = inputs['input_ids'].ne(self.pad_token_id).float()
        decoder_attention_mask = inputs['decoder_input_ids'].ne(self.pad_token_id).float()

        return self.model(input_ids=inputs['input_ids'],
                          attention_mask=attention_mask,
                          decoder_input_ids=inputs['decoder_input_ids'],
                          decoder_attention_mask=decoder_attention_mask,
                          labels=inputs['labels'].type(torch.LongTensor), return_dict=True)

    def training_step(self, batch, batch_idx):
        outs = self(batch)
        loss = outs.loss
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outs = self(batch)
        loss = outs['loss']
        self.outputs[dataloader_idx].append({"loss": loss})

    def on_validation_epoch_end(self):
        flat_outputs = []
        for lst in self.outputs.values():
            flat_outputs.extend(lst)
        loss = torch.stack([x["loss"] for x in flat_outputs]).mean()
        self.log("val_loss", loss, prog_bar=True)
        self.outputs.clear()


In [None]:
import argparse
import lightning as L

from lightning.pytorch.callbacks import ModelCheckpoint
from loguru import logger
from transformers import PreTrainedTokenizerFast

In [None]:
import torch
print(torch.cuda.is_available())

In [None]:
parser = argparse.ArgumentParser(description='KoBART Summarization')


class ArgsBase:
    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = argparse.ArgumentParser(
            parents=[parent_parser], add_help=False)
        parser.add_argument('--train_file',
                            type=str,
                            default='/content/drive/MyDrive/data/train.tsv',
                            help='train file')
        parser.add_argument('--test_file',
                            type=str,
                            default='/content/drive/MyDrive/data/test.tsv',
                            help='test file')
        parser.add_argument('--batch_size',
                            type=int,
                            default=8,
                            help='')
        parser.add_argument('--checkpoint',
                            type=str,
                            default='/content/drive/MyDrive/data/checkpoint',
                            help='')
        parser.add_argument('--max_len',
                            type=int,
                            default=512,
                            help='max seq len')
        parser.add_argument('--max_epochs',
                            type=int,
                            default=10,
                            help='train epochs')
        parser.add_argument('--lr',
                            type=float,
                            default=3e-5,
                            help='The initial learning rate')
        parser.add_argument('--accelerator',
                            type=str,
                            default='gpu',
                            choices=['gpu', 'cpu'],
                            help='select accelerator')
        parser.add_argument('--num_gpus',
                            type=int,
                            default=1,
                            help='number of gpus')
        parser.add_argument('--gradient_clip_val',
                            type=float,
                            default=1.0,
                            help='gradient_clipping')

        return parser

In [None]:
parser = ArgsBase.add_model_specific_args(parser)
parser = KobartSummaryModule.add_model_specific_args(parser)
tokenizer = PreTrainedTokenizerFast.from_pretrained('digit82/kobart-summarization')
args = parser.parse_args('')
logger.info(args)

dm = KobartSummaryModule(args.train_file,
                          args.test_file,
                          tokenizer,
                          batch_size=args.batch_size,
                          max_len=args.max_len,
                          num_workers=args.num_workers)

dm.setup('fit')

model = KoBARTConditionalGeneration(args)
checkpoint_callback = ModelCheckpoint(monitor='val_loss',
                                      dirpath=args.checkpoint,
                                      filename='model_chp/{epoch:02d}-{val_loss:.3f}',
                                      verbose=True,
                                      save_last=True,
                                      mode='min',
                                      save_top_k=3)

trainer = L.Trainer(max_epochs=args.max_epochs,
                    accelerator=args.accelerator,
                    devices=args.num_gpus,
                    gradient_clip_val=args.gradient_clip_val,
                    callbacks=[checkpoint_callback]
                    )

trainer.fit(model, dm)

In [None]:
import argparse
from transformers.models.bart import BartForConditionalGeneration

parser = argparse.ArgumentParser()
parser.add_argument("--hparams", default='/content/drive/MyDrive/data/checkpoint/hparams.yaml', type=str)
parser.add_argument("--model_binary", default='/content/drive/MyDrive/data/checkpoint/model_chp/epoch=01-val_loss=1.627.ckpt', type=str)
parser.add_argument("--output_dir", default='/content/drive/MyDrive/data/kobart_summary', type=str)
args = parser.parse_args('')

inf = KoBARTConditionalGeneration.load_from_checkpoint(args.model_binary)

inf.model.save_pretrained(args.output_dir)