<a href="https://colab.research.google.com/github/Tiabet/BaekJoon/blob/main/KoBART_Baseline_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%capture

!pip install transformers
!pip install datasets

In [1]:
from tqdm import tqdm
import json
import pandas as pd
import torch
from transformers import BartForConditionalGeneration, AutoTokenizer
from datasets import Dataset
from torch.utils.data import DataLoader
from tokenizers.processors import TemplateProcessing

In [2]:
def StoryDataLoader(fname, tokenizer, batch_size, max_length, mode="train"):
    """
    Build Data Loader

    """

    dataset = Dataset.from_json(fname, mode)

    if not tokenizer.cls_token:
        tokenizer.cls_token = tokenizer.bos_token
    if not tokenizer.sep_token:
        tokenizer.sep_token = tokenizer.eos_token

    tokenizer._tokenizer.post_processor = TemplateProcessing(
        single=f"{tokenizer.cls_token} $0 {tokenizer.sep_token}",
        pair=f"{tokenizer.cls_token} $A {tokenizer.sep_token} $B:1 {tokenizer.sep_token}:1",
        special_tokens=[(tokenizer.cls_token, tokenizer.cls_token_id), (tokenizer.sep_token, tokenizer.sep_token_id)],
    )

    def preprocess_function(examples):
        processed = {}
        tokenizer_input = tokenizer(
            examples["input"]["sentence1"],
            examples["input"]["sentence3"],
            padding="max_length",
            max_length=max_length,
            truncation=True
        )
        processed["input_ids"] = tokenizer_input["input_ids"],
        processed["attention_mask"] = tokenizer_input["attention_mask"]

        if mode == "train":
            tokenizer_output = tokenizer(
                examples["output"],
                padding="max_length",
                max_length=max_length,
                truncation=True
            )
            processed["decoder_input_ids"] = tokenizer_output["input_ids"]
            processed["decoder_attention_mask"] = tokenizer_output["attention_mask"]

        return processed

    dataset = dataset.map(
        preprocess_function,
        remove_columns=dataset.column_names
    ).with_format("torch")
    dataloader = DataLoader(dataset, shuffle=(True if mode=="train" else False), batch_size=batch_size)

    return dataloader


In [3]:
def jsonlload(fname):
    with open(fname, "r", encoding="utf-8") as f:
        lines = f.read().strip().split("\n")
        j_list = [json.loads(line) for line in lines]

    return j_list


def jsonldump(j_list, fname):
    with open(fname, "w", encoding='utf-8') as f:
        for json_data in j_list:
            f.write(json.dumps(json_data, ensure_ascii=False)+'\n')

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
dir = '/content/drive/MyDrive/'

In [6]:
tokenizer = AutoTokenizer.from_pretrained(dir)

dataloader = StoryDataLoader(dir+"nikluge-sc-2023-test.jsonl", tokenizer=tokenizer, batch_size=16, max_length=512, mode="infer")

model = BartForConditionalGeneration.from_pretrained(dir)
model.to(device)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(30000, 768, padding_idx=3)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(30000, 768, padding_idx=3)
      (embed_positions): BartLearnedPositionalEmbedding(1028, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [7]:
model.eval()
torch.set_grad_enabled(False)

total_summary_tokens = []

count = 0
for batch in tqdm(dataloader):
    if count!=0:
      break

    dialoge_input = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    summary_tokens = model.generate(
        dialoge_input,
        attention_mask=attention_mask,
        decoder_start_token_id=tokenizer.bos_token_id,
        max_length=256,  # Set your desired value for summary_max_seq_len here
        pad_token_id=tokenizer.pad_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        num_beams=5,
        use_cache=True,
    )
    total_summary_tokens.extend(summary_tokens.cpu().detach().tolist())

    count+=1

decoded = [tokenizer.decode(tokens, skip_special_tokens=False) for tokens in tqdm(total_summary_tokens)]

j_list = jsonlload(dir+"/nikluge-sc-2023-test.jsonl")
for idx, oup in enumerate(decoded):
    j_list[idx]["output"] = oup

jsonldump(j_list, dir+"/nikluge-sc-2023-test-answer.jsonl")


  0%|          | 1/939 [00:09<2:23:02,  9.15s/it]
100%|██████████| 16/16 [00:00<00:00, 617.09it/s]
