# Generation-based MRC 문제를 풀어보기

### Requirements

In [1]:
!pip install tqdm==4.64.1 -q
!pip install datasets==2.7.0 -q
!pip install transformers==4.24.0 -q
!pip install sentencepiece==0.1.97 -q
!pip install nltk -q

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/kingstar/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## 데이터 및 평가 지표 불러오기

In [3]:
from datasets import load_dataset

datasets = load_dataset("squad_kor_v1")

Found cached dataset squad_kor_v1 (/home/kingstar/.cache/huggingface/datasets/squad_kor_v1/squad_kor_v1/1.0.0/18d4f44736b8ee85671f63cb84965bfb583fa0a4ff2df3c2e10eee9693796725)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from datasets import load_metric

metric = load_metric('squad')

  metric = load_metric('squad')


## Pre-trained 모델 및 토크나이저 불러오기

In [5]:
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer
)

In [6]:
model_name = "paust/pko-t5-small"

In [7]:
config = AutoConfig.from_pretrained(
    model_name,
    cache_dir=None,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=None,
    use_fast=True,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    config=config,
    cache_dir=None,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.95k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/383M [00:00<?, ?B/s]

## 설정하기

In [8]:
max_source_length = 384
max_target_length = 128 # Decoding 단계에서 생성할 시퀀스의 최대 길이
padding = "max_length"
preprocessing_num_workers=12
num_beams = 3
max_train_samples = 5000
max_val_samples = 500
num_train_epochs = 3
train_batch_size = 24
eval_batch_size = 3

## 전처리하기

In [9]:
def preprocess_function(examples):
    inputs = [f'question: {q}  context: {c} </s>' for q, c in zip(examples['question'], examples['context'])]
    targets = [f'{a["text"][0]} </s>' for a in examples['answers']]
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    model_inputs["example_id"] = []
    for i in range(len(model_inputs["labels"])):
        model_inputs["example_id"].append(examples["id"][i])
    return model_inputs

In [10]:
column_names = datasets['train'].column_names

In [11]:
train_dataset = datasets["train"]
train_dataset = train_dataset.select(range(max_train_samples))
train_dataset = train_dataset.map(
            preprocess_function,
            batched=True,
            num_proc=preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=False,
        )

                

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]



In [12]:
eval_dataset = datasets["validation"]
eval_dataset = eval_dataset.select(range(max_val_samples))
eval_dataset = eval_dataset.map(
            preprocess_function,
            batched=True,
            num_proc=preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=False,
        )


                

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]



#5:   0%|          | 0/1 [00:00<?, ?ba/s]



 



  

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 



 



#8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]



#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]



## Fine-tuning하기

In [13]:
from transformers import (
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

In [14]:
label_pad_token_id = tokenizer.pad_token_id
data_collator = DataCollatorForSeq2Seq(
            tokenizer,
            model=model,
            label_pad_token_id=label_pad_token_id,
            pad_to_multiple_of=None,
        )

In [15]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # decoded_labels is for rouge metric, not used for f1/em metric
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    formatted_predictions = [{"id": ex['id'], "prediction_text": decoded_preds[i]} for i, ex in enumerate(datasets["validation"].select(range(max_val_samples)))]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets["validation"].select(range(max_val_samples))]

    result = metric.compute(predictions=formatted_predictions, references=references)
    return result

In [16]:
args = Seq2SeqTrainingArguments(
    output_dir='outputs', 
    do_train=True, 
    do_eval=True, 
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    predict_with_generate=True,
    num_train_epochs=num_train_epochs
)

In [17]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [18]:
train_result = trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: example_id. If example_id are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5000
  Num Epochs = 3
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 627
  Number of trainable parameters = 95628672
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtraintogpb[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [19]:
train_result

TrainOutput(global_step=627, training_loss=0.2230048012315181, metrics={'train_runtime': 579.4062, 'train_samples_per_second': 25.889, 'train_steps_per_second': 1.082, 'total_flos': 2413842923520000.0, 'train_loss': 0.2230048012315181, 'epoch': 3.0})

## 평가하기

In [20]:
metrics = trainer.evaluate(
    max_length=max_target_length, num_beams=num_beams, metric_key_prefix="eval"
)

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: example_id. If example_id are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 3


In [21]:
metrics

{'eval_loss': 0.03034132905304432,
 'eval_exact_match': 51.0,
 'eval_f1': 58.29095238095239,
 'eval_runtime': 45.1989,
 'eval_samples_per_second': 11.062,
 'eval_steps_per_second': 3.695,
 'epoch': 3.0}

In [22]:
def generarate_answer(sample):
    inputs = f'question: {sample["question"]}  context: {sample["context"]} </s>'
    print(inputs)
    sample = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True, return_tensors='pt')
    sample = sample.to("cuda:0")
    outputs = model.generate(**sample, max_length=max_target_length, num_beams=num_beams)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

    pred = "\n".join(nltk.sent_tokenize(pred))

    return pred

import numpy as np
np.random.seed(seed=7777) 

for i in np.random.randint(0, len(datasets["validation"]), 5):
    print(generarate_answer(datasets["validation"][int(i)]))
    print("=" * 8)

question: 유아인이 배우로서 처음으로 부산국제영화제에 참석한 년도는?  context: 2006년 1월 스크린 데뷔작인 독립영화 《우리에게 내일은 없다》의 촬영을 시작했다. 이 영화를 연출한 노동석 감독은 오디션을 볼 당시 유아인에게 극 중 캐릭터에 대해 묻자 창 밖을 한참 바라보며 “슬프죠”라는 한 마디만을 던진 모습이 인상적이었다며 캐스팅의 이유를 밝혔다. 유아인은 이 영화에서 진짜 총을 구해 현실로부터 자신을 구해내려는 소년 ‘종대’ 역할을 맡았는데, 인터뷰에서 "종대처럼 사건에 휘말린 적도 없고 불우한 환경에서 자라지도 않았지만 제가 종대와 비슷한 시기에 느꼈던 불안이나 두려움 등이 연기를 하는 데 큰 도움이 됐습니다. 종대도 청춘이고 저도 청춘이니까요"라며 연기를 한 소회를 밝혔다. 2007년 5월 《우리에게 내일은 없다》 언론시사회에서는 작품에 대해 “배우라는 앞날에 대한 꿈을 꾸고 그림을 그렸다면 그 그림 속에 꼭 있어야 할 영화”라며 본인의 영화 데뷔작에 대한 애정을 드러낸다. 또한 배우로서 고유한 소년성을 갖게해 준 ‘첫 활시위’ 같은 작품이라고 설명한다. 2006년 10월 유아인은 이 영화를 통해 배우로서 처음으로 부산국제영화제 개막식과 GV에 참석한다. </s>
2007년
question: 김희선이 4년만에 브라운관에 컴백한 인기 드라마 <야마토 나데시코>를 원작으로 한 로맨스 드라마는?  context: 김희선은 2000년대에 접어들면서 스크린으로 활동 무대를 옮겨 드라마 출연을 한동안 중단하였다. 영화 《와니와 준하》(2001), 《화성으로 간 사나이》(2003)에 출연했지만 번번히 이렇다 할 흥행을 거두지 못한 채 2003년 일본의 인기 드라마 《야마토 나데시코》를 원작으로 한 로맨스 드라마 《요조숙녀》로 4년여만에 브라운관에 컴백하였다. 하지만 이 작품은 진부한 설정과 스토리로 기대 이상의 주목은 받지 못했다. 이듬해, 2004년에는 한류를 겨냥한 멜로 드라마 《슬픈 연가》에서 출연하였지만 남자주인공 중 한 명인 송승헌이 병역

### **콘텐츠 라이선스**

<font color='red'><b>**WARNING**</b></font> : **본 교육 콘텐츠의 지식재산권은 재단법인 네이버커넥트에 귀속됩니다. 본 콘텐츠를 어떠한 경로로든 외부로 유출 및 수정하는 행위를 엄격히 금합니다.** 다만, 비영리적 교육 및 연구활동에 한정되어 사용할 수 있으나 재단의 허락을 받아야 합니다. 이를 위반하는 경우, 관련 법률에 따라 책임을 질 수 있습니다. 모델 라이선스 : MIT License

