# RE task 기초 베이스라인 1

- Task : KLUE-RE
- 담당자: [김보석](https://github.com/BOSOEK) 님
- 최종수정일: 21-09-15
- 본 자료는 가짜연구소 3기 KLUE 로 모델 평가하기 크루 활동으로 작성됨


In [None]:
!pip install datasets
!pip install sklearn
!pip install transformers



In [None]:
import torch
import torch.nn as nn
import sklearn.metrics

from tqdm import tqdm
from datasets import load_dataset
from datasets.arrow_dataset import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader

### 데이터 셋 가져오기

In [None]:
dataset = load_dataset('klue', 're')
dataset

Reusing dataset klue (/root/.cache/huggingface/datasets/klue/re/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90)


DatasetDict({
    train: Dataset({
        features: ['guid', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'],
        num_rows: 32470
    })
    validation: Dataset({
        features: ['guid', 'sentence', 'subject_entity', 'object_entity', 'label', 'source'],
        num_rows: 7765
    })
})

### 데이터 살짝 맛보기
input은 sentence, output은 label

In [None]:
# 데이터 구성
dataset['train'][0]

{'guid': 'klue-re-v1_train_00000',
 'label': 0,
 'object_entity': {'end_idx': 18,
  'start_idx': 13,
  'type': 'PER',
  'word': '조지 해리슨'},
 'sentence': '〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey Road》에 담은 노래다.',
 'source': 'wikipedia',
 'subject_entity': {'end_idx': 26,
  'start_idx': 24,
  'type': 'ORG',
  'word': '비틀즈'}}

# 토큰 추가
__sentence__에 관계 단어 두개를 표시하기 위해 특별 토큰을 추가합니다.

In [None]:
dataset['train'][0]['subject_entity']

{'end_idx': 26, 'start_idx': 24, 'type': 'ORG', 'word': '비틀즈'}

In [None]:
def add_Token(dataset):
    sentences = []
    labels = []

    for data in dataset:
        sentence = data['sentence']

        object_start =  int(data['object_entity']['start_idx'])
        object_end =  int(data['object_entity']['end_idx'])
        subject_start =  int(data['subject_entity']['start_idx'])
        subject_end =  int(data['subject_entity']['end_idx'])

        if object_start < subject_start:
            new_sentence = sentence[:object_start] + '<o>' + sentence[object_start:object_end+1] + '</o>' + sentence[object_end+1:subject_start] + '<s>' + sentence[subject_start:subject_end+1] + '</s>' + sentence[subject_end+1:]
        else:
            new_sentence = sentence[:subject_start] + '<s>' + sentence[subject_start:subject_end+1] + '</s>' + sentence[subject_end+1:object_start] + '<o>' + sentence[object_start:object_end+1] + '</o>' + sentence[object_end+1:]

        # 본문 저장
        sentences.append(new_sentence)

        # 레이블 저장
        labels.append(data['label'])

    return sentences, labels

In [None]:
# train, validation데이터셋에서 sentence와 label만 저장.
train_sentences, train_labels = add_Token(dataset['train'])
val_sentences, val_labels = add_Token(dataset['validation'])

In [None]:
# 토큰 확인하기.
for sentence in train_sentences[:5]:
    print(sentence, '\n')

〈Something〉는 <o>조지 해리슨</o>이 쓰고 <s>비틀즈</s>가 1969년 앨범 《Abbey Road》에 담은 노래다. 

호남이 기반인 바른미래당·<o>대안신당</o>·<s>민주평화당</s>이 우여곡절 끝에 합당해 민생당(가칭)으로 재탄생한다. 

K리그2에서 성적 1위를 달리고 있는 <s>광주FC</s>는 지난 26일 <o>한국프로축구연맹</o>으로부터 관중 유치 성과와 마케팅 성과를 인정받아 ‘풀 스타디움상’과 ‘플러스 스타디움상’을 수상했다. 

균일가 생활용품점 (주)<s>아성다이소</s>(대표 <o>박정부</o>)는 코로나19 바이러스로 어려움을 겪고 있는 대구광역시에 행복박스를 전달했다고 10일 밝혔다. 

<o>1967</o>년 프로 야구 드래프트 1순위로 <s>요미우리 자이언츠</s>에게 입단하면서 등번호는 8번으로 배정되었다. 



# klue/bert-base 토크나이저 가져오기

In [None]:
tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')

### token 추가하기
__sentence__에 추가한 subject, object 토큰들을 토크나이저에 등록해야 해당 token들이 token화 되지 않는다.

In [None]:
new_enrollment_tokens = {'additional_special_tokens': ['<o>', '</o>', '<s>', '</s>']}
enrollment_tokens = tokenizer.add_special_tokens(new_enrollment_tokens)

학습에 필요한 값 설정

In [None]:
batch_size = 8
num_labels = 30
learning_rate = 1e-5
weight_decay = 0.0

### 학습 데이터셋과 데이터로더 만들기

In [None]:
class makeDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, sentences, labels, max_length=128):
        self.encodings = tokenizer(sentences,
                                   max_length=max_length,
                                   padding='max_length',
                                   truncation=True)
        self.labels = labels

train_dataset = KlueReDataset(tokenizer, train_sentences, train_labels)
val_dataset = KlueReDataset(tokenizer, val_sentences, val_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

# klue/bert-base 모델을 로드하기

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

### Embedding resize

지금의 BERT Embedding Layer에는 추가했던 subject, object의 4개 토큰 정보가 반영되지 않았기 때문에  __index error__가 발생! 

때문에 Embedding Layer의 input이 32000에서 32004로 resize해준다!

In [None]:
model.resize_token_embeddings(len(tokenizer))

Embedding(32004, 768)

학습 도중 Loss, Accuracy 계산 및 저장을 간단하게 하기 위해 AverageMeter를 클래스를 이용합니다.

In [None]:
class AverageMeter():
    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

### model fine-tuning
BERT-base 모델을 fine-tuning합니다.

구글 코랩에서 epoch 당 25~30분 정도가 소요됍니다.(ㄷㄷ)

In [None]:
def train_model(data_loader, model, criterion, optimizer, train=True):
    loss_save = AverageMeter()
    acc_save = AverageMeter()
    
    # progress bar 생성
    for _, batch in tqdm(enumerate(data_loader), total=len(data_loader)):
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'token_type_ids': batch['token_type_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
        }
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(**inputs)
        logits = outputs['logits']
        
        loss = criterion(logits, labels)

        if train:
            loss.backward()
            optimizer.step()
        
        preds = torch.argmax(logits, dim=1)
        acc = ((preds == labels).sum().item() / labels.shape[0])
        
        loss_save.update(loss, labels.shape[0])
        acc_save.update(acc, labels.shape[0])
        
    results = {
        'loss': loss_save.avg,
        'acc': acc_save.avg,
    }
    
    return results
        

epochs = 1

# loss function, optimizer 설정
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

for epoch in range(epochs):
    print(f'< Epoch {epoch+1} / {epochs} >')
    
    # Train
    model.train()
    train_results = train_model(train_loader, model, criterion, optimizer)
    train_loss, train_acc = train_results['loss'], train_results['acc']
    
    # Validation
    with torch.no_grad():
        model.eval()
        
        val_results = train_model(val_loader, model, criterion, optimizer, False)
        val_loss, val_acc = val_results['loss'], val_results['acc']
    
    
    print(f'train_loss: {train_loss:.4f}, train_acc: {train_acc:.4f}, val_loss: {val_loss:.4f}, val_acc: {val_acc:.4f}')
    print('=' * 100)

< Epoch 1 / 1 >


100%|██████████| 4059/4059 [27:03<00:00,  2.50it/s]
100%|██████████| 971/971 [02:04<00:00,  7.81it/s]

train_loss: 0.5199, train_acc: 0.8220, val_loss: 0.7974, val_acc: 0.7378





### 결과 확인

실제 20개의 head 데이터로 확인 결과 3개 제외한 답이 정답이였다.

In [None]:
for i in range(20):
    val = encoding = tokenizer(val_sentences[i], max_length=128, padding='max_length', truncation=True, return_tensors='pt')
    val_input = {
    'input_ids': val['input_ids'].to(device),
    'token_type_ids': val['token_type_ids'].to(device),
    'attention_mask': val['attention_mask'].to(device),
    }
    model.eval()
    output = model(**val_input)
    label = torch.argmax(output['logits'], dim=1)
    print('label : ' + str(val_labels[i]) + ', 추측 값 : ' + str(label))

label : 0, 추측 값 : tensor([0], device='cuda:0')
label : 0, 추측 값 : tensor([0], device='cuda:0')
label : 0, 추측 값 : tensor([0], device='cuda:0')
label : 0, 추측 값 : tensor([10], device='cuda:0')
label : 0, 추측 값 : tensor([0], device='cuda:0')
label : 0, 추측 값 : tensor([0], device='cuda:0')
label : 18, 추측 값 : tensor([18], device='cuda:0')
label : 17, 추측 값 : tensor([17], device='cuda:0')
label : 0, 추측 값 : tensor([0], device='cuda:0')
label : 10, 추측 값 : tensor([10], device='cuda:0')
label : 10, 추측 값 : tensor([0], device='cuda:0')
label : 0, 추측 값 : tensor([0], device='cuda:0')
label : 6, 추측 값 : tensor([0], device='cuda:0')
label : 3, 추측 값 : tensor([3], device='cuda:0')
label : 8, 추측 값 : tensor([8], device='cuda:0')
label : 0, 추측 값 : tensor([0], device='cuda:0')
label : 29, 추측 값 : tensor([29], device='cuda:0')
label : 0, 추측 값 : tensor([0], device='cuda:0')
label : 0, 추측 값 : tensor([0], device='cuda:0')
label : 18, 추측 값 : tensor([18], device='cuda:0')


### 모델 평가하기
F1 score의 성능만 일단 측정한다.

In [None]:
def calc_f1_score(preds, labels):
    preds_relation = []
    labels_relation = []
    
    for pred, label in zip(preds, labels):
        preds_relation.append(pred)
        labels_relation.append(label)

    f1_score = sklearn.metrics.f1_score(labels_relation, preds_relation, average='micro', zero_division=1)
    
    return f1_score * 100

In [None]:
with torch.no_grad():
    model.eval()
    
    label_all = []
    pred_all = []
    for batch in tqdm(val_loader):
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'token_type_ids': batch['token_type_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
        }
        labels = batch['labels'].to(device)
        
        outputs = model(**inputs)
        logits = outputs['logits']
        
        preds = torch.argmax(logits, dim=1)
        
        label_all.extend(labels.detach().cpu().numpy().tolist())
        pred_all.extend(preds.detach().cpu().numpy().tolist())
    
    f1_score = calc_f1_score(label_all, pred_all)

100%|██████████| 971/971 [02:00<00:00,  8.09it/s]


F1 score : 73.7797

In [None]:
f1_score

73.77978106889891