In [None]:
!pip install konlpy
!pip install rouge

In [None]:
# 학습 데이터 다운로드
!gdown https://drive.google.com/uc?id=13l621lx2nSnXpFpzh78UUEyds_DAzyn6

# 테스트 데이터 다운로드
!gdown https://drive.google.com/uc?id=10LwhiPlgjOZbtF0Bv5395wYIm23y_QfT

In [None]:
import pandas as pd

In [None]:
train = pd.read_json('summ_train.json')
train = train.dropna()
train = train[:20000]

test = pd.read_json('summ_test.json')
test = test.dropna()
test = test[:3000]

print(len(train))
print(len(test))

In [None]:
train.head()

In [None]:
train['documents'][0]

In [None]:
def preprocessing_data(data):
  outs = []

  for doc in data['documents']:
    line = []
    line.append(doc['media_name'])
    line.append(doc['id'])

    para = []
    for sent in doc['text']:
      for s in sent:
        para.append(s['sentence'])

    line.append(para)
    line.append(doc['abstractive'][0])
    line.append(doc['extractive'])

    a = doc['extractive']
    if a[0] == None or a[1] == None or a[2] == None:
      continue

    outs.append(line)

  outs_df = pd.DataFrame(outs)
  outs_df.columns = ['media', 'id', 'article_original', 'abstractive', 'extractive']

  return outs_df

In [None]:
train_data = preprocessing_data(train)
test_data = preprocessing_data(test)

In [None]:
train_data.head()

In [None]:
train_data['news'] = train_data['article_original'].apply(lambda x : ' '.join(x))
test_data['news'] = test_data['article_original'].apply(lambda x : ' '.join(x))

In [None]:
train_data['news'][0]

In [None]:
import os
import logging
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from torch.utils.data import Dataset, DataLoader
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
from tqdm import tqdm

In [None]:
class KoBARTSummaryDataset(Dataset):
  def __init__(self, df, tokenizer, max_len, ignore_index = -100):
    super().__init__()
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.docs = df
    self.len = self.docs.shape[0]

    self.pad_index = self.tokenizer.pad_token_id
    self.ignore_index = ignore_index

  def add_padding_data(self, inputs):
    if len(inputs) < self.max_len:
      pad = np.array([self.pad_index] * (self.max_len - len(inputs)))
      inputs = np.concatenate([inputs, pad])
    else:
      inputs = inputs[:self.max_len]

    return inputs

  def add_ignore_data(self, inputs):
    if len(inputs) < self.max_len:
      pad = np.array([self.ignore_index] * (self.max_len - len(inputs)))
      inputs = np.concatenate([inputs, pad])
    else:
      inputs = inputs[:self.max_len]

    return inputs

  def __getitem__(self, idx):
    instance = self.docs.iloc[idx]
    input_ids = self.tokenizer.encode(instance['news'])
    input_ids = self.add_padding_data(input_ids)

    label_ids = self.tokenizer.encode(instance['abstractive'])
    label_ids.append(self.tokenizer.eos_token_id)

    dec_input_ids = [self.tokenizer.eos_token_id]
    dec_input_ids += label_ids[:-1]
    dec_input_ids = self.add_padding_data(dec_input_ids)

    label_ids = self.add_ignore_data(label_ids)

    return {'input_ids' : np.array(input_ids, dtype = np.int_),
            'decoder_input_ids' : np.array(dec_input_ids, dtype = np.int_),
            'labels' : np.array(label_ids, dtype = np.int_)}

  def __len__(self):
    return self.len

In [None]:
class KoBartConditionalGeneration(nn.Module):
  def __init__(self):
    super(KoBartConditionalGeneration, self).__init__()
    self.model = BartForConditionalGeneration.from_pretrained('gogamza/kobart-base-v1')
    self.tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v1')
    self.pad_token_id = self.tokenizer.pad_token_id

  def forward(self, inputs):
    attention_mask = inputs['input_ids'].ne(self.pad_token_id).float()
    decoder_attention_mask = inputs['decoder_input_ids'].ne(self.pad_token_id).float()

    return self.model(input_ids = inputs['input_ids'],
                      attention_mask = attention_mask,
                      decoder_input_ids = inputs['decoder_input_ids'],
                      decoder_attention_mask = decoder_attention_mask,
                      labels = inputs['labels'], return_dict = True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v1')
model = KoBartConditionalGeneration()
model.to(device)

In [None]:
batch_size = 16
max_len = 512
num_workers = 4
lr = 3e-5
max_epochs = 10
warmup_ratio = 0.1

In [None]:
train_data = KoBARTSummaryDataset(train_data, tokenizer, max_len)
test_data = KoBARTSummaryDataset(test_data, tokenizer, max_len)

train_loader = DataLoader(train_data, batch_size = batch_size, num_workers = num_workers, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, num_workers = num_workers, shuffle = False)

In [None]:
optimizer = AdamW(model.parameters(), lr = lr)
total_steps = len(train_loader) * max_epochs
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0 = int(total_steps * warmup_ratio), T_mult = 1, eta_min = 0)

In [None]:
best_loss = np.inf

for epoch in range(max_epochs):
  print(epoch + 1, '수행 중')
  model.train()

  for batch in tqdm(train_loader, total = len(train_loader)):
    batch = {k : v.to(device) for k, v in batch.items()}

    outputs = model(batch)
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    scheduler.step()

  model.eval()
  total_loss = 0.0

  with torch.no_grad():
    for batch in tqdm(test_loader, total = len(test_loader)):
      batch = {k : v.to(device) for k, v in batch.items()}

      outputs = model(batch)
      loss = outputs.loss
      total_loss += loss.item()

  avg_loss = total_loss / len(test_loader)
  print(f'Epoch : {epoch + 1}, Loss : {avg_loss}')

  if avg_loss < best_loss:
    print(f'Validation loss improved from {best_loss:.4f} to {avg_loss:.4f}.체크포인트를 저장합니다.')
    best_loss = avg_loss
    torch.save(model.state_dict(), 'best_model.pt')

In [None]:
# 모델 인스턴스 생성
model_wrapper = KoBARTConditionalGeneration().to(device)

# 가중치 로드
model_wrapper.load_state_dict(torch.load('best_model.pt'))

# 모델을 평가 모드로 설정
model_wrapper.eval()

In [None]:
output = model_wrapper.model.generate(input_ids, eos_token_id=1, max_length=512,num_beams = 5)
output = tokenizer.decode(output[0],skip_special_tokens=True)
print(output)

In [None]:
from konlpy.tag import Okt
from rouge import Rouge

def calculate_rouge(reference_sentence, hypothesis_sentence):
  okt = Okt()

  def tokenize_and_concat(text):
    return ' '.join(otk.morphs(text))

  tokenize_reference = tokenize_and_concat(reference_sentence)
  tokenize_hypothesis = tokenize_and_concat(hypothesis_sentence)

  rouge = Rouge()
  scores = rouge.get_scores(tokenize_reference, tokenize_hypothesis)
  return scores

In [None]:
# 사용 예시
label= "고양이가매트위에앉아있다."
model1_prediction= "매트위에고양이가앉아있다."
model2_prediction= "고양이가매트위에앉아있다."

rouge_scores = calculate_rouge(label, model1_prediction)
print(rouge_scores[0]['rouge-l']['f'])

rouge_scores = calculate_rouge(label, model2_prediction)
print(rouge_scores[0]['rouge-l']['f'])

In [None]:
output = '배우 배수지가 매니지먼트 숲과 전속계약을 체결해 배우 배수지의 장점과 매력을 극대화할 수 있는 작품 선택부터 국내외 활동, 가수로서의 솔로 활동까지 활발하게 이루어질 수 있도록 지원할 예정이다.'
label = test_data.loc[25]['abstractive']
rouge_scores = calculate_rouge(label, output)

print('모델의 예측 :', output)
print('정답 문장 :', label)
print('Rouge Scores :', rouge_scores[0]['rouge-l']['f'])