In [None]:
!pip install rouge
!pip install konlpy

In [None]:
# 학습 데이터 다운로드
!gdown https://drive.google.com/uc?id=13l621lx2nSnXpFpzh78UUEyds_DAzyn6

# 테스트 데이터 다운로드
!gdown https://drive.google.com/uc?id=10LwhiPlgjOZbtF0Bv5395wYIm23y_QfT

In [None]:
import pandas as pd

In [None]:
train = pd.read_json('summ_train.json')
test = pd.read_json('summ_test.json')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train = train.dropna()
train = train[:20000]
print(len(train))

test = test.dropna()
test = test[:3000]
print(len(test))

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train['documents'][0]

In [None]:
def preprocessing(data):
  outs = []
  for doc in data['documents']:
    line = []
    line.append(doc['media_name'])
    line.append(doc['id'])

    para = []
    for sent in doc['text']:
      for s in sent:
        para.append(s['sentence'])

    line.append(para)
    line.append(doc['abstractive'][0])
    line.append(doc['extractive'])

    a = doc['extractive']
    if a[0] == None or a[1] == None or a[2] == None:
      continue

    outs.append(line)

  outs_df = pd.DataFrame(outs)
  outs_df.columns = ['media', 'id', 'article_original', 'abstractive', 'extractive']

  return outs_df

In [None]:
train_data = preprocessing(train)
test_data = preprocessing(test)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data['news'] = train_data['article_original'].apply(lambda x : ' '.join(x))
test_data['news']  = test_data['article_original'].apply(lambda x : ' '.join(x))

In [None]:
train_data['news'][0]

In [None]:
import os
import logging
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from transformers import T5ForConditionalGeneration, T5TokenizerFast

In [None]:
class T5SummaryDataset(Dataset):
  def __init__(self, data, tokenizer, max_len, ignore_index = -100):
    super().__init__()

    self.tokenizer = tokenizer
    self.docs = data
    self.max_len = max_len
    self.len = self.docs.shape[0]

    self.pad_index = self.tokenizer.pad_token_id
    self.ignore_index = ignore_index

  def add_padding_data(self, inputs):
    if len(inputs) < self.max_len:
      pad = np.array([self.pad_index] * (self.max_len - len(inputs)))
      inputs = np.concatenate([inputs, pad])
    else:
      inputs = inputs[:self.max_len]

    return inputs

  def add_ignore_data(self, inputs):
    if len(inputs) < self.max_len:
      pad = np.array([self.pad_index] * (self.max_len - len(inputs)))
      inputs = np.concatenate([inputs, pad])
    else:
      inputs = inputs[:self.max_len]

    return inputs

  def __getitem__(self, idx):
    instance = self.docs.iloc[idx]

    input_text = "summarize : " + instance['news']
    input_ids = self.tokenizer.encode(input_text, return_tensors = 'pt', max_length = self.max_len, truncation = True).squeeze()
    input_ids = self.add_padding_data(input_ids)

    label_ids = self.tokenizer.encode(instance['abstractive'], return_tensors = 'pt', max_length = self.max_len, truncation = True).squeeze()
    label_ids = self.add_ignore_data(label_ids)

    return {'input_ids' : np.array(input_ids, dtype = np.int_),
            'label_ids' : np.array(label_ids, dtype = np.int_)}

  def __len__(self):
    return self.len

In [None]:
class T5ConditionalGeneration(nn.Module):
  def __init__(self):
    super(T5ConditionalGeneration, self).__init__()

    self.model = T5ForConditionalGeneration.from_pretrained('paust/pko-t5-base')
    self.tokenizer = T5TokenizerFast.from_pretrained('paust/pko-t5-base')
    self.pad_token_id = self.tokenizer.pad_token_id

  def forward(self, inputs):
    attention_mask = inputs['input_ids'].ne(self.pad_token_id).float()

    return self.model(input_ids = inputs['input_ids'],
                      attention_mask = attention_mask,
                      labels = inputs['label_ids'], return_dict = True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = T5ConditionalGeneration()
model.to(device)

In [None]:
batch_size = 8
max_len = 512
num_workers = 4
lr = 3e-5
max_epochs = 10
warmup_ratio = 0.1

tokenizer = T5TokenizerFast.from_pretrained('paust/pko-t5-base')

train_dataset = T5SummaryDataset(train_data, tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size = batch_size, num_workers = num_workers)

test_dataset = T5SummaryDataset(test_data, tokenizer, max_len)
test_loader = DataLoader(test_dataset, batch_size, batch_size, num_workers = num_workers)

In [None]:
optimizer = AdamW(model.parameters(), lr = lr)

total_steps = len(train_loader) * max_epochs
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0 = int(total_steps * warmup_ratio), T_mult = 1, eta_min = 0)

In [None]:
best_loss = np.inf

for epoch in range(max_epochs):
  print(epoch + 1, '수행 중')
  model.train()

  for batch in tqdm(train_loader, total = len(train_loader)):
    batch = {k : v.to(device) for k, v in batch.items()}

    optimizer.zero_grad()
    outputs = model(batch)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    scheduler.step()

  model.eval()
  total_loss = 0
  with torch.no_grad():
    for batch in tqdm(test_loader, total = len(test_loader)):
      batch = {k : v.to(device) for k, v in batch.items()}

      outputs = model(batch)
      loss = outputs.loss
      total_loss += loss.item()

  avg_loss = total_loss / len(test_loader)
  print(f'Epoch : {epoch + 1} / Loss : {avg_loss}')

  if avg_loss < best_loss:
    print(f'Validation loss improved from {best_loss:.4f} to {avg_loss:.4f}.체크포인트를 저장합니다.')
    best_loss = avg_loss
    torch.save(model.state_dict(), 'best_model.pt')

In [None]:
model_wrapper = T5ConditionalGeneration().to(device)
model_wrapper.model_state_dict(torch.load('best_model.pt'))
model_wrapper.eval()

In [None]:
# 임의로 테스트 데이터의 25번 샘플을 text라는 변수에 저장
text = test_data.loc[25]['news']
text = "summarize:" + text

# 정수 인코딩 후 디코딩하여 출력
input_ids = tokenizer.encode(text)
tokenizer.decode(input_ids)

In [None]:
# 정수 인코딩
input_ids = tokenizer.encode(text)
input_ids = torch.tensor(input_ids)
input_ids = input_ids.unsqueeze(0).to(device)

# 모델에 원문을 입력
output = model_wrapper.model.generate(input_ids, eos_token_id=1, max_length=512, num_beams=5)

# 모델이 출력한 요약문을 디코딩
output = tokenizer.decode(output[0],skip_special_tokens=True)
print(output)