# **💁🏻🗨️💁🏻‍♂️대화 요약 Baseline code**
> **Dialogue Summarization** 경진대회에 오신 여러분 환영합니다! 🎉    
> 본 대회에서는 최소 2명에서 최대 7명이 등장하여 나누는 대화를 요약하는 BART 기반 모델의 baseline code를 제공합니다.     
> 주어진 데이터를 활용하여 일상 대화에 대한 요약을 효과적으로 생성하는 모델을 만들어봅시다!

- base 모델
- num_epochs = 100
- early stopping cnt = 10
- loss 함수로 rouge 사용

In [2]:
import pandas as pd
import os
import re
import json
import yaml
from glob import glob
from tqdm import tqdm
from pprint import pprint
import torch
import pytorch_lightning as pl
from rouge import Rouge # 모델의 성능을 평가하기 위한 라이브러리입니다.

from torch.utils.data import Dataset , DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback

import wandb # 모델 학습 과정을 손쉽게 Tracking하고, 시각화할 수 있는 라이브러리입니다.

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
special_tokens_dict={'additional_special_tokens': ['#Person1#', '#Person2#','#Person3#', '#Person4#', '#Person5#', '#Person6#', '#Person7#', '#PhoneNumber#', 
                                                   '#Address#', '#PassportNumber#', '#CardNumber#', '#Email#', '#DateOfBirth#',]}

tokenizer.add_special_tokens(special_tokens_dict)
print(tokenizer.special_tokens_map)

{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['#Email#', '#Person5#', '#DateOfBirth#', '#PassportNumber#', '#Person2#', '#Person7#', '#Address#', '#Person1#', '#Person6#', '#CardNumber#', '#Person3#', '#PhoneNumber#', '#Person4#']}


In [5]:
model_id = 'facebook/bart-base'
train_df = pd.read_csv('/data/ephemeral/home/back_translation/train.csv')
valid_df = pd.read_csv('/data/ephemeral/home/back_translation/dev.csv')
test_df = pd.read_csv('/data/ephemeral/home/back_translation/test.csv')

epochs = 100
batch_size = 50
num_workers = 0
log_interval = 300
dig_max_len = 1000
sum_max_len = 200

remove_tokens = [
    f"{tokenizer.bos_token}", 
    f"{tokenizer.eos_token}", 
    f"{tokenizer.unk_token}", 
    f"{tokenizer.sep_token}", 
    f"{tokenizer.pad_token}",
    f"{tokenizer.cls_token}", 
    f"{tokenizer.mask_token}", 
]

In [7]:
model = BartForConditionalGeneration.from_pretrained(model_id).to(device)
torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0001
    maximize: False
    weight_decay: 0.01
)

In [8]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, input_len, summ_len, is_train=True):
        self.tokenizer = tokenizer
        self.df = df
        self.source_len = input_len
        self.summ_len = summ_len
        self.is_train = is_train
        if self.is_train:
            self.input_ids = tokenizer(self.df['translated_dialogue'].tolist(), return_tensors='pt', padding=True,
                                       add_special_tokens=True, truncation=True, max_length=512, return_token_type_ids=False).input_ids
            self.labels = tokenizer(self.df['translated_summary'].tolist(), return_tensors='pt', padding=True,
                                       add_special_tokens=True, truncation=True, max_length=100, return_token_type_ids=False).input_ids
            
        else:
            self.input_ids = tokenizer(self.df['translated_dialogue'].tolist(), return_tensors='pt', padding=True,
                                       add_special_tokens=True, truncation=True, max_length=512, return_token_type_ids=False).input_ids
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        if self.is_train:
            return self.input_ids[idx], self.labels[idx]
        else :
            return self.input_ids[idx]

In [9]:
train_dataset = CustomDataset(train_df[['translated_dialogue', 'translated_summary']], tokenizer, dig_max_len, sum_max_len)
valid_dataset = CustomDataset(valid_df[['translated_dialogue', 'translated_summary']], tokenizer, dig_max_len, sum_max_len)
test_dataset = CustomDataset(train_df[['translated_dialogue']], tokenizer, dig_max_len, sum_max_len, is_train=False)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

In [14]:
class ROUGELoss(torch.nn.Module):
    def __init__(self, smoothing=1e-5):
        super(ROUGELoss, self).__init__()
        self.smoothing = smoothing
        self.rouge = Rouge()

    def forward(self, predictions, targets):
        # predictions and targets are expected to be batches of sequences
        pred_strs = [' '.join(map(str, seq)) for seq in predictions.tolist()]
        targ_strs = [' '.join(map(str, seq)) for seq in targets.tolist()]

        scores = []
        for pred, targ in zip(pred_strs, targ_strs):
            try:
                score = self.rouge.get_scores(pred, targ)[0]
                rouge_1 = score['rouge-1']['f']
                rouge_2 = score['rouge-2']['f']
                rouge_l = score['rouge-l']['f']
                avg_rouge = (rouge_1 + rouge_2 + rouge_l) / 3
                scores.append(avg_rouge)
            except ValueError:  # Handle empty predictions or targets
                scores.append(0.0)

        return 1 - torch.tensor(scores).mean()

def ids_to_words(tokenizer, preds, labels):
    decoded_preds = tokenizer.batch_decode(preds, clean_up_tokenization_spaces=True)
    labels = tokenizer.batch_decode(labels, clean_up_tokenization_spaces=True)

    replaced_predictions = decoded_preds.copy()
    replaced_labels = labels.copy()
    # remove_tokens = ['<usr>', f"{tokenizer.unk_token}", f"{tokenizer.eos_token}", f"{tokenizer.pad_token}"]

    for token in remove_tokens:
        replaced_predictions = [sentence.replace(token," ") for sentence in replaced_predictions]
        replaced_labels = [sentence.replace(token," ") for sentence in replaced_labels]
    return replaced_predictions, replaced_labels

def compute_metrics(replaced_predictions, replaced_labels):
    rouge = Rouge()

    results = rouge.get_scores(replaced_predictions, replaced_labels,avg=True)
    result = {key: value["f"] for key, value in results.items()}
    
    return result

In [None]:
def train(model, train_dataloader, criterion, optimizer, log_interval, train_step, epoch):
    model.train()
    train_loss = 0.0
    ce_loss = []
    rouge_losses = []
    rouge_loss_fn = ROUGELoss()

    for idx, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        input_ids = batch[0].to(device)