In [None]:
%pip install transformers==4.32.1
%pip install sentence-transformers==2.2.2

In [None]:
import torch
from transformers import PreTrainedTokenizerFast
from transformers import BartForConditionalGeneration

tokenizer = PreTrainedTokenizerFast.from_pretrained('digit82/kobart-summarization')
model = BartForConditionalGeneration.from_pretrained('/content/drive/MyDrive/data/kobart_summary')
model2 = BartForConditionalGeneration.from_pretrained('digit82/kobart-summarization')


def summarization(text):
    text = text.replace('\n', ' ')

    raw_input_ids = tokenizer.encode(text)
    input_ids = [tokenizer.bos_token_id] + raw_input_ids + [tokenizer.eos_token_id]
    if len(input_ids) > 1026:
        print(len(input_ids))
        input_ids = input_ids[:1026]
        print(input_ids)
    summary_ids = model.generate(torch.tensor([input_ids]), num_beams=4, max_length=1024, eos_token_id=1)
    return tokenizer.decode(summary_ids.squeeze().tolist(), skip_special_tokens=True)


def summarization2(text):
    text = text.replace('\n', ' ')

    raw_input_ids = tokenizer.encode(text)
    input_ids = [tokenizer.bos_token_id] + raw_input_ids + [tokenizer.eos_token_id]
    if len(input_ids) > 1026:
        print(len(input_ids))
        input_ids = input_ids[:1026]
        print(input_ids)
    summary_ids = model2.generate(torch.tensor([input_ids]), num_beams=4, max_length=1024, eos_token_id=1)
    return tokenizer.decode(summary_ids.squeeze().tolist(), skip_special_tokens=True)

In [None]:
from numpy import dot
from numpy.linalg import norm

def cal_cos_sim(a, b):
    return dot(a, b) / (norm(a) * norm(b))


def get_RDASS(d, r, p):
    return (cal_cos_sim(p, r) + cal_cos_sim(p, d)) / 2

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/data/test.tsv", delimiter='\t')

In [None]:
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

s_bert = SentenceTransformer('jhgan/ko-sroberta-nli')


def cal_average_RDASS(summarize_method):
    RDASS_sum = 0
    
    for cur in tqdm(range(len(df))):
        docs = df.loc[cur].news
        reference = df.loc[cur].summary
        predict = summarize_method(docs)
        
        embedding_docs = s_bert.encode(docs)
        embedding_reference = s_bert.encode(reference)
        embedding_predict = s_bert.encode(predict)
        
        RDASS_sum += get_RDASS(embedding_docs, 
                               embedding_reference, 
                               embedding_predict)
        
    return RDASS_sum / len(df)

In [None]:
original_score = cal_average_RDASS(summarization2)
fine_tuning_score = cal_average_RDASS(summarization)

print(f'기존 모델 : {original_score}')
print(f'파인 튜닝 후 : {fine_tuning_score}')