In [None]:
import pandas as pd
import textstat
import torch

from evaluate import load
from nltk.translate import meteor_score
from nltk import word_tokenize
from torchmetrics.text.rouge import ROUGEScore

from functions import *

In [None]:
cuda_avbl, device = test_cuda_avbl()

MODEL           = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}                                                      # set model

SPECIAL_TOKENS  = {"bos_token": "<|BOS|>",                                                                              # beginning of a sequenze
                   "eos_token": "<|EOS|>",                                                                              # end of a sequenze
                   "unk_token": "<|UNK|>",                                                                              # set for unknown tokens
                   "pad_token": "<|PAD|>",                                                                              # empty tokens for short sentences
                   "sep_token": "<|SEP|>"}                                                                              # seperates sentences

MAX_LENGTH      = 1024

TRAIN_SIZE      = 0.8

In [None]:
df_testing = pd.read_excel("")

In [None]:
tokenizer = get_tokenizer(MODEL, special_tokens=SPECIAL_TOKENS)
model = get_model(MODEL, cuda_avbl, tokenizer, special_tokens=SPECIAL_TOKENS,
                load_model_path=r"model\pytorch_model.bin")

In [None]:
class Summarizer():
    '''
    This is a class for summarizing abstracts of scientifict paper.

    Attributes:
        model (model class): NLP model for summarization.
        tokenizer (tokenizer class): tokenizer for preparing inputs for model.
        cuda_avbl (bool): Is Cuda for data processing available?
    '''

    def __init__(self, model, tokenizer, cuda_avbl):
        '''
        Constructor for Summarizer class.
        
        Parameters:
            model (model class): NLP model for summarization.
            tokenizer (tokenizer class): tokenizer for preparing inputs for model.
            cuda_avbl (bool): Is Cuda for data processing available?
        '''

        self.model = model 
        self.tokenizer = tokenizer
        self.cuda_avbl = cuda_avbl
        logging.info("Summarizer instantiated")


    def __prep_text(self):
        '''Prepares inputs for NLP model.'''

        self.in_text = self.in_text.strip().replace("\n", " ")

        prompt = "<|BOS|>" + self.in_text + "TL;DR:"
        tokenized_prompt = self.tokenizer.encode(prompt)
        if len(tokenized_prompt) > MAX_LENGTH:
            trunc = int((MAX_LENGTH-self.max_len)/2)
            tokenized_prompt = tokenized_prompt[:trunc] + tokenized_prompt[-trunc:]
        self.encoded_in = torch.tensor(tokenized_prompt).unsqueeze(0)
        self.in_ids_len = len(tokenized_prompt)
        logging.info("Model inputs prepared")
        
        if self.cuda_avbl:
            device = torch.device("cuda")
            self.encoded_in = self.encoded_in.to(device)
    

    def get_summary(self, in_text, max_len, min_len):
        '''
        Generates summaries with NLP model.

        Parameters:
            in_text (str): Input text to summarize.
            max_len (int): Maximum length of generated summary.
            min_len (int): Minimum length of generated summary.

        Returns:
            output (str): The generated summary from data.
        '''

        self.in_text = in_text
        self.max_len = max_len
        self.__prep_text()
        self.model.eval()

        sample_outputs = self.model.generate(inputs=self.encoded_in,    
                                max_length=self.in_ids_len+max_len,                                                 # max lenght of generated text 
                                min_length=self.in_ids_len+min_len,                                                 # min lenght of generated text
                                do_sample=True,                                                                     # sampling or always using word with highest probability
                                early_stopping=True,                                                                # stopping beamch search when num_beam sentences finished        
                                temperature=0.5,                                                                    # scales probabilities for a more conservative (lower) or divers (higher) model
                                top_k=30,                                                                           # number of most propable tokens to keep                                
                                top_p=0.7,                                                                          # keeping only most propable tokens for generation 
                                repetition_penalty=2.0,                                                             # avoiding sentences that repeat themselves
                                num_return_sequences=1                                                              # number of returned descriptions
                                )
        
        logging.info("Summary generated")

        for i, sample_output in enumerate(sample_outputs):
            self.smry = self.tokenizer.decode(sample_output, skip_special_tokens=True)
            output = self.smry.split("TL;DR:",1)[1]

        return output.strip()

In [None]:
def calc_rouge_score(original, generate):
    '''
    Calculates Rouge Score for a generated summary and a reference text.

    Parameters:
        original (str): Reference summary.
        generate (str): NLP-Model generated summary.

    Returns:
        list: Multiple Rouge Scores.
    '''

    rouge = ROUGEScore(use_stemmer=True, accumulate="best", rouge_keys=("rouge1", "rougeLsum"))

    rouge_score      = rouge(generate, original)
    rouge1_fmeasure  = round(rouge_score["rouge1_fmeasure"].item(), 2)
    rouge1_precision = round(rouge_score["rouge1_precision"].item(), 2)
    rouge1_recall    = round(rouge_score["rouge1_recall"].item(), 2)
    rougeLsum_fmeasure  = round(rouge_score["rougeLsum_fmeasure"].item(), 2)
    rougeLsum_precision = round(rouge_score["rougeLsum_precision"].item(), 2)
    rougeLsum_recall    = round(rouge_score["rougeLsum_recall"].item(), 2)
    
    return [rouge1_fmeasure, rouge1_precision, rouge1_recall, rougeLsum_fmeasure, rougeLsum_precision, rougeLsum_recall]

In [None]:
def calc_bert_score(original, generate):
    '''
    Calculates Bert Score for a generated summary and a reference text.

    Parameters:
        original (str): Reference summary.
        generate (str): NLP-Model generated summary.

    Returns:
        list: Multiple Bert Scores.
    '''

    bertscore = load("bertscore")

    bert_results = bertscore.compute(predictions=[generate], references=[original], lang="en")
    bert_precision = round(bert_results["precision"][0], 2)
    bert_recall = round(bert_results["recall"][0], 2)
    bert_f1 = round(bert_results["f1"][0], 2)

    return [bert_f1, bert_precision, bert_recall]

In [None]:
def calc_meteor_score(original, generate):
    '''
    Calculates Meteor Score for a generated summary and a reference text.

    Parameters:
        original (str): Reference summary.
        generate (str): NLP-Model generated summary.

    Returns:
        list: Meteor Score.
    '''

    meteorscore = meteor_score.single_meteor_score(
        word_tokenize(original),
        word_tokenize(generate))

    return [round(meteorscore, 2)]

In [None]:
def calc_flesch_score(generate):
    '''
    Calculates Flesch reading ease for a generated text.

    Parameters:
        generate (str): NLP-Model generated summary.

    Returns:
        list: Meteor Score.
    '''

    textstat.set_lang("en")
    flesch_index = textstat.flesch_reading_ease(generate)

    return [round(flesch_index, 2)]

In [None]:
def get_scores(original, generate):
    '''
    Gets scores from multiple functions and summarizes all values ​​in one list.

    Parameters:
        original (str): Reference summary.
        generate (str): NLP-Model generated summary.

    Returns:
        scores (list): Multiple Scores from different NLP text metrics.
    '''

    rouge_scores  = calc_rouge_score(original, generate)
    bert_scores   = calc_bert_score(original, generate)
    meteor_scores = calc_meteor_score(original, generate)
    flesch_scores = calc_flesch_score(generate)

    scores = [*rouge_scores, *bert_scores, *meteor_scores, *flesch_scores]
    
    return scores

In [None]:
df_results = pd.DataFrame(columns=["rouge1_fmeasure", "rouge1_precision", "rouge1_recall", "rougeLsum_fmeasure", "rougeLsum_precision", "rougeLsum_recall", "bert_f1", "bert_precision", "bert_recall", "meteor", "flesch"])
gpt_summarizer = Summarizer(model, tokenizer, cuda_avbl)
        
for index, row in df_testing.iterrows():

    #target = row["Abstract"]
    target = row["One Sentence"]
    print(f"Target:\n{target}")

    #generated = gpt_summarizer.get_summary(row["Title"] + row["Full Text"], max_len=200, min_len=100)
    #generated = gpt_summarizer.get_summary(row["Title"] + row["Abstract"], max_len=50, min_len=20)
    generated = gpt_summarizer.get_summary(row["Title"] + row["Full Text"], max_len=50, min_len=20)  
    print(f"Generated:\n{generated}")

    new_row = get_scores(target, generated) 
    df_results.loc[len(df_results)] = new_row
    
    print(f"{new_row}\n")

In [None]:
df_results[["rouge1_precision", "rouge1_recall", "rouge1_fmeasure", "bert_precision", "bert_recall", "bert_f1"]].mean()