In [20]:
from transformers import AutoModelForSeq2SeqLM, AutoModel, AutoTokenizer
import nltk
import json
import pandas as pd
from datasets import *
import numpy as np
import torch

nltk.download('punkt')
SEED = 42

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Luka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
split = "test"

In [22]:
rouge = load_metric("rouge")
bert = load_metric("bertscore")

In [23]:
def compute_rouge(decoded_preds, decoded_labels, prediction_lens):
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [24]:
model_checkpoints = [
    "cjvt/t5-sl-small",
    "ls1906/t5-sl-small-finetuned-assistant",
    "vh-student/gpt-sl-oasst1-pairs",
]

In [25]:
for model_checkpoint in model_checkpoints:
    # read local data depending on the model that it was generated with
    model_name = model_checkpoint.split("/")[-1]
    data_path = f"../../data/results/prompt_reply_pairs_1_generated_{split}_{model_name}.csv"
    data = pd.read_csv(data_path, sep=";")

    decoded_labels = data["reply"].to_list()
    decoded_preds = data["generated"].to_list()
    prediction_lens = data["token_prediction_len"].to_list()

    print(model_name.upper())

    # ROUGE
    result_rouge = compute_rouge(decoded_preds, decoded_labels, prediction_lens)
    print(result_rouge)

    # BERTSCORE
    result_bert = bert.compute(predictions=decoded_preds, references=decoded_labels, lang="sl")
    result_bert = {"Precision": np.array(result_bert["precision"]).mean(), "Recall": np.array(result_bert["recall"]).mean(), "F1": np.array(result_bert["f1"]).mean()}
    print(result_bert)
    print()

T5-SL-SMALL
{'rouge1': 0.3381, 'rouge2': 0.0651, 'rougeL': 0.2428, 'rougeLsum': 0.3114, 'gen_len': 3.4974}
{'Precision': 0.6246253614609059, 'Recall': 0.440937814724751, 'F1': 0.5145365306108426}
T5-SL-SMALL-FINETUNED-ASSISTANT
{'rouge1': 17.2265, 'rouge2': 4.0796, 'rougeL': 11.4772, 'rougeLsum': 15.7305, 'gen_len': 73.2593}
{'Precision': 0.6693199212489984, 'Recall': 0.6336666443225665, 'F1': 0.6491875979411297}
GPT-SL-OASST1-PAIRS
{'rouge1': 16.8979, 'rouge2': 2.625, 'rougeL': 9.8226, 'rougeLsum': 15.6079, 'gen_len': 162.5594}
{'Precision': 0.6103136431987469, 'Recall': 0.6375802568716881, 'F1': 0.6223735224894988}
