In [7]:
from transformers import AutoModelForSeq2SeqLM, AutoModel, AutoTokenizer
import nltk
import json
import pandas as pd
from datasets import *
import numpy as np
import torch

nltk.download('punkt')
SEED = 42

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Luka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
split = "test"

In [9]:
rouge = load_metric("rouge")
bert = load_metric("bertscore")

In [10]:
def compute_rouge(decoded_preds, decoded_labels, prediction_lens):
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [11]:
model_checkpoints = [
    "cjvt/t5-sl-small",
    "ls1906/t5-sl-small-finetuned-assistant",
    # "cjvt/t5-sl-large",
    "vh-student/t5-sl-large-oasst-pairs",
    "vh-student/t5-sl-large-oasst-context",
    "cjvt/gpt-sl-base",
    "vh-student/gpt-sl-oasst1-pairs",
    "vh-student/gpt-sl-oasst1-context"
]

## First generated response

In [6]:
for model_checkpoint in model_checkpoints:
    # read local data depending on the model that it was generated with
    model_name = model_checkpoint.split("/")[-1]
    data_path = f"../../data/results/prompt_reply_pairs_1_generated_{split}_{model_name}.csv"
    data = pd.read_csv(data_path, sep=";")

    decoded_labels = data["reply"].to_list()
    decoded_preds = data["generated"].to_list()
    prediction_lens = data["token_prediction_len"].to_list()

    print(model_name.upper())

    # ROUGE
    result_rouge = compute_rouge(decoded_preds, decoded_labels, prediction_lens)
    print(result_rouge)

    # BERTSCORE
    result_bert = bert.compute(predictions=decoded_preds, references=decoded_labels, lang="sl")
    result_bert = {"Precision": np.array(result_bert["precision"]).mean(), "Recall": np.array(result_bert["recall"]).mean(), "F1": np.array(result_bert["f1"]).mean()}
    print(result_bert)
    print()

T5-SL-SMALL
{'rouge1': 0.3379, 'rouge2': 0.0652, 'rougeL': 0.2437, 'rougeLsum': 0.3107, 'gen_len': 3.4974}
{'Precision': 0.6246253625246194, 'Recall': 0.440937813881116, 'F1': 0.5145365299567198}

T5-SL-SMALL-FINETUNED-ASSISTANT
{'rouge1': 17.2297, 'rouge2': 4.0728, 'rougeL': 11.4752, 'rougeLsum': 15.7352, 'gen_len': 73.2593}
{'Precision': 0.6693199219275744, 'Recall': 0.63366664484831, 'F1': 0.6491875981428684}

T5-SL-LARGE-OASST-PAIRS
{'rouge1': 14.7096, 'rouge2': 3.6051, 'rougeL': 10.3875, 'rougeLsum': 13.3069, 'gen_len': 60.1304}
{'Precision': 0.6684416730118834, 'Recall': 0.6245725867237247, 'F1': 0.64394908616542}

T5-SL-LARGE-OASST-CONTEXT
{'rouge1': 17.9929, 'rouge2': 4.1677, 'rougeL': 14.1615, 'rougeLsum': 16.783, 'gen_len': 50.6636}
{'Precision': 0.7225160922187016, 'Recall': 0.658557513531338, 'F1': 0.6862731942471684}

GPT-SL-BASE
{'rouge1': 15.0177, 'rouge2': 2.4444, 'rougeL': 9.1292, 'rougeLsum': 13.5454, 'gen_len': 162.5594}
{'Precision': 0.5951044166271503, 'Recall': 0.