In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoModel, AutoTokenizer
import nltk
import json
import pandas as pd
from datasets import *
import numpy as np
import torch

nltk.download('punkt')
SEED = 42

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Luka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
split = "test"

In [3]:
rouge = load_metric("rouge")
bert = load_metric("bertscore")

  rouge = load_metric("rouge")


In [4]:
def compute_rouge(decoded_preds, decoded_labels, prediction_lens):
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [5]:
model_checkpoints = [
    "cjvt/t5-sl-small",
    "ls1906/t5-sl-small-finetuned-assistant",
    # "cjvt/t5-sl-large",
    # "vh-student/t5-sl-large-oasst-pairs",
    # "vh-student/t5-sl-large-oasst-context",
    "cjvt/gpt-sl-base",
    "vh-student/gpt-sl-oasst1-pairs",
    "vh-student/gpt-sl-oasst1-context"
]

## First generated response

In [6]:
for model_checkpoint in model_checkpoints:
    # read local data depending on the model that it was generated with
    model_name = model_checkpoint.split("/")[-1]
    data_path = f"../../data/results/prompt_reply_pairs_1_generated_{split}_{model_name}.csv"
    data = pd.read_csv(data_path, sep=";")

    decoded_labels = data["reply"].to_list()
    decoded_preds = data["generated"].to_list()
    prediction_lens = data["token_prediction_len"].to_list()

    print(model_name.upper())

    # ROUGE
    result_rouge = compute_rouge(decoded_preds, decoded_labels, prediction_lens)
    print(result_rouge)

    # BERTSCORE
    result_bert = bert.compute(predictions=decoded_preds, references=decoded_labels, lang="sl")
    result_bert = {"Precision": np.array(result_bert["precision"]).mean(), "Recall": np.array(result_bert["recall"]).mean(), "F1": np.array(result_bert["f1"]).mean()}
    print(result_bert)
    print()

T5-SL-SMALL
{'rouge1': 0.3373, 'rouge2': 0.0651, 'rougeL': 0.243, 'rougeLsum': 0.3124, 'gen_len': 3.4974}
{'Precision': 0.6246253633865944, 'Recall': 0.44093781444965263, 'F1': 0.5145365309715271}

T5-SL-SMALL-FINETUNED-ASSISTANT
{'rouge1': 17.2219, 'rouge2': 4.0704, 'rougeL': 11.476, 'rougeLsum': 15.7317, 'gen_len': 73.2593}
{'Precision': 0.6693199218603281, 'Recall': 0.6336666446221181, 'F1': 0.6491875983446073}

GPT-SL-BASE
{'rouge1': 15.0181, 'rouge2': 2.4436, 'rougeL': 9.1311, 'rougeLsum': 13.5434, 'gen_len': 162.5594}
{'Precision': 0.5951044182532873, 'Recall': 0.6274400653044383, 'F1': 0.6096532132686713}

GPT-SL-OASST1-PAIRS
{'rouge1': 16.8995, 'rouge2': 2.627, 'rougeL': 9.8213, 'rougeLsum': 15.6109, 'gen_len': 162.5594}
{'Precision': 0.6103136447759775, 'Recall': 0.637580257342412, 'F1': 0.6223735225873116}

GPT-SL-OASST1-CONTEXT
{'rouge1': 14.7758, 'rouge2': 2.3965, 'rougeL': 8.8418, 'rougeLsum': 13.6492, 'gen_len': 202.3118}
{'Precision': 0.6056029972235631, 'Recall': 0.6519

## RRHF