In [None]:
import random

from rouge_score import rouge_scorer  # Google's Rouge implementation
from nltk.tokenize import WhitespaceTokenizer

import evaluate  # Huggingface wrapper around Google's Rouge implementation

# Auxiliary functions
We additionally include Precision, Recall, F1 via strict or relaxed operators, and alternative ways to induce boundaries within triples in ROUGE. In the experiments, we only tested and evaluated ROUGE via space separators (default configuration in its implementation by Google)

In [None]:
# Helper class for allowing comparing triples via equality that also
# indicates how we should verbalize them
class Triple:
    full_equality = True
    verbalize_with_random_separators = True

    def __init__(self, s, p, o):
        self.elements = (s, p, o)

    def __eq__(self, other):
        if isinstance(other, Triple):
            s1, p1, o1 = self.elements
            s2, p2, o2 = other.elements

            if Triple.full_equality:
                return s1 == s2 and p1 == p2 and o1 == o2
            else:
                # Allow triple elements to be partially
                # contained inside their equivalent s
                return (
                        (s1 == s2 and p1 == p2 and o1 == o2) or
                        (
                                (s1 in s2 or s2 in s1) and
                                (p1 in p2 or p2 in p1) and
                                (o1 in o2 or o2 in o1)
                        )
                )
        return False

    def __getitem__(self, index):
        return self.elements[index]

    def __hash__(self):
        return hash(self.elements)

In [None]:
# Returns a random valid unicode char not within the provided set
def get_random_char(forbidden_chars):
    unicode_start_range = 0x0020  # Start of printable characters range
    unicode_end_range = 0x10FFFF  # End of unicode range

    random_unicode_char = chr(random.randint(unicode_start_range, unicode_end_range))

    if random_unicode_char not in forbidden_chars:
        return random_unicode_char
    else:
        return get_random_char(forbidden_chars)
    
# Adds to forbidden_chars the characters containined in each triple,
# so that when we use random separators, they don't contain them
def get_chars_from_triples(forbidden_chars, triples_list):
    for triples in triples_list:
        for (s, p, o) in triples:
            for char in s:
                forbidden_chars.add(char)
            for char in p:
                forbidden_chars.add(char)
            for char in o:
                forbidden_chars.add(char)

    return forbidden_chars

In [None]:
# Turns the two sets of triples into text following the Triple's configuration
def verbalize_triples(generated_triples, ground_truth):
    # First, we build a set of characters already present in the triples, which we should not
    # use as random separators. We will add random chars as separators between triples, ensuring
    # that they  did not appear either in the triples or in previous separators
    forbidden_chars = set()

    if Triple.verbalize_with_random_separators:
        forbidden_chars = get_chars_from_triples(forbidden_chars, ground_truth)
        forbidden_chars = get_chars_from_triples(forbidden_chars, generated_triples)

    lines_gt = []
    lines_generated = []

    # Every sample is a verbalized string of all tuples, separated
    # by a special symbol, which is different for ground and generated
    # triples in order to ensure that there are no matches across different
    # triples
    for triples in ground_truth:
        line_gt = " "
        for i, (s, p, o) in enumerate(triples):
            line_gt += f"{s} {p} {o}"
            if i < len(triples) - 1:
                if Triple.verbalize_with_random_separators:
                    random_char = get_random_char(forbidden_chars)
                    line_gt += f" {random_char} "
                    forbidden_chars.add(random_char)
                else:
                    line_gt += " "

        lines_gt.append(line_gt)

    for triples in generated_triples:
        line_generated = " "
        try:
            for i, (s, p, o) in enumerate(triples):
                line_generated += f"{s} {p} {o}"
                if Triple.verbalize_with_random_separators:
                    random_char = get_random_char(forbidden_chars)
                    line_generated += f" {random_char} "
                    forbidden_chars.add(random_char)
                else:
                    line_generated += " "

        except Exception as e:  # One of the triples was wrong (missing object usually)
            line_generated = " "

        lines_generated.append(line_generated)

    return lines_gt, lines_generated

In [None]:
# Rouge and P/R/F1 functions

def evaluate_text_huggingface_impl(ground_truth, generated_triples):
    lines_gt, lines_generated = verbalize_triples(generated_triples, ground_truth)
    results = evaluate.load('rouge').compute(predictions=lines_generated, references=lines_gt)

    return results["rouge1"], results["rouge2"], results["rougeL"]


def evaluate_text_google_impl(ground_truth, generated_triples, custom_tokenizer):
    lines_gt, lines_generated = verbalize_triples(generated_triples, ground_truth)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'],
                                      use_stemmer=False,
                                      split_summaries=False,
                                      # For example, from NLTK. This ensures that
                                      # no regexes or other filtering rules are used
                                      #
                                      # Their only requirement is that the class has
                                      # a tokenize() method that does anything and
                                      # returns a list of strings
                                      #
                                      # use_stemmer won't have any effect if we are
                                      # using a custom tokenizer
                                      tokenizer=custom_tokenizer)
    scores_rouge_1 = []
    scores_rouge_2 = []
    scores_rouge_L = []
    for line_g, line_gt in zip(lines_generated, lines_gt):
        scores = scorer.score(line_gt, line_g)
        p_r1, r_r1, f1_r1 = scores['rouge1']
        p_r2, r_r2, f1_r2 = scores['rouge2']
        p_rL, r_rL, f1_rL = scores['rougeL']
        scores_rouge_1.append(f1_r1)
        scores_rouge_2.append(f1_r2)
        scores_rouge_L.append(f1_rL)

    avg_rouge_1 = sum(scores_rouge_1) / len(scores_rouge_1)
    avg_rouge_2 = sum(scores_rouge_2) / len(scores_rouge_2)
    avg_rouge_L = sum(scores_rouge_L) / len(scores_rouge_L)

    return avg_rouge_1, avg_rouge_2, avg_rouge_L


def calculate_tp_fp_fn(ground_truth_triples, gen_triples):
    if len(ground_truth_triples) == 0:
        return 0.0, 0.0, 0.0

    true_positive = 0
    false_positive = 0
    false_negative = 0

    # We allow partial equalities and it's a list of singletons
    # (we are comparing subjects, predicates or objects separately)
    #
    # If not, the implementation below works for any other case
    if not Triple.full_equality and type(ground_truth_triples[0]) is str:
        for singleton_gen in gen_triples:
            found = False
            for singleton_gt in ground_truth_triples:
                if singleton_gen in singleton_gt or singleton_gt in singleton_gen:
                    found = True
                    break

            if found:
                true_positive += 1
            else:
                false_positive += 1

    # We don't do it via set operators since we want to use the custom
    # equality functions, not hashes
    for gen_triple in gen_triples:
        if gen_triple in ground_truth_triples:
            true_positive += 1
        else:
            false_positive += 1

    if len(ground_truth_triples) > len(gen_triples):
        false_negative = len(ground_truth_triples) - len(gen_triples) # Triples that we missed, otherwise 0

    return true_positive, false_positive, false_negative

In [None]:
# Returns triple comparison results
def evaluate_triples(ground_truth, generated_triples):
    precision_scores = []
    recall_scores = []
    f1_scores = []
    for ground_truth_triples, gen_triples in zip(ground_truth, generated_triples):
        # Only zephyr causes this: It returns a list of tuples of arrays
        for t in gen_triples:
            try:
                hash(t)
            except TypeError:
                gen_triples = []
                break

        true_positive, false_positive, false_negative = calculate_tp_fp_fn(ground_truth_triples, gen_triples)

        precision = true_positive / max(1, true_positive + false_positive)
        recall = true_positive / max(1, true_positive + false_negative)
        f1 = 2 * (precision * recall) / max(1e-10, precision + recall)

        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    p = sum(precision_scores) / len(precision_scores)
    r = sum(recall_scores) / len(recall_scores)
    f1 = sum(f1_scores) / len(f1_scores)

    return p, r, f1

In [None]:
def get_subjects(triples):
    return [t[0] for sublist in triples for t in sublist]


def get_predicates(triples):
    return [t[1] for sublist in triples for t in sublist]


def get_objects(triples):
    return [t[2] for sublist in triples for t in sublist]


def is_valid_triple(t):
    return isinstance(t, tuple) and len(t) == 3


def clean_and_turn_to_triple(t):
    s, p, o = t
    if not isinstance(s, str):
        s = str(s)
    if not isinstance(p, str):
        p = str(p)
    if not isinstance(o, str):
        o = str(o)

    return Triple(s.lower(), p.lower(), o.lower())


# Given a results file from LLM_testing, writes to the CSV file the comparison results for
# every possible configuration (full or relaxed triple equality, random separators in rouge or not...)
def eval_results_file(path, LLM_name, n_samples, out_csv_file):
    with open(path, 'r') as file:
        samples = []
        ground_truth = []
        generated_triples = []
        invalid_generations = 0

        for line in file:
            line = line.strip()
            if line.startswith("Sample: "):
                samples.append(line[len("Sample: "):])

            if line.startswith("Ground truth: "):
                gt_triples = eval(line[len("Ground truth: "):])
                ground_truth.append([clean_and_turn_to_triple(t) for t in gt_triples])

            if line.startswith("Generated triples: "):
                try:
                    gen_triples_raw = []
                    for gen_triple_raw in eval(line[len("Generated triples: "):]):
                        if is_valid_triple(gen_triple_raw):
                            gen_triples_raw.append(gen_triple_raw)
                        #else:
                            #print(f"{LLM_name}: invalid generated triple {gen_triple_raw}")

                    gt = [clean_and_turn_to_triple(t) for t in gen_triples_raw]
                    generated_triples.append(gt)
                    if len(gt) == 0:
                        invalid_generations += 1

                except Exception as e:  # Only happens with webNLG (it sometimes generates triples without objects)
                    invalid_generations += 1
                    generated_triples.append([])

    subjects_generated = get_subjects(generated_triples)
    subjects_ground_truth = get_subjects(ground_truth)

    predicates_generated = get_predicates(generated_triples)
    predicates_ground_truth = get_predicates(ground_truth)

    objects_generated = get_objects(generated_triples)
    objects_ground_truth = get_objects(ground_truth)

    Triple.full_equality = True
    p_fe, r_fe, f1_fe = evaluate_triples(ground_truth, generated_triples)
    p_s_fe, r_s_fe, f1_s_fe = evaluate_triples(subjects_ground_truth, subjects_generated)
    p_p_fe, r_p_fe, f1_p_fe = evaluate_triples(predicates_ground_truth, predicates_generated)
    p_o_fe, r_o_fe, f1_o_fe = evaluate_triples(objects_ground_truth, objects_generated)

    Triple.full_equality = False
    p_relaxed, r_relaxed, f1_relaxed = evaluate_triples(ground_truth, generated_triples)
    p_s_relaxed, r_s_relaxed, f1_s_relaxed = evaluate_triples(subjects_ground_truth, subjects_generated)
    p_p_relaxed, r_p_relaxed, f1_p_relaxed = evaluate_triples(predicates_ground_truth, predicates_generated)
    p_o_relaxed, r_o_relaxed, f1_o_relaxed = evaluate_triples(objects_ground_truth, objects_generated)

    Triple.verbalize_with_random_separators = False
    avg_rouge_1_google_space_seps, avg_rouge_2_google_space_seps, avg_rouge_L_google_space_seps = (
        evaluate_text_google_impl(ground_truth,
                                  generated_triples,
                                  None))

    Triple.verbalize_with_random_separators = True
    avg_rouge_1_google_random_seps, avg_rouge_2_google_random_seps, avg_rouge_L_google_random_seps = (
        evaluate_text_google_impl(ground_truth,
                                  generated_triples,
                                  None))

    Triple.verbalize_with_random_separators = False
    avg_rouge_1_google_space_seps_custom_tokenizer, avg_rouge_2_google_space_seps_custom_tokenizer, avg_rouge_L_google_space_seps_custom_tokenizer = (
        evaluate_text_google_impl(ground_truth,
                                  generated_triples,
                                  WhitespaceTokenizer()))

    Triple.verbalize_with_random_separators = True
    avg_rouge_1_google_random_seps_custom_tokenizer, avg_rouge_2_google_random_seps_custom_tokenizer, avg_rouge_L_google_random_seps_custom_tokenizer = (
        evaluate_text_google_impl(ground_truth,
                                  generated_triples,
                                  WhitespaceTokenizer()))

    out_csv_file.write(f"{LLM_name},")
    out_csv_file.write(f"{n_samples},")

    out_csv_file.write(f"{invalid_generations},")

    out_csv_file.write(f"{round(p_fe, 3)},")
    out_csv_file.write(f"{round(p_relaxed, 3)},")

    out_csv_file.write(f"{round(f1_fe, 3)},")
    out_csv_file.write(f"{round(f1_relaxed, 3)},")

    out_csv_file.write(f"{round(f1_s_fe, 3)},")
    out_csv_file.write(f"{round(f1_s_relaxed, 3)},")

    out_csv_file.write(f"{round(f1_p_fe, 3)},")
    out_csv_file.write(f"{round(f1_p_relaxed, 3)},")

    out_csv_file.write(f"{round(f1_o_fe, 3)},")
    out_csv_file.write(f"{round(f1_o_relaxed, 3)},")

    out_csv_file.write(f"{round(avg_rouge_1_google_space_seps, 3)},")
    out_csv_file.write(f"{round(avg_rouge_2_google_space_seps, 3)},")
    out_csv_file.write(f"{round(avg_rouge_L_google_space_seps, 3)}\n")

# Evaluate the results
The results will be saved to `results_llm_testing/results.csv`, containing the F1 and Rouge scores
using different configurations:
- Strict F1 (ommited): Calculate F1 scores by matching verbalized triples (`"{s} {p} {o}"`) via a strict string equality
- Relaxed F1 (ommited): Calculate the same score, but allowing subjects, predicates and objects to be partially within the ground truth or viceversa
- Space separators: Separate the verbalized triples of every sample via spaces (`"{s1} {p1} {o1} {s2} {p2} {o2} ..."`)
- Random separators (ommited): Instead of spaces, add random unicode separators when verbalizing triples. This makes it impossible for rouge-2 and rouge-L to go beyond
  the boundaries of single triple (otherwise, for example, given (s1,p1,o1) and (s2,p2,o2), it could match (p1,o1,s2))
- Custom tokenizer (ommited): Instead of using the Rouge's implementation tokenizer (https://github.com/google-research/google-research/blob/master/rouge/tokenize.py), which converts to lowercase and removes non-alphanumeric characters, use NLTK's `WhiteSpaceTokenizer()` (Note: we also convert to lowercase both the generated and the ground truth triples). The tokenizer can be changed in the `eval_results_file` function

The Rouge implementation being used is Google's (which is also Huggingface's)

In [None]:
import Datasets

webnlg_dataset = Datasets.WebNLGDataset()
test_samples = dict(webnlg_dataset.test_samples)

def gemini_results_to_txt(gemini_results, n_samples):
    with open(f"results_llm_testing/results_gemini_webnlg_{n_samples}_samples.txt", 'w') as file:
        for text, triples in gemini_results.items():
            sample = f"Sample: {text}\nGround truth: {test_samples[text]}\nGenerated triples: {list(map(tuple, triples))}\n\n"
            file.write(sample)

gemini_results_to_txt(json.load(open("results_llm_testing/gemini_results_5.json", 'r')), 5)
gemini_results_to_txt(json.load(open("results_llm_testing/gemini_results_8.json", 'r')), 8)
gemini_results_to_txt(json.load(open("results_llm_testing/gemini_results_16.json", 'r')), 16)

In [None]:
with open("results_llm_testing/results_llms.csv", 'w') as out_csv_file:
    # Write the CSV headers
    out_csv_file.write("LLM,")
    out_csv_file.write("Examples provided,")

    out_csv_file.write("Invalid outputs,")

    out_csv_file.write("Precision (strict),")
    out_csv_file.write("Precision (relaxed),")
    
    out_csv_file.write("F1 (strict),")
    out_csv_file.write("F1 (relaxed),")

    out_csv_file.write("F1 (subjects strict),")
    out_csv_file.write("F1 (subjects relaxed),")

    out_csv_file.write("F1 (predicates strict),")
    out_csv_file.write("F1 (predicates relaxed),")

    out_csv_file.write("F1 (objects strict),")
    out_csv_file.write("F1 (objects relaxed),")

    out_csv_file.write("Avg. Rouge-1 (space separators),")
    out_csv_file.write("Avg. Rouge-2 (space separators),")
    out_csv_file.write("Avg. Rouge-L (space separators)\n")

    eval_results_file('results_llm_testing/results_gemma_2b_webnlg_5_samples.txt', "Gemma-2 (2B)", 5, out_csv_file)
    eval_results_file('results_llm_testing/results_gemma_2b_webnlg_8_samples.txt', "Gemma-2 (2B)", 8, out_csv_file)
    eval_results_file('results_llm_testing/results_gemma_2b_webnlg_16_samples.txt', "Gemma-2 (2B)", 16, out_csv_file)

    eval_results_file('results_llm_testing/results_llama_3b_webnlg_5_samples.txt', "Llama-3.2 (3B)", 5, out_csv_file)
    eval_results_file('results_llm_testing/results_llama_3b_webnlg_8_samples.txt', "Llama-3.2 (3B)", 8, out_csv_file)
    eval_results_file('results_llm_testing/results_llama_3b_webnlg_16_samples.txt', "Llama-3.2 (3B)", 16, out_csv_file)

    eval_results_file('results_llm_testing/results_phi_3b_webnlg_5_samples.txt', "Phi-3.5 (3.8B)", 5, out_csv_file)
    eval_results_file('results_llm_testing/results_phi_3b_webnlg_8_samples.txt', "Phi-3.5 (3.8B)", 8, out_csv_file)
    eval_results_file('results_llm_testing/results_phi_3b_webnlg_16_samples.txt', "Phi-3.5 (3.8B)", 16, out_csv_file)

    eval_results_file('results_llm_testing/results_llama_8b_webnlg_5_samples.txt', "Llama-3.1 (8B)", 5, out_csv_file)
    eval_results_file('results_llm_testing/results_llama_8b_webnlg_8_samples.txt', "Llama-3.1 (8B)", 8, out_csv_file)
    eval_results_file('results_llm_testing/results_llama_8b_webnlg_16_samples.txt', "Llama-3.1 (8B)", 16, out_csv_file)

    eval_results_file('results_llm_testing/results_gemma_9b_webnlg_5_samples.txt', "Gemma-2 (9B)", 5, out_csv_file)
    eval_results_file('results_llm_testing/results_gemma_9b_webnlg_8_samples.txt', "Gemma-2 (9B)", 8, out_csv_file)
    eval_results_file('results_llm_testing/results_gemma_9b_webnlg_16_samples.txt', "Gemma-2 (9B)", 16, out_csv_file)
    
    eval_results_file('results_llm_testing/results_gemini_webnlg_5_samples.txt', "Gemini", 5, out_csv_file)
    eval_results_file('results_llm_testing/results_gemini_webnlg_8_samples.txt', "Gemini", 8, out_csv_file)
    eval_results_file('results_llm_testing/results_gemini_webnlg_16_samples.txt', "Gemini", 16, out_csv_file)