## [MAUVE & ROUGE] ASQA & ELI5

In [None]:
import json

# Function to trim prediction
def trim_prediction(pred):
    return pred.split("</s>")[0]

# Load JSON data from file
with open("results_no_indent.json", "r") as file:
    results = json.load(file)

# Apply the function to modify 'pred' values
for entry in results:
    entry["pred"] = [trim_prediction(p) for p in entry["pred"]]

# Save the modified JSON back to a file
with open("results_no_indent_modified.json", "w") as file:
    json.dump(results, file, indent=4)

print("Updated JSON saved as 'results_no_indent_modified.json'.")

In [18]:
import copy
import json
# from metrics import load_file

import random

with open("results_no_indent_modified.json", "r") as file:
    dummy_data = json.load(file)


data = []
for item in dummy_data:
    data.append({
        "input": " | ".join([" ".join(triple) for triple in item['input']]),
        "output": item["pred"][0].lower(),
        "golds": item["label"][0].lower()
    })
            


In [34]:
dummy_data[45]

{'input': [['<S> Frankfurt am main| <P> Country| <O> Germany',
   '<S> Rüdiger wittig| <P> Affiliation| <O> Goethe university frankfurt',
   '<S> Rüdiger wittig| <P> Field of study| <O> Ecology',
   '<S> Rüdiger wittig| <P> Field of study| <O> Geobotany',
   '<S> Goethe university frankfurt| <P> City| <O> Frankfurt am main']],
 'pred': ['Rüdiger Wittig is a German geobotanist and ecologist who works at Goethe University Frankfurt in Frankfurt am Main, Germany.'],
 'label': ['Rüdiger Wittig is a professor of geobotany and ecology at the Goethe University Frankfurt in Frankfurt am Main, Germany.']}

In [27]:
data[25]

{'input': '<S> Ban nang long railway station| <P> Province| <O> Nakhon si thammarat <S> Ban nang long railway station| <P> District| <O> Cha-uat district <S> Ban nang long railway station| <P> Location| <O> Nang long subdistrict',
 'output': 'ban nang long railway station is a railway station in nang long subdistrict, cha-uat district, nakhon si thammarat province.',
 'golds': 'ban nang long railway station is a railway station located in nang long subdistrict, cha-uat district, nakhon si thammarat.'}

In [3]:
import os
from metrics import *

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
from tqdm import tqdm



In [6]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

normalized_data = copy.deepcopy(data)

# Normalize output and reference texts (trims to first 100 tokens)
references = [' '.join((' '.join(item['input']) + " " + ' '.join(item['golds'])).split()[:100]).rstrip() for item in normalized_data]
predictions = [' '.join((' '.join(item['input']) + " " + item['output']).split()[:100]).rstrip() for item in normalized_data]

# Compute and display metrics
print("ROUGE Score:", compute_rouge([{"output": d["output"], "label": d["golds"]} for d in normalized_data]))
print("MAUVE Score:", mauve_score(predictions, references))

# Compute accuracy-based evaluation
metric_result_1 = []
for i in tqdm(range(len(data))):
    golds = data[i]["golds"] if isinstance(data[i]["golds"], list) else [data[i]["golds"]]
    result = match(data[i]["output"], golds)
    metric_result_1.append(result)

# Print final evaluation metric
print("Match Accuracy:", np.mean(metric_result_1))

ROUGE Score: 71.83422263040993
Loading tokenizer
Tokenizing text...
Loading tokenizer
Loading model
Featurizing tokens


Featurizing p: 100%|██████████| 41/41 [00:33<00:00,  1.22it/s]


Tokenizing text...
Featurizing tokens


Featurizing q: 100%|██████████| 41/41 [00:32<00:00,  1.26it/s]


seed = 25
performing clustering in lower dimension = 128
Clustering 656 points in 129D to 33 clusters, redo 5 times, 500 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 499 (0.23 s, search 0.12 s): objective=131.689 imbalance=1.214 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 499 (0.32 s, search 0.19 s): objective=131.309 imbalance=1.571 nsplit=0       
Objective improved: keep new clusters
Outer iteration 2 / 5
  Iteration 499 (0.41 s, search 0.26 s): objective=136.535 imbalance=1.716 nsplit=0       
Outer iteration 3 / 5
  Iteration 499 (0.49 s, search 0.32 s): objective=130.337 imbalance=1.431 nsplit=0       
Objective improved: keep new clusters
Outer iteration 4 / 5
  Iteration 499 (0.58 s, search 0.38 s): objective=134.022 imbalance=1.814 nsplit=0       
kmeans time: 0.58 s
total discretization time: 0.75 seconds
MAUVE Score: 76.08182172960719


100%|██████████| 328/328 [00:00<00:00, 25962.10it/s]

Match Accuracy: 0.08536585365853659





In [7]:
data[0]

{'input': '<S> Richard james pears| <P> Nationality| <O> English <S> Richard james pears| <P> Occupation| <O> Professional footballer',
 'output': 'richard james pears was an english professional footballer.',
 'golds': 'richard james pears is an english former professional footballer.'}

In [8]:
import json
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Smoothing function to avoid zero scores
smoothing = SmoothingFunction().method1

# List to store BLEU scores
bleu_scores = []

for entry in data:
    # Process 'pred' field
    processed_pred = entry["output"][0]

    # Tokenize reference and prediction
    references = [entry["golds"][0].split()]  # Tokenized ground truth
    candidate = processed_pred.split()  # Tokenized prediction

    # Compute BLEU score
    bleu_score = sentence_bleu(references, candidate, smoothing_function=smoothing)
    bleu_scores.append(bleu_score)

# Compute final average BLEU score
final_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0.0

# Print the final BLEU score
print(f"Final BLEU Score: {final_bleu_score:.4f}")


Final BLEU Score: 0.1442


In [13]:
nltk.download('wordnet')  # Required for METEOR to handle synonyms


[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...


True

In [12]:
import json
import torch
from bert_score import score as bert_score
# from bleurt import score as bleurt_score

# Load JSON data from file
# with open("results.json", "r") as file:
#     results = json.load(file)

# Extract predictions and ground truths
predictions = [entry["output"][0] for entry in data]
references = [entry["golds"][0] for entry in data]

# Compute BERTScore
P, R, F1 = bert_score(predictions, references, lang="en", model_type="microsoft/deberta-xlarge-mnli")

# Compute BLEURT
# bleurt_evaluator = bleurt_score.BleurtScorer("BLEURT-20")
# bleurt_scores = bleurt_evaluator.score(references=references, candidates=predictions)

# Compute Final Scores
average_bert_f1 = torch.mean(F1).item()
# average_bleurt = sum(bleurt_scores) / len(bleurt_scores) if bleurt_scores else 0.0

# Print Results
print(f"Final BERTScore (F1): {average_bert_f1:.4f}")
# print(f"Final BLEURT Score: {average_bleurt:.4f}")


Final BERTScore (F1): 0.9265


In [14]:
import json
import nltk
from nltk.translate.meteor_score import meteor_score


# Compute METEOR scores
meteor_scores = []

for entry in data:
    # Extract reference (ground truth) and candidate (prediction)
    reference = entry["output"][0]  # Ground truth
    candidate = entry["golds"][0]  # Model's prediction

    # Compute METEOR score
    score = meteor_score([reference.split()], candidate.split())
    meteor_scores.append(score)

# Compute final METEOR score (average over all samples)
final_meteor_score = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0.0

# Print the final METEOR score
print(f"Final METEOR Score: {final_meteor_score:.4f}")


Final METEOR Score: 0.4055


In [22]:
data[0]

{'input': '<S> Richard james pears| <P> Nationality| <O> English <S> Richard james pears| <P> Occupation| <O> Professional footballer',
 'output': 'richard james pears was an english professional footballer.',
 'golds': 'richard james pears is an english former professional footballer.'}