In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from datasets import load_dataset
from tigerscore import TIGERScorer
scorer = TIGERScorer(model_size="7b", quantized=True)
dataset = load_dataset("TIGER-Lab/MetricInstruct")
num_few_examples = 10
tasks = dataset["train_mix"]['task'][0:num_few_examples]
insts = dataset["train_mix"]['instruction'][0:num_few_examples]
input_contexts = dataset["train_mix"]['input_context'][0:num_few_examples]
hypo_output = dataset["train_mix"]['hypo_output'][0:num_few_examples]
results = scorer.score(tasks, insts, input_contexts, hypo_output)
scores = [result["score"] for result in results]
print(results)


  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
TIGERScore Batch Scoring: 100%|██████████| 2/2 [01:00<00:00, 30.43s/it]

[{'num_errors': 2, 'score': -5.0, 'errors': {'error_0': {'location': 'the future head of the European Championships', 'aspect': 'Accuracy', 'explanation': "The model incorrectly translates 'EM-Cheforganisator' as 'head of the European Championships'. The correct translation should be 'head organizer of the European Football Championship'. This error changes the meaning of the sentence.", 'severity': 'Major', 'score_reduction': '4'}, 'error_1': {'location': 'is to participate in the DFB Presidency', 'aspect': 'Terminology', 'explanation': "The model uses 'participate in' instead of 'work for' when translating 'mitarbeiten' in the context of 'DFB-Präsidium'. While 'participate in' is not incorrect, 'work for' is a more accurate translation in this context. This is a minor error as it does not significantly change the meaning of the sentence.", 'severity': 'Minor', 'score_reduction': '1'}}, 'raw_output': 'You are evaluating errors in a model-generated output for a(an) translation task.\nT




In [None]:
import logging
logging.getLogger

In [3]:
scorer.tokenizer.padding_side

'right'

In [None]:
torch.cuda.is_available()

In [2]:
for error in errors:
    print(error)
    print("#####"*10)

{'error_0': {'location': 'the future head of the European Championships', 'aspect': 'Accuracy', 'explanation': "The model incorrectly translates 'EM-Cheforganisator' as 'head of the European Championships'. The correct translation should be 'head organizer of the European Football Championship'. This error changes the meaning of the sentence.", 'severity': 'Major', 'score_reduction': '4'}, 'error_1': {'location': 'is to participate in the DFB Presidency', 'aspect': 'Terminology', 'explanation': "The model uses 'participate in the DFB Presidency' instead of 'work for the DFB Presidential Board'. While 'participate' is not incorrect, 'work for' is a more accurate translation of 'mitarbeiten' in this context, as it implies a more active role than 'participate'. This error does not significantly change the meaning of the sentence.", 'severity': 'Minor', 'score_reduction': '1'}}
##################################################
{'error_0': {'location': 'base', 'aspect': 'Terminology', 'exp

In [None]:
print(dataset['train_mix'][0]['errors'])

In [None]:
from tqdm import tqdm
list([decode_tigerscore_output(x) for x in tqdm(dataset['train_mix']['errors'])])

In [None]:
import regex as re
def decode_tigerscore_output(output):
    """Decode the output of TIGERScore model into structured error explanations.

    Args:
        output (str):
            the output of TIGERScore model.
    Returns:
        errors (List[Dict]):
            structured error explanations for each error in the output.
            Each error explanation is a dictionary with the following fields:
                - error_location (str): the words that are wrong in the output
                - error_aspect (str): the aspect of the error
                - error_explanation (str): explanation why it's an error, and the correction suggestions
                - error_severity (str): severity of the error ("Major" or "Minor")
                - score_reduction (float): reduction of score (between 0.5 and 5 given the severity of the error)
            There can be multiple errors in each input.
    """
    result = {}
    result['num_errors'] = re.search(r"(?<=The model-generated output contains )\d+(?= errors)", output).group(0)
    result['score'] = re.search(r"(?<=, with a total score reduction of )\d+", output).group(0)
    result['errors'] = {}
    error_locations = re.findall(r"(?<=Error location \d+: ).*?(?=\n)", output)
    error_aspects = re.findall(r"(?<=Error aspect \d+: ).*?(?=\n)", output)
    error_explanations = re.findall(r"(?<=Explanation \d+: ).*?(?=\n)", output)
    error_severities = re.findall(r"(?<=Severity \d+: ).*?(?=\n)", output)
    score_reductions = re.findall(r"(?<=\nScore reduction \d+: )(\d+\.\d+|\d+)", output)
    assert len(error_locations) == len(error_aspects) == len(error_explanations) == len(error_severities) == len(score_reductions), \
        "The number of errors does not match."
    for i in range(len(error_locations)):
        error = {}
        error['location'] = error_locations[i]
        error['aspect'] = error_aspects[i]
        error['explanation'] = error_explanations[i]
        error['severity'] = error_severities[i]
        error['score_reduction'] = score_reductions[i]
        result['errors'][f"error_{i}"] = error
    return result