## TIGERScore Usage Examples

In [None]:
import os
import json
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# set up scorer
from tigerscore import TIGERScorer
scorer = TIGERScorer(model_name="TIGER-Lab/TIGERScore-7B") # on GPU
# scorer = TIGERScorer(model_name="TIGER-Lab/TIGERScore-7B", quantized=True) # 4 bit quantization on GPU
# scorer = TIGERScorer(model_name="TIGER-Lab/TIGERScore-7B", use_vllm=True) # VLLM on GPU, about 5 instances per seconds
# scorer = TIGERScorer(model_name="TIGER-Lab/TIGERScore-7B-GGUF", use_llamacpp=True) # 4 bit quantization on CPU

In [4]:
# Instruction-following example  
instruction = "Write an apology letter."
input_context = "Reason: You canceled a plan at the last minute due to illness."
hypo_output = "Hey [Recipient],\n\nI'm really glad for ditching our plan. I suddenly got an opportunity for a vacation so I took it. I know this might have messed up your plans and I love that.\n\nDespite being under the weather, I would rather go for an adventure. I hope you can understand my perspective and I hope this incident doesn't change anything between us.\n\nWe can reschedule our plan for another time. Sorry again for the trouble.\n\nPeace out,\n[Your Name]\n\n---"
results = scorer.score([instruction], [hypo_output], [input_context])
print(json.dumps(results, indent=4))

TIGERScore Batch Scoring: 100%|██████████| 1/1 [00:15<00:00, 15.06s/it]

[
    {
        "num_errors": 3,
        "score": -12.0,
        "errors": {
            "error_0": {
                "location": "\"I'm really glad for ditching our plan.\"",
                "aspect": "Inappropriate language or tone",
                "explanation": "The phrase \"ditching our plan\" is informal and disrespectful. It should be replaced with a more respectful and apologetic phrase like \"cancelling our plan\".",
                "severity": "Major",
                "score_reduction": "4.0"
            },
            "error_1": {
                "location": "\"I suddenly got an opportunity for a vacation so I took it.\"",
                "aspect": "Lack of apology or remorse",
                "explanation": "This sentence shows no remorse for cancelling the plan at the last minute. It should be replaced with a sentence that expresses regret for the inconvenience caused.",
                "severity": "Major",
                "score_reduction": "4.0"
            },
         




In [2]:
# MathQA example  
instruction = "Calculate 2 + 3."
input_context = ""
hypo_output = "2 + 3 = 15, so the answer is 12."
results = scorer.score([instruction], [hypo_output], [input_context])
print(json.dumps(results, indent=4))

TIGERScore Batch Scoring: 100%|██████████| 1/1 [00:08<00:00,  8.54s/it]

[
    {
        "num_errors": 2,
        "score": -10.0,
        "errors": {
            "error_0": {
                "location": "\"15\"",
                "aspect": "Incorrect calculation",
                "explanation": "The model incorrectly calculated 2 + 3 as 15 instead of 5. The model should ensure that basic arithmetic operations are performed correctly.",
                "severity": "Major",
                "score_reduction": "5.0"
            },
            "error_1": {
                "location": "\"so the answer is 12\"",
                "aspect": "Logical conflict",
                "explanation": "The model contradicts itself by stating the incorrect result (15) and then providing the correct answer (5). The model should ensure consistency in its responses.",
                "severity": "Major",
                "score_reduction": "5.0"
            }
        },
        "raw_output": "You are evaluating errors in a model-generated output for a given instruction.\nInstruction:




## Fast infernece with VLLM!

In [4]:
# Instruction-following example  
from datasets import load_dataset
dataset = load_dataset("TIGER-Lab/MetricInstruct")
dataset = dataset.shuffle()
dataset = dataset['train'].select(range(500))
instruction = dataset["instruction"]
input_context = dataset["input_context"]
hypo_output = dataset["hypo_output"]
# scoring
from tigerscore import TIGERScorer
scorer = TIGERScorer(model_name="TIGER-Lab/TIGERScore-7B", use_vllm=True) # VLLM on GPU
results = scorer.score(instruction, hypo_output, input_context)
print(json.dumps(results, indent=4))

Processed prompts: 100%|██████████| 500/500 [01:57<00:00,  4.25it/s]

[
    {
        "num_errors": 0,
        "score": -0.0,
        "errors": {},
        "raw_output": " The model-generated output contains 0 errors, with a total score reduction of 0."
    },
    {
        "num_errors": 0,
        "score": -0.0,
        "errors": {},
        "raw_output": " The model-generated output contains 0 errors, with a total score reduction of 0."
    },
    {
        "num_errors": 1,
        "score": -2.0,
        "errors": {
            "error_0": {
                "location": "Fan's trying to turn off the music to sleep.",
                "aspect": "Relevance",
                "explanation": "The summary incorrectly states that the fan is trying to turn off the music, while the source states that Sam is asking the fan to turn down the music. To correct this error, the summary should accurately reflect the source and state that Sam is asking the fan to turn down the music.",
                "severity": "Major",
                "score_reduction": "2"
           


