# Statistical Evaluation

Evaluating model output with NLP statistical methods.

Add `prettytable` to make a nice looking output.

In [None]:
!pip3 install prettytable==3.10.2 sacrebleu==2.4.2 rouge-score==0.1.2

In [None]:
!pip3 show prettytable sacrebleu rouge-score 

In [None]:
from prettytable import PrettyTable
import sacrebleu
from rouge_score import rouge_scorer

evaluation_data = [
    {
        "input": "What should I do in New York City in July?",
        "output": "Check out Times Square, go to an outdoor concert, and visit the Statue of Liberty.",
        "golden_answer": "Explore Central Park, attend outdoor concerts, and visit rooftop bars.",
        "contexts": [
            "Times Square is known for its Broadway theaters, bright lights, and bustling atmosphere.",
            "Outdoor concerts in Central Park are popular summer events attracting many visitors.",
            "The Statue of Liberty is a symbol of freedom and a must-see landmark in NYC."
        ]
    },
    {
        "input": "Can you help me with my math homework?",
        "output": "I'm designed to assist with travel queries. For math help, try using online resources like Khan Academy or Mathway.",
        "golden_answer": "I am a travel assistant chatbot, so I cannot help you with your math homework.",
        "contexts": []
    },
    {
        "input": "What’s the capital of France?",
        "output": "The capital of France is Paris.",
        "golden_answer": "Paris is the capital of France.",
        "contexts": [
            "Paris, known as the City of Light, is the most populous city of France.",
            "European capitals: Paris, France; Berlin, Germany; Madrid, Spain",
        ]
    }
]

# Statistical evaluators
def evaluate_bleu(output, golden_answer):
    bleu = sacrebleu.corpus_bleu([output], [[golden_answer]])
    return bleu.score / 100  # Normalize BLEU score to be between 0 and 1

def evaluate_rouge(output, contexts):
    context_text = ("\n").join(contexts)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(context_text, output)
    return scores['rougeL'].fmeasure


def calculate_average(scores):
    return sum(scores) / len(scores)

# truncate strings for easier printing in table
def truncate_string(s, max_length=10):
    return (s[:max_length] + '...') if len(s) > max_length else s

def create_table(data):
    table = PrettyTable()
    table.field_names = ["Input", "Output", "Golden Answer", "# Contexts", "BLEU", "ROGUE"]
    
    bleu_scores = [evaluate_bleu(case["output"], case["golden_answer"]) for case in data]
    rouge_scores = [evaluate_rouge(case["output"], case["contexts"]) for case in data]
    
    for case, bleu, rouge in zip(data, bleu_scores, rouge_scores):
        table.add_row([
            truncate_string(case["input"]),
            truncate_string(case["output"]),
            truncate_string(case["golden_answer"]),
            len(case["contexts"]),
            f"{bleu:.4f}",
            f"{rouge:.4f}"])
    
    # Add a blank row for visual separation
    table.add_row(["", "", "", "", "", ""])
    
    # Add the average score to bottom of the table
    average_bleu = calculate_average(bleu_scores)
    average_rouge = calculate_average(rouge_scores)
    
    table.add_row(["Average", "", "", "", f"{average_bleu:.4f}", f"{average_rouge:.4f}"])
    
    return table

# Create and print the table
result_table = create_table(evaluation_data)
print(result_table)
