In [1]:
import pandas as pd
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# For ROUGE, either use 'rouge_score' or 'evaluate' (both shown below).
# 2A) Using the 'rouge_score' library directly:
from rouge_score import rouge_scorer

# 2B) Or using Hugging Face's 'evaluate':
# import evaluate
# rouge_evaluator = evaluate.load('rouge')

# Make sure you have downloaded the NLTK tokenizer:
nltk.download('punkt')

def calculate_bleu_and_rouge(csv_file_path):
    """
    Reads the CSV with columns:
        [Video_file, Q_ID, Prompt_name, Model_Name, Output, Cayla_Resp, Nami_Resp, Natalia_Resp]
    Calculates aggregated BLEU and ROUGE across all rows.
    """
    df = pd.read_csv(csv_file_path)

    # Prepare lists for BLEU calculation
    # BLEU expects:
    #  - hypotheses as a list of tokenized strings
    #  - list_of_references as a list of lists of tokenized references
    hypotheses = []
    list_of_references = []  # Each item is a list of references for that hypothesis

    # Prepare ROUGE scorer
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # For storing cumulative ROUGE scores
    total_rouge1 = 0.0
    total_rouge2 = 0.0
    total_rougeL = 0.0
    count = 0

    # We’ll iterate each row and gather references/hypothesis
    for idx, row in df.iterrows():
        model_output = row['Output']
        ref1 = str(row['Cayla_Resp'])
        ref2 = str(row['Nami_Resp'])
        ref3 = str(row['Natalia_Resp'])

        # Tokenize the hypothesis (model output) for BLEU
        hypo_tokens = nltk.word_tokenize(model_output.lower())

        # Tokenize each reference
        ref_tokens_1 = nltk.word_tokenize(ref1.lower())
        ref_tokens_2 = nltk.word_tokenize(ref2.lower())
        ref_tokens_3 = nltk.word_tokenize(ref3.lower())

        # For BLEU, we can have multiple references in a list:
        references_for_this_row = []
        # Only add reference if it's non-empty or not "normal"
        # (You might need your own logic here if references can be empty or placeholders)
        if len(ref_tokens_1) > 0:
            references_for_this_row.append(ref_tokens_1)
        if len(ref_tokens_2) > 0:
            references_for_this_row.append(ref_tokens_2)
        if len(ref_tokens_3) > 0:
            references_for_this_row.append(ref_tokens_3)

        # If for some reason all references are empty, skip (or handle differently)
        if len(references_for_this_row) == 0:
            continue
        
        # Add to BLEU lists
        hypotheses.append(hypo_tokens)
        list_of_references.append(references_for_this_row)

        # For ROUGE, we typically provide a single hypothesis and multiple references.
        # The 'rouge_score' library doesn't natively do "multiple references" at once,
        # so you can average or take the max across references:
        # We'll do a simple average of ROUGE scores across the references.
        current_rouge1 = 0.0
        current_rouge2 = 0.0
        current_rougeL = 0.0

        # Evaluate each reference individually and then average
        for ref in [ref1, ref2, ref3]:
            if ref.strip():
                scores = rouge_scorer_obj.score(ref, model_output)
                current_rouge1 += scores['rouge1'].fmeasure
                current_rouge2 += scores['rouge2'].fmeasure
                current_rougeL += scores['rougeL'].fmeasure

        # Divide by number of non-empty references
        non_empty_refs = sum([1 for r in [ref1, ref2, ref3] if r.strip()])
        current_rouge1 /= non_empty_refs
        current_rouge2 /= non_empty_refs
        current_rougeL /= non_empty_refs

        total_rouge1 += current_rouge1
        total_rouge2 += current_rouge2
        total_rougeL += current_rougeL
        count += 1

    # --- BLEU Calculation (corpus-level) ---
    # Using a smoothing function is often recommended
    smoothie = SmoothingFunction().method4
    bleu_score = corpus_bleu(list_of_references, hypotheses, smoothing_function=smoothie)

    # --- ROUGE Calculation (averaged across rows) ---
    if count > 0:
        avg_rouge1 = total_rouge1 / count
        avg_rouge2 = total_rouge2 / count
        avg_rougeL = total_rougeL / count
    else:
        avg_rouge1 = avg_rouge2 = avg_rougeL = 0.0

    print("===== BLEU & ROUGE Results =====")
    print(f"BLEU (corpus-level): {bleu_score:.4f}")
    print(f"ROUGE-1 (avg F-measure): {avg_rouge1:.4f}")
    print(f"ROUGE-2 (avg F-measure): {avg_rouge2:.4f}")
    print(f"ROUGE-L (avg F-measure): {avg_rougeL:.4f}")

    return bleu_score, avg_rouge1, avg_rouge2, avg_rougeL



csv_file = pd.read_csv(".//localdisk4/panwla/Projects/park_vlm/Kangaroo/kangaroo_results.csv")
calculate_bleu_and_rouge(csv_file)


ModuleNotFoundError: No module named 'rouge_score'