In [1]:
import math
from collections import Counter

def get_ngrams(sentence, n):
    """Extract n-grams from a sentence (list of tokens)."""
    return [tuple(sentence[i:i+n]) for i in range(len(sentence)-n+1)]

def modified_precision(candidate, references, n):
    """
    Compute modified n-gram precision for BLEU.
    candidate: list of tokens
    references: list of reference token lists
    """
    cand_ngrams = get_ngrams(candidate, n)
    cand_count = Counter(cand_ngrams)

    # Reference max counts
    ref_counts = [Counter(get_ngrams(ref, n)) for ref in references]

    # Clip counts
    clipped_counts = {}
    for ng in cand_count:
        max_ref_count = max(ref_count.get(ng, 0) for ref_count in ref_counts)
        clipped_counts[ng] = min(cand_count[ng], max_ref_count)

    return sum(clipped_counts.values()), sum(cand_count.values())

def brevity_penalty(candidate, references):
    """
    Compute the brevity penalty (BP).
    BP = 1, if len(c) > len(r)
         exp(1 - len(r)/len(c)), otherwise
    where r is reference length closest to candidate length.
    """
    c_len = len(candidate)
    ref_lens = [len(ref) for ref in references]
    closest_r = min(ref_lens, key=lambda r: abs(r - c_len))

    if c_len > closest_r:
        return 1
    else:
        return math.exp(1 - closest_r / c_len)

def compute_bleu2(candidate, references):
    """
    Compute BLEU-2 = BP * exp( (1/2)*(log(p1) + log(p2)) )
    where p1 = modified unigram precision
          p2 = modified bigram precision
    """
    # p1
    clip1, total1 = modified_precision(candidate, references, 1)
    p1 = clip1 / total1 if total1 > 0 else 0

    # p2
    clip2, total2 = modified_precision(candidate, references, 2)
    p2 = clip2 / total2 if total2 > 0 else 0

    # Avoid log(0)
    if p1 == 0 or p2 == 0:
        return 0

    bp = brevity_penalty(candidate, references)

    bleu2 = bp * math.exp(0.5 * (math.log(p1) + math.log(p2)))
    return bleu2

# -----------------------------
# Test with the provided data
# -----------------------------
C  = ["a", "cat", "is", "on", "the", "mat"]
R1 = ["the", "cat", "is", "on", "the", "mat"]
R2 = ["the", "cat", "sits", "on", "the", "mat"]

BLEU_2 = compute_bleu2(C, [R1, R2])
print("BLEU-2 score:", BLEU_2)


BLEU-2 score: 0.816496580927726
