In [1]:
import math
import string
import re

In [2]:
#Defining constants
BP = 1
N = 4
w_n = 1 / N
epsilon = 1e-5

# Question 1

### Preprocessing Step

In [3]:
# Removing the punctuations from the text
def remPunctuations(text):
    # Table is a translation table for removing the punctuation marks from the words
    table = str.maketrans({key: None for key in string.punctuation})
    translated = text.translate(table)
    return translated

In [4]:
# Lowercasing of text
def toLower(text):
    return text.lower()

In [5]:
# Splitting the text based on the delimiter (in this case, space)
def splitText(text):
    return text.split()

### BLEU Metric

In [6]:
# Given the text array and n, it will find all the n-grams for the given text
def nGramGenerator(text, n):
    nGramCount = {}
    numOfTokens = len(text)

    for i in range(numOfTokens - n + 1):
        nGramString = ""
        for j in range(i, i + n):
            nGramString += text[j]

        # Increase the frequency of the string in the dictionary
        if nGramString in nGramCount:
            nGramCount[nGramString] += 1
        else:
            nGramCount[nGramString] = 1

    return nGramCount

In [7]:
# Computing the modified n-gram precision given the 2 texts
def generate_p_array(text1, text2):
    # Preprocessing the texts
    preprocessedString1 = splitText(toLower(remPunctuations(text1)))
    preprocessedString2 = splitText(toLower(remPunctuations(text2)))

    numOfTokens1 = len(preprocessedString1)
    numOfTokens2 = len(preprocessedString2)

    p = [0]

    for n in range(1, min(numOfTokens1, numOfTokens2) + 1):
        nGramCount1 = nGramGenerator(preprocessedString1, n)
        nGramCount2 = nGramGenerator(preprocessedString2, n)

        val = 0
        for nGram, cnt1 in nGramCount1.items():
            if nGram in nGramCount2:
                val += min(cnt1, nGramCount2[nGram])

        val /= (numOfTokens1 - n + 1)
        p.append(val)

    return p

In [8]:
# Computing the BLEU score using the modified n-gram precision
def bleu_score_compute(p):
    bleu_score = 0

    for i in range(1, N+1):
        if p[i] > epsilon:
            bleu_score += w_n * math.log(p[i])

    bleu_score = math.exp(bleu_score)
    bleu_score *= BP

    return bleu_score

# Question 2

### Computing BLEU Score

In [9]:
inputs = [["The boys were playing happily on the ground.", "The boys were playing football on the field."]]

In [10]:
for inputPair in inputs:
    p = generate_p_array(inputPair[0], inputPair[1])
    print(bleu_score_compute(p))

0.4111336169005197


# Question 3

We are taking minimum in the calculation of $p_n$ because we don't want to exceed the largest count observed, for each n-gram in the machine translated text, with the ground truth.

So, the truncated count is always less than or equal to the n-gram's count in the ground truth.

# Question 4

In [11]:
inputs = [["I have three apples and two oranges", "I have some apples and some oranges"],
          ["I have three apples and two oranges", "Have three apples I and two oranges"],
          ["a a a a a ", "a a a a a a a a a"],
          ["I had the cookie", "I had the cake"],
          ["I had the cookie", "I had a cookie"]]

In [12]:
for inputPair in inputs:
    p = generate_p_array(inputPair[0], inputPair[1])
    print(bleu_score_compute(p))

0.6985342056580097
0.7186082239261684
1.0
0.7071067811865475
0.7071067811865475


The potential disadvantages of BLEU score are:
*   **It does not capture the meaning of the sentences**: This can be seen in the 4th and 5th pairs of sentences. Both the pairs return the same score even though the meaning is completely different.
*   **It does not check the sentence structure properly**: The second pair has a higher score than the first pair even though the grammar of the second pair does not make the sentence meaningful.

