# NLP Assignment 1
## Team Members -
### Shrenik Ganguli : CS23MTECH14014
### Trishita Saha      : CS23MTECH14016

## Question 1

In [1]:
#Import necessary packages
import string
import math

#Function for text preprocessing
def text_pre_processing(text):
    text = text.lower() #Converts text into lower case
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')  #Remove punctuation
    return text.split() #Split the text into words and return list

#Function for modified n-gram precision
def modified_ngram_precision(x, y, n):
    """
    Calculates the modified n-gram precision between two lists of words.
    Parameters:
    - x: List of words (reference)
    - y: List of words (candidate)
    - n: Size of n-gram
    Returns:
    - Modified n-gram precision
    """
    x_ngrams = []
    y_ngrams = []

    # Generate n-grams for both lists
    for i in range(len(x) - n + 1):
        x_ngrams.append(x[i:i + n])
    for j in range(len(y) - n + 1):
        y_ngrams.append(y[j:j + n])

    # Convert n-grams to tuples for comparison
    x_ngrams = [tuple(ngram) for ngram in x_ngrams]
    y_ngrams = [tuple(ngram) for ngram in y_ngrams]

    # Get unique n-grams from reference list
    unique_x_ngrams = list(set(x_ngrams))

    # Calculate the modified n-gram precision
    min_grams = 0
    for ngram in unique_x_ngrams:
        if ngram in y_ngrams:
            min_grams += min(x_ngrams.count(ngram), y_ngrams.count(ngram))
    count_x_ngrams = len(x_ngrams)
    return min_grams / count_x_ngrams


#Function for BLEU score calculation
def bleu_score(x, y, N=4, BP=1):
    """
    Calculates the BLEU score between two lists of words.

    Parameters:
    - x: List of words (reference)
    - y: List of words (candidate)
    - N: Maximum n-gram size (default = 4)
    - BP: Brevity penalty (default = 1)

    Returns:
    - BLEU score
    """
    w = [1 / N] * N
    b = 0

    for i in range(N):
        p = modified_ngram_precision(x, y, (i + 1))

        # Handle cases where p is zero
        if p == 0:

            #p = 1e-10  # A small epsilon value

            b = -1
            break # Since no common n-gram found BLEU score = 0

            #continue # Since if p=0 we assume log(p)=0

        b += w[i] * math.log(p)

    b = math.exp(b) if b != -1 else 0
    b = BP * b
    return b

## Question 2

In [2]:
#Implementation of BLUE score
x = "The boys were playing happily on the ground."
y = "The boys were playing football on the field."
print(f"The BLUE score for x = '{x}' and y = '{y}' is:", end=" ")

x = text_pre_processing(x)
y = text_pre_processing(y)

BLEU = bleu_score(x,y)
print(BLEU)

The BLUE score for x = 'The boys were playing happily on the ground.' and y = 'The boys were playing football on the field.' is: 0.4111336169005197


## Question 3




> The minimum in the numerator prevents overestimation of n-gram matches by ensuring that the count of an n-gram in the intersection is not higher than its count in the reference sentence. This maintains a balanced comparison between the candidate and reference sentences for accurate evaluation.





## Question 4

In [6]:
#Example sentences
sentence_pairs = [
    ("I want to do research in NLP.", "NLP is my research area."),
    ("The quick brown fox jumps over the lazy dog all day long.", "The fast brown fox jumps over the sleepy dog."),
    ("A.R Rahman is a musical legend", "A.R Rahman is a legendary musician"),
    ("The man arrived late because of the train", "The train arrived late because of the man"),
    ("I love IIT Hyderabad.", "Everyone loves IIT Hyderabad."),
]

for index, pair in enumerate(sentence_pairs, start=1):
    x = pair[0]
    y = pair[1]
    # Calculate and print BLEU score
    print(f"The BLEU score for sentence pair {index}:")
    print(f"x = '{x}'")
    print(f"y = '{y}'")
    x = text_pre_processing(x)
    y = text_pre_processing(y)


    BLEU = bleu_score(x, y)
    print(f"BLEU Score: {BLEU}\n")


# Potential disadvantages of BLEU Score:
print("\n"+'\033[1m'+"Potential disadvantages of BLEU Score:"+'\033[0m')
print("i)  Sensitivity to sentence length (as seen in example 2)")
print("ii) Emphasis on n-gram precision over semantic meaning (as seen in example 1)")
print("iii)BLEU does not understand the variants of the words and can’t take the word order into account. (as seen in ")
print("    example 3 & 4)")
print("iv) Limited applicability across languages with diverse word orders or morphologies")
print("v)  BLEU does not understand the significance of the words in the text, and penalizes relatively less important ")
print("    words as much as words more important to the statement. This is problem is more significant in smaller sentence ")
print("    due to brevity penalty formula.")

The BLEU score for sentence pair 1:
x = 'I want to do research in NLP.'
y = 'NLP is my research area.'
BLEU Score: 0

The BLEU score for sentence pair 2:
x = 'The quick brown fox jumps over the lazy dog all day long.'
y = 'The fast brown fox jumps over the sleepy dog.'
BLEU Score: 0.3448444257953326

The BLEU score for sentence pair 3:
x = 'A.R Rahman is a musical legend'
y = 'A.R Rahman is a legendary musician'
BLEU Score: 0.5081327481546147

The BLEU score for sentence pair 4:
x = 'The man arrived late because of the train'
y = 'The train arrived late because of the man'
BLEU Score: 0.6434588841607617

The BLEU score for sentence pair 5:
x = 'I love IIT Hyderabad.'
y = 'Everyone loves IIT Hyderabad.'
BLEU Score: 0


[1mPotential disadvantages of BLEU Score:[0m
i)  Sensitivity to sentence length (as seen in example 2)
ii) Emphasis on n-gram precision over semantic meaning (as seen in example 1)
iii)BLEU does not understand the variants of the words and can’t take the word order into