# Exercise 1 and 2: Zero-probability N-grams
Student's name: SETH Rattanak </br>
Lecturer: Dr. Dona Valy

### Exercise 1

In [2]:
text = '<s> I am Sam </s> <s> Sam I am </s> <s> Sam I like </s> <s> Sam I do like </s> <s> do I like Sam </s>'
tokens = text.split()
print(len(tokens))
print(tokens)

27
['<s>', 'I', 'am', 'Sam', '</s>', '<s>', 'Sam', 'I', 'am', '</s>', '<s>', 'Sam', 'I', 'like', '</s>', '<s>', 'Sam', 'I', 'do', 'like', '</s>', '<s>', 'do', 'I', 'like', 'Sam', '</s>']


In [3]:
from collections import Counter

unigram = [char for char in tokens]
uni_counts = Counter(unigram)
print(uni_counts)

bigrams = [' '.join([t1, t2]) for t1, t2 in zip(tokens[:-1], tokens[1:])]
bi_counts = Counter(bigrams) # it represents a table of two words count
print(bi_counts)

Counter({'<s>': 5, 'I': 5, 'Sam': 5, '</s>': 5, 'like': 3, 'am': 2, 'do': 2})
Counter({'</s> <s>': 4, '<s> Sam': 3, 'Sam I': 3, 'I am': 2, 'Sam </s>': 2, 'I like': 2, 'like </s>': 2, '<s> I': 1, 'am Sam': 1, 'am </s>': 1, 'I do': 1, 'do like': 1, '<s> do': 1, 'do I': 1, 'like Sam': 1})


In [4]:
vocab = list(Counter(tokens).keys())
print(vocab)
"""
    Unsmoothed or MLE estimate
    This is a bi-gram probability finding
    e.g pr(do|<s>) = find_probability('do', '<s>')
"""
def find_probability(find_word, given_word):
    join_word_count = bi_counts.get(' '.join([given_word, find_word]))
    if not join_word_count :
        join_word_count = 0
    total_given_word = uni_counts.get(given_word)
    pr =  join_word_count / total_given_word
    print("pr(%s|%s) = %d / %d = %.3f" % (find_word, given_word, join_word_count,  total_given_word, pr))
    
    return pr

"""
    Add-one estimate (Laplace Smoothing)
"""
def laplace_smoothing(find_word, given_word):
    join_word_count = bi_counts.get(' '.join([given_word, find_word]))
    if not join_word_count :
        join_word_count = 0
    total_given_word = uni_counts.get(given_word)
    pr =  (join_word_count + 1)*1.0 / (total_given_word + len(vocab))
    print("pr(%s|%s) = %d / %d = %.3f" % (find_word, given_word, join_word_count+1,  total_given_word + len(vocab), pr))
    
    return pr



['<s>', 'I', 'am', 'Sam', '</s>', 'like', 'do']


In [5]:
"""
    1. Find the following bigram probabilities estimated by this model
    a) P(do|<s>)
    b) P(do|Sam)
    c) P(Sam|<s>)
    d) P(Sam|do)
    e) P(I|Sam)
    f) P(I|do)
    g) P(like|I)
    h) P(Sam|like)
"""
print("Unsmoothed")
find_probability('do', '<s>')
find_probability('do', 'Sam')
find_probability('Sam', '<s>')
find_probability('Sam', 'do')
find_probability('I', 'Sam')
find_probability('I', 'do')
find_probability('like', 'I')
find_probability('Sam', 'like')

print("\nSmoothed with Laplace Smoothing")
laplace_smoothing('do', '<s>')
laplace_smoothing('do', 'Sam')
laplace_smoothing('Sam', '<s>')
laplace_smoothing('Sam', 'do')
laplace_smoothing('I', 'Sam')
laplace_smoothing('I', 'do')
laplace_smoothing('like', 'I')
laplace_smoothing('Sam', 'like')

Unsmoothed
pr(do|<s>) = 1 / 5 = 0.200
pr(do|Sam) = 0 / 5 = 0.000
pr(Sam|<s>) = 3 / 5 = 0.600
pr(Sam|do) = 0 / 2 = 0.000
pr(I|Sam) = 3 / 5 = 0.600
pr(I|do) = 1 / 2 = 0.500
pr(like|I) = 2 / 5 = 0.400
pr(Sam|like) = 1 / 3 = 0.333

Smoothed with Laplace Smoothing
pr(do|<s>) = 2 / 12 = 0.167
pr(do|Sam) = 1 / 12 = 0.083
pr(Sam|<s>) = 4 / 12 = 0.333
pr(Sam|do) = 1 / 9 = 0.111
pr(I|Sam) = 4 / 12 = 0.333
pr(I|do) = 2 / 9 = 0.222
pr(like|I) = 3 / 12 = 0.250
pr(Sam|like) = 2 / 10 = 0.200


0.2

In [6]:
"""2. Calculate the probabilities and perplexity of the following
sequences according to this model
a) <s> do Sam I like
b) <s> Sam do I like
c) I do like Sam </s>"""

def compute_probability_sentence(sentence: str, is_Laplace_Smoothing: bool = False):
    print("Calculate bigram probability:")
    print('Given sentence: ', sentence)
    list_word = sentence.split()

    # initialise value
    total_probability = 1
    total_perplexity = 1.0
    tmpPerplex = 1.0

    for idx in range(0, len(list_word)-1):
        pr = 0
        if not is_Laplace_Smoothing:
            pr = find_probability(list_word[idx + 1], list_word[idx])
        else:
            pr = laplace_smoothing(list_word[idx + 1], list_word[idx]) # Laplace Smoothing (adding one) to prevent zero happen
            
        total_probability = total_probability * pr

        # prevent zero value happening due to leading to NONE or UNDEFINE
        if pr == 0:
            tmpPerplex = None
            continue
        if tmpPerplex:
            tmpPerplex = tmpPerplex * (1 / pr)
    
        # print('P(%s %s) = %.2f' %(list_word[idx], list_word[idx + 1], pr))
    if tmpPerplex:
        total_perplexity = tmpPerplex**(1/len(list_word)) # tmpPerplex^(1/n)
    else:
        total_perplexity = tmpPerplex
    print("Perplexity of this sentence: ", total_perplexity)
    print("Total probability of this sentence is: ", total_probability, '\n')
    return {"total_probability": total_probability, "perplexity": total_perplexity}

    
print("Unsmoothed\n")
compute_probability_sentence('<s> do Sam I like')
compute_probability_sentence('<s> Sam do I like')
compute_probability_sentence('I do like Sam </s>')

print("\nSmoothed with Laplace Smoothing (adding one)\n")
compute_probability_sentence('<s> do Sam I like', True)
compute_probability_sentence('<s> Sam do I like', True)
compute_probability_sentence('I do like Sam </s>', True)


Unsmoothed

Calculate bigram probability:
Given sentence:  <s> do Sam I like
pr(do|<s>) = 1 / 5 = 0.200
pr(Sam|do) = 0 / 2 = 0.000
pr(I|Sam) = 3 / 5 = 0.600
pr(like|I) = 2 / 5 = 0.400
Perplexity of this sentence:  None
Total probability of this sentence is:  0.0 

Calculate bigram probability:
Given sentence:  <s> Sam do I like
pr(Sam|<s>) = 3 / 5 = 0.600
pr(do|Sam) = 0 / 5 = 0.000
pr(I|do) = 1 / 2 = 0.500
pr(like|I) = 2 / 5 = 0.400
Perplexity of this sentence:  None
Total probability of this sentence is:  0.0 

Calculate bigram probability:
Given sentence:  I do like Sam </s>
pr(do|I) = 1 / 5 = 0.200
pr(like|do) = 1 / 2 = 0.500
pr(Sam|like) = 1 / 3 = 0.333
pr(</s>|Sam) = 2 / 5 = 0.400
Perplexity of this sentence:  2.3714406097793117
Total probability of this sentence is:  0.013333333333333334 


Smoothed with Laplace Smoothing (adding one)

Calculate bigram probability:
Given sentence:  <s> do Sam I like
pr(do|<s>) = 2 / 12 = 0.167
pr(Sam|do) = 1 / 9 = 0.111
pr(I|Sam) = 4 / 12 = 0.333

{'total_probability': 0.001851851851851852, 'perplexity': 3.519482028935523}

### Exercise 2
The same question as Exercise 1, but this time use add-k
(k=0.1) smoothing instead of Laplace smoothing and in
addition use linear interpolation to compute the probability
of each bigram:
$$\hat{P}(w_i|w_{i-1})= \lambda_1 P(w_i) + \lambda_2 P(w_i|w{i-1})$$
where $\lambda_1=0.25$ and $\lambda_2=0.75$

In [7]:
from typing import Final
# Global declaration
LAMBDA_1: Final[float] = 0.25
LAMBDA_2: Final[float] = 0.75
K: Final[float] = 0.1

In [21]:
"""
    k-smoothing: K=0.1
    This is a bi-gram probability finding
    e.g pr(do|<s>) = add_k_smoothing('do', '<s>')
"""
def add_k_smoothing(find_word, given_word, is_print: bool = True):
    join_word_count = bi_counts.get(' '.join([given_word, find_word]))
    if not join_word_count :
        join_word_count = 0
    total_given_word = uni_counts.get(given_word)
    pr =  (join_word_count + K) / (total_given_word + K*len(vocab))
    if is_print: print("pr(%s|%s) = %.3f / %.3f = %.3f" % (find_word, given_word, (join_word_count + K),  (total_given_word + K*len(vocab)), pr))
    
    return pr

def add_k_smoothing_uni_gram(w: str):
    total_count = len(tokens)
    vocab_size = len(vocab)
    pr = (uni_counts.get(w) + K) / (total_count + K * vocab_size)
    return pr

def linear_interpolation(find_word, given_word):
    first_operation: float = (LAMBDA_1 * add_k_smoothing_uni_gram(find_word))
    second_operation: float = (LAMBDA_2 * add_k_smoothing(find_word, given_word, False))
    p_head = first_operation + second_operation
    print(f"pr_head({find_word}|{given_word})= {first_operation} + {second_operation} = {p_head}")
    return p_head

add_k_smoothing_uni_gram('do')

0.07581227436823106

In [22]:
# Solution 1
print(f"Add K-Smoothing which k={K}")
add_k_smoothing('do', '<s>')
add_k_smoothing('do', 'Sam')
add_k_smoothing('Sam', '<s>')
add_k_smoothing('Sam', 'do')
add_k_smoothing('I', 'Sam')
add_k_smoothing('I', 'do')
add_k_smoothing('like', 'I')
add_k_smoothing('Sam', 'like')

Add K-Smoothing which k=0.1
pr(do|<s>) = 1.100 / 5.700 = 0.193
pr(do|Sam) = 0.100 / 5.700 = 0.018
pr(Sam|<s>) = 3.100 / 5.700 = 0.544
pr(Sam|do) = 0.100 / 2.700 = 0.037
pr(I|Sam) = 3.100 / 5.700 = 0.544
pr(I|do) = 1.100 / 2.700 = 0.407
pr(like|I) = 2.100 / 5.700 = 0.368
pr(Sam|like) = 1.100 / 3.700 = 0.297


0.2972972972972973

### Use interpolation to compute the probability of each bigram

In [23]:
print("Linear Interpolation with K-Smoothing which is K=0.1\n")
linear_interpolation('do', '<s>')
linear_interpolation('do', 'Sam')
linear_interpolation('Sam', '<s>')
linear_interpolation('Sam', 'do')
linear_interpolation('I', 'Sam')
linear_interpolation('I', 'do')
linear_interpolation('like', 'I')
linear_interpolation('Sam', 'like')

Linear Interpolation with K-Smoothing which is K=0.1

pr_head(do|<s>)= 0.018953068592057764 + 0.14473684210526316 = 0.16368991069732092
pr_head(do|Sam)= 0.018953068592057764 + 0.013157894736842105 = 0.03211096332889987
pr_head(Sam|<s>)= 0.04602888086642599 + 0.4078947368421053 = 0.4539236177085313
pr_head(Sam|do)= 0.04602888086642599 + 0.027777777777777776 = 0.07380665864420377
pr_head(I|Sam)= 0.04602888086642599 + 0.4078947368421053 = 0.4539236177085313
pr_head(I|do)= 0.04602888086642599 + 0.3055555555555556 = 0.35158443642198156
pr_head(like|I)= 0.027978339350180507 + 0.2763157894736842 = 0.3042941288238647
pr_head(Sam|like)= 0.04602888086642599 + 0.22297297297297297 = 0.26900185383939895


0.26900185383939895