In [1]:
text = "The new table is red. The blue table is broken."
words_processed = ['the', 'new', 'table', 'is', 'red', 'the', 'blue', 'table', 'is', 'broken']
vocabulary = set() # Create an empty set
for word in words_processed: # Iterate through available words
    vocabulary.add(word) # Add word to set
print("Document:",text)
print("Pre-processed words:",words_processed)
print("Document size:",len(words_processed))
print("Vocabulary:",vocabulary)
print("Vocabulary size:",len(vocabulary))

Document: The new table is red. The blue table is broken.
Pre-processed words: ['the', 'new', 'table', 'is', 'red', 'the', 'blue', 'table', 'is', 'broken']
Document size: 10
Vocabulary: {'red', 'is', 'table', 'new', 'blue', 'the', 'broken'}
Vocabulary size: 7


## 4.2 Unigrams (1-Grams)
### 4.2.1 Compute unigrams

In [2]:
import nltk
from nltk import FreqDist  # Import the FreqDist function from NLTK
tf = FreqDist(words_processed)  # Compute term frequency of words
print(tf, "\n")
vocabulary = sorted(vocabulary)  # Sort alphabetically for better presentation
unigrams = dict()  # Create empty dictionary for unigrams

for word in vocabulary:
    unigrams[word] = tf[word]
print(unigrams)

<FreqDist with 7 samples and 10 outcomes> 

{'blue': 1, 'broken': 1, 'is': 2, 'new': 1, 'red': 1, 'table': 2, 'the': 2}


### 4.2.2 Unigram probability

In [3]:
total_words = len(words_processed)  # Compute total words in corpus
unigram_probabilities = dict()  # Create empty dictionary for unigram probabilities
for word in unigrams:
    unigram_probabilities[word] = unigrams[word] / \
        total_words  # Compute P(w_n)
print("Unigram probabilities:", unigram_probabilities)

Unigram probabilities: {'blue': 0.1, 'broken': 0.1, 'is': 0.2, 'new': 0.1, 'red': 0.1, 'table': 0.2, 'the': 0.2}


### 4.2.3 Sentence probability

In [4]:
from collections import defaultdict
# Create a dictionary that will return 0 for unknown words
pw = defaultdict(lambda: 0, unigram_probabilities)
print(pw, "\n")
p_text1 = pw["the"]*pw["new"]*pw["table"]*pw["is"]*pw["red"]
p_text2 = pw["the"]*pw["black"]*pw["table"]
print("P(the new table is red)= %f" % p_text1)
print("P(the black table)= %f" % p_text2)

defaultdict(<function <lambda> at 0x000002B13E4C3EB0>, {'blue': 0.1, 'broken': 0.1, 'is': 0.2, 'new': 0.1, 'red': 0.1, 'table': 0.2, 'the': 0.2}) 

P(the new table is red)= 0.000080
P(the black table)= 0.000000


In [5]:
V = len(vocabulary)  # Compute words in vocabulary
total_words = len(words_processed)  # Compute total words in corpus
l = 0.001  # Define lambda for Add-lambda smoothing
# Compute the probability of unknown words using add-lambda smoothing
p_unknown = (0 + l) / ((l*V) + total_words)
print("P(unknown)=%f\n" % p_unknown)
# Create empty dictionary for unigram probabilities
unigram_probabilities_addl = dict()
for word in unigrams:
    unigram_probabilities_addl[word] = (
        unigrams[word] + l) / (total_words + (l*V))  # Compute P(w_n)
print("Unigram probabilities (Add-lambda smoothing):\n",
      unigram_probabilities_addl, "\n")
# Create a dictionary that will return p_unknown for unknown words
plw = defaultdict(lambda: p_unknown, unigram_probabilities_addl)
pl_text1 = plw["the"]*plw["new"]*plw["table"]*plw["is"]*plw["red"]
pl_text2 = plw["the"]*plw["black"]*plw["table"]
print("P(the new table is red)= %f" % pl_text1)
print("P(the black table)= %f" % pl_text2)

P(unknown)=0.000100

Unigram probabilities (Add-lambda smoothing):
 {'blue': 0.10002997901468971, 'broken': 0.10002997901468971, 'is': 0.1999600279804137, 'new': 0.10002997901468971, 'red': 0.10002997901468971, 'table': 0.1999600279804137, 'the': 0.1999600279804137} 

P(the new table is red)= 0.000080
P(the black table)= 0.000004


## 4.3 Bigrams (2-Grams)
### 4.3.1 Compute bigrams

In [6]:
from nltk.util import ngrams
text = "The new table is red. The blue table is broken."
words_processed = ['the', 'new', 'table', 'is',
                   'red', 'the', 'blue', 'table', 'is', 'broken']
bigrams = ngrams(words_processed, 2)  # Compute the bigrams in the text
bigrams_unique = set()  # Create empty set for unique bigrams
for bigram in bigrams:
    print(bigram)
    bigrams_unique.add(bigram)  # Add bigram to set
print("\nUnique bigrams:\n", bigrams_unique)

('the', 'new')
('new', 'table')
('table', 'is')
('is', 'red')
('red', 'the')
('the', 'blue')
('blue', 'table')
('table', 'is')
('is', 'broken')

Unique bigrams:
 {('the', 'blue'), ('blue', 'table'), ('new', 'table'), ('red', 'the'), ('the', 'new'), ('is', 'red'), ('table', 'is'), ('is', 'broken')}


In [7]:
bigrams = ngrams(words_processed,2) # Compute the bigrams in the text
bigram_freq = FreqDist(bigrams).items() # Compute frequency distribution for all the bigrams in the text
print(bigram_freq)

dict_items([(('the', 'new'), 1), (('new', 'table'), 1), (('table', 'is'), 2), (('is', 'red'), 1), (('red', 'the'), 1), (('the', 'blue'), 1), (('blue', 'table'), 1), (('is', 'broken'), 1)])


In [8]:
text = "The new table is red. The blue table is broken."
# Add tokens indicating the start and end of a sentence in the respective position
text2 = "<s> The new table is red. </s> <s> The blue table is broken. </s>"
words_processed = ['<s>', 'the', 'new', 'table', 'is', 'red', '</s>', '<s>', 'the', 'blue', 'table',
                   'is', 'broken', '</s>']
vocabulary = set()  # Create an empty set
for word in words_processed:  # Iterate through available words
    vocabulary.add(word)  # Add word to set
tf = FreqDist(words_processed)  # Compute term frequency of words
vocabulary = sorted(vocabulary)  # Sort alphabetically for better presentation
ugf = dict()  # Create empty dictionary for unigram counts
for word in vocabulary:
    ugf[word] = tf[word]
# Create a dictionary that will return 0 for unknown unigrams
ugf = defaultdict(lambda: 0, ugf)
print("Unigram counts:", ugf, "\n")
bigrams = ngrams(words_processed, 2)  # Compute the bigrams in the text
# Compute frequency distribution for all the bigrams in the text
bigram_freq = FreqDist(bigrams).items()
print("Bigram counts:", bigram_freq, "\n")
# Create a dictionary that will return 0 for unknown bigrams
bgf = defaultdict(lambda: 0, bigram_freq)


# Create function to compute bigram probability
def p_big(bigram, bigram_frequencies, unigram_frequencies):
    if (bigram_frequencies[bigram] == 0):
        return 0
    else:
        return bigram_frequencies[bigram] / unigram_frequencies[bigram[0]]


p_text1 = p_big(('<s>', 'the'), bgf, ugf)*p_big(('the', 'new'), bgf, ugf)*p_big(('new', 'table'), bgf,
                                                                                ugf)*p_big(('table', 'is'), bgf, ugf)*p_big(('is', 'red'), bgf, ugf)*p_big(('red', '</s>'), bgf, ugf)
p_text2 = p_big(('<s>', 'the'), bgf, ugf)*p_big(('the', 'black'), bgf, ugf) * \
    p_big(('black', 'table'), bgf, ugf)*p_big(('table', '</s>'), bgf, ugf)
print("P(<s> the new table is red </s>)= %f" % p_text1)
print("P(<s> the black table </s>)= %f" % p_text2)

Unigram counts: defaultdict(<function <lambda> at 0x000002B103081870>, {'</s>': 2, '<s>': 2, 'blue': 1, 'broken': 1, 'is': 2, 'new': 1, 'red': 1, 'table': 2, 'the': 2}) 

Bigram counts: dict_items([(('<s>', 'the'), 2), (('the', 'new'), 1), (('new', 'table'), 1), (('table', 'is'), 2), (('is', 'red'), 1), (('red', '</s>'), 1), (('</s>', '<s>'), 1), (('the', 'blue'), 1), (('blue', 'table'), 1), (('is', 'broken'), 1), (('broken', '</s>'), 1)]) 

P(<s> the new table is red </s>)= 0.250000
P(<s> the black table </s>)= 0.000000


In [9]:
text = "The new table is red. The blue table is broken."
# Add tokens indicating the start and end of a sentence in the respective position
text3 = "<s> <s> The new table is red. </s> </s> <s> <s> The blue table is broken. </s> </s>"
words_processed = ['<s>', '<s>', 'the', 'new', 'table', 'is', 'red', '</s>', '</s>', '<s>', '<s>', 'the',
                   'blue', 'table', 'is', 'broken', '</s>', '</s>']
bigrams = ngrams(words_processed, 2)  # Compute the bigrams in the text
# Compute frequency distribution for all the bigrams in the
bigram_freq = FreqDist(bigrams).items()
text
print("Bigram counts:", bigram_freq, "\n")
trigrams = ngrams(words_processed, 3)  # Compute the trigrams in the text
# Compute frequency distribution for all the trigrams in the text
trigram_freq = FreqDist(trigrams).items()
print("Trigram counts:", trigram_freq, "\n")
# Create a dictionary that will return 0 for unknown bigrams
bgf = defaultdict(lambda: 0, bigram_freq)
# Create a dictionary that will return 0 for unknown
tgf = defaultdict(lambda: 0, trigram_freq)
trigrams


# Create function to compute trigram probability
def p_trig(trigram, trigram_frequencies, bigram_frequencies):
    if (trigram_frequencies[trigram] == 0):
        return 0
    else:
        return trigram_frequencies[trigram] / bigram_frequencies[(trigram[0], trigram[1])]


p_text1 = p_trig(('<s>', '<s>', 'the'), tgf, bgf)*p_trig(('<s>', 'the', 'new'), tgf, bgf)*p_trig(('the', 'new', 'table'), tgf, bgf)*p_trig(('new',
                                                                                                                                            'table', 'is'), tgf, bgf)*p_trig(('table', 'is', 'red'), tgf, bgf)*p_trig(('is', 'red', '</s>'), tgf, bgf)*p_trig(('red', '</s>', '</s>'), tgf, bgf)
p_text2 = p_trig(('<s>', '<s>', 'the'), tgf, bgf)*p_trig(('<s>', 'the', 'black'), tgf, bgf)*p_trig(('the', 'black',
                                                                                                    'table'), tgf, bgf)*p_trig(('black', 'table', '</s>'), tgf, bgf)*p_trig(('table', '</s>', '</s>'), tgf, bgf)
print("P(<s> <s> the new table is red </s> </s>)= %f" % p_text1)
print("P(<s> <s> the black table </s> </s>)= %f" % p_text2)

Bigram counts: dict_items([(('<s>', '<s>'), 2), (('<s>', 'the'), 2), (('the', 'new'), 1), (('new', 'table'), 1), (('table', 'is'), 2), (('is', 'red'), 1), (('red', '</s>'), 1), (('</s>', '</s>'), 2), (('</s>', '<s>'), 1), (('the', 'blue'), 1), (('blue', 'table'), 1), (('is', 'broken'), 1), (('broken', '</s>'), 1)]) 

Trigram counts: dict_items([(('<s>', '<s>', 'the'), 2), (('<s>', 'the', 'new'), 1), (('the', 'new', 'table'), 1), (('new', 'table', 'is'), 1), (('table', 'is', 'red'), 1), (('is', 'red', '</s>'), 1), (('red', '</s>', '</s>'), 1), (('</s>', '</s>', '<s>'), 1), (('</s>', '<s>', '<s>'), 1), (('<s>', 'the', 'blue'), 1), (('the', 'blue', 'table'), 1), (('blue', 'table', 'is'), 1), (('table', 'is', 'broken'), 1), (('is', 'broken', '</s>'), 1), (('broken', '</s>', '</s>'), 1)]) 

P(<s> <s> the new table is red </s> </s>)= 0.250000
P(<s> <s> the black table </s> </s>)= 0.000000


## 4.6 Exercises
### Exercise 4.1

In [13]:
def get_sentence_probability_unigram(words_list, unigram_frequencies):
    total_words = sum(unigram_frequencies.values())
    sentence_probability = 1
    for word in words_list:
        word_probability = unigram_frequencies.get(word, 0) / total_words
        sentence_probability *= word_probability
    return sentence_probability


def test_get_sentence_probability_unigram():
    # Test case 1
    words_list = ['the', 'new', 'table', 'is', 'red']
    unigram_frequencies = {'the': 2, 'new': 1, 'table': 1, 'is': 1, 'red': 1}
    expected_probability = 0.00032  # Updated expected probability
    get_sentence_probability_unigram(
        words_list, unigram_frequencies) == expected_probability

    # Test case 2
    words_list = ['the', 'blue', 'table', 'is', 'broken']
    unigram_frequencies = {'the': 2, 'blue': 1,
                           'table': 1, 'is': 1, 'broken': 1}
    expected_probability = 0.00032  # Updated expected probability
    get_sentence_probability_unigram(
        words_list, unigram_frequencies) == expected_probability

    # Test case 3
    words_list = ['the', 'new', 'table', 'is', 'blue']
    unigram_frequencies = {'the': 2, 'new': 1, 'table': 1, 'is': 1, 'red': 1}
    expected_probability = 0
    get_sentence_probability_unigram(
        words_list, unigram_frequencies) == expected_probability

    # Add more test cases as needed

    print("All test cases passed!")


test_get_sentence_probability_unigram()

All test cases passed!
