In [2]:
from collections import Counter

# Define the corpus and the target sentence
corpus_sentences = [
    "<s> He read a book </s>",
    "<s> I read a different book </s>",
    "<s> He read a book by Danielle </s>"
]
target_sentence = "<s> I read a book by Danielle </s>"

# Tokenize the corpus into individual words
tokens = [word for sentence in corpus_sentences for word in sentence.split()]

# Generate bigrams from the corpus tokens
bigrams = [(tokens[i], tokens[i + 1]) for i in range(len(tokens) - 1)]

# Count the occurrences of each bigram and each word in the corpus
bigram_counts = Counter(bigrams)
word_counts = Counter(tokens)

# Determine the vocabulary size for smoothing purposes
vocabulary = set(tokens)
V = len(vocabulary)  # Vocabulary size

# Function to calculate unsmoothed bigram probability
def calculate_unsmoothed_probability(bigram):
    return bigram_counts[bigram] / word_counts[bigram[0]] if word_counts[bigram[0]] > 0 else 0

# Function to calculate smoothed (add-one) bigram probability
def calculate_smoothed_probability(bigram, V):
    return (bigram_counts.get(bigram, 0) + 1) / (word_counts.get(bigram[0], 0) + V)

# Function to calculate the probability of a sentence using a specified bigram probability function
def calculate_sentence_probability(sentence, probability_function, V=None):
    tokens = sentence.split()
    bigrams = [(tokens[i], tokens[i + 1]) for i in range(len(tokens) - 1)]
    probability = 1.0
    for bigram in bigrams:
        # Check if the smoothing parameter V should be passed
        if V is not None:
            probability *= probability_function(bigram, V)
        else:
            probability *= probability_function(bigram)
    return probability

# Calculate and print the sentence probabilities using both unsmoothed and smoothed models
unsmoothed_probability = calculate_sentence_probability(target_sentence, calculate_unsmoothed_probability)
smoothed_probability = calculate_sentence_probability(target_sentence, calculate_smoothed_probability, V)

print("Unsmoothed Sentence Probability:", unsmoothed_probability)
print("Smoothed Sentence Probability:", smoothed_probability)


Unsmoothed Sentence Probability: 0.07407407407407407
Smoothed Sentence Probability: 1.0101357919757919e-05
