In [38]:
from preprocess import *
from collections import Counter

In [39]:
def uni_probability(lamda: float, unigram_word: int, total_word: int, k: float, v: int,):
    return (lamda * (unigram_word + k)) / (total_word + (k * v))

def ngram_probability(lamda: float, ngram_count: int, prev_ngram_count: int, k: float, v: int,):
    return (lamda * (ngram_count + k)) / (prev_ngram_count + (k * v))

def addk_probability_interpolation(
    word: str,
    previous_words: tuple[str, str, str],
    unigram_counts: dict[str, int],
    bigram_counts: dict[str, int],
    trigram_counts: dict[str, int],
    fourgram_counts: dict[str, int],
    k=0.1,  # Smoothing parameter
    lambda1=0.1,
    lambda2=0.2,
    lambda3=0.3,
    lambda4=0.4,
):
    """
    Estimate the probability of a word being the next word after given previous words using linear interpolation with add-k smoothing.

    Args:
        word: The word for which to calculate the next word probability.
        previous_words: A tuple containing the two previous words.
        unigram_counts: A dictionary with counts of unigrams.
        bigram_counts: A dictionary with counts of bigrams.
        trigram_counts: A dictionary with counts of trigrams.
        fourgram_counts: A dictionary with counts of fourgrams.
        k: Smoothing parameter (default is 1).
        lambda1: Weight for unigram model.
        lambda2: Weight for bigram model.
        lambda3: Weight for trigram model.
        lambda4: Weight for fourgram model.

    Returns:
        The estimated probability of 'word' being the next word after 'previous_words' using linear interpolation with add-k smoothing.
    """
    # Create the unigram, bigram, trigram, and fourgram tuples
    unigram = (word,)
    bigram = (previous_words[2], word)
    trigram = (previous_words[1], previous_words[2], word)
    fourgram = (previous_words[0], previous_words[1], previous_words[2], word)

    # Get counts from respective count dictionaries
    unigram_count = unigram_counts.get(unigram, 0)
    bigram_count = bigram_counts.get(bigram, 0)
    trigram_count = trigram_counts.get(trigram, 0)
    fourgram_count = fourgram_counts.get(fourgram, 0)

    # Total counts for normalization
    total_word = len(tokenized_words['sentences']) 

    # Total unique word
    unique_word = len(set(tokenized_words['sentences']))

    unigram_prob = uni_probability(
        lambda1, unigram_count, total_word, k, unique_word
    )
    bigram_prob = ngram_probability(
        lambda2, bigram_count, unigram_count, k, unique_word
    )
    trigram_prob = ngram_probability(
        lambda3, trigram_count, bigram_count, k, unique_word
    )
    fourgram_prob = ngram_probability(
        lambda4, fourgram_count, trigram_count, k, unique_word
    )

    probability = unigram_prob + bigram_prob + trigram_prob + fourgram_prob

    return probability;

In [40]:


test = addk_probability_interpolation(
    "test", 
    ("this", "is", "a"),
    Counter(freq_uni),
    Counter(freq_bi),
    Counter(freq_tri),
    Counter(freq_four),
)

print(test)

0.596638655462185
