In [119]:
# get the preprocessed data from the preprocess file
from preprocess import *

## Model Building

In [120]:

def unigram_addk_probability(
    freq_uni: dict,
    current_uni: tuple,
    k = 0.1
):
    
    def get_n_total_words():
        # tokenized_sent = tokenize_sentences(sentences)
        return len(tokenized_words['sentences'])
    
    uni_gram_count = freq_uni.get(current_uni, 0)
    n_total_words = get_n_total_words()
    n_unique_words = len(freq_uni)
    
    probability = (uni_gram_count + k) / (n_total_words + n_unique_words * k) 
    return probability

In [121]:
def n_gram_addk_probability(
    word: str,
    given_gram: tuple,
    freq_uni: dict,
    freq_previous: dict, 
    freq_current: dict, 
    k = 0.1
):
    print(f'p({word} | {given_gram})', end=' = ')
    # new n-gram
    n_gram = list(given_gram)
    n_gram.append(word)
    n_gram = tuple(n_gram)
    
    current_gram_count = freq_current.get(n_gram, 0)
    previous_gram_count = freq_previous.get(given_gram, 0)
    unique_word_count = len(freq_uni)
    
    probability = (current_gram_count + k) / (previous_gram_count + unique_word_count * k)
    
    print(f'{probability}')
    
    print(f'current:{n_gram}: {current_gram_count}')
    print(f'previous: {given_gram}: {previous_gram_count}')
    print(unique_word_count)
    
    return probability

In [122]:

def addk_probability_interpolation(
    word: str,
    previous_tri_gram: tuple,
    freq_uni: dict,
    freq_bi: dict,
    freq_tri: dict,
    freq_four: dict,
    k=0.1,  # Smoothing parameter
    lambda1=0.1,
    lambda2=0.2,
    lambda3=0.3,
    lambda4=0.4,
):
    """
    Estimate the probability of a word being the next word after given previous words using linear interpolation with add-k smoothing.

    Args:
        word: The word for which to calculate the next word probability.
        previous_words: A tuple containing the two previous words.
        unigram_counts: A dictionary with counts of unigrams.
        bigram_counts: A dictionary with counts of bigrams.
        trigram_counts: A dictionary with counts of trigrams.
        fourgram_counts: A dictionary with counts of fourgrams.
        k: Smoothing parameter (default is 1).
        lambda1: Weight for unigram model.
        lambda2: Weight for bigram model.
        lambda3: Weight for trigram model.
        lambda4: Weight for fourgram model.

    Returns:
        The estimated probability of 'word' being the next word after 'previous_words' using linear interpolation with add-k smoothing.
    """
    # Create the unigram, bigram, trigram, and fourgram tuples
    uni_gram = (word,)
    bi_gram = (previous_tri_gram[2], word)
    tri_gram = (previous_tri_gram[1], previous_tri_gram[2], word)
    four_gram = (previous_tri_gram[0], previous_tri_gram[1], previous_tri_gram[2], word)
    
    print(uni_gram)
    print(bi_gram)
    print(tri_gram)
    print(four_gram)

    # Calculate probabilities for each n-gram model with add-k smoothing
    unigram_prob = unigram_addk_probability(
        freq_uni=freq_uni, 
        current_uni=uni_gram, 
        k=k,
    )
    
    bigram_prob = n_gram_addk_probability(
        word=bi_gram[1], 
        given_gram=bi_gram[:1], 
        freq_uni=freq_uni, 
        freq_previous=freq_uni, 
        freq_current=freq_bi,
        k=k,
    )
    
    trigram_prob = n_gram_addk_probability(
        word=tri_gram[2],
        given_gram=tri_gram[:2],
        freq_uni=freq_uni, 
        freq_previous=freq_bi, 
        freq_current=freq_tri,
        k=k,
    )
    
    fourgram_prob = n_gram_addk_probability(
        word=four_gram[3], 
        given_gram=(four_gram[:3]),
        freq_uni=freq_uni, 
        freq_previous=freq_tri, 
        freq_current=freq_four,
        k=k,
    )
    
    # print(len(freq_uni))
    # print(len(freq_tri))
    # print(len(freq_four))
    
    print(f'uni_prob: {unigram_prob}')
    print(f'bi_prob: {bigram_prob}')
    print(f'tri_prob: {trigram_prob}')
    print(f'four_prob: {fourgram_prob}')

    # Calculate interpolated probability
    probability = (lambda1*unigram_prob) + (lambda2*bigram_prob) + (lambda3*trigram_prob) + (lambda4*fourgram_prob)

    return probability

In [123]:
addk_probability_interpolation(
    'test', 
    ('this', 'is', 'a'),
    freq_uni,
    freq_bi,
    freq_tri,
    freq_four,
)

('test',)
('a', 'test')
('is', 'a', 'test')
('this', 'is', 'a', 'test')
p(test | ('a',)) = 0.6470588235294118
current:('a', 'test'): 1
previous: ('a',): 1
7
p(test | ('is', 'a')) = 0.6470588235294118
current:('is', 'a', 'test'): 1
previous: ('is', 'a'): 1
7
p(test | ('this', 'is', 'a')) = 0.6470588235294118
current:('this', 'is', 'a', 'test'): 1
previous: ('this', 'is', 'a'): 1
7
uni_prob: 0.14285714285714288
bi_prob: 0.6470588235294118
tri_prob: 0.6470588235294118
four_prob: 0.6470588235294118


0.596638655462185

In [124]:
freq_bi.get(('is', 'test'),0)

0

In [125]:
print(len(freq_uni))
print(len(freq_tri))
print(len(freq_four))

7
5
4


In [126]:
print(n_gram_addk_probability('test', ('a',), freq_uni, freq_uni, freq_bi))

p(test | ('a',)) = 0.6470588235294118
current:('a', 'test'): 1
previous: ('a',): 1
7
0.6470588235294118


In [127]:
(1 + 0.1) / (1 + 0.7)

0.6470588235294118

In [128]:
unigram_addk_probability(freq_uni, ('this',))

0.14285714285714288

In [129]:
(1 + 0.1) / (7 + 0.7)

0.14285714285714288

In [130]:
# unigram_addk_probability(freq_uni, ('iNtelligence'.lower(),))

In [131]:
# from collections import Counter
# unigram_counts = Counter(unigrams)
# bigram_counts = Counter(bigrams)
# trigram_counts = Counter(trigrams)
# fourgram_counts = Counter(fourgrams)


# test = addk_probability_interpolation(
#     "it",
#     ("for", "testing", "purposes"),
#     unigram_counts,
#     bigram_counts,
#     trigram_counts,
#     fourgram_counts,
# )

# print(test)

In [132]:
# def generate_text(
#     seed_words,
#     length,
#     interpolation_function,
#     unigram_counts,
#     bigram_counts,
#     trigram_counts,
#     fourgram_counts,
# ):
#     generated_text = list(seed_words)

#     for _ in range(length):
#         previous_words = tuple(generated_text[-3:])  # Get the last three words
#         next_word = generate_next_word(
#             interpolation_function,
#             previous_words,
#             unigram_counts,
#             bigram_counts,
#             trigram_counts,
#             fourgram_counts,
#         )
#         generated_text.append(next_word)

#     return " ".join(generated_text)


# def generate_next_word(
#     interpolation_function,
#     previous_words,
#     unigram_counts,
#     bigram_counts,
#     trigram_counts,
#     fourgram_counts,
# ):
#     # Create a list to store predicted words and their probabilities
#     predictions = []

#     # Iterate through all possible unigrams
#     for word in unigram_counts.keys():
#         # Calculate the probability of the word being the next word
#         probability = interpolation_function(
#             word,
#             previous_words,
#             unigram_counts,
#             bigram_counts,
#             trigram_counts,
#             fourgram_counts,
#         )

#         # Add the word and its probability to the list
#         predictions.append((word, probability))

#     # Sort the predictions by probability in descending order
#     predictions.sort(key=lambda x: x[1], reverse=True)

#     # Return the word with the highest probability
#     return str(predictions[0][0])
#     # return predictions[0][0]


# seed_words = ["for", "testing", "purposes",]  # Seed sequence
# generated_text = generate_text(
#     seed_words,
#     100,
#     addk_probability_interpolation,
#     unigram_counts,
#     bigram_counts,
#     trigram_counts,
#     fourgram_counts,
# )
# print(generated_text)

In [133]:
# build a model using interpolation
def language_model_interpolation():
    return

## Model Evaluation

In [134]:
def perplexity_interpolation():
    return

## Text Generation