In [4]:
from preprocess import *

with open('corpus/test.txt', 'r') as infile:
    file_content = infile.read().replace('\n', '')

text_preprocessor = TextPreprocessor()

## Model Building

In [5]:
class AddKInterpolation:
    def __init__(self) -> None:
        # preprocess
        text_preprocessor = TextPreprocessor()
        
        # initialize necessary fields
        self.freq_uni = text_preprocessor.freq_uni
        self.freq_bi = text_preprocessor.freq_bi
        self.freq_tri = text_preprocessor.freq_tri
        self.freq_four = text_preprocessor.freq_four
    
    def addk_interpolation_probability(
        self,
        word: str,
        given_tri_gram: tuple,
        k=0.1,  # Smoothing parameter
        lambda1=0.1,
        lambda2=0.2,
        lambda3=0.3,
        lambda4=0.4,
    ):
        """
        Estimate the probability of a word being the next word after given previous words using linear interpolation with add-k smoothing.

        Args:
            word: The word for which to calculate the next word probability.
            previous_words: A tuple containing the two previous words.
            unigram_counts: A dictionary with counts of unigrams.
            bigram_counts: A dictionary with counts of bigrams.
            trigram_counts: A dictionary with counts of trigrams.
            fourgram_counts: A dictionary with counts of fourgrams.
            k: Smoothing parameter (default is 1).
            lambda1: Weight for unigram model.
            lambda2: Weight for bigram model.
            lambda3: Weight for trigram model.
            lambda4: Weight for fourgram model.

        Returns:
            The estimated probability of 'word' being the next word after 'previous_words' using linear interpolation with add-k smoothing.
        """
        # Create the unigram, bigram, trigram, and fourgram tuples
        uni_gram = (word,)
        bi_gram = (given_tri_gram[2], word)
        tri_gram = (given_tri_gram[1], given_tri_gram[2], word)
        four_gram = (given_tri_gram[0], given_tri_gram[1], given_tri_gram[2], word)

        # Calculate probabilities for each n-gram model with add-k smoothing
        unigram_prob = self.unigram_addk_probability(
            current_uni=uni_gram, 
            k=k,
        )
        
        bigram_prob = self.n_gram_addk_probability(
            word=bi_gram[1], 
            given_gram=bi_gram[:1], 
            freq_previous=self.freq_uni, 
            freq_current=self.freq_bi,
            k=k,
        )
        
        trigram_prob = self.n_gram_addk_probability(
            word=tri_gram[2],
            given_gram=tri_gram[:2],
            freq_previous=self.freq_bi, 
            freq_current=self.freq_tri,
            k=k,
        )
        
        fourgram_prob = self.n_gram_addk_probability(
            word=four_gram[3], 
            given_gram=(four_gram[:3]), 
            freq_previous=self.freq_tri, 
            freq_current=self.freq_four,
            k=k,
        )

        # Calculate interpolated probability
        probability = (lambda1*unigram_prob) + (lambda2*bigram_prob) + (lambda3*trigram_prob) + (lambda4*fourgram_prob)
        print(f'probability: {probability}')

        return probability
    
    # ---------- Add-k probability of unigram ----------
    def unigram_addk_probability(
        self,
        current_uni: tuple,
        k = 0.1
    ):
        def get_n_total_words():
            # tokenized_sent = tokenize_sentences(sentences)
            return len(text_preprocessor.training_data)
        
        uni_gram_count = self.freq_uni.get(current_uni, 0)
        n_total_words = get_n_total_words()
        n_unique_words = len(self.freq_uni)
        
        probability = (uni_gram_count + k) / (n_total_words + n_unique_words * k) 
        return probability
    
    # ---------- Add-k probability of n-gram, starting from bi-gram ----------
    def n_gram_addk_probability(
        self,
        word: str,
        given_gram: tuple,
        freq_previous: dict, 
        freq_current: dict, 
        k = 0.1
    ):
        # new n-gram
        n_gram = list(given_gram)
        n_gram.append(word)
        n_gram = tuple(n_gram)
        
        current_gram_count = freq_current.get(n_gram, 0)
        previous_gram_count = freq_previous.get(given_gram, 0)
        unique_word_count = len(self.freq_uni)
        
        probability = (current_gram_count + k) / (previous_gram_count + unique_word_count * k)
        
        return probability
    
    # ---------- Predict the next word ----------
    def predict(
        self,
        previous_word: tuple[str, str, str],
    ):
        predictions = []

        for word in self.freq_uni.keys():
            probability = self.addk_interpolation_probability(word[0], previous_word)

            predictions.append((word, probability)) 

        predictions.sort(key=lambda x: x[1], reverse=True)
        return predictions[0][0][0]
    

In [6]:
model = AddKInterpolation()
model.predict(('this', 'is', 'a'))

probability: 0.09803921568627452
probability: 0.15686274509803919
probability: 0.15686274509803919
probability: 0.15686274509803919
probability: 0.8235294117647058
probability: 0.15686274509803919
probability: 0.09803921568627452


'test'

In [7]:
def unigram_addk_probability(
    freq_uni: dict,
    current_uni: tuple,
    k = 0.1
):
    
    def get_n_total_words():
        # tokenized_sent = tokenize_sentences(sentences)
        return len(text_preprocessor.training_data)
    
    uni_gram_count = freq_uni.get(current_uni, 0)
    n_total_words = get_n_total_words()
    n_unique_words = len(freq_uni)
    
    probability = (uni_gram_count + k) / (n_total_words + n_unique_words * k) 
    return probability

In [8]:
def n_gram_addk_probability(
    word: str,
    given_gram: tuple,
    freq_uni: dict,
    freq_previous: dict, 
    freq_current: dict, 
    k = 0.1
):
    # print(f'p({word} | {given_gram})', end=' = ')
    # new n-gram
    n_gram = list(given_gram)
    n_gram.append(word)
    n_gram = tuple(n_gram)
    
    current_gram_count = freq_current.get(n_gram, 0)
    previous_gram_count = freq_previous.get(given_gram, 0)
    unique_word_count = len(freq_uni)
    
    probability = (current_gram_count + k) / (previous_gram_count + unique_word_count * k)
    
    # print(f'{probability}')
    
    # print(f'current:{n_gram}: {current_gram_count}')
    # print(f'previous: {given_gram}: {previous_gram_count}')
    # print(unique_word_count)
    
    return probability

In [9]:

def addk_interpolation_probability(
    word: str,
    previous_tri_gram: tuple,
    freq_uni: dict,
    freq_bi: dict,
    freq_tri: dict,
    freq_four: dict,
    k=0.1,  # Smoothing parameter
    lambda1=0.1,
    lambda2=0.2,
    lambda3=0.3,
    lambda4=0.4,
):
    """
    Estimate the probability of a word being the next word after given previous words using linear interpolation with add-k smoothing.

    Args:
        word: The word for which to calculate the next word probability.
        previous_words: A tuple containing the two previous words.
        unigram_counts: A dictionary with counts of unigrams.
        bigram_counts: A dictionary with counts of bigrams.
        trigram_counts: A dictionary with counts of trigrams.
        fourgram_counts: A dictionary with counts of fourgrams.
        k: Smoothing parameter (default is 1).
        lambda1: Weight for unigram model.
        lambda2: Weight for bigram model.
        lambda3: Weight for trigram model.
        lambda4: Weight for fourgram model.

    Returns:
        The estimated probability of 'word' being the next word after 'previous_words' using linear interpolation with add-k smoothing.
    """
    # Create the unigram, bigram, trigram, and fourgram tuples
    uni_gram = (word,)
    bi_gram = (previous_tri_gram[2], word)
    tri_gram = (previous_tri_gram[1], previous_tri_gram[2], word)
    four_gram = (previous_tri_gram[0], previous_tri_gram[1], previous_tri_gram[2], word)
    
    # print(uni_gram)
    # print(bi_gram)
    # print(tri_gram)
    # print(four_gram)

    # Calculate probabilities for each n-gram model with add-k smoothing
    unigram_prob = unigram_addk_probability(
        freq_uni=freq_uni, 
        current_uni=uni_gram, 
        k=k,
    )
    
    bigram_prob = n_gram_addk_probability(
        word=bi_gram[1], 
        given_gram=bi_gram[:1], 
        freq_uni=freq_uni, 
        freq_previous=freq_uni, 
        freq_current=freq_bi,
        k=k,
    )
    
    trigram_prob = n_gram_addk_probability(
        word=tri_gram[2],
        given_gram=tri_gram[:2],
        freq_uni=freq_uni, 
        freq_previous=freq_bi, 
        freq_current=freq_tri,
        k=k,
    )
    
    fourgram_prob = n_gram_addk_probability(
        word=four_gram[3], 
        given_gram=(four_gram[:3]),
        freq_uni=freq_uni, 
        freq_previous=freq_tri, 
        freq_current=freq_four,
        k=k,
    )
    
    # print(len(freq_uni))
    # print(len(freq_tri))
    # print(len(freq_four))
    
    # print(f'uni_prob: {unigram_prob}')
    # print(f'bi_prob: {bigram_prob}')
    # print(f'tri_prob: {trigram_prob}')
    # print(f'four_prob: {fourgram_prob}')

    # Calculate interpolated probability
    probability = (lambda1*unigram_prob) + (lambda2*bigram_prob) + (lambda3*trigram_prob) + (lambda4*fourgram_prob)
    print(f'probability: {probability}')

    return probability

In [10]:
addk_interpolation_probability(
    'test', 
    ('this', 'is', 'a'),
    text_preprocessor.freq_uni,
    text_preprocessor.freq_bi,
    text_preprocessor.freq_tri,
    text_preprocessor.freq_four,
)

probability: 0.8235294117647058


0.8235294117647058

In [11]:
# build a model using interpolation
def language_model_interpolation(
    previous_word: tuple[str, str, str],
    freq_uni: dict[tuple, int],
    freq_bi: dict[tuple, int],
    freq_tri: dict[tuple, int],
    freq_four: dict[tuple, int],
):
    predictions = []

    for word in freq_uni.keys():
        probability = addk_interpolation_probability(word[0], previous_word, freq_uni, freq_bi, freq_tri, freq_four,)

        predictions.append((word, probability)) 

    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions[0][0][0]

In [12]:
next_word = language_model_interpolation(('this', 'is', 'a'), text_preprocessor.freq_uni, text_preprocessor.freq_bi, text_preprocessor.freq_tri, text_preprocessor.freq_four)
print("Next word: " + next_word)

probability: 0.09803921568627452
probability: 0.15686274509803919
probability: 0.15686274509803919
probability: 0.15686274509803919
probability: 0.8235294117647058
probability: 0.15686274509803919
probability: 0.09803921568627452
Next word: test


In [13]:
text_preprocessor.freq_bi.get(('is', 'test'),0)

0

In [14]:
print(n_gram_addk_probability('test', ('a',), text_preprocessor.freq_uni, text_preprocessor.freq_uni, text_preprocessor.freq_bi))

0.7777777777777778


In [15]:
(1 + 0.1) / (1 + 0.7)

0.6470588235294118

In [16]:
unigram_addk_probability(text_preprocessor.freq_uni, ('this',))

1.2352941176470587

In [17]:
(1 + 0.1) / (7 + 0.7)

0.14285714285714288

In [18]:
print(text_preprocessor.freq_four.items())
print(addk_interpolation_probability('test', ('this', 'is', 'a'), text_preprocessor.freq_uni, text_preprocessor.freq_bi, text_preprocessor.freq_tri, text_preprocessor.freq_four,))

dict_items([(('<s>', 'this', 'is', 'a'), 1), (('this', 'is', 'a', 'test'), 2), (('is', 'a', 'test', 'corpus'), 2), (('a', 'test', 'corpus', 'this'), 1), (('test', 'corpus', 'this', 'is'), 1), (('corpus', 'this', 'is', 'a'), 1), (('a', 'test', 'corpus', '</s>'), 1)])
probability: 0.8235294117647058
0.8235294117647058


## Model Evaluation

In [19]:
def perplexity_interpolation():
    return

## Text Generation