In [7]:
from preprocess import *

text_preprocessor = TextPreprocessor()

## Model Building

In [8]:
class InterpolationAddK:
    def __init__(
        self,
        k=0.1,  # Smoothing parameter
        lambda1=0.1,
        lambda2=0.2,
        lambda3=0.3,
        lambda4=0.4,
    ) -> None:
        # preprocess
        text_preprocessor = TextPreprocessor()
        
        # initialize necessary fields
        self.freq_uni = text_preprocessor.freq_uni
        self.freq_bi = text_preprocessor.freq_bi
        self.freq_tri = text_preprocessor.freq_tri
        self.freq_four = text_preprocessor.freq_four
        
        # k and lambda
        self.k = k
        self.lambda1 = lambda1
        self.lambda2 = lambda2
        self.lambda3 = lambda3
        self.lambda4 = lambda4
    
    # ---------- interpolation with Add-k probability of unigram ----------
    def probability(
        self,
        word: str,
        given_tri_gram: tuple,
    ):
        """
        Estimate the probability of a word being the next word after given previous words using linear interpolation with add-k smoothing.

        Args:
            word: The word for which to calculate the next word probability.
            give_tri_gram: A tuple containing the three previous words.

        Returns:
            The estimated probability of 'word' being the next word after 'given_tri_gram' using linear interpolation with add-k smoothing.
        """
        
        # Create the unigram, bigram, trigram, and fourgram tuples
        uni_gram = (word,)
        bi_gram = (given_tri_gram[2], word)
        tri_gram = (given_tri_gram[1], given_tri_gram[2], word)
        four_gram = (given_tri_gram[0], given_tri_gram[1], given_tri_gram[2], word)

        # Calculate probabilities for each n-gram model with add-k smoothing
        unigram_prob = self.unigram_addk_probability(
            current_uni=uni_gram, 
            k=self.k,
        )
        
        bigram_prob = self.n_gram_addk_probability(
            word=bi_gram[1], 
            given_gram=bi_gram[:1], 
            freq_previous=self.freq_uni, 
            freq_current=self.freq_bi,
            k=self.k,
        )
        
        trigram_prob = self.n_gram_addk_probability(
            word=tri_gram[2],
            given_gram=tri_gram[:2],
            freq_previous=self.freq_bi, 
            freq_current=self.freq_tri,
            k=self.k,
        )
        
        fourgram_prob = self.n_gram_addk_probability(
            word=four_gram[3], 
            given_gram=(four_gram[:3]), 
            freq_previous=self.freq_tri, 
            freq_current=self.freq_four,
            k=self.k,
        )

        # Calculate interpolated probability
        probability = (self.lambda1*unigram_prob) + (self.lambda2*bigram_prob) + (self.lambda3*trigram_prob) + (self.lambda4*fourgram_prob)
        
        # print(f'probability of {word}: {probability}')

        return probability
    
    # ---------- Add-k probability of unigram ----------
    def unigram_addk_probability(
        self,
        current_uni: tuple,
        k = 0.1
    ):
        uni_gram_count = self.freq_uni.get(current_uni, 0)
        n_total_words = len(text_preprocessor.training_data)
        n_unique_words = len(self.freq_uni)
        
        probability = (uni_gram_count + k) / (n_total_words + n_unique_words * k) 
        return probability
    
    # ---------- Add-k probability of n-gram, starting from bi-gram ----------
    def n_gram_addk_probability(
        self,
        word: str,
        given_gram: tuple,
        freq_previous: dict, 
        freq_current: dict, 
        k = 0.1
    ):
        # new n-gram
        n_gram = list(given_gram)
        n_gram.append(word)
        n_gram = tuple(n_gram)
        
        current_gram_count = freq_current.get(n_gram, 0)
        previous_gram_count = freq_previous.get(given_gram, 0)
        unique_word_count = len(self.freq_uni)
        
        probability = (current_gram_count + k) / (previous_gram_count + unique_word_count * k)
        
        return probability
    
    # ---------- Predict the next word ----------
    def predict(
        self,
        previous_word: tuple[str, str, str],
    ):
        predictions = []
        for word in self.freq_uni.keys():
            # if (word[0] != '<s>' and word[0] != '</s>'):
            probability = self.probability(word[0], previous_word)
            predictions.append((word, probability)) 

        predictions.sort(key=lambda x: x[1], reverse=True)
        # print(predictions)
        return predictions[0][0][0]
    

In [9]:
model = InterpolationAddK(
    k=pow(10, -2), 
    lambda1=pow(10, -5), 
    lambda2=pow(10, -5),
    lambda3=pow(10, -5),
    lambda4=pow(10, -1),
)
model.predict(('<s>', 'computer', 'science'))

'is'

## Model Evaluation for Validation Set

In [10]:
interpolation_model = InterpolationAddK()

def perplexity_interpolation_for_val():
    pp = 1
    tokenized_validation = text_preprocessor.tokenize_words(text_preprocessor.validation_data)
    tokens = tokenized_validation['sentences']
    for i in range(3, len(tokens)):  # Adjusted loop range for fourgram
        current_word = tokens[i]  # Adjusted indexing for fourgram
        previous_four_gram = (tokens[i - 3], tokens[i - 2], tokens[i - 1], tokens[i])  # Construct fourgram tuple

        probability = interpolation_model.probability(
            word=current_word,
            given_tri_gram=previous_four_gram[:-1],
            
        )  # Calculate probability

        pp *= (1 / probability) ** (1 / len(tokens))  # Update perplexity

    perplexity = pp ** (1 / (len(tokens) - 3))  # Take the nth root of the product of inverse probabilities
    print(perplexity)

perplexity_interpolation_for_val()

1.0029782746726499


## Model Evaluation for Test Set

In [11]:
interpolation_model = InterpolationAddK()

def perplexity_interpolation_for_test():
    pp = 1
    tokenized_validation = text_preprocessor.tokenize_words(text_preprocessor.test_data)
    tokens = tokenized_validation['sentences']
    for i in range(3, len(tokens)):  # Adjusted loop range for fourgram
        current_word = tokens[i]  # Adjusted indexing for fourgram
        previous_four_gram = (tokens[i - 3], tokens[i - 2], tokens[i - 1], tokens[i])  # Construct fourgram tuple

        probability = interpolation_model.probability(
            word=current_word,
            given_tri_gram=previous_four_gram[:-1],
            
        )  # Calculate probability

        pp *= (1 / probability) ** (1 / len(tokens))  # Update perplexity

    perplexity = pp ** (1 / (len(tokens) - 3))  # Take the nth root of the product of inverse probabilities
    print(perplexity)

perplexity_interpolation_for_test()

1.0015364723172278


## Text Generation

In [12]:
model = InterpolationAddK(
    k=pow(10, -2), 
    lambda1=pow(10, -5), 
    lambda2=pow(10, -5),
    lambda3=pow(10, -5),
    lambda4=pow(10, -1),
)

def text_generator(tri_gram):

    generated_text = ' '
    for word in tri_gram:
        generated_text = generated_text + word + ' '
    for i in range(0, 100):
        next_word = model.predict(tri_gram)
        tri_gram = tri_gram[1:]
        tri_gram = list(tri_gram)
        tri_gram.append(next_word)
        tri_gram = tuple(tri_gram)
        generated_text = generated_text + next_word + ' '
    return generated_text
    
generated = text_generator(('<s>', 'computer', 'science'))
generated

' <s> computer science is the study of computer science and information technology </s> <s> it is the study of computer science and information technology </s> <s> it is the study of computer science and information technology </s> <s> it is the study of computer science and information technology </s> <s> it is the study of computer science and information technology </s> <s> it is the study of computer science and information technology </s> <s> it is the study of computer science and information technology </s> <s> it is the study of computer science and information technology </s> <s> it is the study of '