In [1]:
# get the preprocessed data from the preprocess file
from preprocess import *

## Model Building

In [2]:
class FourGramWithBackOff:        
    def __init__(self):

        # preprocess
        text_preprocessor = TextPreprocessor()
        
        # initialize necessary fields
        self.unigram_freq = text_preprocessor.freq_uni
        self.bigram_freq = text_preprocessor.freq_bi
        self.trigram_freq = text_preprocessor.freq_tri
        self.fourgram_freq = text_preprocessor.freq_four

    def probability_uni(self, word):
        uni_tuple = (word,)
        total_unigrams = sum(self.unigram_freq.values())

        return self.unigram_freq.get(uni_tuple, 0) / total_unigrams if total_unigrams != 0 else 0

    def probability_bi(self, word1, word2):
        bi_tuple = (word1, word2)
        word = (word1,)

        unigram_count = self.unigram_freq.get(word, 0)
        return self.bigram_freq.get(bi_tuple, 0) / unigram_count if unigram_count != 0 else 0

    def probability_tri(self, word1, word2, word3):
        tri_tuple = (word1, word2, word3)
        bi_tuple = (word1, word2)

        bigram_count = self.bigram_freq.get(bi_tuple, 0)
        return self.trigram_freq.get(tri_tuple, 0) / bigram_count if bigram_count != 0 else 0

    def probability_four(self, word1, word2, word3, word4):
        four_tuple = (word1, word2, word3, word4)
        tri_tuple = (word1, word2, word3)

        trigram_count = self.trigram_freq.get(tri_tuple, 0)
        return self.fourgram_freq.get(four_tuple, 0) / trigram_count if trigram_count != 0 else 0

    def probability_backoff(self, word1, word2, word3, word4):
        fourgram_prob = self.probability_four(word1, word2, word3, word4)
        
        if fourgram_prob != 0 and fourgram_prob != float('inf'):
            return fourgram_prob
        else:
            trigram_prob = self.probability_tri(word2, word3, word4)
            
            if trigram_prob != 0 and trigram_prob != float('inf'):
                return trigram_prob
            else:
                bigram_prob = self.probability_bi(word3, word4)
                
                if bigram_prob != 0 and bigram_prob != float('inf'):
                    return bigram_prob
                else:
                    unigram_prob = self.probability_uni(word4)
                    return unigram_prob
                
    def predict_next_word(self, word1, word2, word3):
        predictions = []
        
        for i in range(1, 5):
            # Perform backoff starting from fourgram down to unigram
            if(i == 1):
                for word in self.unigram_freq.keys():
                    probability = self.probability_four(word1, word2, word3, word[0])
                    predictions.append((word, probability))
                predictions.sort(key=lambda x: x[1], reverse=True)
                if predictions[0][1] == 0.0:
                    predictions = []
                    continue
                else:
                    break
            elif(i == 2):
                for word in self.unigram_freq.keys():
                    probability = self.probability_tri(word2, word3, word[0])
                    predictions.append((word, probability))
                predictions.sort(key=lambda x: x[1], reverse=True)
                if predictions[0][1] == 0.0:
                    predictions = []
                    continue
                else:
                    break
            elif(i == 3):
                for word in self.unigram_freq.keys():
                    probability = self.probability_bi(word3, word[0])
                    predictions.append((word, probability))
                predictions.sort(key=lambda x: x[1], reverse=True)
                if predictions[0][1] == 0.0:
                    predictions = []
                    continue
                else:
                    break
            else:
                for word in self.unigram_freq.keys():
                    probability = self.probability_bi(word[0])
                    predictions.append((word, probability))
                predictions.sort(key=lambda x: x[1], reverse=True)
                if predictions[0][1] == 0.0:
                    predictions = []
                    continue
                else:
                    break

        # Return the word with the highest probability
        return predictions[0][0][0]
        
    def generate_text(self, initial_words, num_words=10):
        generated_text = list(initial_words)

        for _ in range(num_words):
            if len(generated_text) >= 3:
                word1, word2, word3 = generated_text[-3:]
            elif len(generated_text) == 2:
                word1, word2, word3 = generated_text[0], generated_text[0], generated_text[1]
            elif len(generated_text) == 1:
                word1, word2, word3 = generated_text[0], generated_text[0], generated_text[0]
            else:
                # Handle the case where there are fewer than 3 initial words
                raise ValueError("Insufficient initial words for text generation.")

            next_word = self.predict_next_word(word1, word2, word3)
            generated_text.append(next_word)

        return generated_text


## Text Generation

In [3]:
model = FourGramWithBackOff()

In [4]:
backoff_prop = model.probability_backoff('computer', 'science', 'is', 'the')
backoff_prop

0.4

In [5]:
model.predict_next_word('computer', 'science', 'is')

'the'

In [6]:
initial_words = ['computer', 'science', 'is']
generated_text = model.generate_text(initial_words, num_words=50)
print(" ".join(generated_text))

computer science is the study of computer science and information technology </s> <s> computational science refers to the study of computer science and information technology </s> <s> computational science refers to the study of computer science and information technology </s> <s> computational science refers to the study of computer science and information technology


## Model Evaluation for Validation Set

In [7]:
def perplexity_back_off():
    return

In [10]:
text_preprocessor = TextPreprocessor()
backoff_model = FourGramWithBackOff()

def perplexity_backoff_four_val(backoff_model):
    pp = 1
    tokenized_validation = text_preprocessor.tokenize_words(text_preprocessor.validation_data)
    tokens = tokenized_validation['sentences']
    for i in range(3, len(tokens)):  # Adjusted loop range for fourgram
        current_word = tokens[i]  # Adjusted indexing for fourgram
        previous_three_gram = (tokens[i - 3], tokens[i - 2], tokens[i - 1])  # Construct threegram tuple

        probability = backoff_model.probability_backoff(
            word1=tokens[i-3],
            word2=tokens[i-2],
            word3=tokens[i-1],
            word4=current_word
            
        )  # Calculate probability using backoff model

        if probability != 0:  # Exclude zero probabilities
            pp *= (1 / probability) ** (1 / len(tokens))  # Update perplexity

    perplexity = pp ** (1 / (len(tokens) - 3))  # Take the nth root of the product of inverse probabilities
    return perplexity

perplexity_backoff_four_val(backoff_model)

1.002155920517299

## Model Evaluation for Test Set

In [11]:
text_preprocessor = TextPreprocessor()
backoff_model = FourGramWithBackOff()

def perplexity_backoff_four_val(backoff_model):
    pp = 1
    tokenized_validation = text_preprocessor.tokenize_words(text_preprocessor.test_data)
    tokens = tokenized_validation['sentences']
    for i in range(3, len(tokens)):  # Adjusted loop range for fourgram
        current_word = tokens[i]  # Adjusted indexing for fourgram
        previous_three_gram = (tokens[i - 3], tokens[i - 2], tokens[i - 1])  # Construct threegram tuple

        probability = backoff_model.probability_backoff(
            word1=tokens[i-3],
            word2=tokens[i-2],
            word3=tokens[i-1],
            word4=current_word
            
        )  # Calculate probability using backoff model

        if probability != 0:  # Exclude zero probabilities
            pp *= (1 / probability) ** (1 / len(tokens))  # Update perplexity

    perplexity = pp ** (1 / (len(tokens) - 3))  # Take the nth root of the product of inverse probabilities
    return perplexity

perplexity_backoff_four_val(backoff_model)

1.0010125547047657

In [12]:
model = FourGramWithBackOff()

def text_generator(tri_gram):

    generated_text = ' '
    for word in tri_gram:
        generated_text = generated_text + word + ' '
    for i in range(0, 100):
        next_word = model.predict_next_word(tri_gram[0], tri_gram[1], tri_gram[2])
        tri_gram = tri_gram[1:]
        tri_gram = list(tri_gram)
        tri_gram.append(next_word)
        tri_gram = tuple(tri_gram)
        generated_text = generated_text + next_word + ' '
    return generated_text
    
generated = text_generator(('<s>', 'computer', 'science'))
generated

' <s> computer science is the study of computer science and information technology </s> <s> computational science refers to the study of computer science and information technology </s> <s> computational science refers to the study of computer science and information technology </s> <s> computational science refers to the study of computer science and information technology </s> <s> computational science refers to the study of computer science and information technology </s> <s> computational science refers to the study of computer science and information technology </s> <s> computational science refers to the study of computer science and information technology </s> <s> computational science refers to the '