In [1]:
# get the preprocessed data from the preprocess file
from preprocess import *

## Model Building

In [2]:
class FourGramWithBackOff:
    def __init__(self):
        self.unigram_freq = {}
        self.bigram_freq = {}
        self.trigram_freq = {}
        self.fourgram_freq = {}
        
    def train(self):
        unigram_freq = FreqDist()
        bigram_freq = FreqDist()
        trigram_freq = FreqDist()
        fourgram_freq = FreqDist()

        for sentence in tokenized_sent['uni_grams']:
            for word in sentence:
                unigram_freq[word] += 1
        

        for sentence in tokenized_sent['bi_grams']:
            for bigram in sentence:
                bigram_freq[bigram] += 1

        for sentence in tokenized_sent['tri_grams']:
            for trigram in sentence:
                trigram_freq[trigram] += 1

        for sentence in tokenized_sent['four_grams']:
            for fourgram in sentence:
                fourgram_freq[fourgram] += 1
                
    def unique_n_gram_occurance(self, unique_n_grams, flat_n_grams, unique_word): 
        word_count_i = 0
        for i in unique_n_grams: 
            if unique_word == i: 
                for j in flat_n_grams: 
                    if i == j: 
                        word_count_i +=1
                return word_count_i
        return 0
    
    def unigram_word_count(self, unique_word): 
        uni_grams = tokenized_sent['uni_grams']
        
        unique_word_tuple = (unique_word,)
        # Flatten the list of lists
        flat_uni_grams = [word for sublist in uni_grams for word in sublist]
        
        # Convert the flattened list to a set to remove duplicates
        unique_words_set_uni_grams = set(flat_uni_grams)
        
        word_count_i = 0
        for i in unique_words_set_uni_grams: 
            if unique_word_tuple == i: 
                for j in flat_uni_grams: 
                    if i == j: 
                        word_count_i +=1
                return word_count_i
        
        return 0 

    def probability_uni(self, unique_word):
        uni_grams = tokenized_sent['uni_grams']
        
        unique_word_tuple = (unique_word,)
        # Flatten the list of lists
        flat_uni_grams = [word for sublist in uni_grams for word in sublist]
        
        num_flat_uni_grams = len(flat_uni_grams)
        
        # Convert the flattened list to a set to remove duplicates
        unique_words_set_uni_grams = set(flat_uni_grams)
        
        # Count the number of unique words
        num_unique_words = len(unique_words_set_uni_grams)
        
        #Create dictionary that store each unique word occurance
        unique_uni_grams_count = self.unique_n_gram_occurance(unique_words_set_uni_grams, flat_uni_grams, unique_word_tuple)
        uni_gram_probability = unique_uni_grams_count/num_flat_uni_grams

        
        return uni_gram_probability
    
    def probability_bi(self, word1, word2):
        bi_grams = tokenized_sent['bi_grams']
        uni_grams = tokenized_sent['uni_grams']
        
        unique_word_tuple = (word1, word2)
        
        flat_uni_grams = [word for sublist in uni_grams for word in sublist]
        flat_bi_grams = [word for sublist in bi_grams for word in sublist]
        
        # num_flat_bi_grams = len(flat_bi_grams)
        
        unique_words_set_uni_grams = set(flat_uni_grams)
        unique_words_set_bi_grams = set(flat_bi_grams)
        
        # num_unique_words_set_bi_grams = len(unique_words_set_bi_grams)
        
        unique_bi_grams_count = self.unique_n_gram_occurance(unique_words_set_bi_grams, flat_bi_grams, unique_word_tuple)
        uni_gram_word = (unique_word_tuple[0],)
        uni_gram_word_count = self.unique_n_gram_occurance(unique_words_set_uni_grams, flat_uni_grams, uni_gram_word)
        if uni_gram_word_count != 0: 
            bi_gram_probability = unique_bi_grams_count/uni_gram_word_count
        else: 
            bi_gram_probability = float('inf')  # Set to infinity
        
        
        return bi_gram_probability
    
    def probability_tri(self, word1, word2, word3):
        tri_grams = tokenized_sent['tri_grams']
        bi_grams = tokenized_sent['bi_grams']
        uni_grams = tokenized_sent['uni_grams']
        
        unique_word_tuple = (word1, word2, word3)
        
        flat_uni_grams = [word for sublist in uni_grams for word in sublist]
        flat_bi_grams = [word for sublist in bi_grams for word in sublist]
        flat_tri_grams = [word for sublist in tri_grams for word in sublist]
        
        unique_words_set_uni_grams = set(flat_uni_grams)
        unique_words_set_bi_grams = set(flat_bi_grams)
        unique_words_set_tri_grams = set(flat_tri_grams)
        
        unique_tri_grams_count = self.unique_n_gram_occurance(unique_words_set_tri_grams, flat_tri_grams, unique_word_tuple)
        bi_gram_word = (unique_word_tuple[0], unique_word_tuple[1])
        bi_gram_word_count = self.unique_n_gram_occurance(unique_words_set_bi_grams, flat_bi_grams, bi_gram_word)
        if bi_gram_word_count != 0:
            tri_gram_probability = unique_tri_grams_count / bi_gram_word_count
        else:
            tri_gram_probability = float('inf')  # Set to infinity
        
        return tri_gram_probability

    def probability_four(self, word1, word2, word3, word4):
        four_grams = tokenized_sent['four_grams']
        tri_grams = tokenized_sent['tri_grams']
        bi_grams = tokenized_sent['bi_grams']
        uni_grams = tokenized_sent['uni_grams']
        
        unique_word_tuple = (word1, word2, word3, word4)
        
        flat_uni_grams = [word for sublist in uni_grams for word in sublist]
        flat_bi_grams = [word for sublist in bi_grams for word in sublist]
        flat_tri_grams = [word for sublist in tri_grams for word in sublist]
        flat_four_grams = [word for sublist in four_grams for word in sublist]
        
        unique_words_set_uni_grams = set(flat_uni_grams)
        unique_words_set_bi_grams = set(flat_bi_grams)
        unique_words_set_tri_grams = set(flat_tri_grams)
        unique_words_set_four_grams = set(flat_four_grams)
        
        unique_four_grams_count = self.unique_n_gram_occurance(unique_words_set_four_grams, flat_four_grams, unique_word_tuple)
        tri_gram_word = (unique_word_tuple[0], unique_word_tuple[1], unique_word_tuple[2])
        tri_gram_word_count = self.unique_n_gram_occurance(unique_words_set_tri_grams, flat_tri_grams, tri_gram_word)
        if tri_gram_word_count != 0:
            four_gram_probability = unique_four_grams_count / tri_gram_word_count
        else:
            four_gram_probability = float('inf')  # Set to infinity
        
        return four_gram_probability

    def probability_backoff(self, word1, word2, word3, word4):
        fourgram_prob = self.probability_four(word1, word2, word3, word4)
        
        if fourgram_prob != 0 and fourgram_prob != float('inf'):
            return fourgram_prob
        else:
            trigram_prob = self.probability_tri(word2, word3, word4)
            
            if trigram_prob != 0 and trigram_prob != float('inf'):
                return trigram_prob
            else:
                bigram_prob = self.probability_bi(word3, word4)
                
                if bigram_prob != 0 and bigram_prob != float('inf'):
                    return bigram_prob
                else:
                    unigram_prob = self.probability_uni(word4)
                    return unigram_prob
                
    def predict_next_word(self, word1, word2, word3):
        uni_grams = tokenized_sent['uni_grams']
        possible_next_words = set(word for sublist in uni_grams for word in sublist)
        max_probability = 0
        predicted_word = ""

        for next_word in possible_next_words:
            fourgram_prob = self.probability_backoff(word1, word2, word3, next_word)
            
            if fourgram_prob > max_probability:
                max_probability = fourgram_prob
                predicted_word = next_word

        return predicted_word


In [3]:
model = FourGramWithBackOff()

In [4]:
testing_word_prob = model.probability_uni('computer')
testing_word_prob

0.004057543341940244

In [5]:
bi_prop = model.probability_bi('is', 'a')
bi_prop

0.11956521739130435

In [6]:
tri_prop = model.probability_tri('science', 'is', 'a')
tri_prop

0.0

In [7]:
four_prop = model.probability_four('computer', 'science', 'is', 'the')
four_prop

0.3333333333333333

In [8]:
backoff_prop = model.probability_backoff('computer', 'science', 'is', 'the')
backoff_prop

0.3333333333333333

In [9]:
result = model.predict_next_word('computer', 'science', 'a')
print(result)




## Model Evaluation

In [10]:
def perplexity_back_off():
    return

## Text Generation