In [319]:
# get the preprocessed data from the preprocess file
from preprocess import *

## Model Building

In [320]:
class FourGramWithBackOff:        
    def __init__(self, data):
        self.uni_grams = data['uni_grams']
        self.bi_grams = data['bi_grams']
        self.tri_grams = data['tri_grams']
        self.four_grams = data['four_grams']

        self.unigram_freq = FreqDist()
        self.bigram_freq = FreqDist()
        self.trigram_freq = FreqDist()
        self.fourgram_freq = FreqDist()

        for sentence in data['uni_grams']:
            for word in sentence:
                self.unigram_freq[word] += 1
        
        for sentence in data['bi_grams']:
            for bigram in sentence:
                self.bigram_freq[bigram] += 1

        for sentence in data['tri_grams']:
            for trigram in sentence:
                self.trigram_freq[trigram] += 1

        for sentence in data['four_grams']:
            for fourgram in sentence:
                self.fourgram_freq[fourgram] += 1
    
    def get(self):
        for sample, frequency in self.fourgram_freq.items():
            print(f"{sample}: {frequency}")

    def probability_uni(self, word):
        uni_tuple = (word,)

        return self.unigram_freq.get(uni_tuple, 0) / sum(self.unigram_freq.values())
    
    def probability_bi(self, word1, word2):
        bi_tuple = (word1, word2)
        word = (word1,)
        
        return self.bigram_freq.get(bi_tuple, 0) / self.unigram_freq.get(word, 0)
    
    def probability_tri(self, word1, word2, word3):
        tri_tuple = (word1, word2, word3)
        bi_tuple = (word1, word2)
        
        return self.trigram_freq.get(tri_tuple, 0) / self.bigram_freq.get(bi_tuple, 0)

    def probability_four(self, word1, word2, word3, word4):
        four_tuple = (word1, word2, word3, word4)
        tri_tuple = (word1, word2, word3)
        
        return self.fourgram_freq.get(four_tuple, 0) / self.trigram_freq.get(tri_tuple, 0)

    def probability_backoff(self, word1, word2, word3, word4):
        fourgram_prob = self.probability_four(word1, word2, word3, word4)
        
        if fourgram_prob != 0 and fourgram_prob != float('inf'):
            return fourgram_prob
        else:
            trigram_prob = self.probability_tri(word2, word3, word4)
            
            if trigram_prob != 0 and trigram_prob != float('inf'):
                return trigram_prob
            else:
                bigram_prob = self.probability_bi(word3, word4)
                
                if bigram_prob != 0 and bigram_prob != float('inf'):
                    return bigram_prob
                else:
                    unigram_prob = self.probability_uni(word4)
                    return unigram_prob
                
    

In [321]:
model = FourGramWithBackOff(tokenized_sent)

('computer', 'science', 'is', 'the'): 1
('science', 'is', 'the', 'study'): 1
('is', 'the', 'study', 'of'): 1
('the', 'study', 'of', 'computation'): 1
('study', 'of', 'computation', 'information'): 1
('of', 'computation', 'information', 'and'): 1
('computation', 'information', 'and', 'automation'): 1
('computer', 'science', 'spans', 'theoretical'): 1
('science', 'spans', 'theoretical', 'disciplines'): 1
('spans', 'theoretical', 'disciplines', 'such'): 1
('theoretical', 'disciplines', 'such', 'as'): 1
('disciplines', 'such', 'as', 'algorithms'): 1
('such', 'as', 'algorithms', 'theory'): 1
('as', 'algorithms', 'theory', 'of'): 1
('algorithms', 'theory', 'of', 'computation'): 1
('theory', 'of', 'computation', 'and'): 1
('of', 'computation', 'and', 'information'): 1
('computation', 'and', 'information', 'theory'): 1
('and', 'information', 'theory', 'to'): 1
('information', 'theory', 'to', 'applied'): 1
('theory', 'to', 'applied', 'disciplines'): 1
('to', 'applied', 'disciplines', 'including

In [322]:
testing_word_prob = model.probability_uni('science')
testing_word_prob

0.0020902496003934587

In [323]:
bi_prop = model.probability_bi('computer', 'science')
bi_prop

0.3939393939393939

In [324]:
tri_prop = model.probability_tri('is', 'the', 'study')
tri_prop

0.125

In [325]:
four_prop = model.probability_four('computer', 'science', 'is', 'the')
four_prop

0.3333333333333333

In [326]:
backoff_prop = model.probability_backoff('computer', 'science', 'is', 'the')
backoff_prop

0.3333333333333333

## Model Evaluation

In [328]:
def perplexity_back_off():
    return

## Text Generation