In [1]:
from preprocess import *

text_preprocessor = TextPreprocessor()

## Model Building

In [2]:
class InterpolationAddK:
    def __init__(
        self,
        k=0.1,  # Smoothing parameter
        lambda1=0.1,
        lambda2=0.2,
        lambda3=0.3,
        lambda4=0.4,
    ) -> None:
        # preprocess
        text_preprocessor = TextPreprocessor()
        
        # initialize necessary fields
        self.freq_uni = text_preprocessor.freq_uni
        self.freq_bi = text_preprocessor.freq_bi
        self.freq_tri = text_preprocessor.freq_tri
        self.freq_four = text_preprocessor.freq_four
        
        # k and lambda
        self.k = k
        self.lambda1 = lambda1
        self.lambda2 = lambda2
        self.lambda3 = lambda3
        self.lambda4 = lambda4
    
    # ---------- interpolation with Add-k probability of unigram ----------
    def probability(
        self,
        word: str,
        given_tri_gram: tuple,
    ):
        """
        Estimate the probability of a word being the next word after given previous words using linear interpolation with add-k smoothing.

        Args:
            word: The word for which to calculate the next word probability.
            give_tri_gram: A tuple containing the three previous words.

        Returns:
            The estimated probability of 'word' being the next word after 'given_tri_gram' using linear interpolation with add-k smoothing.
        """
        
        # Create the unigram, bigram, trigram, and fourgram tuples
        uni_gram = (word,)
        bi_gram = (given_tri_gram[2], word)
        tri_gram = (given_tri_gram[1], given_tri_gram[2], word)
        four_gram = (given_tri_gram[0], given_tri_gram[1], given_tri_gram[2], word)

        # Calculate probabilities for each n-gram model with add-k smoothing
        unigram_prob = self.unigram_addk_probability(
            current_uni=uni_gram, 
            k=self.k,
        )
        
        bigram_prob = self.n_gram_addk_probability(
            word=bi_gram[1], 
            given_gram=bi_gram[:1], 
            freq_previous=self.freq_uni, 
            freq_current=self.freq_bi,
            k=self.k,
        )
        
        trigram_prob = self.n_gram_addk_probability(
            word=tri_gram[2],
            given_gram=tri_gram[:2],
            freq_previous=self.freq_bi, 
            freq_current=self.freq_tri,
            k=self.k,
        )
        
        fourgram_prob = self.n_gram_addk_probability(
            word=four_gram[3], 
            given_gram=(four_gram[:3]), 
            freq_previous=self.freq_tri, 
            freq_current=self.freq_four,
            k=self.k,
        )

        # Calculate interpolated probability
        probability = (self.lambda1*unigram_prob) + (self.lambda2*bigram_prob) + (self.lambda3*trigram_prob) + (self.lambda4*fourgram_prob)
        
        # print(f'probability of {word}: {probability}')

        return probability
    
    # ---------- Add-k probability of unigram ----------
    def unigram_addk_probability(
        self,
        current_uni: tuple,
        k = 0.1
    ):
        uni_gram_count = self.freq_uni.get(current_uni, 0)
        n_total_words = len(text_preprocessor.training_data)
        n_unique_words = len(self.freq_uni)
        
        probability = (uni_gram_count + k) / (n_total_words + n_unique_words * k) 
        return probability
    
    # ---------- Add-k probability of n-gram, starting from bi-gram ----------
    def n_gram_addk_probability(
        self,
        word: str,
        given_gram: tuple,
        freq_previous: dict, 
        freq_current: dict, 
        k = 0.1
    ):
        # new n-gram
        n_gram = list(given_gram)
        n_gram.append(word)
        n_gram = tuple(n_gram)
        
        current_gram_count = freq_current.get(n_gram, 0)
        previous_gram_count = freq_previous.get(given_gram, 0)
        unique_word_count = len(self.freq_uni)
        
        probability = (current_gram_count + k) / (previous_gram_count + unique_word_count * k)
        
        return probability
    
    # ---------- Predict the next word ----------
    def predict(
        self,
        previous_word: tuple[str, str, str],
    ):
        predictions = []
        for word in self.freq_uni.keys():
            # if (word[0] != '<s>' and word[0] != '</s>'):
            probability = self.probability(word[0], previous_word)
            predictions.append((word, probability)) 

        predictions.sort(key=lambda x: x[1], reverse=True)
        # print(predictions)
        return predictions[0][0][0]
    

In [3]:
model = InterpolationAddK(
    k=pow(10, -2), 
    lambda1=pow(10, -5), 
    lambda2=pow(10, -5),
    lambda3=pow(10, -5),
    lambda4=pow(10, -1),
)
model.predict(('computer', 'data', 'engineer'))

'and'

In [4]:
print('Count each word')

for key, value in sorted(model.freq_uni.items(), key=lambda item: item[1], reverse=True):
    print(f'{key}: {value}')

Count each word
('and',): 583
('the',): 539
('<s>',): 470
('</s>',): 470
('data',): 421
('of',): 373
('to',): 261
('in',): 225
('a',): 200
('computer',): 135
('is',): 135
('systems',): 117
('for',): 114
('as',): 98
('are',): 95
('that',): 80
('software',): 77
('on',): 77
('or',): 73
('with',): 71
('it',): 69
('they',): 67
('by',): 62
('from',): 59
('be',): 54
('this',): 53
('science',): 51
('information',): 50
('can',): 47
('an',): 46
('system',): 43
('such',): 42
('database',): 40
('s',): 39
('may',): 37
('design',): 36
('business',): 34
('warehouse',): 34
('processing',): 33
('used',): 32
('which',): 31
('big',): 31
('algorithms',): 29
('applications',): 28
('programming',): 27
('development',): 27
('these',): 27
('into',): 27
('etl',): 27
('more',): 26
('often',): 26
('analysis',): 26
('management',): 25
('computing',): 25
('databases',): 25
('was',): 24
('have',): 23
('also',): 23
('all',): 23
('engineering',): 22
('modeling',): 22
('model',): 22
('using',): 21
('various',): 21
('s

In [5]:
validating_tri_gram = text_preprocessor.tokenize_words(text_preprocessor.validation_data)['tri_grams']

In [6]:
text_preprocessor.tokenize_words(text_preprocessor.validation_data)['four_grams']

[('<s>', 'a', 'proof', 'consists'),
 ('a', 'proof', 'consists', 'of'),
 ('proof', 'consists', 'of', 'a'),
 ('consists', 'of', 'a', 'succession'),
 ('of', 'a', 'succession', 'of'),
 ('a', 'succession', 'of', 'applications'),
 ('succession', 'of', 'applications', 'of'),
 ('of', 'applications', 'of', 'deductive'),
 ('applications', 'of', 'deductive', 'rules'),
 ('of', 'deductive', 'rules', 'to'),
 ('deductive', 'rules', 'to', 'already'),
 ('rules', 'to', 'already', 'established'),
 ('to', 'already', 'established', 'results'),
 ('already', 'established', 'results', '</s>'),
 ('established', 'results', '</s>', '<s>'),
 ('results', '</s>', '<s>', 'these'),
 ('</s>', '<s>', 'these', 'results'),
 ('<s>', 'these', 'results', 'include'),
 ('these', 'results', 'include', 'previously'),
 ('results', 'include', 'previously', 'proved'),
 ('include', 'previously', 'proved', 'theorems'),
 ('previously', 'proved', 'theorems', 'axioms'),
 ('proved', 'theorems', 'axioms', 'and—in'),
 ('theorems', 'axioms

In [7]:
model.freq_four.keys()

dict_keys([('<s>', 'computer', 'science', 'is'), ('computer', 'science', 'is', 'the'), ('science', 'is', 'the', 'study'), ('is', 'the', 'study', 'of'), ('the', 'study', 'of', 'computation'), ('study', 'of', 'computation', 'information'), ('of', 'computation', 'information', 'and'), ('computation', 'information', 'and', 'automation'), ('information', 'and', 'automation', '</s>'), ('and', 'automation', '</s>', '<s>'), ('automation', '</s>', '<s>', 'computer'), ('</s>', '<s>', 'computer', 'science'), ('<s>', 'computer', 'science', 'spans'), ('computer', 'science', 'spans', 'theoretical'), ('science', 'spans', 'theoretical', 'disciplines'), ('spans', 'theoretical', 'disciplines', 'such'), ('theoretical', 'disciplines', 'such', 'as'), ('disciplines', 'such', 'as', 'algorithms'), ('such', 'as', 'algorithms', 'theory'), ('as', 'algorithms', 'theory', 'of'), ('algorithms', 'theory', 'of', 'computation'), ('theory', 'of', 'computation', 'and'), ('of', 'computation', 'and', 'information'), ('com

In [8]:
model.predict(('true', 'starting', 'points'))

'and'

In [9]:
for tri_gram in validating_tri_gram[:100]:
    print(f'{tri_gram}: {model.predict(tri_gram)}')

('<s>', 'a', 'proof'): and
('a', 'proof', 'consists'): and


('proof', 'consists', 'of'): the
('consists', 'of', 'a'): and
('of', 'a', 'succession'): and
('a', 'succession', 'of'): the
('succession', 'of', 'applications'): and
('of', 'applications', 'of'): the
('applications', 'of', 'deductive'): and
('of', 'deductive', 'rules'): and
('deductive', 'rules', 'to'): the
('rules', 'to', 'already'): and
('to', 'already', 'established'): and
('already', 'established', 'results'): and
('established', 'results', '</s>'): <s>
('results', '</s>', '<s>'): physicists
('</s>', '<s>', 'these'): systems
('<s>', 'these', 'results'): and
('these', 'results', 'include'): and
('results', 'include', 'previously'): and
('include', 'previously', 'proved'): and
('previously', 'proved', 'theorems'): and
('proved', 'theorems', 'axioms'): and
('theorems', 'axioms', 'and—in'): and
('axioms', 'and—in', 'case'): and
('and—in', 'case', 'of'): the
('case', 'of', 'abstraction'): and
('of', 'abstraction', 'from'): the
('abstraction', 'from', 'nature—some'): and
('from', 'nature

In [10]:
test_tri_gram = text_preprocessor.tokenize_words(text_preprocessor.test_data)['tri_grams']

for tri_gram in test_tri_gram[:100]:
    print(f'{tri_gram}: {model.predict(tri_gram)}')

('<s>', 'a', 'machine'): and
('a', 'machine', 'learning'): and
('machine', 'learning', 'algorithm'): and
('learning', 'algorithm', 'for'): and
('algorithm', 'for', 'stock'): and
('for', 'stock', 'trading'): and
('stock', 'trading', 'may'): and
('trading', 'may', 'inform'): and
('may', 'inform', 'the'): and
('inform', 'the', 'trader'): and
('the', 'trader', 'of'): the
('trader', 'of', 'future'): and
('of', 'future', 'potential'): and
('future', 'potential', 'predictions'): and
('potential', 'predictions', '</s>'): <s>
('predictions', '</s>', '<s>'): the
('</s>', '<s>', 'as'): the
('<s>', 'as', 'a'): and
('as', 'a', 'scientific'): and
('a', 'scientific', 'endeavor'): and
('scientific', 'endeavor', 'machine'): and
('endeavor', 'machine', 'learning'): and
('machine', 'learning', 'grew'): and
('learning', 'grew', 'out'): and
('grew', 'out', 'of'): the
('out', 'of', 'the'): and
('of', 'the', 'quest'): and
('the', 'quest', 'for'): and
('quest', 'for', 'artificial'): and
('for', 'artificial', 

In [20]:
print(list(text_preprocessor.freq_tri.keys())[0])

print('generated text: ')
text = ''
for tri_gram in list(text_preprocessor.freq_tri.keys())[:500]:
    text = text + ' ' + model.predict(tri_gram)
    

('<s>', 'computer', 'science')
generated text: 
 is the study of computer information and automation </s> <s> computer science theoretical disciplines such as algorithms theory of computation and information theory to applied disciplines including the design and organization of hardware and software </s> <s> data used often considered an academic discipline computer science is related to computer programming </s> <s> computer and data structures are used to computer science </s> <s> the data of computation abstract models of computation and classes of problems that are be used using them </s> <s> the of computer and computer security involve studying the means for secure communication and data preventing security vulnerabilities </s> <s> computer and visualization geometry address the generation of images </s> <s> programming language theory considers different ways to describe computational processes and database theory concerns the management of repositories of data </s> <s> the inte

In [21]:
display(text)


' is the study of computer information and automation </s> <s> computer science theoretical disciplines such as algorithms theory of computation and information theory to applied disciplines including the design and organization of hardware and software </s> <s> data used often considered an academic discipline computer science is related to computer programming </s> <s> computer and data structures are used to computer science </s> <s> the data of computation abstract models of computation and classes of problems that are be used using them </s> <s> the of computer and computer security involve studying the means for secure communication and data preventing security vulnerabilities </s> <s> computer and visualization geometry address the generation of images </s> <s> programming language theory considers different ways to describe computational processes and database theory concerns the management of repositories of data </s> <s> the interaction investigates the interfaces through whi

## Model Evaluation

In [12]:
def perplexity_interpolation():
    
    return

## Text Generation