# N-Gram Models

In [71]:
from nltk import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize

## Normalizing the text

In [67]:
text = ''
with open('sherlock.txt', 'r') as file:
    text = file.read()
text = text.lower()

## Maximum Likelihood Estimation
- The maximum likelihood of a sequence $$w_{i-n+1},\dots ,w_{i-2},w_{i-1},w_i$$ is the likelihood of $w_i$ coming after the sequence $$w_{i-n+1},\dots ,w_{i-2},w_{i-1}$$
- Thus,
$$
P(w_i|w_{i-n+1},\dots ,w_{i-2},w_{i-1}) = \frac{
    C(w_{i-n+1},\dots ,w_{i-2},w_{i-1},w_i)
    }{
        \sum_w C(w_{i-n+1},\dots ,w_{i-2},w_{i-1},w)
    }
$$
- Now, the number of n-grams starting with $w_{i-n},\dots ,w_{i-2},w_{i-1}$ is the same as the number of times the sequence appears, thus $$  \sum_w C(w_{i-n},\dots ,w_{i-2},w_{i-1},w) = C(w_{i-n},\dots ,w_{i-2},w_{i-1}) $$
- Thus, the MLE becomes
$$
P(w_i|w_{i-n+1},\dots ,w_{i-2},w_{i-1}) = \frac{
    C(w_{i-n+1},\dots ,w_{i-2},w_{i-1},w_i)
    }{
        C(w_{i-n+1},\dots ,w_{i-2},w_{i-1})
    }
$$

In [147]:
def mle(word: str, prev_words: tuple[str], n_grams: list[tuple[str, ...]], sentences: list[list[str]], n: int) -> float:
    sequence = tuple(prev_words+(word,))
    n1_grams = make_ngrams(sentences, n-1)
    count_sequence = n_grams.count(prev_words + (word,))
    count_prev = n1_grams.count(prev_words)
    return count_sequence/count_prev

In [148]:
mle('I', ('<s>',), n_grams, sentences, n)

0.6666666666666666

In [151]:
print(mle('Sam', ('<s>',), n_grams, sentences, n))
print(mle('am', ('I',), n_grams, sentences, n))
print(mle('Sam', ('am',), n_grams, sentences, n))
print(mle('do', ('I',), n_grams, sentences, n))

0.3333333333333333
0.6666666666666666
0.5
0.3333333333333333


In [172]:
class NGramModel:
    def __init__(self, text: str, n: int, keep_punctuation: bool=False):
        self.text = text
        self.n = n
        sentences, words = self._tokenize(keep_punctuation)
        self.n_grams = self._make_n_grams(sentences, n)
        self.n1_grams = self._make_n_grams(sentences, n-1)

    def _tokenize(self, keep_punctuation: bool):
        sentences = [
            word_tokenize(sentence)
                for sentence in sent_tokenize(self.text)
        ]
        sentences = [(
            ['<s>']*(n-1) + sentence + ['</s>']*(n-1)
        ) for sentence in sentences]
        
        punctutation=['.', ',', '!', '?']
        words = [
            word
                for sentence in sentences
                    for word in sentence
        ]

        if not keep_punctuation:
            sentences = [
                [word for word in sentence
                    if word not in punctutation]
                for sentence in sentences
            ]
            words = [
                word for word in words
                    if word not in punctutation
            ]
        return sentences, words

    def _make_n_grams(self, sentences: list[list[str]], n: int) -> list[tuple[str, ...]]:
        n_grams = [
            ng for sentence in sentences
                for ng in ngrams(sentence, n)
        ]
        return n_grams

    def getMLE(self, word: str, prev_words: tuple[str]) -> float:
        sequence = tuple(prev_words+(word,))
        count_sequence = self.n_grams.count(sequence)
        count_prev = self.n1_grams.count(prev_words)
        mle = count_sequence/count_prev
        print({
            'cs': count_sequence,
            'cp': count_prev,
            'mle': mle
        })
        return mle

In [178]:
text = 'I am Sam. Sam I am. I do not like green eggs and ham'
n = 2

In [179]:
ngm = NGramModel(text, 2)
ngm.getMLE('Sam', ('<s>',))
ngm.getMLE('I', ('<s>',))
ngm.getMLE('</s>', ('Sam',))
ngm.getMLE('do', ('I',))
ngm.getMLE('am', ('I',))

{'cs': 1, 'cp': 3, 'mle': 0.3333333333333333}
{'cs': 2, 'cp': 3, 'mle': 0.6666666666666666}
{'cs': 1, 'cp': 2, 'mle': 0.5}
{'cs': 1, 'cp': 3, 'mle': 0.3333333333333333}
{'cs': 2, 'cp': 3, 'mle': 0.6666666666666666}


0.6666666666666666