In [4]:
import nltk, re
from nltk.corpus import brown
from collections import Counter

nltk.download("brown")
nltk.download("punkt")
nltk.download("punkt_tab")

sents = brown.sents(categories="news")
txt = " ".join([" ".join(s) for s in sents])
txt = txt.lower()
txt = re.sub(r"[^a-z\s]", "", txt)

sentstk = nltk.sent_tokenize(txt)
wordstk = nltk.word_tokenize(txt)

print("Total sentences:", len(sentstk))
print("Total words:", len(wordstk))
print("Vocabulary size:", len(set(wordstk)))

def genngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

uni = genngrams(wordstk, 1)
bi = genngrams(wordstk, 2)
tri = genngrams(wordstk, 3)

bicount = Counter(bi)
tricount = Counter(tri)
unicount = Counter(uni)

print("\nTop 10 bigrams:")
for pair, freq in bicount.most_common(10):
    print(pair, ":", freq)

print("\nTop 10 trigrams:")
for triplet, freq in tricount.most_common(10):
    print(triplet, ":", freq)

def bipro(w1, w2):
    return bicount[(w1, w2)] / unicount[(w1,)] if unicount[(w1,)] > 0 else 0

def tripro(w1, w2, w3):
    return tricount[(w1, w2, w3)] / bicount[(w1, w2)] if bicount[(w1, w2)] > 0 else 0

print("\nP('the' | 'in') =", bipro("in", "the"))
print("P('president' | 'the','of') =", tripro("the", "of", "president"))

def sentprobi(sentence):
    words = nltk.word_tokenize(sentence.lower())
    prob = 1.0
    for i in range(len(words)-1):
        prob *= bipro(words[i], words[i+1])
    return prob

sentence = "the president of the company"
print("\nSentence:", sentence)
print("Probability (Bigram Model):", sentprobi(sentence))

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Total sentences: 1
Total words: 87019
Vocabulary size: 12131

Top 10 bigrams:
('of', 'the') : 850
('in', 'the') : 610
('to', 'the') : 279
('on', 'the') : 254
('for', 'the') : 223
('at', 'the') : 199
('will', 'be') : 157
('that', 'the') : 149
('with', 'the') : 142
('and', 'the') : 141

Top 10 trigrams:
('one', 'of', 'the') : 44
('mr', 'and', 'mrs') : 42
('the', 'united', 'states') : 37
('members', 'of', 'the') : 28
('president', 'of', 'the') : 22
('a', 'number', 'of') : 19
('the', 'white', 'house') : 19
('as', 'a', 'result') : 18
('some', 'of', 'the') : 18
('the', 'u', 's') : 17

P('the' | 'in') = 0.30198019801980197
P('president' | 'the','of') = 0

Sentence: the president of the company
Probability (Bigram Model): 9.05010204014703e-07
