In [2]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from nltk import word_tokenize
from nltk import sent_tokenize

In [3]:
text1 = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]
text2 = [['I', 'am', 'Sam'], ['Sam', 'am', 'I'], ['I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham']]


In [4]:
for i in text1:
    text1_bi = bigrams(i)
    print(list(text1_bi))

[('a', 'b'), ('b', 'c')]
[('a', 'c'), ('c', 'd'), ('d', 'c'), ('c', 'e'), ('e', 'f')]


In [5]:
for i in text2:
    text1_n = ngrams(i, n = 3)
    print(list(text1_n))

[('I', 'am', 'Sam')]
[('Sam', 'am', 'I')]
[('I', 'do', 'not'), ('do', 'not', 'like'), ('not', 'like', 'green'), ('like', 'green', 'eggs'), ('green', 'eggs', 'and'), ('eggs', 'and', 'ham')]


In [6]:
# The n order of n-grams, if it's 2-grams, you pad once, 3-grams pad twice, etc.
res = list(pad_sequence(text2[0],
                        pad_left=True, left_pad_symbol="<s>",
                        pad_right=True, right_pad_symbol="</s>",
                        n=3))
print(res)
print(list(ngrams(res, n=2)))

['<s>', '<s>', 'I', 'am', 'Sam', '</s>', '</s>']
[('<s>', '<s>'), ('<s>', 'I'), ('I', 'am'), ('am', 'Sam'), ('Sam', '</s>'), ('</s>', '</s>')]


In [7]:
res1 = list(pad_both_ends(text2[1], n=2))
print(res1)

list(bigrams(pad_both_ends(text2[2], n=2)))

['<s>', 'Sam', 'am', 'I', '</s>']


[('<s>', 'I'),
 ('I', 'do'),
 ('do', 'not'),
 ('not', 'like'),
 ('like', 'green'),
 ('green', 'eggs'),
 ('eggs', 'and'),
 ('and', 'ham'),
 ('ham', '</s>')]

In [8]:
padded_bigrams = list(pad_both_ends(text2[1], n=2))
list(everygrams(padded_bigrams, max_len=10))

[('<s>',),
 ('<s>', 'Sam'),
 ('<s>', 'Sam', 'am'),
 ('<s>', 'Sam', 'am', 'I'),
 ('<s>', 'Sam', 'am', 'I', '</s>'),
 ('Sam',),
 ('Sam', 'am'),
 ('Sam', 'am', 'I'),
 ('Sam', 'am', 'I', '</s>'),
 ('am',),
 ('am', 'I'),
 ('am', 'I', '</s>'),
 ('I',),
 ('I', '</s>'),
 ('</s>',)]

In [9]:
# To create this vocabulary we need to pad our sentences
# (just like for counting ngrams) and then combine the sentences
# into one flat stream of words.
list(flatten(pad_both_ends(sent, n=2) for sent in text2))

['<s>',
 'I',
 'am',
 'Sam',
 '</s>',
 '<s>',
 'Sam',
 'am',
 'I',
 '</s>',
 '<s>',
 'I',
 'do',
 'not',
 'like',
 'green',
 'eggs',
 'and',
 'ham',
 '</s>']

In [10]:
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(2, text2)
print(list(vocab))
print("###")
for ngram_sent in train:
    print(list(ngram_sent))
    print()

['<s>', 'I', 'am', 'Sam', '</s>', '<s>', 'Sam', 'am', 'I', '</s>', '<s>', 'I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham', '</s>']
###
[('<s>',), ('<s>', 'I'), ('I',), ('I', 'am'), ('am',), ('am', 'Sam'), ('Sam',), ('Sam', '</s>'), ('</s>',)]

[('<s>',), ('<s>', 'Sam'), ('Sam',), ('Sam', 'am'), ('am',), ('am', 'I'), ('I',), ('I', '</s>'), ('</s>',)]

[('<s>',), ('<s>', 'I'), ('I',), ('I', 'do'), ('do',), ('do', 'not'), ('not',), ('not', 'like'), ('like',), ('like', 'green'), ('green',), ('green', 'eggs'), ('eggs',), ('eggs', 'and'), ('and',), ('and', 'ham'), ('ham',), ('ham', '</s>'), ('</s>',)]



In [11]:
# An abstract and paraghraph of the introduction from "A neural probabilistic language model" from
# The Journal of Machine Learning ResearchVolume 33/1/2003 pp 1137–1155
text = """A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach significantly improves on state-of-the-art n-gram models, and that the proposed approach allows to take advantage of longer contexts.
A useful way to visualize how different learning algorithms generalize, inspired from the view of non-parametric density estimation, is to think of how probability mass that is initially concentrated on the training points (e.g., training sentences) is distributed in a larger volume, usually in some form of neighborhood around the training points. In high dimensions, it is crucial to distribute probability mass where it matters rather than uniformly in all directions around each training point. We will show in this paper that the way in which the approach proposed here generalizes is fundamentally different from the way in which previous state-of-the-art statistical language modeling approaches are generalizing."""
#word_tokenize(sent_tokenize(sent)[0])
tokenized_text = [list(map(str.lower, word_tokenize(sent)))
                  for sent in sent_tokenize(text)]
tokenized_text

[['a',
  'goal',
  'of',
  'statistical',
  'language',
  'modeling',
  'is',
  'to',
  'learn',
  'the',
  'joint',
  'probability',
  'function',
  'of',
  'sequences',
  'of',
  'words',
  'in',
  'a',
  'language',
  '.'],
 ['this',
  'is',
  'intrinsically',
  'difficult',
  'because',
  'of',
  'the',
  'curse',
  'of',
  'dimensionality',
  ':',
  'a',
  'word',
  'sequence',
  'on',
  'which',
  'the',
  'model',
  'will',
  'be',
  'tested',
  'is',
  'likely',
  'to',
  'be',
  'different',
  'from',
  'all',
  'the',
  'word',
  'sequences',
  'seen',
  'during',
  'training',
  '.'],
 ['traditional',
  'but',
  'very',
  'successful',
  'approaches',
  'based',
  'on',
  'n-grams',
  'obtain',
  'generalization',
  'by',
  'concatenating',
  'very',
  'short',
  'overlapping',
  'sequences',
  'seen',
  'in',
  'the',
  'training',
  'set',
  '.'],
 ['we',
  'propose',
  'to',
  'fight',
  'the',
  'curse',
  'of',
  'dimensionality',
  'by',
  'learning',
  'a',
  'distrib

In [12]:
# Building SLM with uni, bi, tri-grams
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

from nltk.lm import MLE
model = MLE(n)

model.fit(train_data, padded_sents)

 The model application: Checking the unknown vocabulary

In [13]:
print("Vocabulary:", len(model.vocab))

# replace words not in the vocabulary with `<UNK>`
print(model.vocab.lookup('corpora showing allow'.split()))

Vocabulary: 177
('corpora', 'showing', '<UNK>')


 The model application: Checking counts

In [14]:
print(model.counts)

#Checking counts in unigrams
print("Counts of 'model' = ", model.counts['model'])

# Checking count in bigrams e.g. the phrase "statistical language" (P(language|statistiacl))
no = model.counts[['statistical']]['language']
print("count(language|statistical) =", no)

# Checking count in trigrams e.g. the phrase "statistical language modeling" (P(modeling|statistical language))
no = model.counts[['statistical', 'language']]['modeling']
print("count(modeling |statistical language) =", no)

<NgramCounter with 3 ngram orders and 1194 ngrams>
Counts of 'model' =  3
count(language|statistical) = 2
count(modeling |statistical language) = 2


Model used to score how probable words are in certain contexts.

In [15]:
no = model.score('language')
print("p(language) =", no)

# P('distributed' | representation)
no = model.score('representation', ['distributed'])
print("P(distributed |representation ) =", no)

no = model.score('for', ['distributed', 'representation'])
print("P(for |distributed representation ) =", no)

#Unknown words P = 0.0
model.score("<UNK>")

p(language) = 0.007334963325183374
P(distributed |representation ) = 0.6666666666666666
P(for |distributed representation ) = 1.0


0.0

The model evaluation (probablility, log probablility,
cross-entropy and perplexity with respect to sequences of ngrams).

In [16]:
print("Log score for 'model' word:", model.logscore("model"))
print("Entropy score for ('language','modeling'),('distributed', 'representation'):",model.entropy([('language','modeling'),('distributed', 'representation')]))
print("Perplexity score for '('language','modeling'),('distributed', 'representation'):",model.perplexity([('language','modeling'),('distributed', 'representation')]))

Log score for 'model' word: -7.090994532220593
Entropy score for ('language','modeling'),('distributed', 'representation'): 0.5849625007211563
Perplexity score for '('language','modeling'),('distributed', 'representation'): 1.5000000000000002


ngram models can be used to generate text

In [17]:
print(model.generate(10, random_seed=10)) #Provide random_seed if you want to consistently reproduce the same text

# condition generation on some preceding text
print(model.generate(7, text_seed=['language'], random_seed=10))

['n-grams', 'obtain', 'generalization', 'by', 'concatenating', 'very', 'short', 'overlapping', 'sequences', 'seen']
['modeling', 'approaches', 'are', 'generalizing', '.', '</s>', '</s>']


In [18]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize

detokenize(model.generate(7, text_seed=['language'], random_seed=10))

'modeling approaches are generalizing . </s> </s>'