In [1]:
%pip install nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [3]:
# make all the bigrams for this entire corpus
# there's a method in nltk that does this for us!
review_bigrams = nltk.bigrams(movie_reviews.words())
list(review_bigrams)

[('plot', ':'),
 (':', 'two'),
 ('two', 'teen'),
 ('teen', 'couples'),
 ('couples', 'go'),
 ('go', 'to'),
 ('to', 'a'),
 ('a', 'church'),
 ('church', 'party'),
 ('party', ','),
 (',', 'drink'),
 ('drink', 'and'),
 ('and', 'then'),
 ('then', 'drive'),
 ('drive', '.'),
 ('.', 'they'),
 ('they', 'get'),
 ('get', 'into'),
 ('into', 'an'),
 ('an', 'accident'),
 ('accident', '.'),
 ('.', 'one'),
 ('one', 'of'),
 ('of', 'the'),
 ('the', 'guys'),
 ('guys', 'dies'),
 ('dies', ','),
 (',', 'but'),
 ('but', 'his'),
 ('his', 'girlfriend'),
 ('girlfriend', 'continues'),
 ('continues', 'to'),
 ('to', 'see'),
 ('see', 'him'),
 ('him', 'in'),
 ('in', 'her'),
 ('her', 'life'),
 ('life', ','),
 (',', 'and'),
 ('and', 'has'),
 ('has', 'nightmares'),
 ('nightmares', '.'),
 ('.', 'what'),
 ('what', "'"),
 ("'", 's'),
 ('s', 'the'),
 ('the', 'deal'),
 ('deal', '?'),
 ('?', 'watch'),
 ('watch', 'the'),
 ('the', 'movie'),
 ('movie', 'and'),
 ('and', '"'),
 ('"', 'sorta'),
 ('sorta', '"'),
 ('"', 'find'),
 ('f

In [4]:
# make all the bigrams for this entire corpus
# there's a method in nltk that does this for us too!
review_trigrams = nltk.trigrams(movie_reviews.words())
review_trigrams = list(review_trigrams)

In [5]:
review_trigrams[:10]

[('plot', ':', 'two'),
 (':', 'two', 'teen'),
 ('two', 'teen', 'couples'),
 ('teen', 'couples', 'go'),
 ('couples', 'go', 'to'),
 ('go', 'to', 'a'),
 ('to', 'a', 'church'),
 ('a', 'church', 'party'),
 ('church', 'party', ','),
 ('party', ',', 'drink')]

In [6]:
# let's start building a trigram model
# where we want to know the probability of a target token appearing after two context tokens
# to do this, we use nltk's conditional frequency distribution method

trigram_pairs = (((w0, w1), w2) for w0, w1, w2, in review_trigrams)
cfd_trigrams = nltk.ConditionalFreqDist(trigram_pairs)

bigram_pairs = (((w0, w1), w2) for w0, w1, w2, in review_bigrams)
cfd_bigrams = nltk.ConditionalFreqDist(bigram_pairs)

In [7]:
cfd_trigrams.conditions()

[('plot', ':'),
 (':', 'two'),
 ('two', 'teen'),
 ('teen', 'couples'),
 ('couples', 'go'),
 ('go', 'to'),
 ('to', 'a'),
 ('a', 'church'),
 ('church', 'party'),
 ('party', ','),
 (',', 'drink'),
 ('drink', 'and'),
 ('and', 'then'),
 ('then', 'drive'),
 ('drive', '.'),
 ('.', 'they'),
 ('they', 'get'),
 ('get', 'into'),
 ('into', 'an'),
 ('an', 'accident'),
 ('accident', '.'),
 ('.', 'one'),
 ('one', 'of'),
 ('of', 'the'),
 ('the', 'guys'),
 ('guys', 'dies'),
 ('dies', ','),
 (',', 'but'),
 ('but', 'his'),
 ('his', 'girlfriend'),
 ('girlfriend', 'continues'),
 ('continues', 'to'),
 ('to', 'see'),
 ('see', 'him'),
 ('him', 'in'),
 ('in', 'her'),
 ('her', 'life'),
 ('life', ','),
 (',', 'and'),
 ('and', 'has'),
 ('has', 'nightmares'),
 ('nightmares', '.'),
 ('.', 'what'),
 ('what', "'"),
 ("'", 's'),
 ('s', 'the'),
 ('the', 'deal'),
 ('deal', '?'),
 ('?', 'watch'),
 ('watch', 'the'),
 ('the', 'movie'),
 ('movie', 'and'),
 ('and', '"'),
 ('"', 'sorta'),
 ('sorta', '"'),
 ('"', 'find'),
 ('f

In [8]:
# examine the idividual "conditions" (two-token context)
cfd_trigrams[('based', 'on')]

FreqDist({'the': 117, 'a': 67, 'his': 13, 'an': 10, 'their': 6, '.': 4, 'what': 3, ',': 3, 'stephen': 3, 'some': 2, ...})

In [9]:
# we want to know how likely "fall on the" is,
# but "the" is never seen after "fall on" in the corpus
# so we back-off to the bigram model
cfd_bigrams['on']

FreqDist({})

In [10]:
# let's extract all the bigrams where the context token is "people"
from collections import Counter
# finall all the target tokens that appear after "people"
target_tokens = []
for bigram in review_bigrams:
    if bigram[0].lower() == 'people':
        target_token = bigram[1].lower()
        target_tokens.append(target_token)

people_bigram_lm = Counter(target_tokens)
# Print the top 10 most common bigrams that start with "people"
print(people_bigram_lm.most_common(10))

[]


In [11]:
# now we want to build a full bigram model for all context tokens
from collections import Counter

# build the bigrams again
review_bigrams = nltk.bigrams(movie_reviews.words())

# use a dictionary to keep track of all the context tokens
bigram_lm = {}
target_tokens = {}

# build the bigram model
for bigram in review_bigrams:
    context_token = bigram[0].lower()
    target_token = bigram[1].lower()
    if context_token in target_tokens:
        # if we have already started tallying target tokens for this particular context token
        target_tokens[context_token].append(target_token)
    else:
        #otherwise, we start a new list for this context token
        target_tokens[context_token] = [target_token]


In [12]:
# let's check out the content of "target_tokens"
k = 0
for context_token, target_tokens in target_tokens.items():
    if k > 9: break
    print(context_token, ':', target_tokens)
    k = k + 1

plot : [':', 'elements', 'is', 'to', 'of', '.', ':', 'problems', 'thread', '.', 'in', ',', 'is', 'elements', ':', ',', 'calls', '.', 'of', "'", 'points', 'takes', 'tongue', '.', 'description', 'is', 'isn', '"', 'only', 'is', 'to', '.', 'development', 'description', '.', "'", '-', 'point', '?', 'of', 'twists', '.', 'follows', 'is', 'but', ',', 'with', 'goes', ',', '.', 'in', 'and', ',', '.', '-', 'and', ',', 'is', 'gimmicks', 'is', 'moves', 'about', 'were', 'is', 'is', 'line', 'gives', 'goes', 'lines', 'meanders', 'is', 'holes', '.', 'is', '.', 'is', 'is', '.', '.', ',', ',', 'aside', 'turns', ':', 'filled', 'full', 'along', 'is', 'far', 'holes', '-', 'changing', 'and', 'presents', 'points', 'with', 'elements', '.', 'twists', 'dressed', 'is', 'lent', 'may', 'finally', 'holes', ':', ',', 'has', ',', ':', ',', ',', 'there', '?', ':', 'and', 'points', 'gets', ':', 'which', 'is', 'holes', 'is', '--', ',', 'devices', '.', ':', ',', 'holes', 'in', 'from', 'seemed', 'has', ',', 'is', ')', 'tha

In [13]:
# try writing a predicting machine that generates 10 tokens given an input token
starting_token = ('run')

import random

for i in range(10):
    print(starting_token, '\r')

    # first we only want the 10 most frequent candidates
    top_10 = bigram_lm[starting_token].most_common(10)
    top_10_tokens = [_[0] for _ in top_10]
    top_10_counts = [_[1] for _ in top_10]

    # randomly sampled the next token according to frequency accounts and used that as the next starting token
    starting_token = random.sample(top_10_tokens, k=1, counts=top_10_counts)[0]


run 


KeyError: 'run'

In [25]:
# let's build an algorithm that generates a sequence of tokens
# using the trigram model where possible and backing off to the bigrams when needed

# we need to specify a starting token
output_tokens = ['the', 'second']

#keep generating new tokens until the output list has 10 tokens
while len(output_tokens) < 20:

    top_10 = cfd_trigrams[(output_tokens[-2], output_tokens[-1])].most_common(10)
    top_10_tokens = [token[0] for token in top_10]
    top_10_counts = [token[1] for token in top_10]

    target_token = random.sample(top_10_tokens, k=1, counts=top_10_counts)[0]

    #print('the context tokens are:', (output_tokens[-2], output_tokens[-1]))
    #print('target candidates:', top_10_tokens)
    #print('target counts:', top_10_counts)
    #print('the target we pick:', target_token)

    # we need to put the target token that is sampled into the output tokens
    output_tokens.append(target_token)

    print('the new output tokens:', ' '.join(output_tokens))

the new output tokens: the second half
the new output tokens: the second half of
the new output tokens: the second half of the
the new output tokens: the second half of the characters
the new output tokens: the second half of the characters .
the new output tokens: the second half of the characters . this
the new output tokens: the second half of the characters . this film
the new output tokens: the second half of the characters . this film .
the new output tokens: the second half of the characters . this film . "
the new output tokens: the second half of the characters . this film . " it
the new output tokens: the second half of the characters . this film . " it doesn
the new output tokens: the second half of the characters . this film . " it doesn '
the new output tokens: the second half of the characters . this film . " it doesn ' t
the new output tokens: the second half of the characters . this film . " it doesn ' t be
the new output tokens: the second half of the characters . this