# Download books from Project Gutenberg

In [1]:
import numpy as np
import nltk
nltk.download('gutenberg')

from collections import defaultdict

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\TristramArmour\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
# list all corpora from project Gutenberg in NLTK
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
# list of Shakespeare corpora
shakespeare_corpora = [
    "shakespeare-caesar.txt",
    "shakespeare-hamlet.txt",
    "shakespeare-macbeth.txt"
]

# get all corpora
corpora = {
    corpus_name: nltk.corpus.gutenberg.words(corpus_name)
    for corpus_name in shakespeare_corpora
}

# print some sentences from a corpus...
some_n_tokens = corpora["shakespeare-caesar.txt"][1002:1050]
print(" ".join(some_n_tokens))

Set him before me , let me see his face Cassi . Fellow , come from the throng , look vpon Caesar Caes . What sayst thou to me now ? Speak once againe , Sooth . Beware the Ides of March Caes . He is a Dreamer


## Count Occurrencies

In [4]:
# count how many times a specific token is right after another specific token
# in the corpora

# example: from the text "the dog is under the table.", we want to obtain the dictionary
# {
#  "the": { "dog": 1, "table": 1 },
#  "dog": { "is": 1 },
#  "is": { "under": 1 },
#  "under": { "the": 1 },
#  "table": { ".": 1 }
# }

# from_token_to_next_token_counts = { token: { next_token: num_of_occurrencies } }
from_token_to_next_token_counts = defaultdict(dict)

for corpus in corpora.values():
  for i in range(len(corpus) - 1):
    token = corpus[i].lower()
    next_token = corpus[i + 1].lower()
    if next_token not in from_token_to_next_token_counts[token]:
      from_token_to_next_token_counts[token][next_token] = 0
    from_token_to_next_token_counts[token][next_token] += 1

# print 10 examples of tokens that followed the token "from" in the corpora, along
# with their counts of occurrences
print(list(from_token_to_next_token_counts["from"].items())[:10])

[('the', 37), ('caesars', 1), ('your', 5), ('their', 3), ('brutus', 1), ('that', 2), ('seuerall', 1), ('qualitie', 1), ('bondage', 1), ('power', 1)]


## Estimate Probabilities

In [5]:
# transform occurrencies into probabilities

# example: from the text "the dog is under the table.", we want to obtain the dictionary
# {
#  "the": { "dog": 0.5, "table": 0.5 },
#  "dog": { "is": 1 },
#  "is": { "under": 1 },
#  "under": { "the": 1 },
#  "table": { ".": 1 }
# }

# from_token_to_next_token_probs = { token: { next_token: probability } }
from_token_to_next_token_probs = {}

for token, d_token in from_token_to_next_token_counts.items():
  sum_of_counts_for_token = sum(d_token.values())
  from_token_to_next_token_probs[token] = {
      next_token: count / sum_of_counts_for_token
      for next_token, count
      in d_token.items()
  }

# print 10 examples of tokens that followed the token "from" in the corpora, along
# with their probabilities
print(list(from_token_to_next_token_probs["from"].items())[:10])

[('the', 0.19170984455958548), ('caesars', 0.0051813471502590676), ('your', 0.025906735751295335), ('their', 0.015544041450777202), ('brutus', 0.0051813471502590676), ('that', 0.010362694300518135), ('seuerall', 0.0051813471502590676), ('qualitie', 0.0051813471502590676), ('bondage', 0.0051813471502590676), ('power', 0.0051813471502590676)]


## How to sample

In [7]:
# sample the next token according to the computed probabilities
def sample_next_token(token, from_token_to_next_token_probs):
  next_tokens, next_tokens_probs = list(zip(*from_token_to_next_token_probs[token].items()))
  next_token_sampled = np.random.choice(next_tokens, size=1, p=next_tokens_probs)[0]
  return next_token_sampled

print(sample_next_token("from", from_token_to_next_token_probs))

fiffe


In [8]:
# repeatedly sample tokens to generate long text
def generate_text_from_token(token, from_token_to_next_token_probs, n_words_to_generate):
  text = token
  for _ in range(n_words_to_generate):
    next_token = sample_next_token(token, from_token_to_next_token_probs)
    text += " " + next_token
    token = next_token
  return text

first_token = "from"
n_words_to_generate = 50
generated_text = generate_text_from_token(first_token, from_token_to_next_token_probs, n_words_to_generate)
print(generated_text)

from her coronet weeds : tis an hundred ducates a beere - day , i am bent for them againe to heart : it offends mee , dar ' d into the age dotes on his name of them i know your nature , but change to england , no other


### Generally we should not use more than 3 gram, probably 2 gram. Even with 3 books of text!

In [9]:
# merge all the corpora in a single string
all_corpora_tokens = corpora["shakespeare-caesar.txt"] + corpora["shakespeare-hamlet.txt"] + corpora["shakespeare-macbeth.txt"]
all_corpora_tokens = [token.lower() for token in all_corpora_tokens]
all_corpora_text = " ".join(all_corpora_tokens)

# see how many specific 1-grams, 2-grams, 3-grams can be found in the corpus
print(all_corpora_text.count("from ")) # 1-grams
print(all_corpora_text.count("from the ")) # 2-grams
print(all_corpora_text.count("from the streets ")) # 3-grams

193
37
1
