# Parsing text into words

In [1]:
# the text
text = [ 'All that we are is the result of what we have thought',
         'To be or not to be that is the question',
         'Be yourself everyone else is already taken' ]

text

['All that we are is the result of what we have thought',
 'To be or not to be that is the question',
 'Be yourself everyone else is already taken']

In [3]:
# seperate into words by splitting them
import re
re.split(' ', text[0])

['All',
 'that',
 'we',
 'are',
 'is',
 'the',
 'result',
 'of',
 'what',
 'we',
 'have',
 'thought']

In [4]:
# can recombine into text
' '.join( re.split(' ', text[0]) )

'All that we are is the result of what we have thought'

In [5]:
# make all lower case
allwords = re.split(" ", ' '.join(text).lower())
allwords

['all',
 'that',
 'we',
 'are',
 'is',
 'the',
 'result',
 'of',
 'what',
 'we',
 'have',
 'thought',
 'to',
 'be',
 'or',
 'not',
 'to',
 'be',
 'that',
 'is',
 'the',
 'question',
 'be',
 'yourself',
 'everyone',
 'else',
 'is',
 'already',
 'taken']

# Create a vocabulary

In [6]:
# find the unique words
vocab = sorted(set(allwords))
vocab

['all',
 'already',
 'are',
 'be',
 'else',
 'everyone',
 'have',
 'is',
 'not',
 'of',
 'or',
 'question',
 'result',
 'taken',
 'that',
 'the',
 'thought',
 'to',
 'we',
 'what',
 'yourself']

In [7]:
print(f'There are {len(allwords)} words in the text, and {len(vocab)} words in the vocabulary')

There are 29 words in the text, and 21 words in the vocabulary


# Create an encoder and decoder

In [8]:
# encoder is a python dict
word2idx = {}
for i, word in enumerate(vocab):
    word2idx[word] = i

word2idx

{'all': 0,
 'already': 1,
 'are': 2,
 'be': 3,
 'else': 4,
 'everyone': 5,
 'have': 6,
 'is': 7,
 'not': 8,
 'of': 9,
 'or': 10,
 'question': 11,
 'result': 12,
 'taken': 13,
 'that': 14,
 'the': 15,
 'thought': 16,
 'to': 17,
 'we': 18,
 'what': 19,
 'yourself': 20}

In [9]:
# decoder
idx2word = {}
for i, word in enumerate(vocab):
    idx2word[i] = word

idx2word

{0: 'all',
 1: 'already',
 2: 'are',
 3: 'be',
 4: 'else',
 5: 'everyone',
 6: 'have',
 7: 'is',
 8: 'not',
 9: 'of',
 10: 'or',
 11: 'question',
 12: 'result',
 13: 'taken',
 14: 'that',
 15: 'the',
 16: 'thought',
 17: 'to',
 18: 'we',
 19: 'what',
 20: 'yourself'}

In [10]:
print(f'The word "to" has index {word2idx["to"]}')
print(f'The index "7" maps to the word "{idx2word[7]}"')

The word "to" has index 17
The index "7" maps to the word "is"


# make fake quotes

In [11]:
# select random words from the dict
import numpy as np
rand_idx = np.random.randint(0, len(vocab), size=5)

[idx2word[i] for i in rand_idx]

['all', 'is', 'be', 'we', 'to']

In [12]:
' '.join([idx2word[i] for i in rand_idx])

'all is be we to'

# a peak at tokenization

In [13]:
# text to numbers
text_as_int = [ word2idx[word] for word in allwords]
text_as_int

[0,
 14,
 18,
 2,
 7,
 15,
 12,
 9,
 19,
 18,
 6,
 16,
 17,
 3,
 10,
 8,
 17,
 3,
 14,
 7,
 15,
 11,
 3,
 20,
 5,
 4,
 7,
 1,
 13]

In [14]:
# numbers back to text
for token_i in text_as_int:
    print(f"Token {token_i:2}: {idx2word[token_i]}")

Token  0: all
Token 14: that
Token 18: we
Token  2: are
Token  7: is
Token 15: the
Token 12: result
Token  9: of
Token 19: what
Token 18: we
Token  6: have
Token 16: thought
Token 17: to
Token  3: be
Token 10: or
Token  8: not
Token 17: to
Token  3: be
Token 14: that
Token  7: is
Token 15: the
Token 11: question
Token  3: be
Token 20: yourself
Token  5: everyone
Token  4: else
Token  7: is
Token  1: already
Token 13: taken
