In [13]:
import collections
import random
import re
import torch
from d2l import torch as d2l

Preprocessing pipeline for text:
- Load text as strings into memory.

- Split the strings into tokens (e.g., words or characters).

- Build a vocabulary dictionary to associate each vocabulary element with a numerical index.

- Convert the text into sequences of numerical indices.

Here, we will work with H. G. Wells’ The Time Machine, a book containing just over 30000 words.

In [14]:
class TimeMachine(d2l.DataModule): #@save
    """The Time Machine dataset."""
    def _download(self):
        fname = d2l.download(d2l.DATA_URL + 'timemachine.txt', self.root,
                             '090b5e7e70c295757f55df93cb0a180b9691891a')
        with open(fname) as f:
            return f.read()
data = TimeMachine()
raw_text = data._download()
raw_text[:60]

'The Time Machine, by H. G. Wells [1898]\n\n\n\n\nI\n\n\nThe Time Tra'

For simplicity, we ignore punctuation and capitalization when preprocessing the raw text.

In [15]:
@d2l.add_to_class(TimeMachine)  #@save
def _preprocess(self, text):
    return re.sub('[^A-Za-z]+', ' ', text).lower()

text = data._preprocess(raw_text)
text[:60]

'the time machine by h g wells i the time traveller for so it'

### Tokenization

In [16]:
@d2l.add_to_class(TimeMachine)  #@save
def _tokenize(self, text):
    return list(text)

tokens = data._tokenize(text)
','.join(tokens[:30])

't,h,e, ,t,i,m,e, ,m,a,c,h,i,n,e, ,b,y, ,h, ,g, ,w,e,l,l,s, '

### Vocabulary

These tokens are still strings. However, the inputs to our models must ultimately consist of numerical inputs. Next, we introduce a class for constructing vocabularies, i.e., objects that associate each distinct token value with a unique index. First, we determine the set of unique tokens in our training corpus. We then assign a numerical index to each unique token. Rare vocabulary elements are often dropped for convenience. Whenever we encounter a token at training or test time that had not been previously seen or was dropped from the vocabulary, we represent it by a special “<unk\>” token, signifying that this is an unknown value.

In [17]:
class Vocab:  #@save
    """Vocabulary for text."""
    def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
        # Flatten a 2D list if needed
        if tokens and isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        # Count token frequencies
        counter = collections.Counter(tokens)
        # counter.items() returns a list of tuples (token, frequency).
        # We then base our sorting on the frequency of each element
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                  reverse=True)
        # The list of unique tokens
        # set() prevents duplicates from appearing and sorted allows 
        self.idx_to_token = list(sorted(set(['<unk>'] + reserved_tokens + 
                                            [token for token, freq in self.token_freqs if freq >= min_freq])))
        self.token_to_idx = {token : idx for idx, token in enumerate(self.idx_to_token)}

    def __len__(self):
        # Number of unique tokens
        return len(self.idx_to_token)
    
    # __getitem is used to evaluate the value of self[key] by the object or instance of the class
    def __getitem__(self, tokens):
        # Check if tokens is a single token str or a list/tuple of tokens str
        if not isinstance(tokens, (list, tuple)):
            # returns idx of token if found, otherwise get '<unk>' token idx
            return self.token_to_idx.get(tokens, self.unk)
        # Recursive call to itself until all tokens have been converted to their associated indices
        return [self.__getitem__(token) for token in tokens]

    # Takes a list of token indices and returns their associated str representations
    def to_tokens(self, indices):
        if hasattr(indices, '__len__') and len(indices) > 1:
            return [self.idx_to_token[int(index)] for index in indices]
        return self.idx_to_token[indices]

    @property
    def unk(self):  # Index for the unknown token
        return self.token_to_idx['<unk>']
        


In [20]:
vocab = Vocab(tokens)
indices = vocab[tokens[:10]]
print('indices:', indices)
print('tokens:', vocab.)




vocab = Vocab(tokens)
indices = vocab[tokens[:10]]
print('indices:', indices)
print('words:', vocab.to_tokens(indices))

indices: [21, 9, 6, 0, 21, 10, 14, 6, 0, 14]
words: ['t', 'h', 'e', ' ', 't', 'i', 'm', 'e', ' ', 'm']
