In [41]:
import collections
import re
from d2l import torch as d2l
import torch

In [42]:
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',
                                '090b5e7e70c295757f55df93cb0a180b9691891a')

def read_time_machine():
    """将时间机器数据集加载到文本行的列表中"""
    with open(d2l.download('time_machine'), 'r') as f:
        lines = f.readlines()
    # 把不是大小写字母的东西全部变成空格
    # 也就是说，整个文本只有26个小写字母和空格
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]

lines = read_time_machine()
print(f'# 文本总行数: {len(lines)}')
print(lines[0])
print(lines[10])

# 文本总行数: 3221
the time machine by h g wells
twinkled and his usually pale face was flushed and animated the


In [43]:
# 字符串列表，每一行为一个字符串
lines

['the time machine by h g wells',
 '',
 '',
 '',
 '',
 'i',
 '',
 '',
 'the time traveller for so it will be convenient to speak of him',
 'was expounding a recondite matter to us his grey eyes shone and',
 'twinkled and his usually pale face was flushed and animated the',
 'fire burned brightly and the soft radiance of the incandescent',
 'lights in the lilies of silver caught the bubbles that flashed and',
 'passed in our glasses our chairs being his patents embraced and',
 'caressed us rather than submitted to be sat upon and there was that',
 'luxurious after dinner atmosphere when thought roams gracefully',
 'free of the trammels of precision and he put it to us in this',
 'way marking the points with a lean forefinger as we sat and lazily',
 'admired his earnestness over this new paradox as we thought it',
 'and his fecundity',
 '',
 'you must follow me carefully i shall have to controvert one or two',
 'ideas that are almost universally accepted the geometry for',
 'instance the

## 每个文本序列又被拆分成一个标记列表

In [44]:
# 把每一行拆成一个一个的单词或者字符
def tokenize(lines, token='word'):
    """将文本行拆分成单词或字符标记"""
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('错误：未知令牌类型：' + token)

tokens = tokenize(lines)
for i in range(11):
    print(tokens[i])

['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']


## 构建一个字典，通常也叫做词汇表（vocabulary），用来将字符串类型的标记映射到从0开始的数字索引中

In [45]:
class Vocab:
    """文本词表"""
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        # 按出现频率排序
        counter = count_corpus(tokens)
        # [('the', 2261), ('i', 1267), ('and', 1245),......]
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        # 未知词元的索引为0
        self.idx_to_token = ['<unk>'] + reserved_tokens  # ['<unk>']
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}  # {'<unk>': 0}

        for token, freq in self._token_freqs:
            if freq < min_freq:  # 出现频率很低的词就不添加到列表和字典中
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)  # 向列表中添加词
                self.token_to_idx[token] = len(self.idx_to_token) - 1  # 向字典中添加词，值为标号

    def __len__(self):  # 返回有多少个词
        return len(self.idx_to_token)

    def __getitem__(self, tokens):  # 取token的标号
        if not isinstance(tokens, (list, tuple)):  # 传入的tokens是字符串本身
            return self.token_to_idx.get(tokens, self.unk)  # 如果没有这个token，则返回<unk>的值0
        return [self.__getitem__(token) for token in tokens]  # 传入的tokens是列表或者元组

    def to_tokens(self, indices):  # 取标号对应的token
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):  # 未知词元的索引为0
        return 0

    @property
    def token_freqs(self):  # 取记录词和词频率的list of tuple
        return self._token_freqs

def count_corpus(tokens):
    """统计词元的频率"""
    # 这里的tokens是1D列表或2D列表
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # 将词元列表展平成一个列表
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

In [48]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[:10])

[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]


In [52]:
for i in [0, 10]:
    print('文本:', tokens[i])
    print('索引:', vocab[tokens[i]])

文本: ['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
索引: [1, 19, 50, 40, 2183, 2184, 400]
文本: ['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']
索引: [2186, 3, 25, 1044, 362, 113, 7, 1421, 3, 1045, 1]


In [8]:
def load_corpus_time_machine(max_tokens=-1):
    """返回时光机器数据集的词元索引列表和词表"""
    lines = read_time_machine()
    tokens = tokenize(lines, 'char')
    vocab = Vocab(tokens)
    # 因为时光机器数据集中的每个文本行不一定是一个句子或一个段落，
    # 所以将所有文本行展平到一个列表中
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab

corpus, vocab = load_corpus_time_machine()
len(corpus), len(vocab)

(170580, 28)

## 尝试自己实现

In [None]:
all_tokens = [token for each in tokens for token in each]

class My_Vocab:
    def __init__(self, all_tokens, min_freq=0):
        self.token_freq = collections.Counter(all_tokens).most_common()
        self.token = ['<unk>']
        self.token_idx = {'<unk>': 0}
        for token, freq in self.token_freq:
            if freq <= min_freq:
                break
            else:
                self.token.append(token)
                self.token_idx[token] = len(self.token) - 1

    def get_token_freq(self, return_format='list'):
        """Get all the tokens with freq, not including '<unk>'"""
        assert return_format == 'list' or return_format == 'dict'
        if return_format == 'list':
            return self.token_freq
        if return_format == 'dict':
            return {token: freq for token, freq in self.token_freq}

    def __len__(self):
        """Return the number of tokens in the vocabulary, including '<unk>'"""
        return len(self.token)

    def token_to_index(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_idx.get(tokens, 0)
        else:
            return [self.token_to_index(token) for token in tokens]

    def index_to_token(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.token[indices]
        else:
            return [self.index_to_token(index) for index in indices]

    @property
    def get_token(self):
        """Get all the tokens, including '<unk>'"""
        return self.token

    @property
    def get_token_idx(self):
        """Get all the tokens with indices, including 'unk'"""
        return self.token_idx

In [115]:
my_vocab = My_Vocab(all_tokens)

In [116]:
my_vocab.get_token_freq('dict')

{'the': 2261,
 'i': 1267,
 'and': 1245,
 'of': 1155,
 'a': 816,
 'to': 695,
 'was': 552,
 'in': 541,
 'that': 443,
 'my': 440,
 'it': 437,
 'had': 354,
 'me': 281,
 'as': 270,
 'at': 243,
 'for': 221,
 'with': 216,
 'but': 204,
 'time': 200,
 'were': 158,
 'this': 152,
 'you': 137,
 'on': 137,
 'then': 134,
 'his': 129,
 'there': 127,
 'he': 123,
 'have': 122,
 'they': 122,
 'from': 122,
 'one': 120,
 'all': 118,
 'not': 114,
 'into': 114,
 'upon': 113,
 'little': 113,
 'so': 112,
 'is': 106,
 'came': 105,
 'by': 103,
 'some': 94,
 'be': 93,
 'no': 92,
 'could': 92,
 'their': 91,
 'said': 89,
 'saw': 88,
 'down': 87,
 'them': 86,
 'machine': 85,
 'which': 85,
 'very': 85,
 'or': 84,
 'an': 84,
 'we': 82,
 'now': 79,
 'what': 77,
 'been': 75,
 'these': 74,
 'like': 74,
 'her': 74,
 'out': 73,
 'seemed': 72,
 'up': 71,
 'man': 70,
 'about': 70,
 's': 70,
 'its': 69,
 'thing': 66,
 'again': 62,
 'traveller': 61,
 'would': 60,
 'more': 59,
 'white': 59,
 'our': 57,
 'thought': 57,
 'felt':

In [117]:
my_vocab.get_token

['<unk>',
 'the',
 'i',
 'and',
 'of',
 'a',
 'to',
 'was',
 'in',
 'that',
 'my',
 'it',
 'had',
 'me',
 'as',
 'at',
 'for',
 'with',
 'but',
 'time',
 'were',
 'this',
 'you',
 'on',
 'then',
 'his',
 'there',
 'he',
 'have',
 'they',
 'from',
 'one',
 'all',
 'not',
 'into',
 'upon',
 'little',
 'so',
 'is',
 'came',
 'by',
 'some',
 'be',
 'no',
 'could',
 'their',
 'said',
 'saw',
 'down',
 'them',
 'machine',
 'which',
 'very',
 'or',
 'an',
 'we',
 'now',
 'what',
 'been',
 'these',
 'like',
 'her',
 'out',
 'seemed',
 'up',
 'man',
 'about',
 's',
 'its',
 'thing',
 'again',
 'traveller',
 'would',
 'more',
 'white',
 'our',
 'thought',
 'felt',
 'when',
 'over',
 'weena',
 'still',
 'world',
 'myself',
 'even',
 'must',
 'through',
 'if',
 'hand',
 'went',
 'first',
 'are',
 'before',
 'last',
 'towards',
 'only',
 'people',
 'she',
 'morlocks',
 'see',
 'too',
 'found',
 'how',
 'here',
 'light',
 'great',
 'under',
 'did',
 'him',
 'any',
 'began',
 'back',
 'night',
 'face

In [118]:
my_vocab.get_token_idx

{'<unk>': 0,
 'the': 1,
 'i': 2,
 'and': 3,
 'of': 4,
 'a': 5,
 'to': 6,
 'was': 7,
 'in': 8,
 'that': 9,
 'my': 10,
 'it': 11,
 'had': 12,
 'me': 13,
 'as': 14,
 'at': 15,
 'for': 16,
 'with': 17,
 'but': 18,
 'time': 19,
 'were': 20,
 'this': 21,
 'you': 22,
 'on': 23,
 'then': 24,
 'his': 25,
 'there': 26,
 'he': 27,
 'have': 28,
 'they': 29,
 'from': 30,
 'one': 31,
 'all': 32,
 'not': 33,
 'into': 34,
 'upon': 35,
 'little': 36,
 'so': 37,
 'is': 38,
 'came': 39,
 'by': 40,
 'some': 41,
 'be': 42,
 'no': 43,
 'could': 44,
 'their': 45,
 'said': 46,
 'saw': 47,
 'down': 48,
 'them': 49,
 'machine': 50,
 'which': 51,
 'very': 52,
 'or': 53,
 'an': 54,
 'we': 55,
 'now': 56,
 'what': 57,
 'been': 58,
 'these': 59,
 'like': 60,
 'her': 61,
 'out': 62,
 'seemed': 63,
 'up': 64,
 'man': 65,
 'about': 66,
 's': 67,
 'its': 68,
 'thing': 69,
 'again': 70,
 'traveller': 71,
 'would': 72,
 'more': 73,
 'white': 74,
 'our': 75,
 'thought': 76,
 'felt': 77,
 'when': 78,
 'over': 79,
 'weena':

In [119]:
my_vocab.token_to_index(['<unk>', 'i'])

[0, 2]

In [120]:
my_vocab.index_to_token([0, 2])

['<unk>', 'i']

In [None]:
|