In [1]:
import collections
import re
from d2l import torch as d2l

In [2]:
#下载数据集
d2l.DATA_HUB['time_machine'] =  (d2l.DATA_URL + 'timemachine.txt',
                                '090b5e7e70c295757f55df93cb0a180b9691891a')

def read_time_machine():
    with open(d2l.download('time_machine')) as file:
        lines = file.readlines()
        return [re.sub('[^A-Za-z]+',' ',line).strip().lower() for line in lines]
    
lines = read_time_machine()

Downloading ..\data\timemachine.txt from http://d2l-data.s3-accelerate.amazonaws.com/timemachine.txt...


In [3]:
lines[1]
lines[10]

'twinkled and his usually pale face was flushed and animated the'

In [6]:
#词元化
def tokenize(lines,token = 'word'):
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('error token!')

tokens = tokenize(lines,'word')
for i in range(11):
    print(tokens[i])

['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']


In [10]:
def count_corpus(tokens):
    if len(tokens)==0 or isinstance(tokens,list):
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

In [17]:
#词表类
class Vocab:
    def __init__(self,tokens=None,min_freq=0,resereverd_tokens=None):
        if tokens==None:
            self.tokens=[]
        if resereverd_tokens==None:
            self.resereverd_tokens=[]
        counter = count_corpus(tokens)
        self._token_freq = sorted(counter.items(),key=lambda x:x[1],reverse=True)
        self.idx_to_token = ['<unk>'] + self.resereverd_tokens
        self.token_to_idx = {token:idx for idx,token in enumerate(self.idx_to_token)}

        for token,freq in self._token_freq:
            if freq<min_freq:
                break
            else:
                self.idx_to_token.append(token)
                self.token_to_idx[token]=len(self.idx_to_token)-1
    
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self,tokens):
        if not isinstance(tokens,(list,tuple)):
            return self.token_to_idx.get(tokens)
        return [self.__getitem__(token) for token in tokens]
    
    def to_tokens(self,indices):
        if not isinstance(indices,(list,tuple)):
            return self.idx_to_token[indices]
        return [self.to_tokens(index) for index in indices]
    
    @property
    def unk(self):
        return 0
    
    @property
    def token_freqs(self):
        return self._token_freq


In [19]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[:10])

[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]


In [20]:
for i in range(10):
    print(tokens[i])
    print(vocab[tokens[i]])

['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[1, 19, 50, 40, 2183, 2184, 400]
[]
[]
[]
[]
[]
[]
[]
[]
['i']
[2]
[]
[]
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
[1, 19, 71, 16, 37, 11, 115, 42, 680, 6, 586, 4, 108]
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
[7, 1420, 5, 2185, 587, 6, 126, 25, 330, 127, 439, 3]


In [22]:
def load_corpus_time_machine(max_tokens=-1):
    lines = read_time_machine()
    tokens = tokenize(lines,'char')
    vocab = Vocab(tokens)
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens>0:
        corpus = corpus[:max_tokens]
    return corpus,vocab

In [24]:
corpus,vocab = load_corpus_time_machine()
len(corpus),len(vocab)

(170580, 28)