In [1]:
import math
import torch
from torch import nn
from torch.nn import functional as F
import random
import hashlib
import os
import requests
import re
import collections
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'

DATA_HUB['time_machine'] = (DATA_URL + 'timemachine.txt',
                                '090b5e7e70c295757f55df93cb0a180b9691891a')

def download(name, cache_dir=os.path.join('..', 'data')):  #@save
    """下载一个DATA_HUB中的文件，返回本地文件名"""
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # 命中缓存
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname
def read_time_machine():
    with open(download('time_machine'),'r') as fp:
        lines = fp.readlines()
    return [re.sub('[^A-Za-z]+',' ',line).strip().lower() for line in lines]
def tokenize(lines,token='word'):
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('错误')
class Vocab:
    def __init__(self,tokens=None,min_freq=0,reserved_token=None):
        if tokens is None:
            tokens = []
        if reserved_token is None:
            reserved_token = []
        counter = self.count_corpus(tokens)
        self.token_freqs = sorted(counter.items(),key=lambda x :x[1],reverse=True)
        
        self.unk , uniq_token = 0,['<unk>'] + reserved_token

        uniq_token+=[
            token for token,freq in self.token_freqs
            if freq >= min_freq and token not in uniq_token
        ] 
        self.idx_to_token ,self.token_to_idx = [],dict()
        for token in uniq_token:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1
    def count_corpus(self,tokens):
        if len(tokens)==0 or isinstance(tokens[0],list):
            tokens = [token for line in tokens
                      for token in line]
        return collections.Counter(tokens)
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self,indices):
        if not isinstance(indices,(list,tuple)):
            return self.token_to_idx[indices]
        return [self.token_to_idx[index] for index in indices]
    def to_tokens(self,indices):
        if not isinstance(indices,(list,tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]
    
bath_size ,num_step = 32,35
def seq_data_iter_random(corpus,bathch_size,num_step):
    
    corpus = corpus[random.randint(0,num_step-1):]
    num_subseqs = (len(corpus) - 1) // num_step
    # 保留label
    initial_indices = list(range(0,num_subseqs * num_step,num_step))
    random.shuffle(initial_indices)
    
    def data(pos):
        return corpus[pos :pos+num_step]
    # def label(pos):
    #     return corpus[pos+num_step]
    num_batches = num_subseqs // bathch_size
    for i in range(0,bathch_size * num_batches,bathch_size):
        initial_indices_per_batch = initial_indices[i:i+bathch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j + 1) for j in initial_indices_per_batch]
        yield torch.tensor(X),torch.tensor(Y)

def seq_data_iter_sequential(corpus,bath_size,num_step):
    offset = random.randint(0,num_step)
    num_tokens = ((len(corpus) - offset - 1)//bath_size) * bath_size

    Xs = torch.tensor(corpus[offset:offset + num_tokens])
    Ys = torch.tensor([corpus[offset + 1: offset + 1 + num_tokens]])

    Xs,Ys = Xs.reshape(bath_size ,-1),Ys.reshape(bath_size,-1)
    num_batches = Xs.shape[-1] // num_step

    for i in range(0,num_step * num_batches,num_step):
        x = Xs[:,i:i+num_step]
        y = Ys[:,i:i+num_step]
        yield x,y
def load_corpus_time_machine(max_tokens=-1):
    lines = read_time_machine()
    tokens = tokenize(lines,'char')
    vocab = Vocab(tokens)
    corpus = [
        vocab[token] for line in tokens
                    for token in line 
    ]
    if max_tokens >0:
        corpus = corpus[:max_tokens]
    
    return corpus,vocab

class SeqDataloader:
    def __init__(self,batch_size,num_step,use_random_split,max_token):
        if use_random_split:
            self.data_iter_fn = seq_data_iter_random
        else:
            self.data_iter_fn = seq_data_iter_sequential
        
        self.corpus , self.vocab = load_corpus_time_machine(max_tokens=max_token)
        self.batch_size,self.num_step = batch_size,num_step

    
    def __iter__(self):
        return self.data_iter_fn(self.corpus,self.batch_size,self.num_step)
def load_data_time_machine(batch_size,
                           num_step,
                           use_random_split=False,
                           max_token=1000):
    data_iter = SeqDataloader(
        batch_size=batch_size,
        num_step=num_step,
        use_random_split=use_random_split,
        max_token=max_token
    )
    return data_iter,data_iter.vocab

train_iter,vocab = load_data_time_machine(batch_size=bath_size,num_step=num_step)

In [13]:
F.one_hot(torch.tensor([0,2]),len(vocab))

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])

In [6]:
X = torch.arange(10).reshape((2,5))
F.one_hot(X.T,28).shape

torch.Size([5, 2, 28])

In [None]:
def get_params(vocab_size,num_hiddens,devices):
    num_inputs = num_outputs = vocab_size

    def normal(shape):
        return torch.rand(size=shape,device=devices) * 0.01
    
    W_xh = normal((num_inputs,num_hiddens))
    W_hh = normal((num_hiddens,num_hiddens))

    b_h = torch.zeros(num_hiddens,device=devices)

    W_hq = normal((num_hiddens,num_outputs))
    b_q = torch.zeros(num_outputs,device=devices)

    params = [W_xh,W_hh,b_h,W_hq,b_q]

    for param in params:
        param.requires_grad_(True)
        
    return params

In [None]:
def init_rnn_stats(bath_size,num_hidden,device):
    return (torch.zeros((bath_size,num_hidden),device=device),)

In [None]:
def rnn(inputs,state,params):
    W_xh , W_hh ,b_h,W_hq,b_q = params

    H, =state
    outputs = []

    for X in input:
        H = torch.tanh(torch.mm(X,W_xh) + torch)