In [15]:
import math
import torch
from torch import nn
from torch.nn import functional as F
import random
import hashlib
import os
import requests
import re
import collections
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'

DATA_HUB['time_machine'] = (DATA_URL + 'timemachine.txt',
                                '090b5e7e70c295757f55df93cb0a180b9691891a')

def download(name, cache_dir=os.path.join('..', 'data')):  #@save
    """下载一个DATA_HUB中的文件，返回本地文件名"""
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # 命中缓存
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname
def read_time_machine():
    with open(download('time_machine'),'r') as fp:
        lines = fp.readlines()
    return [re.sub('[^A-Za-z]+',' ',line).strip().lower() for line in lines]
def tokenize(lines,token='word'):
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('错误')
class Vocab:
    def __init__(self,tokens=None,min_freq=0,reserved_token=None):
        if tokens is None:
            tokens = []
        if reserved_token is None:
            reserved_token = []
        counter = self.count_corpus(tokens)
        self.token_freqs = sorted(counter.items(),key=lambda x :x[1],reverse=True)
        
        self.unk , uniq_token = 0,['<unk>'] + reserved_token

        uniq_token+=[
            token for token,freq in self.token_freqs
            if freq >= min_freq and token not in uniq_token
        ] 
        self.idx_to_token ,self.token_to_idx = [],dict()
        for token in uniq_token:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1
    def count_corpus(self,tokens):
        if len(tokens)==0 or isinstance(tokens[0],list):
            tokens = [token for line in tokens
                      for token in line]
        return collections.Counter(tokens)
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self,indices):
        if not isinstance(indices,(list,tuple)):
            return self.token_to_idx[indices]
        return [self.token_to_idx[index] for index in indices]
    def to_tokens(self,indices):
        if not isinstance(indices,(list,tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]
    
bath_size ,num_step = 32,35
def seq_data_iter_random(corpus,bathch_size,num_step):
    
    corpus = corpus[random.randint(0,num_step-1):]
    num_subseqs = (len(corpus) - 1) // num_step
    # 保留label
    initial_indices = list(range(0,num_subseqs * num_step,num_step))
    random.shuffle(initial_indices)
    
    def data(pos):
        return corpus[pos :pos+num_step]
    # def label(pos):
    #     return corpus[pos+num_step]
    num_batches = num_subseqs // bathch_size
    for i in range(0,bathch_size * num_batches,bathch_size):
        initial_indices_per_batch = initial_indices[i:i+bathch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j + 1) for j in initial_indices_per_batch]
        yield torch.tensor(X),torch.tensor(Y)

def seq_data_iter_sequential(corpus,bath_size,num_step):
    offset = random.randint(0,num_step)
    num_tokens = ((len(corpus) - offset - 1)//bath_size) * bath_size

    Xs = torch.tensor(corpus[offset:offset + num_tokens])
    Ys = torch.tensor(corpus[offset + 1: offset + 1 + num_tokens])

    Xs,Ys = Xs.reshape(bath_size ,-1),Ys.reshape(bath_size,-1)
    num_batches = Xs.shape[-1] // num_step

    for i in range(0,num_step * num_batches,num_step):
        x = Xs[:,i:i+num_step]
        y = Ys[:,i:i+num_step]
        yield x,y
def load_corpus_time_machine(max_tokens=-1):
    lines = read_time_machine()
    tokens = tokenize(lines,'char')
    vocab = Vocab(tokens)
    corpus = [
        vocab[token] for line in tokens
                    for token in line 
    ]
    if max_tokens >0:
        corpus = corpus[:max_tokens]
    
    return corpus,vocab

class SeqDataloader:
    def __init__(self,batch_size,num_step,use_random_split,max_token):
        if use_random_split:
            self.data_iter_fn = seq_data_iter_random
        else:
            self.data_iter_fn = seq_data_iter_sequential
        
        self.corpus , self.vocab = load_corpus_time_machine(max_tokens=max_token)
        self.batch_size,self.num_step = batch_size,num_step

    
    def __iter__(self):
        return self.data_iter_fn(self.corpus,self.batch_size,self.num_step)
def load_data_time_machine(batch_size,
                           num_step,
                           use_random_split=False,
                           max_token=100000):
    data_iter = SeqDataloader(
        batch_size=batch_size,
        num_step=num_step,
        use_random_split=use_random_split,
        max_token=max_token
    )
    return data_iter,data_iter.vocab

train_iter,vocab = load_data_time_machine(batch_size=bath_size,num_step=num_step)

In [16]:
F.one_hot(torch.tensor([0,2]),len(vocab))

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])

In [17]:
X = torch.arange(10).reshape((2,5))
F.one_hot(X.T,28).shape

torch.Size([5, 2, 28])

In [18]:
def get_params(vocab_size,num_hiddens,devices):
    num_inputs = num_outputs = vocab_size

    def normal(shape):
        return torch.rand(size=shape,device=devices) * 0.01
    
    W_xh = normal((num_inputs,num_hiddens))
    W_hh = normal((num_hiddens,num_hiddens))

    b_h = torch.zeros(num_hiddens,device=devices)

    W_hq = normal((num_hiddens,num_outputs))
    b_q = torch.zeros(num_outputs,device=devices)

    params = [W_xh,W_hh,b_h,W_hq,b_q]

    for param in params:
        param.requires_grad_(True)
        
    return params


In [19]:
def init_rnn_stats(bath_size,num_hidden,device):
    return (torch.zeros((bath_size,num_hidden),device=device),)

In [20]:
def rnn(inputs,state,params):
    W_xh , W_hh ,b_h,W_hq,b_q = params

    H, =state
    outputs = []

    for X in inputs:
        H = torch.tanh(torch.mm(X,W_xh) + torch.mm(H,W_hh) + b_h)
        Y = torch.mm(H,W_hq) + b_q
        outputs.append(Y)
    return torch.cat(outputs,dim=0),(H,)

In [21]:
class RNNModelScratch():
    def __init__(self,vocab_size,num_hidden,device,get_params,init_state,forward_fn):
        self.vocab_size = vocab_size
        self.num_hiddens = num_hidden
        self.params = get_params(vocab_size,num_hidden,device)
        self.init_state , self.forward_dn =init_state,forward_fn


    def __call__(self,X,state):
        X = F.one_hot(X.T,self.vocab_size).type(torch.float32)
        return self.forward_dn(X,state,self.params)
    
    def begin_state(self,batch_size,device):
        return self.init_state(batch_size,self.num_hiddens,device)
    


In [22]:
num_hidden = 512 
net = RNNModelScratch(len(vocab),num_hidden,'cuda',get_params,init_rnn_stats,rnn)
state = net.begin_state(X.shape[0],'cuda')
Y,new_state = net(X.to('cuda'),state)
Y.shape,len(new_state),new_state[0].shape

(torch.Size([10, 28]), 1, torch.Size([2, 512]))

In [23]:
def predict_ch8(prefix,num_preds,net,vocab,device):
    state = net.begin_state(batch_size=1,device=device)
    outputs = [vocab[prefix[0]]]
    get_params = lambda:torch.tensor([outputs[-1]],device=device).reshape((1,1))

    for y in prefix[1:]:
        _,state = net(get_params(),state)
        outputs.append(vocab[y])
    for _ in range(num_preds):
        y,state = net(get_params(),state)
        outputs.append(int(torch.argmax(y,dim=1).reshape(1)))
    
    return ''.join([vocab.idx_to_token[i] for i in outputs])
predict_ch8("time traveller ",10,net,vocab,'cuda')

'time traveller vvvvvvvvvv'

In [24]:
def grad_clipping(net,theta):
    if isinstance(net,nn.Module):
        params = [p for p in net.parameters() if p.requires_grad]
    else:
        params = net.params

    norm = torch.sqrt(sum(torch.sum((p.grad**2)) for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta/norm

In [25]:
class Accumulator:  #@save
    """在n个变量上累加"""
    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [26]:
def train_epoch_ch8(net,train_iter,loss,updater,device,use_random_iter):
    state = None
    metric = Accumulator(2)
    for X,Y in train_iter:
        if state is None or use_random_iter:
            state = net.begin_state(batch_size=X.shape[0],device=device)
        else:
            if isinstance(net,nn.Module) and not isinstance(state,tuple):
                state.detach_()
            else:
                for s in state:
                    s.detach_()
        y = Y.T.reshape(-1)
        X,y = X.to(device),y.to(device)
        y_hat ,state = net(X,state)
        l = loss(y_hat,y.long()).mean()
        if isinstance(updater,torch.optim.Optimizer):
            updater.zero_grad()
            l.backward()
            grad_clipping(net,1)
            updater.step()
        else:
            l.backward()
            grad_clipping(net,1)
            updater(batch_size=1)
        metric.add(l * y.numel(),y.numel())
    return math.exp(metric[0] / metric[1])

In [27]:
def sgd(params, lr, batch_size):  #@save
    """小批量随机梯度下降"""
    with torch.no_grad():
        for param in params:
            param -= lr * param.grad / batch_size
            param.grad.zero_()

In [28]:
def train_ch8(net,train_iter,vocab,lr,num_epochs,device,use_random_iter=False):
    loss = nn.CrossEntropyLoss()
    
    if isinstance(net,nn.Module):
        updater = torch.optim.SGD(net.parameters(),lr)
    else:
        updater = lambda batch_size : sgd(net.params,lr,batch_size) 
    predict = lambda prefix:predict_ch8(prefix,50,net,vocab,device)
    for epoch in range(num_epochs):
        ppl  = train_epoch_ch8(net,train_iter,loss,updater,device,use_random_iter)
        if(epoch + 1)%10==0:
            print(predict('time traveller'))
    print(f"困惑度{ppl:.1f}")
    print(predict('time traveller'))
    print(predict('traveller'))


num_epoch ,lr =500,1

train_ch8(net,train_iter,vocab,lr,num_epoch,'cuda')

time travellere the the the the the the the the the the the the 
time traveller and the the the the the the the the the the the t
time traveller and the that the the the the the the the the the 
time traveller a could some in the laig the laight the thing the
time traveller a self hand the time traveller a self hand the ti
time traveller s all of the wild i to the siledily flack and the
time traveller a seemed to me in the sard the lawh the waller a 
time traveller s grew the lang the was along the ered a strange 
time traveller a shigthen i deach deach a stoudded a said the ti
time traveller was stoud and the little people were great distin
time traveller s were strange and in a paranex about the time tr
time traveller have been sound of my eyes the enith on a matthe 
time traveller clunce of the stare had sumper worns said i to my
time traveller and intellecture to mather that the time machine 
time traveller s lookly said the time traveller suinal solight o
time traveller i want som