In [152]:
import torch
from matplotlib.pyplot import xlabel, yscale, xscale
from torch import nn
from matplotlib import pyplot as plt
from torch.utils.data import Dataset, DataLoader
import random
import collections
import re
import requests
import random 
from torch.nn import functional as F

In [153]:
with open('../data/time_machine.txt','r') as f:
    content=f.readlines()
##读取文章
def read_time_machine():
    with open('../data/time_machine.txt','r') as f:
        content=f.readlines()
    return [ re.sub('[^A-Za-z]', ' ', i.replace('\n','')).strip().lower() for i in content ]

## 定义一个拆分词元的函数 结果是词元组成的list
def tokenize(content,token='word'):
    if token=='word':
        token_list=[token.lower() for i in content for token in i.split(' ')]
    else:
        token_list=[token.lower() for i in content for token in i]
    return token_list

##定义一个统计频率的函数 可以处理1d2d
def count_corpus(token_list):
    if isinstance(token_list[0], list):
      tokens=[token for i in token_list for token in i ]
    tokens_count=collections.Counter(token_list)
    return tokens_count

class Vocal():
    def __init__(self,token_list=None,min_feq=0,reserved_tokens=None):
        self.token_list=token_list
        if token_list is None:
            self.token_list=[]
        if reserved_tokens is None:
            reserved_tokens=[]
        counter_info=count_corpus(token_list)
        self._token_feq=[]
        ##只接受符合条件的词
        for items in sorted(counter_info.items(),key=lambda x:x[1],reverse=True):
            if items[1]<=min_feq:
                break
            else:
                self._token_feq.append(items)

        ##
        if '<unk>' in reserved_tokens:
            self.idx_to_token=reserved_tokens
        else:
            self.idx_to_token=['unk']+reserved_tokens

        self.token_to_idx={token:i for i,token in enumerate(self.idx_to_token)}
        for token,_ in self._token_feq:
            self.idx_to_token.append(token)
            self.token_to_idx[token]=len(self.token_to_idx)
    def __len__(self):
        return len(self.token_to_idx)
    ##实现一个索引方法 但是传入的索引是token 返回是idx,保证未知token显示0值
    def __getitem__(self,tokens):
        if not isinstance(tokens,(tuple,list)):
            return self.token_to_idx.get(tokens,0)
        return [self.__getitem__(i) for i in tokens]

    @property
    def unk(self):
        return 0

    @property
    def tokens_freq(self):
        return self._token_feq

def load_corpus_time_machine(max_tokens=-1):  #@save
    """返回时光机器数据集的词元索引列表和词表"""
    tokens=tokenize(read_time_machine(),'char')
    vocal=Vocal(tokens)

    # 因为时光机器数据集中的每个文本行不一定是一个句子或一个段落，
    # 所以将所有文本行展平到一个列表中
    corpus = [vocal[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocal

##产生随机序列
def seq_data_iter_random(corpus,batch_size,num_steps):
    """使用随机抽样生成一个小批量子序列"""
    ##randint是左闭右闭 所以必须得减一不然0和numsteps实际上是重复的
    corpus=corpus[random.randint(0,num_steps-1):]
    ##因为y要比x多一位 一共可以有这么多子序列
    num_subseqs=(len(corpus)-1)//num_steps

    ## 然后找出每一个num seq的起始索引拿出来放到列表里 这里不能用corpus的长度 而是num_steps*num_subseqs
    seq_start_index=[i for i in range(0,num_steps*num_subseqs,num_steps)]
    ##打乱索引
    random.shuffle(seq_start_index)

    ## 再除以batchsize 有多少个batch
    num_batch=num_subseqs//batch_size
    ### 然后给每个batch找x和y 索引起始位置在seq_start_index里
    for i in range(num_batch):
        index_list=seq_start_index[i*batch_size:(i+1)*batch_size]
        x=[]
        y=[]
        for _index in index_list:
           x.append(corpus[_index:_index+num_steps])
           y.append(corpus[_index+1:_index+1+num_steps])
        yield torch.tensor(x),torch.tensor(y)

# def seq_data_iter_sequential(corpus,batch_size,num_steps):
#     random.seed(42)
#     corpus=corpus[random.randint(0,num_steps):]
#     batch_num=(len(corpus)-1)//(batch_size*num_steps)
#     Xs=torch.tensor(corpus[:batch_num*batch_size*num_steps]).reshape(-1,batch_size,num_steps)
#     Ys=torch.tensor(corpus[1:batch_num*batch_size*num_steps+1]).reshape(-1,batch_size,num_steps)
#     for i in range(batch_num):
#         yield Xs[i],Ys[i]
# s1=seq_data_iter_sequential(corpus,batch_size,num_steps)
# 这个方案是错的 连续要求在不同的batch上保持连续 而不是在一个batch的多个样本上保持连续 并且 batch_num=(len(corpus)-1)//(batch_size*num_steps) 这样也不太好
#因为batch_num 计算方式耦合了 batch_size 和 num_steps，


def seq_data_iter_sequential(corpus,batch_size,num_steps):
    random.seed(0)
    corpus=corpus[random.randint(0,num_steps):]
    num_tokens=(len(corpus)-1)//batch_size*batch_size ## 保证是batch_size的倍数
    ###batch_size需要放在最外维度：常规样本维度安排 外层是样本个数 也就是batch_size
    Xs=torch.tensor(corpus[:num_tokens]).reshape(batch_size,-1)
    Ys=torch.tensor(corpus[1:num_tokens+1]).reshape(batch_size,-1)
    ##维度1是每个batchsize有多少token
    batch_num=Xs.shape[1]//num_steps
    for i in range(batch_num):
        yield Xs[:,i*num_steps:(i+1)*num_steps],Ys[:,i*num_steps:(i+1)*num_steps]


In [154]:
corpus,vocal=load_corpus_time_machine()
batch_size,num_steps=5,10

In [164]:
class SeqDataLoader:  #@save
    def __init__(self,batch_size,num_steps,use_random_iter,max_token):
        if use_random_iter:
            self.data_iter_fn=seq_data_iter_random
        else:
            self.data_iter_fn=seq_data_iter_sequential
        self.corpus,self.vocal=load_corpus_time_machine(max_token)
        self.batch_size,self.num_steps = batch_size,num_steps

    ##__iter__方法使得整个类变成可迭代的
    def __iter__(self):
        return self.data_iter_fn(self.corpus,self.batch_size,self.num_steps)

In [165]:
def load_data_time_machine(batch_size,num_steps,use_random_iter=True,max_token=-1):
    data_iter=SeqDataLoader(batch_size,num_steps,use_random_iter,max_token)
    return data_iter,data_iter.vocal

In [166]:
s=load_data_time_machine(10,5)