In [14]:
import torch
import random
import zipfile
with zipfile.ZipFile(r'D:\CS\MachineLearning\Dive-into-DL-PyTorch-master\data\jaychou_lyrics.txt.zip') as zin:
    with zin.open('jaychou_lyrics.txt') as f:
        corpus_chars = f.read().decode('utf-8')
corpus_chars[:40]

'想要有直升机\n想要和你飞到宇宙去\n想要和你融化在一起\n融化在宇宙里\n我每天每天每'

In [15]:
#replace the \n and \r by space
corpus_chars=corpus_chars.replace("\n"," ").replace("\r"," ")
corpus_chars=corpus_chars[0:10000]

In [16]:
#establish the index
idx_to_char=list(set(corpus_chars))
char_to_idx=dict([(char,i) for i,char in enumerate(idx_to_char)])
vocab_size=len(char_to_idx)
vocab_size

1027

In [17]:
#find the correspongding index of each char of the dataset
corpus_indices=[char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:20]
print('chars:', ''.join([idx_to_char[idx] for idx in sample]))
print('indices:', sample)

chars: 想要有直升机 想要和你飞到宇宙去 想要和
indices: [560, 463, 916, 681, 185, 73, 249, 560, 463, 887, 701, 402, 56, 866, 807, 490, 249, 560, 463, 887]


In [23]:
#time series sampling
#1.random sampling
def data_iter_random(corpus_indices,batch_size,num_steps,device=None):
    num_examples=(len(corpus_indices)-1)//num_steps
    epoch_size=num_examples//batch_size
    example_indices=list(range(num_examples))
    random.shuffle(example_indices)
    
    #返回从pos开始的长为num_steps的序列
    def _data(pos):
        return corpus_indices[pos:pos+num_steps]
    
#     if device is None:
#         device=torch.device('cuba' if torch.cuba.is_available() else 'cpu')

    for i in range(epoch_size):
        #read batch_size samples every time
        i=i*batch_size
        batch_indices=example_indices[i:i+batch_size]
        X=[_data(j*num_steps) for j in batch_indices]
        Y=[_data(j*num_steps + 1) for j in batch_indices]
        yield torch.tensor(X,dtype=torch.float32,device=device), torch.tensor(Y,dtype=torch.float32,device=device)

In [24]:
#test
my_seq=list(range(30))
for X,Y in data_iter_random(my_seq,batch_size=2,num_steps=6):
    print("X:",X,"\nY:",Y,"\n")

X: tensor([[18., 19., 20., 21., 22., 23.],
        [ 0.,  1.,  2.,  3.,  4.,  5.]]) 
Y: tensor([[19., 20., 21., 22., 23., 24.],
        [ 1.,  2.,  3.,  4.,  5.,  6.]]) 

X: tensor([[12., 13., 14., 15., 16., 17.],
        [ 6.,  7.,  8.,  9., 10., 11.]]) 
Y: tensor([[13., 14., 15., 16., 17., 18.],
        [ 7.,  8.,  9., 10., 11., 12.]]) 



In [48]:
#2.adjacent sampling
def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None):
    
    corpus_indices=torch.tensor(corpus_indices,dtype=torch.float32,device=device)
    
    data_len=len(corpus_indices)
    print("data_len:",data_len)
    batch_len=data_len//batch_size
    print("data_len:",batch_len)
    indices=corpus_indices[0:batch_size*batch_len].view(batch_size,batch_len)
    print("indices:",indices)
    epoch_size=(batch_len-1)//num_steps
    print("epoch_size:",epoch_size)
    for i in range(epoch_size):
        print(i)
        i=i*num_steps
        X=indices[:,i:i+num_steps]
        Y=indices[:,i+1:i+1+num_steps]
        yield X,Y

In [49]:
for X,Y in data_iter_consecutive(my_seq,batch_size=2,num_steps=6):
    print("X:",X,"\nY:",Y,"\n")

data_len: 30
data_len: 15
indices: tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
         14.],
        [15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28.,
         29.]])
epoch_size: 2
0
X: tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [15., 16., 17., 18., 19., 20.]]) 
Y: tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [16., 17., 18., 19., 20., 21.]]) 

1
X: tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [21., 22., 23., 24., 25., 26.]]) 
Y: tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [22., 23., 24., 25., 26., 27.]]) 



In [45]:
my_seq

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29]