In [1]:
import torch
import random
import zipfile
from pathlib import Path

In [2]:
with zipfile.ZipFile(Path('deep learning/data/jaychou_lyrics.txt.zip')) as zin:
    with zin.open('jaychou_lyrics.txt') as f:
        corpus_chars = f.read().decode('utf-8')
corpus_chars[:40]

'想要有直升机\n想要和你飞到宇宙去\n想要和你融化在一起\n融化在宇宙里\n我每天每天每'

In [3]:
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', '')
corpus_chars = corpus_chars[:10000]

In [4]:
idx_to_char = list(set(corpus_chars))
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
vocab_size

1027

In [5]:
corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:20]
print('chars:', ''.join([idx_to_char[idx] for idx in sample]))
print('indices:', sample)

chars: 想要有直升机 想要和你飞到宇宙去 想要和
indices: [776, 854, 167, 324, 475, 772, 253, 776, 854, 1, 322, 996, 988, 498, 386, 341, 253, 776, 854, 1]


In [8]:
def data_iter_random(corpus_indices, batch_size, num_steps, device=None):
    num_examples = (len(corpus_indices) - 1) // num_steps  # 总样本数
    epoch_size = num_examples // batch_size  # 小批量读取次数
    example_indices = list(range(num_examples))
    random.shuffle(example_indices)  # 打乱读取顺序
    
    def _data(pos):
        return corpus_indices[pos:pos + num_steps]  # 按下标返回一个样本
    
    for i in range(epoch_size):
        i *= batch_size  # 已读取的样本数
        batch_indices = example_indices[i: i + batch_size]  # 一个小批量中的所有样本第一个下标
        X = [_data(j * num_steps) for j in batch_indices]  # 按下标生成对应的样本X
        Y = [_data(j * num_steps + 1) for j in batch_indices]  # 按下标生成对应的样本Y
        if device is not None:
            yield torch.FloatTensor(X).to(device), torch.FloatTensor(Y).to(device)
        else:
            yield torch.FloatTensor(X), torch.FloatTensor(Y)

In [11]:
my_seq = list(range(30))
for X, Y in data_iter_random(my_seq, 2, 6):
    print('X:', X, '\nY:', Y, '\n')

X: tensor([[12., 13., 14., 15., 16., 17.],
        [ 0.,  1.,  2.,  3.,  4.,  5.]]) 
Y: tensor([[13., 14., 15., 16., 17., 18.],
        [ 1.,  2.,  3.,  4.,  5.,  6.]]) 

X: tensor([[18., 19., 20., 21., 22., 23.],
        [ 6.,  7.,  8.,  9., 10., 11.]]) 
Y: tensor([[19., 20., 21., 22., 23., 24.],
        [ 7.,  8.,  9., 10., 11., 12.]]) 



In [12]:
def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None):
    if device is not None:
        corpus_indices = torch.FloatTensor(corpus_indices).to(device)
    else:
        corpus_indices = torch.FloatTensor(corpus_indices)
    data_len = len(corpus_indices)  # 总长度
    batch_len = data_len // batch_size  
    # 将data变成矩阵
    indices = corpus_indices[0: batch_size * batch_len].view(batch_size, batch_len)
    epoch_size = (batch_len - 1) // num_steps  # 减1是为了防止取Y的时候导致下标越界
    for i in range(epoch_size):
        i *= num_steps
        X = indices[:, i: i + num_steps]
        Y = indices[:, i + 1: i + num_steps + 1]
        yield X, Y

In [13]:
for X, Y in data_iter_consecutive(my_seq, 2, 6):
    print('X: ', X, '\nY: ', Y, '\n')

X:  tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [15., 16., 17., 18., 19., 20.]]) 
Y:  tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [16., 17., 18., 19., 20., 21.]]) 

X:  tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [21., 22., 23., 24., 25., 26.]]) 
Y:  tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [22., 23., 24., 25., 26., 27.]]) 

