In [1]:
%matplotlib inline
import torch
import torchvision
from IPython import display
from matplotlib import pyplot as plt
import numpy as np
import random

In [53]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [15]:
with open('./data/jaychou_lyrics.txt', 'r') as f:
    corpus = f.read()
    corpus = corpus.replace('\n', ' ').replace('\u3000', ' ')

In [23]:
corpus = corpus[:10000]
idx_to_char = list(set(corpus))
len(idx_to_char)

1027

In [24]:
char_to_idx = {i:c for i, c in enumerate(idx_to_char)}

### 时序数据的采样

不同的采样方式, 在训练实现上会略有不同

In [54]:
# 随机采样 每次采样前都需要重新初始化隐藏状态
def data_iter_random(corpus_indices, batch_size, window, device):
    num_example = (len(corpus_indices)-1) // window
    batch_num = num_example // batch_size
    
    example_indices = list(range(num_example))
    random.shuffle(example_indices)
    
    for i in range(batch_num):
        batch_indices = example_indices[i*batch_size: (i+1)*batch_size]
        train_example = [corpus_indices[j*window: (j+1)*window] for j in batch_indices]
        test_example = [corpus_indices[j*window+1: (j+1)*window+1] for j in batch_indices]
        yield torch.tensor(train_example, dtype=torch.float32, device=device), torch.tensor(test_example, dtype=torch.float32, device=device)

In [56]:
my_seq = list(range(30))
print(my_seq)
for X, Y in data_iter_random(my_seq, batch_size=2, window=6, device=device):
    print('X: ', X, '\nY:', Y, '\n')

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
X:  tensor([[12., 13., 14., 15., 16., 17.],
        [ 0.,  1.,  2.,  3.,  4.,  5.]]) 
Y: tensor([[13., 14., 15., 16., 17., 18.],
        [ 1.,  2.,  3.,  4.,  5.,  6.]]) 

X:  tensor([[18., 19., 20., 21., 22., 23.],
        [ 6.,  7.,  8.,  9., 10., 11.]]) 
Y: tensor([[19., 20., 21., 22., 23., 24.],
        [ 7.,  8.,  9., 10., 11., 12.]]) 



In [59]:
# 相邻采样
def data_iter_consecutive(corpus_indices, batch_size, window, device):
    batch_len = len(corpus_indices) // batch_size
    corpus_indices = torch.tensor(corpus_indices, dtype=torch.float32, device=device)
    corpus_indices = corpus_indices[0:batch_len*batch_size]
    corpus_indices = corpus_indices.view(batch_size, batch_len)
    
    batch_num = (batch_len-1)//window
    for i in range(batch_num):
        train_example = corpus_indices[:,i*window:(i+1)*window]
        test_example = corpus_indices[:,i*window+1:(i+1)*window+1]
        yield train_example, test_example
    

In [60]:
my_seq = list(range(30))
print(my_seq)
for X, Y in data_iter_consecutive(my_seq, batch_size=2, window=6, device=device):
    print('X: ', X, '\nY:', Y, '\n')

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
X:  tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [15., 16., 17., 18., 19., 20.]]) 
Y: tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [16., 17., 18., 19., 20., 21.]]) 

X:  tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [21., 22., 23., 24., 25., 26.]]) 
Y: tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [22., 23., 24., 25., 26., 27.]]) 



In [71]:
def one_hot(x, n_class, dtype=torch.float32):
    x = x.long()
    res = torch.zeros(x.shape[0], n_class, dtype=dtype, device=x.device)
    res.scatter_(1, x.view(-1,1), 1)
    return res

In [73]:
x = torch.tensor([1.,2.])
one_hot(x, 3)

tensor([[0., 1., 0.],
        [0., 0., 1.]])

In [79]:
torch.nn.functional.one_hot(x.long().view(-1,1), 3)

tensor([[[0, 1, 0]],

        [[0, 0, 1]]])