# 8.3 语言模型和数据集

In [1]:
import random
import torch

# 读取长序列数据 -- 随机采样
不同batch的对应数据是相互独立的

In [14]:
def seq_data_iter_random(corpus: list, batch_size: int, num_steps: int):
	"""
	Args:
		corpus (list): 语料集
		batch_size (int): 批量大小
		num_steps (int): 每个样本的大小
	"""
	# 随机删除前面的数据, 保证每个epoch数据不一样
	corpus = corpus[random.randint(0, num_steps - 1):]
	# 确定有多少个子序列 (num_subseqs)
	num_subseqs = (len(corpus) - 1) // num_steps
	# 用来确定每一个样本开始的index
	initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
	random.shuffle(initial_indices)

	# 根据position, 返回其在corpus中的数据
	data = lambda pos: corpus[pos: pos + num_steps]

	# 遍历生成每一个batch
	num_batches = num_subseqs // batch_size
	for i in range(0, batch_size + num_batches, batch_size):
		# 随机从initial_indices中挑选出一个batch所对应的index
		initial_indices_per_batch = initial_indices[i: i + batch_size]
		x = [data(j) for j in initial_indices_per_batch]
		y = [data(j + 1) for j in initial_indices_per_batch]
		yield torch.tensor(x), torch.tensor(y)

In [16]:
my_seq = list(range(35))
for X, Y in seq_data_iter_random(my_seq, 2, 5):
	print("---batch---")
	print("X:", X, '\n', "Y:", Y)

---batch---
X: tensor([[10, 11, 12, 13, 14],
        [15, 16, 17, 18, 19]]) 
 Y: tensor([[11, 12, 13, 14, 15],
        [16, 17, 18, 19, 20]])
---batch---
X: tensor([[5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4]]) 
 Y: tensor([[ 6,  7,  8,  9, 10],
        [ 1,  2,  3,  4,  5]])
---batch---
X: tensor([[20, 21, 22, 23, 24],
        [25, 26, 27, 28, 29]]) 
 Y: tensor([[21, 22, 23, 24, 25],
        [26, 27, 28, 29, 30]])


# 读取长序列数据 -- 顺序分区
不同batch中的对应数据是连着的

In [17]:
def seq_data_iter_sequential(corpus: list, batch_size: int, num_steps: int):
	"""
	Args:
		corpus (list): 语料集
		batch_size (int): 批量大小
		num_steps (int): 每个样本的大小
	"""
	offset = random.randint(0, num_steps)
	# 获取token的总数, 并且确保能够被batch_size整除
	num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
	Xs = torch.tensor(corpus[offset: offset + num_tokens])
	Ys = torch.tensor(corpus[offset + 1: offset + num_tokens + 1])
	# Xs中第一维是batch_size, 第二维代表 corpus被batch_size切割后能够存于的元素总量
	Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
	
	# corpus被batch_size切割后能够存于的元素总量 整除 每个样本的大小, 就是一共有多少个batch (num_batches)
	num_batches = Xs.shape[1] // num_steps
	for i in range(0, num_steps * num_batches, num_steps):
		X = Xs[:, i: i + num_steps]
		Y = Ys[:, i: i + num_steps]
		yield X, Y

In [18]:
my_seq = list(range(35))
for X, Y in seq_data_iter_sequential(my_seq, 2, 5):
	print("---batch---")
	print("X:", X, '\n', "Y:", Y)

---batch---
X: tensor([[ 1,  2,  3,  4,  5],
        [17, 18, 19, 20, 21]]) 
 Y: tensor([[ 2,  3,  4,  5,  6],
        [18, 19, 20, 21, 22]])
---batch---
X: tensor([[ 6,  7,  8,  9, 10],
        [22, 23, 24, 25, 26]]) 
 Y: tensor([[ 7,  8,  9, 10, 11],
        [23, 24, 25, 26, 27]])
---batch---
X: tensor([[11, 12, 13, 14, 15],
        [27, 28, 29, 30, 31]]) 
 Y: tensor([[12, 13, 14, 15, 16],
        [28, 29, 30, 31, 32]])
