## dataset

In [None]:
# 测试下处理数据集的方法
import os
import sys

# 处理import data报module not found的错误
# os.getcwd()
# os.chdir('D:/project/nlp-project/word_language_model')  # 将工作目录更改为项目文件夹的路径
# 也可以使用 sys.path.append("../") 父目录下的module可以python解释器找到

sys.path.append("../")

import data
print(data.__file__)

corpus = data.Corpus('./data/wikitext')
dic = corpus.dictionary

#文本中包含的词汇和其对应的编号 
print(dic.idx2word)
print(len(dic.idx2word)) 
print(dic.word2idx)

# 每段文本以0和开始和结束
print(corpus.train[0:100])

# 打印每个数据集的形状
print("train:{}, test:{}, valid:{}".format(corpus.train.shape, corpus.test.shape, corpus.valid.shape))

: 

### torch vs tensorflow处理tensor张量

In [None]:
# torch

import torch

data = [ 0,  1,  2,  3,  4,  1,  0,  0,  5,  6,  2,  7,  8,  9,  3, 10, 11,  8,
        12, 13, 14, 15,  2, 16, 17, 18,  7, 19, 13, 20, 21, 22, 23,  2,  3,  4,
        24, 25, 13, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 17,
        40, 41, 15, 42, 43, 44, 45, 43, 25, 13, 46, 26, 17, 47, 33, 43, 17,  2,
        48, 15,  9, 17, 49, 50, 16, 28, 37, 51, 30, 52, 53, 23, 54, 55, 13, 17,
        56, 57, 58, 22, 17, 59, 33, 37, 60, 17, 1000]

print(len(data))

# list -> tensor
data = torch.tensor(data)
print(data.size(0))

# 设置batch size, 并计算每个batch中的样本数量
bsz = 20
nbatch = data.size(0) // bsz
print(nbatch)

# narrow(切片操作的维度, 切片操作的起始位置, 长度)
data = data.narrow(0, 0, nbatch * bsz) # 101 - 100 = 1 排除了最后的1000
print(data)

# view: 对数据集安装bsz进行切分 
data = data.view(bsz, -1).t().contiguous()
print(data)

# 对a-x进行batchify操作, batch size = 4, 则共分为6个batch, h0为a的输入参数, 则h1就为b的输入,正好适配RNN处理时间序列的逻辑 
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.


# tensorflow

: 

### 实验数据

In [7]:
def batchify(data, bsz = 20):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data

# bptt表示seq length
bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [9]:
# 再打印下形状
print("train:{}, test:{}, valid:{}".format(corpus.train.shape, corpus.test.shape, corpus.valid.shape))

data = batchify(corpus.train)
print(data.shape)

train_data = batchify(corpus.train, 20)

print(bptt)

# 训练前最后一次处理数据
for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):  # 由于我们已经把train_data整理成batch构成的数组了，这里直接进行迭代即可。
    data, targets = get_batch(train_data, i)
    print(data.shape)
    print(targets.shape)

train:torch.Size([2088628]), test:torch.Size([245569]), valid:torch.Size([217646])
torch.Size([104431, 20])
35
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size([700])
torch.Size([35, 20])
torch.Size

## Embedding && Module

### torch.nn.Embedding

主要是将离线向量映射到低维的稠密向量空间，作用类似于word2vec，本质上就是一个lookup table

In [3]:
# Embedding UseCase
import torch
import torch.nn as nn

# num_embeddings为词汇表长度, embedding_dim表示希望映射成的词向量的维度
embedding = nn.Embedding(num_embeddings=10,embedding_dim=3)

# 输入数据, 相当于一个batch里包含两个数据, 每组数据seq_length为4, 每组数据有4个indices, indices为对文本编码的字典的索引
input = torch.LongTensor([[1,2,4,5], [4,3,2,9]])

# 每一个token被映射3维的向量, 所以每个样本被映射为4 * 3的向量
embedding(input)

tensor([[[-0.1031, -0.0581, -1.1876],
         [ 0.9900,  1.0118,  1.2321],
         [-0.0879,  0.1124, -0.2345],
         [-1.1876,  0.4612, -0.8028]],

        [[-0.0879,  0.1124, -0.2345],
         [-0.9365, -0.0478,  0.6572],
         [ 0.9900,  1.0118,  1.2321],
         [ 0.6726, -0.2314, -0.4354]]], grad_fn=<EmbeddingBackward0>)

### torch.nn.Module

Base class for all neural network modules.
需要重写forward方法, 定义网络正向传播的方式

In [7]:
# Module UseCase from chatgpt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# 定义一个虚构的数据集类
class CustomDataset(Dataset):
    def __init__(self, data_size, input_size, output_size):
        self.data = torch.randn(data_size, input_size)
        self.labels = torch.randint(0, output_size, (data_size,))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# 浅层神经网络
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    # 必须
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# 创建一个CustomDataset实例
data_size = 1000
input_size = 10
output_size = 5
dataset = CustomDataset(data_size, input_size, output_size)

# 使用DataLoader加载数据
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 创建一个SimpleNN实例
model = SimpleNN(input_size, hidden_size=20, output_size=output_size)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# 训练模型
num_epochs = 5
for epoch in range(num_epochs):
    for inputs, labels in dataloader:
        # 前向传播
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# 模型训练完成，可以在实际应用中使用该模型进行预测


Epoch 1/5, Loss: 1.6980171203613281
Epoch 2/5, Loss: 1.4891269207000732
Epoch 3/5, Loss: 1.603451132774353
Epoch 4/5, Loss: 1.5915875434875488
Epoch 5/5, Loss: 1.6517460346221924
