## 词向量训练

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm

#### 数据为：NLTK 中提供的 Reuters 语料库

在训练语言模型的过程中需要先定义一些变量，比如句首标记、句尾标记、以及构建批次时用于补齐序列长度的标记

In [2]:
# Constants
BOS_TOKEN = "<bos>" ## 句首标记
EOS_TOKEN = "<eos>" ## 句尾标记
PAD_TOKEN = "<pad>" ## 补齐标记

加载 Reuters 语料库并构建数据集，同时建立词表，这里需要用到前面的vocab

#### 1、词表映射

In [3]:
from collections import defaultdict, Counter

class Vocab:
    def __init__(self, tokens=None):
        self.idx_to_token = list()
        self.token_to_idx = dict()

        if tokens is not None:
            if "<unk>" not in tokens:
                tokens = tokens + ["<unk>"]
            for token in tokens:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1
            self.unk = self.token_to_idx['<unk>']

    @classmethod
    def build(cls, text, min_freq=1, reserved_tokens=None):
        token_freqs = defaultdict(int)
        for sentence in text:
            for token in sentence:
                token_freqs[token] += 1
        uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else [])
        uniq_tokens += [token for token, freq in token_freqs.items() \
                        if freq >= min_freq and token != "<unk>"]
        
        return cls(uniq_tokens)   ## 返回cls 对象，到时候就可以通过这个cls 来调用 Vocab 类中的其他方法
                                  ## 后面调用 build ，并且将返回值设为 vocab = cls(uniq_tokens) ，之后调用vocab，会默认带有uniq_tokens参数
                                 ## 此 uniq_tokens 参数会传给 __init__ 中的 tokens。
    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, token):
        return self.token_to_idx.get(token, self.unk)

    def convert_tokens_to_ids(self, tokens):
        return [self[token] for token in tokens] 

    def convert_ids_to_tokens(self, indices):
        return [self.idx_to_token[index] for index in indices]

#### 下载数据

In [None]:
import nltk
nltk.download('reuters')
nltk.download('punkt')

#### 加载数据

In [4]:
from nltk.corpus import reuters

reuters.sents()

[['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.'], ['They', 'told', 'Reuter', 'correspondents', 'in', 'Asian', 'capitals', 'a', 'U', '.', 'S', '.', 'Move', 'against', 'Japan', 'might', 'boost', 'protectionist', 'sentiment', 'in', 'the', 'U', '.', 'S', '.', 'And', 'lead', 'to', 'curbs', 'on', 'American', 'imports', 'of', 'their', 'products', '.'], ...]

In [5]:
# len(reuters.sents()) ## 54716

In [6]:
def load_reuters():
    text = reuters.sents()[:1000]
    # lowercase (optional)
    text = [[word.lower() for word in sentence] for sentence in text]
    vocab = Vocab.build(text, reserved_tokens=[PAD_TOKEN, BOS_TOKEN, EOS_TOKEN])
    corpus = [vocab.convert_tokens_to_ids(sentence) for sentence in text]

    return corpus, vocab

In [None]:
corpus, vocab = load_reuters()

#### 创建前馈神经网络语言模型处理类

In [None]:
from torch.utils.data import Dataset

class NGramDataset(Dataset):
    def __init__(self, corpus, vocab, context_size=2):
        self.data = []
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        for sentence in tqdm(corpus, desc="Dataset Construction"):
            # 插入句首句尾符号
            sentence = [self.bos] + sentence + [self.eos]
            if len(sentence) < context_size:
                continue
            for i in range(context_size, len(sentence)):
                # 模型输入：长为context_size的上文
                context = sentence[i-context_size:i]
                # 模型输出：当前词
                target = sentence[i]
                self.data.append((context, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        print('调用了我')
        return self.data[i]

    def collate_fn(self, examples):
        # 从独立样本集合中构建batch输入输出
        inputs = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
        targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
        return (inputs, targets)

In [None]:
## 设置权重
# def init_weights(model):
#     for name, param in model.named_parameters():
#         if "embedding" not in name:
#             torch.nn.init.uniform_(
#                 param, a=-WEIGHT_INIT_RANGE, b=WEIGHT_INIT_RANGE
#             )

#### 创建前馈神经网络语言模型

词向量层中的向量会随着模型的优化进行更新

In [None]:
import torch
from torch import nn, optim
import torch.nn.functional as F

class FeedForwardNNLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(FeedForwardNNLM, self).__init__()
        # 词嵌入层
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 线性变换：词嵌入层->隐含层
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        # 线性变换：隐含层->输出层
        self.linear2 = nn.Linear(hidden_dim, vocab_size) ## 最后输出神经元数量为词表大小数量，最后需要预测下一个词可能出现的概率
        # 使用ReLU激活函数
        self.activate = F.relu
#         init_weights(self)

    def forward(self, inputs):
        ## embeddings(inputs).shape 为 torch.Size([1024, 2, 64]) ，embeds shape 为 torch.Size([1024, 128]) 
        ## 所以上面隐藏层输入神经元维度为 context_size * embedding_dim 
        embeds = self.embeddings(inputs).view((inputs.shape[0], -1))
        hidden = self.activate(self.linear1(embeds))
        output = self.linear2(hidden)
        # 根据输出层（logits）计算概率分布并取对数，以便于计算对数似然
        # 这里采用PyTorch库的log_softmax实现
        log_probs = F.log_softmax(output, dim=1)
        return log_probs

#### 将数据转为 dataloader 格式

In [7]:
from torch.utils.data import Dataset, DataLoader

def get_loader(dataset, batch_size, shuffle=True):
    data_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=dataset.collate_fn,
        shuffle=shuffle
    )
    return data_loader

In [None]:
from tqdm.auto import tqdm

## 定义超参数
embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 1024
num_epoch = 10

# 读取文本数据，构建FFNNLM训练数据集（n-grams）
corpus, vocab = load_reuters()                  ## corpus 是将 文字 转 idx 之后得到的值。

## 为了给 dataLoader 提供dataset 参数，第一步需要先将数据传到 NGramDataset 中
dataset = NGramDataset(corpus, vocab, context_size)
## 然后再将 dataset 作为参数传到 get_loader 中，创建dataLoader对象
data_loader = get_loader(dataset, batch_size)

In [None]:
corpus[1]

In [None]:
# 负对数似然损失函数
nll_loss = nn.NLLLoss()
# 构建FFNNLM，并加载至device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = FeedForwardNNLM(len(vocab), embedding_dim, context_size, hidden_dim)
model.to(device)

In [None]:
# 使用Adam优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
total_losses = []
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        inputs, targets = [x.to(device) for x in batch]
        optimizer.zero_grad()
        log_probs = model(inputs)
        loss = nll_loss(log_probs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")
    total_losses.append(total_loss)

In [None]:
def save_pretrained(vocab, embeds, save_path):
    """
    Save pretrained token vectors in a unified format, where the first line
    specifies the `number_of_tokens` and `embedding_dim` followed with all
    token vectors, one token per line.
    """
    with open(save_path, "w") as writer:
        writer.write(f"{embeds.shape[0]} {embeds.shape[1]}\n")
        for idx, token in enumerate(vocab.idx_to_token):
            vec = " ".join(["{:.4f}".format(x) for x in embeds[idx]])
            writer.write(f"{token} {vec}\n")
    print(f"Pretrained embeddings saved to: {save_path}")

In [None]:
# 保存词向量（model.embeddings）
save_pretrained(vocab, model.embeddings.weight.data, "ffnnlm.vec")

## ------------------------------------------------------- 调试部分 ------------------------------------------------------------ 

In [None]:
for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
    inputs, targets = [x for x in batch]
    break

In [None]:
embeddings = nn.Embedding(31081, 64)
embeds = embeddings(inputs).view((inputs.shape[0], -1))

In [None]:
embeddings(inputs).shape

In [None]:
embeds.shape

In [None]:
inputs[2]

In [None]:
targets[2]

# RNNNLM

In [8]:
from torch.nn.utils.rnn import pad_sequence


class RnnlmDataset(Dataset):
    def __init__(self, corpus, vocab):
        self.data = []
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        self.pad = vocab[PAD_TOKEN]
        for sentence in tqdm(corpus, desc="Dataset Construction"):
            # 模型输入：BOS_TOKEN, w_1, w_2, ..., w_n
            input = [self.bos] + sentence
            # 模型输出：w_1, w_2, ..., w_n, EOS_TOKEN
            target = sentence + [self.eos]
            self.data.append((input, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]

    def collate_fn(self, examples):
        # 从独立样本集合中构建batch输入输出
        inputs = [torch.tensor(ex[0]) for ex in examples]
        targets = [torch.tensor(ex[1]) for ex in examples]
        # 对batch内的样本进行padding，使其具有相同长度
        inputs = pad_sequence(inputs, batch_first=True, padding_value=self.pad)
        targets = pad_sequence(targets, batch_first=True, padding_value=self.pad)
        return (inputs, targets)

In [9]:
class RNNLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNNLM, self).__init__()
        # 词嵌入层
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 循环神经网络：这里使用LSTM
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        # 输出层
        self.output = nn.Linear(hidden_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        # 计算每一时刻的隐含层表示
        hidden, _ = self.rnn(embeds)
        output = self.output(hidden)
        log_probs = F.log_softmax(output, dim=2)
        return log_probs

In [10]:
embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 64
num_epoch = 100

# 读取文本数据，构建FFNNLM训练数据集（n-grams）
corpus, vocab = load_reuters()
dataset = RnnlmDataset(corpus, vocab)
data_loader = get_loader(dataset, batch_size)




In [13]:
for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
    inputs, targets = [x for x in batch]
    break




In [14]:
inputs.shape

torch.Size([64, 170])

In [16]:
targets.view(-1).shape

torch.Size([10880])

In [None]:
inputs[0]

In [None]:
' '.join(vocab.convert_ids_to_tokens(inputs[0]))

In [None]:
targets[0]

In [None]:
torch.unsqueeze(inputs[6],dim=0)

In [None]:
' '.join(vocab.convert_ids_to_tokens(dataset[1][0]))

In [None]:
' '.join(vocab.convert_ids_to_tokens(dataset[1][1]))

In [11]:
# 负对数似然损失函数，忽略pad_token处的损失
## 在使用nllloss时，需要有两个张量，一个是预测向量，一个是label
nll_loss = nn.NLLLoss(ignore_index=dataset.pad)
# 构建RNNLM，并加载至device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNNLM(len(vocab), embedding_dim, hidden_dim)
model.to(device)

RNNLM(
  (embeddings): Embedding(4168, 64)
  (rnn): LSTM(64, 128, batch_first=True)
  (output): Linear(in_features=128, out_features=4168, bias=True)
)

In [17]:
# 使用Adam优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        inputs, targets = [x.to(device) for x in batch]
        optimizer.zero_grad()
        log_probs = model(inputs)
#         print(log_probs.view(-1, log_probs.shape[-1]).shape)
        loss = nll_loss(log_probs.view(-1, log_probs.shape[-1]).argmax(dim=1), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")

# save_pretrained(vocab, model.embeddings.weight.data, "rnnlm.vec")




ValueError: Expected 2 or more dimensions (got 1)

In [21]:
nll_loss(output.view(-1, log_probs.shape[-1]), targets.view(-1))

tensor(2.2695, device='cuda:0')

In [None]:
inputs.shape

In [None]:
with torch.no_grad():
    output = model(torch.unsqueeze(inputs[0],dim=0))

In [None]:
inputs = inputs.to(torch.device('cuda'))

In [19]:
with torch.no_grad():
    output = model(inputs)

In [None]:
output.shape

In [None]:
output[0].argmax(dim=1)

In [None]:
output.view(-1, output.shape[-1]).shape

In [None]:
targets.view(-1).shape

In [None]:
vocab.convert_ids_to_tokens(torch.squeeze(output[0].argmax(dim=1)))

In [None]:
inputs[1]

In [None]:
targets[1]

## ------------------------------------------------------- 调试部分 ------------------------------------------------------------ 

In [None]:
data = []
bos = vocab[BOS_TOKEN]
eos = vocab[EOS_TOKEN]
pad = vocab[PAD_TOKEN]
for sentence in tqdm(corpus, desc="Dataset Construction"):
    # 模型输入：BOS_TOKEN, w_1, w_2, ..., w_n
    input = ['bos'] + sentence
    # 模型输出：w_1, w_2, ..., w_n, EOS_TOKEN
    target = sentence + ['eos']
    data.append((input, target))
    break