## 词向量训练

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm

#### 数据为：NLTK 中提供的 Reuters 语料库

在训练语言模型的过程中需要先定义一些变量，比如句首标记、句尾标记、以及构建批次时用于补齐序列长度的标记

In [2]:
# Constants
BOS_TOKEN = "<bos>" ## 句首标记
EOS_TOKEN = "<eos>" ## 句尾标记
PAD_TOKEN = "<pad>" ## 补齐标记

加载 Reuters 语料库并构建数据集，同时建立词表，这里需要用到前面的vocab

#### 1、词表映射

In [3]:
from collections import defaultdict, Counter

class Vocab:
    def __init__(self, tokens=None):
        self.idx_to_token = list()
        self.token_to_idx = dict()

        if tokens is not None:
            if "<unk>" not in tokens:
                tokens = tokens + ["<unk>"]
            for token in tokens:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1
            self.unk = self.token_to_idx['<unk>']

    @classmethod
    def build(cls, text, min_freq=1, reserved_tokens=None):
        token_freqs = defaultdict(int)
        for sentence in text:
            for token in sentence:
                token_freqs[token] += 1
        uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else [])
        uniq_tokens += [token for token, freq in token_freqs.items() \
                        if freq >= min_freq and token != "<unk>"]
        
        return cls(uniq_tokens)   ## 返回cls 对象，到时候就可以通过这个cls 来调用 Vocab 类中的其他方法
                                  ## 后面调用 build ，并且将返回值设为 vocab = cls(uniq_tokens) ，之后调用vocab，会默认带有uniq_tokens参数
                                 ## 此 uniq_tokens 参数会传给 __init__ 中的 tokens。
    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, token):
        return self.token_to_idx.get(token, self.unk)

    def convert_tokens_to_ids(self, tokens):
        return [self[token] for token in tokens] 

    def convert_ids_to_tokens(self, indices):
        return [self.idx_to_token[index] for index in indices]

#### 下载数据

In [None]:
import nltk
nltk.download('reuters')
nltk.download('punkt')

#### 加载数据

In [4]:
from nltk.corpus import reuters

reuters.sents()

[['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.'], ['They', 'told', 'Reuter', 'correspondents', 'in', 'Asian', 'capitals', 'a', 'U', '.', 'S', '.', 'Move', 'against', 'Japan', 'might', 'boost', 'protectionist', 'sentiment', 'in', 'the', 'U', '.', 'S', '.', 'And', 'lead', 'to', 'curbs', 'on', 'American', 'imports', 'of', 'their', 'products', '.'], ...]

In [5]:
# len(reuters.sents()) ## 54716

In [5]:
def load_reuters():
    text = reuters.sents()[:1000]
    # lowercase (optional)
    text = [[word.lower() for word in sentence] for sentence in text]
    vocab = Vocab.build(text, reserved_tokens=[PAD_TOKEN, BOS_TOKEN, EOS_TOKEN])
    corpus = [vocab.convert_tokens_to_ids(sentence) for sentence in text]

    return corpus, vocab

In [7]:
corpus, vocab = load_reuters()

#### 创建前馈神经网络语言模型处理类

In [None]:
from torch.utils.data import Dataset

class NGramDataset(Dataset):
    def __init__(self, corpus, vocab, context_size=2):
        self.data = []
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        for sentence in tqdm(corpus, desc="Dataset Construction"):
            # 插入句首句尾符号
            sentence = [self.bos] + sentence + [self.eos]
            if len(sentence) < context_size:
                continue
            for i in range(context_size, len(sentence)):
                # 模型输入：长为context_size的上文
                context = sentence[i-context_size:i]
                # 模型输出：当前词
                target = sentence[i]
                self.data.append((context, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        print('调用了我')
        return self.data[i]

    def collate_fn(self, examples):
        # 从独立样本集合中构建batch输入输出
        inputs = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
        targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
        return (inputs, targets)

In [None]:
## 设置权重
# def init_weights(model):
#     for name, param in model.named_parameters():
#         if "embedding" not in name:
#             torch.nn.init.uniform_(
#                 param, a=-WEIGHT_INIT_RANGE, b=WEIGHT_INIT_RANGE
#             )

#### 创建前馈神经网络语言模型

词向量层中的向量会随着模型的优化进行更新

In [None]:
import torch
from torch import nn, optim
import torch.nn.functional as F

class FeedForwardNNLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(FeedForwardNNLM, self).__init__()
        # 词嵌入层
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 线性变换：词嵌入层->隐含层
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        # 线性变换：隐含层->输出层
        self.linear2 = nn.Linear(hidden_dim, vocab_size) ## 最后输出神经元数量为词表大小数量，最后需要预测下一个词可能出现的概率
        # 使用ReLU激活函数
        self.activate = F.relu
#         init_weights(self)

    def forward(self, inputs):
        ## embeddings(inputs).shape 为 torch.Size([1024, 2, 64]) ，embeds shape 为 torch.Size([1024, 128]) 
        ## 所以上面隐藏层输入神经元维度为 context_size * embedding_dim 
        embeds = self.embeddings(inputs).view((inputs.shape[0], -1))
        hidden = self.activate(self.linear1(embeds))
        output = self.linear2(hidden)
        # 根据输出层（logits）计算概率分布并取对数，以便于计算对数似然
        # 这里采用PyTorch库的log_softmax实现
        log_probs = F.log_softmax(output, dim=1)
        return log_probs

#### 将数据转为 dataloader 格式

In [6]:
from torch.utils.data import Dataset, DataLoader

def get_loader(dataset, batch_size, shuffle=True):
    data_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=dataset.collate_fn,
        shuffle=shuffle
    )
    return data_loader

In [None]:
from tqdm.auto import tqdm

## 定义超参数
embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 1024
num_epoch = 10

# 读取文本数据，构建FFNNLM训练数据集（n-grams）
corpus, vocab = load_reuters()                  ## corpus 是将 文字 转 idx 之后得到的值。

## 为了给 dataLoader 提供dataset 参数，第一步需要先将数据传到 NGramDataset 中
dataset = NGramDataset(corpus, vocab, context_size)
## 然后再将 dataset 作为参数传到 get_loader 中，创建dataLoader对象
data_loader = get_loader(dataset, batch_size)

In [None]:
corpus[1]

In [None]:
# 负对数似然损失函数
nll_loss = nn.NLLLoss()
# 构建FFNNLM，并加载至device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = FeedForwardNNLM(len(vocab), embedding_dim, context_size, hidden_dim)
model.to(device)

In [None]:
# 使用Adam优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
total_losses = []
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        inputs, targets = [x.to(device) for x in batch]
        optimizer.zero_grad()
        log_probs = model(inputs)
        loss = nll_loss(log_probs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")
    total_losses.append(total_loss)

In [12]:
def save_pretrained(vocab, embeds, save_path):
    """
    Save pretrained token vectors in a unified format, where the first line
    specifies the `number_of_tokens` and `embedding_dim` followed with all
    token vectors, one token per line.
    """
    with open(save_path, "w") as writer:
        writer.write(f"{embeds.shape[0]} {embeds.shape[1]}\n")
        for idx, token in enumerate(vocab.idx_to_token):
            vec = " ".join(["{:.4f}".format(x) for x in embeds[idx]])
            writer.write(f"{token} {vec}\n")
    print(f"Pretrained embeddings saved to: {save_path}")

In [None]:
# 保存词向量（model.embeddings）
save_pretrained(vocab, model.embeddings.weight.data, "ffnnlm.vec")

## ------------------------------------------------------- 调试部分 ------------------------------------------------------------ 

In [None]:
for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
    inputs, targets = [x for x in batch]
    break

In [None]:
embeddings = nn.Embedding(31081, 64)
embeds = embeddings(inputs).view((inputs.shape[0], -1))

In [None]:
embeddings(inputs).shape

In [None]:
embeds.shape

In [None]:
inputs[2]

In [None]:
targets[2]

# RNNNLM

In [7]:
from torch.nn.utils.rnn import pad_sequence


class RnnlmDataset(Dataset):
    def __init__(self, corpus, vocab):
        self.data = []
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        self.pad = vocab[PAD_TOKEN]
        for sentence in tqdm(corpus, desc="Dataset Construction"):
            # 模型输入：BOS_TOKEN, w_1, w_2, ..., w_n
            input = [self.bos] + sentence
            # 模型输出：w_1, w_2, ..., w_n, EOS_TOKEN
            target = sentence + [self.eos]
            self.data.append((input, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]

    def collate_fn(self, examples):
        # 从独立样本集合中构建batch输入输出
        inputs = [torch.tensor(ex[0]) for ex in examples]
        targets = [torch.tensor(ex[1]) for ex in examples]
        # 对batch内的样本进行padding，使其具有相同长度
        inputs = pad_sequence(inputs, batch_first=True, padding_value=self.pad)
        targets = pad_sequence(targets, batch_first=True, padding_value=self.pad)
        return (inputs, targets)

In [8]:
class RNNLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNNLM, self).__init__()
        # 词嵌入层
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 循环神经网络：这里使用LSTM
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        # 输出层
        self.output = nn.Linear(hidden_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        # 计算每一时刻的隐含层表示
        hidden, _ = self.rnn(embeds)
        output = self.output(hidden)
        log_probs = F.log_softmax(output, dim=2)
        return log_probs

In [9]:
embedding_dim = 64
hidden_dim = 128
batch_size = 64
num_epoch = 500

# 读取文本数据，构建FFNNLM训练数据集（n-grams）
corpus, vocab = load_reuters()
dataset = RnnlmDataset(corpus, vocab)
data_loader = get_loader(dataset, batch_size)




In [11]:
for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
    inputs, targets = [x for x in batch]
    break

NameError: name 'epoch' is not defined

In [45]:
inputs.shape

torch.Size([64, 71])

In [46]:
targets.view(-1).shape

torch.Size([4544])

In [47]:
inputs[0]

tensor([   2,  121, 1715,  167, 1716,  211,  102,   19, 1717,   47,   19,   16,
         910,   39,   64,  122,  126,   97, 1718,  228,  211,   57, 1719, 1720,
           8, 1489, 1721,   26, 1722,   31,  328,   57, 1723,   59, 1131,   20,
        1698, 1311,  128,  283,   42,   10,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1])

In [48]:
' '.join(vocab.convert_ids_to_tokens(inputs[0]))

'<bos> " there are obvious problems at the moment in the trade area , but we do not wish those problems to divert attention from important areas of cooperation that continue to exist on security and political issues ," he said . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'

In [49]:
targets[0]

tensor([ 121, 1715,  167, 1716,  211,  102,   19, 1717,   47,   19,   16,  910,
          39,   64,  122,  126,   97, 1718,  228,  211,   57, 1719, 1720,    8,
        1489, 1721,   26, 1722,   31,  328,   57, 1723,   59, 1131,   20, 1698,
        1311,  128,  283,   42,   10,    3,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1])

In [50]:
torch.unsqueeze(inputs[6],dim=0)

tensor([[  2, 121, 122, 989,  57, 990, 119, 127, 991, 128, 892,  42,  10,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1]])

In [51]:
' '.join(vocab.convert_ids_to_tokens(dataset[1][0]))

'<bos> they told reuter correspondents in asian capitals a u . s . move against japan might boost protectionist sentiment in the u . s . and lead to curbs on american imports of their products .'

In [34]:
' '.join(vocab.convert_ids_to_tokens(dataset[1][1]))

'they told reuter correspondents in asian capitals a u . s . move against japan might boost protectionist sentiment in the u . s . and lead to curbs on american imports of their products . <eos>'

In [10]:
# 负对数似然损失函数，忽略pad_token处的损失
## 在使用nllloss时，需要有两个张量，一个是预测向量，一个是label
nll_loss = nn.NLLLoss(ignore_index=dataset.pad)
# 构建RNNLM，并加载至device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNNLM(len(vocab), embedding_dim, hidden_dim)
model.to(device)

RNNLM(
  (embeddings): Embedding(4168, 64)
  (rnn): LSTM(64, 128, batch_first=True)
  (output): Linear(in_features=128, out_features=4168, bias=True)
)

In [11]:
# 使用Adam优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        inputs, targets = [x.to(device) for x in batch]
        optimizer.zero_grad()
        log_probs = model(inputs)
#         print(log_probs.view(-1, log_probs.shape[-1]).shape)
        loss = nll_loss(log_probs.view(-1, log_probs.shape[-1]), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")

# save_pretrained(vocab, model.embeddings.weight.data, "rnnlm.vec")


Loss: 131.98



Loss: 114.35



Loss: 103.17



Loss: 101.69



Loss: 100.54



Loss: 99.41



Loss: 98.18



Loss: 96.97



Loss: 95.60



Loss: 94.40



Loss: 93.08



Loss: 91.83



Loss: 90.66



Loss: 89.48



Loss: 88.49



Loss: 87.23



Loss: 86.28



Loss: 85.18



Loss: 84.03



Loss: 83.23



Loss: 82.21



Loss: 81.23



Loss: 80.29



Loss: 79.34



Loss: 78.52



Loss: 77.52



Loss: 76.80



Loss: 75.97



Loss: 75.15



Loss: 74.35



Loss: 73.52



Loss: 72.85



Loss: 72.01



Loss: 71.30



Loss: 70.44



Loss: 69.73



Loss: 69.18



Loss: 68.50



Loss: 67.68



Loss: 67.14



Loss: 66.52



Loss: 65.82



Loss: 65.08



Loss: 64.45



Loss: 63.88



Loss: 63.25



Loss: 62.64



Loss: 61.95



Loss: 61.40



Loss: 60.69



Loss: 60.23



Loss: 59.61



Loss: 59.06



Loss: 58.50



Loss: 57.93



Loss: 57.34



Loss: 56.82



Loss: 56.26



Loss: 55.75



Loss: 55.23



Loss: 54.66



Loss: 54.14



Loss: 53.60



Loss: 53.13



Loss: 52.53



Loss: 52.09



Loss: 51.55



Loss: 51.02



Loss: 50.62



Loss: 50.04



Loss: 49.72



Loss: 49.14



Loss: 48.76



Loss: 48.25



Loss: 47.75



Loss: 47.36



Loss: 46.90



Loss: 46.45



Loss: 45.99



Loss: 45.61



Loss: 45.19



Loss: 44.84



Loss: 44.34



Loss: 44.00



Loss: 43.60



Loss: 43.11



Loss: 42.74



Loss: 42.42



Loss: 42.01



Loss: 41.62



Loss: 41.22



Loss: 40.88



Loss: 40.48



Loss: 40.19



Loss: 39.85



Loss: 39.43



Loss: 39.09



Loss: 38.70



Loss: 38.39



Loss: 38.04



Loss: 37.70



Loss: 37.38



Loss: 37.11



Loss: 36.69



Loss: 36.45



Loss: 36.10



Loss: 35.75



Loss: 35.51



Loss: 35.16



Loss: 34.88



Loss: 34.56



Loss: 34.20



Loss: 33.96



Loss: 33.62



Loss: 33.38



Loss: 33.12



Loss: 32.85



Loss: 32.51



Loss: 32.26



Loss: 32.00



Loss: 31.73



Loss: 31.43



Loss: 31.15



Loss: 30.85



Loss: 30.61



Loss: 30.36



Loss: 30.14



Loss: 29.85



Loss: 29.57



Loss: 29.33



Loss: 29.09



Loss: 28.88



Loss: 28.59



Loss: 28.35



Loss: 28.15



Loss: 27.95



Loss: 27.66



Loss: 27.46



Loss: 27.23



Loss: 26.91



Loss: 26.70



Loss: 26.57



Loss: 26.32



Loss: 26.18



Loss: 26.00



Loss: 25.68



Loss: 25.41



Loss: 25.16



Loss: 24.95



Loss: 24.74



Loss: 24.55



Loss: 24.31



Loss: 24.09



Loss: 23.89



Loss: 23.68



Loss: 23.50



Loss: 23.40



Loss: 23.29



Loss: 23.00



Loss: 22.81



Loss: 22.57



Loss: 22.43



Loss: 22.18



Loss: 21.96



Loss: 21.79



Loss: 21.62



Loss: 21.44



Loss: 21.26



Loss: 21.09



Loss: 20.92



Loss: 20.77



Loss: 20.61



Loss: 20.48



Loss: 20.37



Loss: 20.19



Loss: 19.97



Loss: 19.74



Loss: 19.61



Loss: 19.38



Loss: 19.31



Loss: 19.05



Loss: 18.91



Loss: 18.75



Loss: 18.58



Loss: 18.47



Loss: 18.31



Loss: 18.15



Loss: 18.04



Loss: 17.90



Loss: 17.80



Loss: 17.64



Loss: 17.47



Loss: 17.34



Loss: 17.26



Loss: 17.07



Loss: 16.91



Loss: 16.79



Loss: 16.67



Loss: 16.56



Loss: 16.37



Loss: 16.20



Loss: 16.07



Loss: 15.92



Loss: 15.81



Loss: 15.67



Loss: 15.60



Loss: 15.47



Loss: 15.44



Loss: 15.34



Loss: 15.16



Loss: 15.07



Loss: 14.95



Loss: 14.77



Loss: 14.67



Loss: 14.50



Loss: 14.37



Loss: 14.22



Loss: 14.18



Loss: 14.05



Loss: 13.90



Loss: 13.83



Loss: 13.71



Loss: 13.60



Loss: 13.57



Loss: 13.45



Loss: 13.39



Loss: 13.28



Loss: 13.16



Loss: 13.02



Loss: 12.92



Loss: 12.81



Loss: 12.69



Loss: 12.77



Loss: 13.05



Loss: 13.00



Loss: 12.63



Loss: 12.47



Loss: 12.32



Loss: 12.13



Loss: 12.02



Loss: 11.86



Loss: 11.77



Loss: 11.70



Loss: 11.61



Loss: 11.50



Loss: 11.45



Loss: 11.35



Loss: 11.27



Loss: 11.20



Loss: 11.14



Loss: 11.06



Loss: 11.00



Loss: 11.06



Loss: 10.90



Loss: 10.80



Loss: 10.71



Loss: 10.64



Loss: 10.52



Loss: 10.43



Loss: 10.36



Loss: 10.27



Loss: 10.18



Loss: 10.13



Loss: 10.08



Loss: 10.00



Loss: 10.19



Loss: 10.19



Loss: 10.07



Loss: 9.91



Loss: 10.02



Loss: 9.86



Loss: 9.74



Loss: 9.57



Loss: 9.47



Loss: 9.36



Loss: 9.28



Loss: 9.20



Loss: 9.13



Loss: 9.07



Loss: 8.98



Loss: 8.92



Loss: 8.87



Loss: 8.81



Loss: 8.76



Loss: 8.70



Loss: 8.66



Loss: 8.60



Loss: 8.58



Loss: 8.52



Loss: 8.46



Loss: 8.42



Loss: 8.34



Loss: 8.28



Loss: 8.24



Loss: 8.19



Loss: 8.12



Loss: 8.08



Loss: 8.04



Loss: 7.99



Loss: 7.95



Loss: 7.94



Loss: 7.89



Loss: 7.82



Loss: 7.78



Loss: 7.71



Loss: 7.66



Loss: 7.62



Loss: 7.58



Loss: 7.51



Loss: 7.48



Loss: 7.47



Loss: 7.48



Loss: 7.39



Loss: 7.37



Loss: 7.30



Loss: 7.26



Loss: 7.20



Loss: 7.14



Loss: 7.09



Loss: 7.05



Loss: 7.00



Loss: 6.98



Loss: 6.94



Loss: 6.92



Loss: 6.88



Loss: 6.84



Loss: 6.80



Loss: 6.75



Loss: 6.74



Loss: 6.76



Loss: 6.75



Loss: 6.69



Loss: 6.66



Loss: 6.61



Loss: 6.55



Loss: 6.52



Loss: 6.81



Loss: 6.98



Loss: 6.67



Loss: 6.66



Loss: 6.54



Loss: 6.45



Loss: 6.37



Loss: 6.30



Loss: 6.24



Loss: 6.18



Loss: 6.15



Loss: 6.11



Loss: 6.07



Loss: 6.04



Loss: 6.01



Loss: 5.98



Loss: 5.97



Loss: 5.92



Loss: 5.93



Loss: 5.89



Loss: 5.88



Loss: 5.85



Loss: 5.82



Loss: 5.82



Loss: 5.78



Loss: 5.88



Loss: 5.92



Loss: 5.87



Loss: 5.78



Loss: 5.73



Loss: 5.70



Loss: 5.66



Loss: 5.61



Loss: 5.59



Loss: 5.56



Loss: 5.53



Loss: 5.51



Loss: 5.52



Loss: 5.50



Loss: 5.46



Loss: 5.41



Loss: 5.40



Loss: 5.37



Loss: 5.36



Loss: 5.34



Loss: 5.31



Loss: 5.30



Loss: 5.28



Loss: 5.28



Loss: 5.41



Loss: 5.35



Loss: 5.31



Loss: 5.28



Loss: 5.27



Loss: 5.24



Loss: 5.22



Loss: 5.20



Loss: 5.14



Loss: 5.11



Loss: 5.10



Loss: 5.06



Loss: 5.04



Loss: 5.08



Loss: 5.02



Loss: 5.01



Loss: 5.00



Loss: 4.99



Loss: 4.96



Loss: 4.94



Loss: 4.93



Loss: 4.90



Loss: 4.89



Loss: 4.88



Loss: 4.86



Loss: 4.85



Loss: 4.86



Loss: 4.84



Loss: 4.84



Loss: 4.83



Loss: 4.81



Loss: 4.80



Loss: 4.79



Loss: 4.77



Loss: 4.77



Loss: 4.77



Loss: 4.74



Loss: 4.72



Loss: 4.69



Loss: 4.68



Loss: 4.67



Loss: 4.66



Loss: 4.65



Loss: 4.64



Loss: 4.65



Loss: 4.61



Loss: 4.61



Loss: 4.58



Loss: 4.58



Loss: 4.58



Loss: 4.56



Loss: 4.54



Loss: 4.55



Loss: 4.55



Loss: 4.54



Loss: 4.53



Loss: 4.51



Loss: 4.51



Loss: 4.51



Loss: 4.51



Loss: 4.54



Loss: 4.53



Loss: 4.53



Loss: 4.56



Loss: 4.76



Loss: 5.86



Loss: 6.87



Loss: 6.04



Loss: 5.33



Loss: 4.97



Loss: 4.74



Loss: 4.64



Loss: 4.57



Loss: 4.52



Loss: 4.48



Loss: 4.45



Loss: 4.43



Loss: 4.41



Loss: 4.40



Loss: 4.38



Loss: 4.37



Loss: 4.36



Loss: 4.36



Loss: 4.34



Loss: 4.33



Loss: 4.32



Loss: 4.31



Loss: 4.31



Loss: 4.30



Loss: 4.28



Loss: 4.28



Loss: 4.27



Loss: 4.25



Loss: 4.26



Loss: 4.25



Loss: 4.23



Loss: 4.23



Loss: 4.24



Loss: 4.22



Loss: 4.21



Loss: 4.20



Loss: 4.22



Loss: 4.21



Loss: 4.19



Loss: 4.18



Loss: 4.19



Loss: 4.20



Loss: 4.17



Loss: 4.17



Loss: 4.14



Loss: 4.15



Loss: 4.14



Loss: 4.16



Loss: 4.13



Loss: 4.14


In [15]:
def load_pretrained(load_path):
    with open(load_path, "r") as fin:
        # Optional: depending on the specific format of pretrained vector file
        n, d = map(int, fin.readline().split())
        tokens = []
        embeds = []
        for line in fin:
            line = line.rstrip().split(' ')
            token, embed = line[0], list(map(float, line[1:]))
            tokens.append(token)
            embeds.append(embed)
        vocab = Vocab(tokens)
        embeds = torch.tensor(embeds, dtype=torch.float)
    return vocab, embeds

In [16]:
vocab,embeds = load_pretrained('./rnnlm.vec')

In [23]:
embeds

tensor([[ 2.1060, -0.9033,  0.7795,  ..., -0.0314,  0.7060, -2.9022],
        [ 0.4234, -0.6294, -0.7711,  ..., -0.4676,  3.0436,  0.0335],
        [ 0.7837,  1.4697, -0.7131,  ...,  0.2140,  0.9701, -0.5033],
        ...,
        [-1.2039, -0.8122,  0.6347,  ..., -0.0931,  0.3163,  0.0917],
        [ 0.5013, -0.4576,  0.6905,  ...,  1.6615, -1.2897,  1.1877],
        [ 1.3486,  0.6406,  0.1452,  ..., -1.5763, -0.8749, -0.0463]])

In [20]:
class RNNLM2(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim,embeds):
        super(RNNLM2, self).__init__()
        # 词嵌入层
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings.weight.data.copy_(embeds)
        self.embeddings.weight.requires_grad = False
        # 循环神经网络：这里使用LSTM
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        # 输出层
        self.output = nn.Linear(hidden_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        # 计算每一时刻的隐含层表示
        hidden, _ = self.rnn(embeds)
        output = self.output(hidden)
        log_probs = F.log_softmax(output, dim=2)
        return log_probs

In [21]:
# 负对数似然损失函数，忽略pad_token处的损失
## 在使用nllloss时，需要有两个张量，一个是预测向量，一个是label
nll_loss = nn.NLLLoss(ignore_index=dataset.pad)
# 构建RNNLM，并加载至device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model2 = RNNLM2(len(vocab), embedding_dim, hidden_dim,embeds)
model2.to(device)

RNNLM2(
  (embeddings): Embedding(4168, 64)
  (rnn): LSTM(64, 128, batch_first=True)
  (output): Linear(in_features=128, out_features=4168, bias=True)
)

In [24]:
# 使用Adam优化器
optimizer = optim.Adam(model2.parameters(), lr=0.001)

model2.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        inputs, targets = [x.to(device) for x in batch]
        optimizer.zero_grad()
        log_probs = model2(inputs)
        loss = nll_loss(log_probs.view(-1, log_probs.shape[-1]), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")

# save_pretrained(vocab, model.embeddings.weight.data, "rnnlm.vec")


Loss: 131.81



Loss: 116.11



Loss: 102.09



Loss: 99.52



Loss: 97.15



Loss: 94.83



Loss: 92.68



Loss: 90.60



Loss: 88.73



Loss: 86.97



Loss: 85.41



Loss: 83.89



Loss: 82.52



Loss: 81.14



Loss: 80.03



Loss: 78.86



Loss: 77.73



Loss: 76.70



Loss: 75.70



Loss: 74.82



Loss: 73.87



Loss: 73.00



Loss: 72.15



Loss: 71.29



Loss: 70.55



Loss: 69.68



Loss: 68.98



Loss: 68.20



Loss: 67.41



Loss: 66.73



Loss: 66.11



Loss: 65.39



Loss: 64.66



Loss: 63.90



Loss: 63.36



Loss: 62.59



Loss: 62.01



Loss: 61.44



Loss: 60.85



Loss: 60.14



Loss: 59.64



Loss: 59.03



Loss: 58.42



Loss: 57.83



Loss: 57.33



Loss: 56.69



Loss: 56.15



Loss: 55.62



Loss: 55.14



Loss: 54.58



Loss: 54.03



Loss: 53.55



Loss: 53.03



Loss: 52.45



Loss: 52.00



Loss: 51.55



Loss: 51.03



Loss: 50.53



Loss: 50.14



Loss: 49.74



Loss: 49.15



Loss: 48.82



Loss: 48.35



Loss: 47.83



Loss: 47.40



Loss: 46.97



Loss: 46.50



Loss: 46.15



Loss: 45.71



Loss: 45.29



Loss: 44.94



Loss: 44.45



Loss: 44.10



Loss: 43.73



Loss: 43.36



Loss: 42.94



Loss: 42.54



Loss: 42.13



Loss: 41.83



Loss: 41.45



Loss: 41.12



Loss: 40.74



Loss: 40.47



Loss: 40.10



Loss: 39.70



Loss: 39.45



Loss: 39.05



Loss: 38.70



Loss: 38.44



Loss: 38.16



Loss: 37.81



Loss: 37.46



Loss: 37.14



Loss: 36.95



Loss: 36.47



Loss: 36.22



Loss: 36.01



Loss: 35.72



Loss: 35.40



Loss: 35.09


In [39]:
' '.join(vocab.convert_ids_to_tokens(targets[30]))

'marble financial corp & lt ; mrbl > 1st qtr net oper shr 26 cts vs not given oper net 866 , 000 vs 480 , 000 note : 1987 net excludes 157 , 000 dlr gain from termination of pension plan . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'

In [23]:
a = targets[30][:10]

In [27]:
inputs.shape

torch.Size([40, 63])

In [28]:
inputs = inputs.to(torch.device('cuda'))

In [24]:
with torch.no_grad():
    output = model(torch.unsqueeze(a,dim=0))


In [None]:
with torch.no_grad():
    output2 = model2(torch.unsqueeze(inputs[30],dim=0))

In [25]:
output.shape

torch.Size([1, 10, 4168])

In [26]:
targets[30]

tensor([1561,   39,  114, 1428,  112,    9,   10,   11,   10, 2518,  252,  249,
        3545,   59,  352, 1512, 2869, 1171,  363,   57, 3546, 3547, 2240,   39,
        3548,   19, 3549,   20, 3550, 3494, 1032,   10,    3,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1], device='cuda:0')

In [27]:
output.argmax(dim=2)

tensor([[2069,   39,  519,  830,    9,   10,   11,   10, 3483,  116]],
       device='cuda:0')

In [17]:
# output2.argmax(dim=2)

In [28]:
output.view(-1, output.shape[-1]).shape

torch.Size([10, 4168])

In [29]:
targets.view(-1).shape

torch.Size([2480])

In [30]:
vocab.convert_ids_to_tokens(torch.squeeze(output.argmax(dim=2)))

['form',
 ',',
 'prices',
 'about',
 'u',
 '.',
 's',
 '.',
 'petrochemical',
 'exports']

In [35]:
vocab.convert_ids_to_tokens(torch.squeeze(output2.argmax(dim=2)))

['the',
 'financial',
 'corp',
 '&',
 'lt',
 ';',
 'rbd',
 '>',
 '1st',
 'qtr',
 'net',
 'shr',
 'shr',
 '1',
 'cts',
 'vs',
 'loss',
 'given',
 'net',
 'net',
 '866',
 ',',
 '000',
 'vs',
 'loss',
 ',',
 '000',
 'revs',
 ':',
 '1987',
 'net',
 'excludes',
 '157',
 ',',
 '000',
 'dlrs',
 ',',
 'of',
 'termination',
 'of',
 'pension',
 '.',
 '.',
 '<eos>',
 'mln',
 'to',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.']

In [None]:
inputs[1]

In [60]:
targets[30]

tensor([  19,  448,   57,  206, 3191,  284, 1555,  118,   19, 4157,  589, 4164,
         192,   19, 1749, 3696, 4156,  287,   39,   19,  129,   42,   10,    3,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1])

## ------------------------------------------------------- 调试部分 ------------------------------------------------------------ 

In [None]:
data = []
bos = vocab[BOS_TOKEN]
eos = vocab[EOS_TOKEN]
pad = vocab[PAD_TOKEN]
for sentence in tqdm(corpus, desc="Dataset Construction"):
    # 模型输入：BOS_TOKEN, w_1, w_2, ..., w_n
    input = ['bos'] + sentence
    # 模型输出：w_1, w_2, ..., w_n, EOS_TOKEN
    target = sentence + ['eos']
    data.append((input, target))
    break