## CBOW

In [33]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader

In [2]:
# Constants
BOS_TOKEN = "<bos>"
EOS_TOKEN = "<eos>"
PAD_TOKEN = "<pad>"
BOW_TOKEN = "<bow>"
EOW_TOKEN = "<eow>"

In [3]:
from collections import defaultdict, Counter

class Vocab:
    def __init__(self, tokens=None):
        self.idx_to_token = list()
        self.token_to_idx = dict()

        if tokens is not None:
            if "<unk>" not in tokens:
                tokens = tokens + ["<unk>"]
            for token in tokens:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1
            self.unk = self.token_to_idx['<unk>']
#         print(tokens)
        
    @classmethod
    def build(cls, text, min_freq=1, reserved_tokens=None):
        token_freqs = defaultdict(int)
        for sentence in text:
            for token in sentence:
                token_freqs[token] += 1
        uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else [])
        uniq_tokens += [token for token, freq in token_freqs.items() \
                        if freq >= min_freq and token != "<unk>"]
        
        return cls(uniq_tokens)   ## 返回cls 对象，到时候就可以通过这个cls 来调用 Vocab 类中的其他方法
                                  ## 后面调用 build ，并且将返回值设为 vocab = cls(uniq_tokens) ，之后调用vocab，会默认带有uniq_tokens参数
                                 ## 此 uniq_tokens 参数会传给 __init__ 中的 tokens。
    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, token):
#         print('調用了我')
#         print(token)
        return self.token_to_idx.get(token, self.unk)
    
    ## convert_tokens_to_ids 在 self[token] 的时候会调用 __getitem__，然后 __getitem__ 返回的内容会赋值给self[token]
    def convert_tokens_to_ids(self, tokens):
#         print(tokens)
#         print('开始调用')
        return [self[token] for token in tokens] 

    def convert_ids_to_tokens(self, indices):
        return [self.idx_to_token[index] for index in indices]

In [4]:
def load_reuters():
    from nltk.corpus import reuters
    text = reuters.sents()[:4000]
    # lowercase (optional)
    text = [[word.lower() for word in sentence] for sentence in text]
    vocab = Vocab.build(text, reserved_tokens=[PAD_TOKEN, BOS_TOKEN, EOS_TOKEN])
    corpus = [vocab.convert_tokens_to_ids(sentence) for sentence in text]

    return corpus, vocab

In [5]:
def get_loader(dataset, batch_size, shuffle=True):
    data_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=dataset.collate_fn,
        shuffle=shuffle
    )
    return data_loader

In [6]:
class CbowDataset(Dataset):
    def __init__(self, corpus, vocab, context_size=2):
        self.data = []
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        for sentence in tqdm(corpus, desc="Dataset Construction"):
            sentence = [self.bos] + sentence+ [self.eos]
            if len(sentence) < context_size * 2 + 1:
                continue
            for i in range(context_size, len(sentence) - context_size):
                # 模型输入：左右分别取context_size长度的上下文
                context = sentence[i-context_size:i] + sentence[i+1:i+context_size+1]
                # 模型输出：当前词
                target = sentence[i]
                self.data.append((context, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]

    def collate_fn(self, examples):
        inputs = torch.tensor([ex[0] for ex in examples])
        targets = torch.tensor([ex[1] for ex in examples])
        return (inputs, targets)

In [7]:
class CbowModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CbowModel, self).__init__()
        # 词嵌入层
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 线性变换：隐含层->输出层
        self.output = nn.Linear(embedding_dim, vocab_size,bias=False)
#         init_weights(self)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        # 计算隐含层：对上下文词向量求平均
        hidden = embeds.mean(dim=1)
        output = self.output(hidden)
        log_probs = F.log_softmax(output, dim=1)
        return log_probs

In [8]:
def save_pretrained(vocab, embeds, save_path):
    """
    Save pretrained token vectors in a unified format, where the first line
    specifies the `number_of_tokens` and `embedding_dim` followed with all
    token vectors, one token per line.
    """
    with open(save_path, "w") as writer:
        writer.write(f"{embeds.shape[0]} {embeds.shape[1]}\n")
        for idx, token in enumerate(vocab.idx_to_token):
            vec = " ".join(["{:.4f}".format(x) for x in embeds[idx]])
            writer.write(f"{token} {vec}\n")
    print(f"Pretrained embeddings saved to: {save_path}")

In [9]:
embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 64
num_epoch = 10

In [10]:
# 读取文本数据，构建CBOW模型训练数据集
corpus, vocab = load_reuters()
dataset = CbowDataset(corpus, vocab, context_size=context_size)
data_loader = get_loader(dataset, batch_size)




In [11]:
dataset[1]

([4, 5, 7, 8], 6)

In [12]:
nll_loss = nn.NLLLoss()
# 构建CBOW模型，并加载至device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CbowModel(len(vocab), embedding_dim)
model.to(device)

CbowModel(
  (embeddings): Embedding(8785, 64)
  (output): Linear(in_features=64, out_features=8785, bias=False)
)

In [13]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        inputs, targets = [x.to(device) for x in batch]
        optimizer.zero_grad()
        log_probs = model(inputs)
        loss = nll_loss(log_probs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")

# 保存词向量（model.embeddings）
save_pretrained(vocab, model.embeddings.weight.data, "cbow.vec")


Loss: 13024.38



Loss: 10536.20



Loss: 9604.98



Loss: 8964.89



Loss: 8466.59



Loss: 8050.31



Loss: 7690.84



Loss: 7373.39



Loss: 7089.69



Loss: 6830.82
Pretrained embeddings saved to: cbow.vec


In [18]:
for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
    inputs, targets = [x.to(device) for x in batch]




In [19]:
inputs

tensor([[  33,  202, 2993,   92],
        [6661,   42,   49, 1963],
        [ 147,   26,  154, 1057],
        [  19,  836,   57,  195],
        [ 856, 2620,   72, 1159],
        [  19,  455,   19, 1641],
        [  19,  255,   10,    3],
        [  49, 4399,   26, 4378],
        [  49, 3869, 5235, 4251],
        [ 287,  173, 3299,  511],
        [4182,   39, 4183,   47],
        [2112,   57,  112, 4432],
        [ 188,  189,   19,  458]], device='cuda:0')

In [25]:
vocab.convert_ids_to_tokens(inputs[1])

['belgrade', 'said', 'a', 'field']

In [49]:
a = torch.from_numpy(np.array([6661,   42,   49, 1963,7,9]))
a = a.long()
a = a.to('cuda')
a

tensor([6661,   42,   49, 1963,    7,    9], device='cuda:0')

In [29]:
inputs[1]

tensor([6661,   42,   49, 1963], device='cuda:0')

In [28]:
targets[1]

tensor(47, device='cuda:0')

In [32]:
torch.unsqueeze(targets[1],dim=0)

tensor([47], device='cuda:0')

In [31]:
vocab.convert_ids_to_tokens(torch.unsqueeze(targets[1],dim=0))

['in']

In [50]:
with torch.no_grad():
    out = model(torch.unsqueeze(a,dim=0))

In [52]:
out.shape

torch.Size([1, 8785])

In [56]:
out.argmax(dim=1)

tensor([39], device='cuda:0')