In [1]:
import collections
import math
import random
import numpy as np
import time
import os
import torch.nn as nn
import torch
import sys
sys.path.append("..")
from torch_utils.utils import *

# Process Penn Tree Bank DataSet

In [2]:
assert "ptb.train.txt" in os.listdir("../data/ptb/")

In [3]:
with open("../data/ptb/ptb.train.txt",'r') as fin:
    lines=fin.readlines()
    raw_dataset=[sentence.split() for sentence in lines]
raw_dataset[0][:5]

['aer', 'banknote', 'berlitz', 'calloway', 'centrust']

# construct index

In [4]:
counter=collections.Counter([token for sentence in raw_dataset for token in sentence])
# keep the word if frequency more than five
counter=dict(filter(lambda x:x[1]>=5,counter.items()))
len(counter)

index2word=[token for token,_ in counter.items()]
word2index={token:index for index,token in enumerate(index2word)}
dataset=[[word2index[word] for word in sentence if word in word2index] for sentence in raw_dataset]

num_tokens=sum(len(sentence) for sentence in dataset)
"num_tokens:%d"%num_tokens

'num_tokens:887100'

# subsampling(二次采样)
通常来说，在一个背景窗口中，一个词（如“chip”）和较低频词（如“microprocessor”）同时出现比和较高频词（如“the”）同时出现对训练词嵌入模型更有益。因此，训练词嵌入模型时可以对词进行二次采样 ，具体来说，数据集中每个被索引词wi将有一定概率被丢弃。越高频的词越容易被丢弃

In [5]:
def discard(index):
    return random.uniform(0,1)<1-math.sqrt(1e-4/counter[index2word[index]]*num_tokens)

subsampled_dataset = [[token for token in sentence if not discard(token)] for sentence in dataset]
'# tokens: %d' % sum([len(st) for st in subsampled_dataset]) # '# tokens: 375875'

'# tokens: 375451'

In [6]:
def compare_counts(token):
    return '# %s: before=%d, after=%d' % (token, sum([st.count(word2index[token]) for st in dataset]),
                                          sum([st.count(word2index[token]) for st in subsampled_dataset]))

compare_counts('the') 

'# the: before=50770, after=2076'

# 提取中心词和背景词

In [7]:
# 在这里我们设定一个特定的windows_size
def get_centers_and_contexts(datasets,window_size):
    centers,contexts=[],[]
    for sentence in datasets:
        if len(sentence)<2:
            continue
        centers+=sentence
        for center_i in range(len(sentence)):
            indices = list(range(max(0, center_i - window_size),min(len(sentence), center_i + 1 + window_size)))
            # 排除中心词
            indices.remove(center_i)
            contexts.append([sentence[index] for index in indices])
    return centers,contexts

tiny_dataset = [list(range(7)), list(range(7, 10))]
print('dataset', tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
    print('center', center, 'has contexts', context)

dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]
center 0 has contexts [1, 2]
center 1 has contexts [0, 2, 3]
center 2 has contexts [0, 1, 3, 4]
center 3 has contexts [1, 2, 4, 5]
center 4 has contexts [2, 3, 5, 6]
center 5 has contexts [3, 4, 6]
center 6 has contexts [4, 5]
center 7 has contexts [8, 9]
center 8 has contexts [7, 9]
center 9 has contexts [7, 8]


In [8]:
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

# 采用负采样进行相似训练

In [9]:
def get_negatives(all_contexts,sampling_weight,K):
    all_negatives,neg_candidates,i=[],[],0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                # 根据每个词的权重（sampling_weights）随机生成k个词的索引作为噪声词。为了高效计算，可以将k设得稍大一点
                i, neg_candidates = 0, random.choices(population, sampling_weights, k=int(1e5))
            neg, i = neg_candidates[i], i + 1
            # 噪声词不能是背景词
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

In [10]:
sampling_weights = [counter[w]**0.75 for w in index2word]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)

# 数据读取

In [15]:
class MyDataSet(torch.utils.data.Dataset):
    def __init__(self,centers,contexts,negatives):
        assert len(centers)==len(contexts)==len(negatives)
        self.centers = centers
        self.contexts = contexts
        self.negatives = negatives
    def __getitem__(self,index):
        return (self.centers[index], self.contexts[index], self.negatives[index])
    def __len__(self):
        return len(self.centers)

In [16]:
# 小批量读取参数
def batchify(data):
    """
    用作DataLoader的参数collate_fn: 输入是个长为batchsize的list,list中的每个元素都是Dataset类调用__getitem__得到的结果
    """
    # 所有的数据中变量最大的长度
    max_len=max(len(c)+len(n) for _,c,n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center,context,negative in data:
        cur_len=len(context)+len(negative)
        centers+=[center]
        # 每个样本的背景词和噪声词连结在一起，并添加填充项0直至连结后的长度相同
        contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
        # mask
        masks += [[1] * cur_len + [0] * (max_len - cur_len)]
        # 背景词（正类）对应的元素设1，其余清0。
        labels += [[1] * len(context) + [0] * (max_len - len(context))]
    return (torch.tensor(centers).view(-1, 1), torch.tensor(contexts_negatives),
            torch.tensor(masks), torch.tensor(labels))

In [17]:
batch_size=32
num_workers=4

dataset=MyDataSet(all_centers,all_contexts,all_negatives)

data_iter=torch.utils.data.DataLoader(dataset,batch_size,shuffle=True,collate_fn=batchify,num_workers=num_workers)

for batch in data_iter:
    for name,data in zip(["centers","contexts_negatives","masks","labels"],batch):
        print(name,data.shape)
    break

centers torch.Size([32, 1])
contexts_negatives torch.Size([32, 60])
masks torch.Size([32, 60])
labels torch.Size([32, 60])


# skip-gram model

In [24]:
# 初始化embedding
embed=nn.Embedding(num_embeddings=20,embedding_dim=4)

x=torch.tensor([[1,2,3],[4,5,6]],dtype=torch.long)
print(embed(x).shape)
embed(x).permute(0,2,1).shape

torch.Size([2, 3, 4])


torch.Size([2, 4, 3])

In [22]:
#  向量批量乘法
X=torch.ones((2,1,4))
Y=torch.ones((2,4,6))
torch.bmm(X,Y).shape

torch.Size([2, 1, 6])

In [25]:
# skip-gram forward
def skip_gram(center,context_and_negatives,embed_v,embed_u):
    v=embed_v(center)
    u=embed_u(context_and_negatives)
    pred=torch.bmm(v,u.permute(0,2,1))
    return pred

In [26]:
# 二元交叉熵损失函数
class SigmoidBinaryCrossEntropyLoss(nn.Module):
    def __init__(self):
        super(SigmoidBinaryCrossEntropyLoss,self).__init__()
    def forward(self,inputs,targets,mask=None):
        """
        input:(batch_size,len)
        target:same shape with input
        """
        inputs,targets,mask=inputs.float(),targets.float(),mask.float()
        res=nn.functional.binary_cross_entropy_with_logits(inputs,targets,reduction='none',weight=mask)
        return res.mean(dim=1)
loss=SigmoidBinaryCrossEntropyLoss()

In [27]:
pred = torch.tensor([[1.5, 0.3, -1, 2], [1.1, -0.6, 2.2, 0.4]])
# 标签变量label中的1和0分别代表背景词和噪声词，第二维的最后一个是填充的
label = torch.tensor([[1, 0, 0, 0], [1, 1, 0, 0]])
mask = torch.tensor([[1, 1, 1, 1], [1, 1, 1, 0]])  # 掩码变量
loss(pred, label, mask) * mask.shape[1] / mask.float().sum(dim=1)

tensor([0.8740, 1.2100])

In [28]:
embed_size=100

net=nn.Sequential(
    nn.Embedding(num_embeddings=len(index2word),embedding_dim=embed_size),
    nn.Embedding(num_embeddings=len(index2word),embedding_dim=embed_size)
)


# model Train

In [33]:
def train(net,lr,num_epochs,device):
    print("train on:",device)
    net=net.to(device)
    optimizer=torch.optim.Adam(net.parameters(),lr=lr)
    
    for epoch in range(num_epochs):
        start, l_sum, n = time.time(), 0.0, 0
        for batch in data_iter:
            center, context_negative, mask, label = [d.to(device) for d in batch]
        
            pred=skip_gram(center,context_negative,net[0],net[1])
        
            # 使用掩码变量mask来避免填充项对损失函数的计算的影响
            l = (loss(pred.view(label.shape), label, mask) *
                     mask.shape[1] / mask.float().sum(dim=1)).mean() 
        
            optimizer.zero_grad()
            l.backward()
        
            optimizer.step()
        
            l_sum+=l.cpu().item()
            n+=1
        print('epoch %d, loss %.2f, time %.2fs'
              % (epoch + 1, l_sum / n, time.time() - start))

In [34]:
device="cuda" if torch.cuda.is_available() else "cpu"
train(net,lr=0.01,num_epochs=100,device=device)

train on: cpu
epoch 1, loss 0.97, time 180.62s


KeyboardInterrupt: 

# 训练word embedding 使用

In [41]:
def get_similar_tokens(query_token, k, embed):
    W = embed.weight.data
    x = W[word2index[query_token]]
    # 添加的1e-9是为了数值稳定性
    cos = torch.matmul(W, x) / (torch.sum(W * W, dim=1) * torch.sum(x * x) + 1e-9).sqrt()
    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:  # 除去输入词
        print('cosine sim=%.3f: %s' % (cos[i], (index2word[i])))

get_similar_tokens('chip', 3, net[0])

cosine sim=0.439: compaq
cosine sim=0.419: disk
cosine sim=0.412: wis.
