In [1]:
import torch
import numpy as np

torch.cuda.is_available()

True

In [2]:
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud


from collections import Counter
import random
import math
import pandas as pd
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

USE_CUDA = torch.cuda.is_available()

In [3]:
# 把数据初始化的seed约定好,尽可能让每次复现都很类似
## 我喜欢用769，131，1331
random.seed(769)
np.random.seed(769)
torch.manual_seed(769)

if USE_CUDA:
    torch.cuda.manual_seed(769)

In [4]:
# 设置一些超参数
# 周围单词，几个算周围单词
c = 3
k = 100 # nagative samples
NUM_EPOCHS = 2
MAX_VOCAB_SIZE = 30000
BATCH_SIZE = 128
LEARNING_RATE = 0.2
EMBEDDING_SIZE = 100
LOG_FILE = "word_embedding.log"

In [5]:
# 处理一下文章的输入，因为text8已经处理好了标点和一些额外空格，所以直接split就ok
def word_tokenize(text):
    return text.split()

In [6]:
# 构建一个单词表，只构建常用的单词，不常用的用一个标记表示
with open("text8.train.txt","r") as fin:
    text = fin.read()
# text[:100]
text = [w for w in word_tokenize(text)] 
vocab = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1)) # 拿到标准的3w - 1个，不常见的都标记为unk
vocab["<unk>"] = len(text) - np.sum(list(vocab.values())) # 拿到unk的词频
# text[:40]

In [7]:
# 构建双射

idx_to_word = [word for word in vocab.keys()]
word_to_idx = {word:i for i,word in enumerate(idx_to_word)}

In [8]:
word_counts = np.array([count for count in vocab.values()],dtype = np.float32)
word_freqs = word_counts / np.sum(word_counts) # 计算出词频（论文中提到了要提升到 ** 3./4.然后重新normal一次）
word_freqs = word_freqs ** (3./4.)
word_freqs = word_freqs / np.sum(word_freqs)

In [9]:
# len(idx_to_word)
VOCAB_SIZE = len(idx_to_word)
VOCAB_SIZE

30000

In [10]:
# 实现dataloader
# __len__ function需要返回整个数据集中有多少个item
# __get__ 根据给定的index返回一个item

class WordEmbeddingDataset(tud.Dataset):
    def __init__(self,text,word_to_idx,idx_to_word,word_freqs,word_counts):
        
        super(WordEmbeddingDataset,self).__init__()
         #字典 get() 函数返回指定键的值（第一个参数），如果值不在字典中返回默认值（第二个参数）
        self.text_encoded = [word_to_idx.get(word,VOCAB_SIZE-1) for word in text]
        
        self.text_encoded = torch.Tensor(self.text_encoded).long()
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        self.word_freqs = torch.Tensor(word_freqs)
        self.word_counts = torch.Tensor(word_counts)
        
        
    def __len__(self):
        return len(self.text_encoded)
    
    def __getitem__(self,idx):
        center_word = self.text_encoded[idx]
        pos_indices = list(range(idx - c,idx)) + list(range(idx + 1,idx + c + 1)) 
        #可能超出文本的话,防止越界
        pos_indices = [i % len(self.text_encoded) for i in pos_indices]
        # 窗口中出现的单词
        pos_words = self.text_encoded[pos_indices]
        
        # 产生一些周围没有的单词
        # 负例采样单词索引，torch.multinomial作用是对self.word_freqs做K * pos_words.shape[0]次取值，输出的是self.word_freqs对应的下标。
        # 取样方式采用有放回的采样，并且self.word_freqs数值越大，取样概率越大。
        # 每个正确的单词采样K个，pos_words.shape[0]是正确单词数量
        neg_words = torch.multinomial(self.word_freqs,k * pos_words.shape[0],True)
        
        return center_word,pos_words,neg_words

In [11]:
dataset = WordEmbeddingDataset(text,word_to_idx,idx_to_word,word_freqs,word_counts)

dataloader = tud.DataLoader(dataset,batch_size=BATCH_SIZE,shuffle = True,num_workers = 0)

In [12]:
for i,(center_word, pos_words, neg_words ) in enumerate(dataloader):
    print(center_word, pos_words, neg_words)
    if i > 5:
        break

tensor([  199, 29999,  3511,   233,    24,   280,     7,     5,    37,    32,
         4738,    16,   520, 29999,    14,  1206,     0,    25,  3263,  1505,
          839,  2316,  4584,  3992,    21,     1,   392,  1136,    16,     2,
           54,   827,    65,     1,     0,     4,    76,     7,  4041, 29999,
           21,     0,     4,  1585,     2,  1394,     1, 16495,  1065,   101,
            1,     1,  3970,  2001,     6,     7,     0,   147,     3,    14,
           45,   282,    48, 29999,    36,  1477,    31,     4,  2555,     1,
         2376,     7,   248,    57,   818,    35,  2997,     1,  1893,  2053,
         4080,   341,  4373,   139,    66,    45, 29999,  3207,  1464,    11,
        11933,    26,    12,   308,  1468,   465,     8,     0, 29999, 29999,
           10,   131,   418,     3, 29999,  2573,  1483,   853,   701,     8,
          243,    24,   910,     2,   464,    17,     4,    28,    76,    14,
         7796,    50,    23,  4619, 24416,    13,  2562,     0])

        29999,    34,    24,  1509, 12969,   229,   112,     0]) tensor([[29999,   116,   470,     1,   116,  4782],
        [    3,    22,    12,   179,  4950,    70],
        [29999,    11, 29999, 29999,   230, 29999],
        [  935,  3336,    38,  1113,     6, 18138],
        [ 1068,     6,  1456,   349,     0,   889],
        [    0,    16, 27888, 29999,     1,     0],
        [ 7473,     2,   122,    25,   160,   121],
        [    3,    12,     8,     0,  1254,    17],
        [    1,  2433,  5082,   667,    23,     0],
        [  195,   106,     4,   248,  5657,    19],
        [    7, 29999,  4150,     8,    15,     8],
        [ 1903, 15793,    70,     5,   330,   558],
        [    8,    12,    15, 29999,    61,   835],
        [18132,   125,   414,     8,     8,    22],
        [ 8408,     2,  1004,    43,  1212,    17],
        [ 3976,  1486,  1081,  9512,  6499,     2],
        [ 7452, 29999,     4,     8,    12,    19],
        [    3,     8,    12,  2658, 29999,  2711],

In [13]:
class EmbeddingModel(nn.Module):
    def __init__(self,vocab_size,embed_size):
        super(EmbeddingModel,self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        
        initrange =  0.5 / self.embed_size
        self.out_embed = nn.Embedding(self.vocab_size,self.embed_size,sparse = False)
        self.out_embed.weight.data.uniform_(-initrange,initrange)
        
        self.in_embed = nn.Embedding(self.vocab_size,self.embed_size,sparse = False)
        self.in_embed.weight.data.uniform_(-initrange,initrange)
        
    def forward(self,input_labels,pos_labels,neg_labels):
        batch_size = input_labels.size(0)
        
        input_embedding = self.in_embed(input_labels) # B * embed_size
        pos_embedding =  self.out_embed(pos_labels) # B * (2*C) * embed_size
        neg_embedding =  self.out_embed(neg_labels)  # B * (2*C * K) * embed_size
        
        log_pos = torch.bmm(pos_embedding, input_embedding.unsqueeze(2)).squeeze()  # B * (2*C)
        log_neg = torch.bmm(neg_embedding, -input_embedding.unsqueeze(2)).squeeze()   # B * (2*C*K)
        
        log_pos = F.logsigmoid(log_pos).sum(1)
        log_neg = F.logsigmoid(log_neg).sum(1)
        
        loss = log_neg + log_pos
        
        return -loss
    
    def input_embeddings(self):
        return self.in_embed.weight.data.cpu().numpy()

In [15]:
model = EmbeddingModel(VOCAB_SIZE,EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()
    
## 训练模型

optimizer = torch.optim.SGD(model.parameters(),lr = LEARNING_RATE)

# for e in range(NUM_EPOCHS):
#     for i,(input_labels, pos_labels, neg_labels) in enumerate(dataloaders):
#         input_labels = input_labels.long()
#         pos_labels = pos_labels.long()
#         neg_labels = neg_labels.long()
#         if USE_CUDA:
#             input_labels = input_labels.cuda()
#             pos_labels = pos_labels.cuda()
#             neg_labels = neg_labels.cuda()
#         optimizer.zero_grad()
        
#         loss = model(input_labels, pos_labels, neg_labels).mean()
#         loss.backward()
#         optimizer.step()
        
#         if i % 10000 == 0:
#             with open(LOG_FILE, "a") as fout:
#                 fout.write("epoch: {}, iter: {}, loss: {}\n".format(e, i, loss.item()))
#                 print("epoch: {}, iter: {}, loss: {}".format(e, i, loss.item()))
        
        
#         embedding_weights = model.input_embeddings()
#         np.save("embedding-{}".format(EMBEDDING_SIZE), embedding_weights)
#         torch.save(model.state_dict(),"embedding-{}.th".format(EMBEDDING_SIZE))