In [62]:
import random
from collections import Counter
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
import tqdm
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity


def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_random_seed(2020)
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

# 数据读取与处理

In [43]:
class Corpus:
    def __init__(self, corpus_path, vocab_size):
        with open(corpus_path, 'r') as f:
            text = f.read()
        self.text = [w for w in text.lower().split()]
        self.vocab_size = vocab_size
        self.vocab = dict(Counter(self.text).most_common(vocab_size - 1))
        self.vocab['<unk>'] = len(self.text) - sum(self.vocab.values())
        self.itos = [w for w in self.vocab.keys()]
        self.stoi = {w: i for i, w in enumerate(self.itos)}
        
    def get_word_freqs(self, freq_coef=0.75):
        word_counts = np.array([c for c in self.vocab.values()], dtype=np.float32)
        word_freqs = word_counts / np.sum(word_counts)
        word_freqs = word_freqs ** freq_coef
        word_freqs = word_freqs / np.sum(word_freqs)
        return word_freqs
    
corpus_path = Path('/media/bnu/data/nlp-practice/language-model/text8.train.txt')
corpus = Corpus(corpus_path, 30000)
word_freqs = corpus.get_word_freqs()
word_freqs[:10]

array([0.01623116, 0.01051   , 0.00803599, 0.00798072, 0.007389  ,
       0.00666238, 0.00653963, 0.0056647 , 0.00547045, 0.00447826],
      dtype=float32)

In [47]:
class Word2VecDataset(torch.utils.data.Dataset):
    def __init__(self, corpus, win_size=3, neg_coef=20):
        self.corpus = corpus
        self.win_size = win_size
        self.neg_coef = neg_coef
        
        unk_idx = corpus.stoi['<unk>']
        self.text = [corpus.stoi.get(w, unk_idx) for w in corpus.text]
        self.text = torch.LongTensor(self.text)
        
    def __len__(self):
        return len(self.text[self.win_size:-self.win_size])
    
    def __getitem__(self, i):
        center_idx = i + self.win_size
        center_word = self.text[center_idx]
        
        pos_idxs = (list(range(center_idx - self.win_size, center_idx)) +
                    list(range(center_idx + 1, center_idx + self.win_size + 1)))
        pos_words = self.text[pos_idxs]
        
        word_freqs = corpus.get_word_freqs()
        word_freqs = torch.FloatTensor(word_freqs)
        neg_words = torch.multinomial(word_freqs, self.neg_coef * len(pos_idxs), replacement=True)
        
        return center_word, pos_words, neg_words
    
dataset = Word2VecDataset(corpus)

dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=128,
    shuffle=True,
    num_workers=4
)

center_word, pos_words, neg_words = next(iter(dataloader))
print(center_word.shape)
print(pos_words.shape)
print(neg_words.shape)

torch.Size([128])
torch.Size([128, 6])
torch.Size([128, 120])


# 模型定义

In [54]:
class Word2VecModel(nn.Module):
    
    def __init__(self, n_words, n_embed):
        super(Word2VecModel, self).__init__()
        self.inp_embed = nn.Embedding(n_words, n_embed)
        self.out_embed = nn.Embedding(n_words, n_embed)
        
    def forward(self, center_word, pos_words, neg_words):
        # (batch_size, n_embed, 1)
        ctr_embed = self.inp_embed(center_word).unsqueeze(2)
        # (batch_size, n_pos, n_embed)
        pos_embed = self.out_embed(pos_words)
        # (batch.size, n_neg, n_embed)
        neg_embed = self.out_embed(neg_words)
        
        # (batch_size, n_pos)
        pos_bmm = torch.bmm(pos_embed, ctr_embed).squeeze()
        # (batch_size, n_neg)
        neg_bmm = torch.bmm(neg_embed, -ctr_embed).squeeze()
        
        pos_loss = -F.logsigmoid(pos_bmm).sum(1)
        neg_loss = -F.logsigmoid(neg_bmm).sum(1)
        return (pos_loss + neg_loss).mean()

    def get_embeddings(self):
        return self.inp_embed.weight.data.cpu().numpy()
        

# 模型训练与评估

In [73]:
def evaluate(filename, embeddings):
    if str(filename).endswith('.csv'):
        data = pd.read_csv(filename, sep=',')
    else:
        data = pd.read_csv(filename, sep='\t')
        
    human_sims, model_sims = [], []
    for i in data.index:
        word1, word2 = data.iloc[i, 0], data.iloc[i, 1]
        if word1 in corpus.stoi and word2 in corpus.stoi:
            word1_idx, word2_idx = corpus.stoi[word1], corpus.stoi[word2]
            embed1, embed2 = embeddings[[word1_idx]], embeddings[[word2_idx]]
            
            model_sims.append(cosine_similarity(embed1, embed2)[0][0])
            human_sims.append(data.iloc[i, 2])
    return spearmanr(human_sims, model_sims).correlation
    
    
eval_path = Path('/media/bnu/data/nlp-practice/word-vector')
model = Word2VecModel(dataset.corpus.vocab_size, 300)
embeddings = model.get_embeddings()

evaluate(eval_path / 'simlex-999.txt', embeddings)

-0.022000427249910234

In [None]:
torch.cuda.empty_cache()
model = Word2VecModel(dataset.corpus.vocab_size, 100)
model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.2)

for epoch in range(1):
    
    pbar = tqdm.notebook.tqdm(dataloader)
    pbar.set_description(f'Epoch {epoch+1} --> Train')
    
    corr = 0.0
    for i, (center_word, pos_words, neg_words) in enumerate(pbar):
        center_word = center_word.to(device)
        pos_words = pos_words.to(device)
        neg_words = neg_words.to(device)
        
        loss = model(center_word, pos_words, neg_words)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if i % 100 == 0:
            embeddings = model.get_embeddings()
            corr = evaluate(eval_path / 'simlex-999.txt', embeddings)
            
        pbar.set_postfix(loss=loss.item(), corr=corr)