# Word2Vec Implementation In Pytorch
This is a notebook from this [blog post](https://nlpython.com/implemeting-word2vec-with-pytorch/), and I don't claim to write this myself.

We will divide this post into three parts:

* Loading and preparing dataset
* Creating dataset tuples
* Creating model
* Training it
## 1. Loading and preparing dataset
For our task in creating word vectors we will use the movie plot description of wikipedia, available at https://www.kaggle.com/jrobischon/wikipedia-movie-plots. We will use the following code:

In [1]:
from string import punctuation
import pandas as pd
df = pd.read_csv("data/wiki_movie_plots_deduped.csv")
clear_punct_regex = "[" + punctuation + "\d\r\n]"
corpus = df['Plot'].str.replace(clear_punct_regex, "").str.lower()
corpus = " ".join(corpus)
open("corpus2.txt", "w", encoding="utf8").write(corpus)

FileNotFoundError: [Errno 2] File b'data/wiki_movie_plots_deduped.csv' does not exist: b'data/wiki_movie_plots_deduped.csv'

In [None]:
corpus = open("data/corpus.txt", encoding="utf8").readlines()
corpus = " ".join(corpus).replace("\n", "")
corpus = corpus.split(" ")

In [None]:
from collections import Counter
vocab_cnt = Counter()
vocab_cnt.update(corpus)
vocab_cnt = Counter({w:c for w,c in vocab_cnt.items() if c > 5})

In [None]:
import numpy as np
import random
vocab = set()
unigram_dist = list()
word2id = dict()
for i, (w, c) in enumerate(vocab_cnt.most_common()):
    vocab.add(w)
    unigram_dist.append(c)
    word2id[w] = i
    
unigram_dist = np.array(unigram_dist)
word_freq = unigram_dist / unigram_dist.sum()
#Generate word frequencies to use with negative sampling
w_freq_neg_samp = unigram_dist ** 0.75
w_freq_neg_samp /= w_freq_neg_samp.sum() #normalize
#Get words drop prob
w_drop_p = 1 - np.sqrt(0.00001/word_freq)
#Generate train corpus dropping common words
train_corpus = [w for w in corpus if w in vocab and random.random() > w_drop_p[word2id[w]]]

In [None]:
import torch
#Generate dataset
dataset = list()
window_size = 5
for i, w in enumerate(train_corpus):
    window_start = max(i - window_size, 0)
    window_end = i + window_size
    for c in train_corpus[window_start:window_end]:
        if c != w:
            dataset.append((word2id[w], word2id[c]))
        
dataset = torch.LongTensor(dataset)
if USE_CUDA:
    dataset = dataset.cuda()


In [None]:
import torch
from torch import nn, optim
import torch.nn.functional as F
VOCAB_SIZE = len(word2id)
EMBED_DIM = 128
class Word2Vec(nn.Module):
    def __init__(self, vocabulary_size, embedding_dimension, sparse_grad=False):
        super(Word2Vec, self).__init__()
        
        self.embed_in = nn.Embedding(vocabulary_size, embedding_dimension, sparse=sparse_grad)
        self.embed_out = nn.Embedding(vocabulary_size, embedding_dimension, sparse=sparse_grad)
        #Sparse gradients do not work with momentum
        
        self.embed_in.weight.data.uniform_(-1, 1)
        self.embed_out.weight.data.uniform_(-1, 1)
        
    def neg_samp_loss(self, in_idx, pos_out_idx, neg_out_idxs):
        emb_in = self.embed_in(in_idx)
        emb_out = self.embed_out(pos_out_idx)
        
        pos_loss = torch.mul(emb_in, emb_out) #Perform dot product between the two embeddings by element-wise mult
        pos_loss = torch.sum(pos_loss, dim=1) #and sum the row values
        
        pos_loss = F.logsigmoid(pos_loss)
        
        neg_emb_out = self.embed_out(neg_out_idxs)
        #Here we must expand dimension for the input embedding in order to perform a matrix-matrix multiplication
        #with the negative embeddings
        neg_loss = torch.bmm(-neg_emb_out, emb_in.unsqueeze(2)).squeeze()
        neg_loss = F.logsigmoid(neg_loss)
        neg_loss = torch.sum(neg_loss, dim=1)
        
        total_loss = torch.mean(pos_loss + neg_loss)
        
        return -total_loss       
           
    def forward(self, indices):
        return self.embed_in(indices)
    
w2v = Word2Vec(VOCAB_SIZE, EMBED_DIM, False)
if USE_CUDA:
    w2v.cuda()

In [None]:
def get_negative_samples(batch_size, n_samples):    
    neg_samples = np.random.choice(len(vocab), size=(batch_size, n_samples), replace=False, p=w_freq_neg_samp)
    
    if USE_CUDA:
        return torch.LongTensor(neg_samples).cuda()
    
    return torch.LongTensor(neg_samples)

In [None]:
optimizer = optim.Adam(w2v.parameters(), lr=0.003)

In [None]:
def get_batches(dataset, batch_size):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i:i+batch_size] 

In [None]:
n_epochs = 5
n_neg_samples = 5
batch_size = 512
for epoch in range(n_epochs):  # loop over the dataset multiple times
    
    loss_values = []
    
    start_t = time.time()
    
    for dp in get_batches(dataset, batch_size):
        
        optimizer.zero_grad() # zero the parameter gradients
        
        inputs, labels = dp[:,0], dp[:,1]
        
        loss = w2v.neg_samp_loss(inputs, labels, get_negative_samples(len(inputs), n_neg_samples))
        loss.backward()
        
        optimizer.step()
        loss_values.append(loss.item())
        
    ellapsed_t = time.time() - start_t
    #if epoch % 1 == 0:
    print("{}/{}\tLoss: {}\tEllapsed time: {}".format(epoch + 1, n_epochs, np.mean(loss_values), ellapsed_t))
print('Done')

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
data_viz_len = 300
viz_embedding = w2v.embed_in.weight.data.cpu()[:data_viz_len]
tsne = TSNE()
embed_tsne = tsne.fit_transform(viz_embedding)
plt.figure(figsize=(16,16))
for w in vocab[:data_viz_len]:
    w_id = word2id[w]
    
    plt.scatter(embed_tsne[w_id,0], embed_tsne[w_id,1])
    plt.annotate(w, (embed_tsne[w_id,0], embed_tsne[w_id,1]), alpha=0.7)