In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
from tqdm import tqdm

import sys
sys.path.append('src/')
from lda2vec import topic_embedding, negative_sampling_loss

# Load data

In [None]:
window_data = np.load('window_data.npy')
unigram_distribution_dict = np.load('unigram_distribution.npy')[()]
word_vectors = np.load('word_vectors.npy')

In [None]:
# number of documents
len(np.unique(window_data[:, 0]))

In [None]:
# number of unique words
len(unigram_distribution_dict)

In [None]:
# embedding for each word
word_vectors.shape

# Create iterator

In [None]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class SimpleDataset(Dataset):

    def __init__(self, data_tensor):
        self.data_tensor = data_tensor

    def __getitem__(self, index):
        return self.data_tensor[index]

    def __len__(self):
        return self.data_tensor.size(0)

In [None]:
batch_size = 128

In [None]:
dataset = SimpleDataset(torch.LongTensor(window_data))

iterator = DataLoader(
    dataset, batch_size=batch_size, num_workers=4,
    shuffle=True, pin_memory=True
)

# number of training samples
data_size = len(window_data)
data_size

# Model

In [None]:
# params
n_topics = 20
embedding_dim = 300
vocab_size = 13812
window_size = 10
n_documents = 16116
num_sampled = 10
beta = 3.0/4.0

In [None]:
word_distribution = np.zeros((vocab_size,), 'float32')

for i in unigram_distribution_dict:
    word_distribution[i] = unigram_distribution_dict[i]
    
word_distribution = word_distribution**beta

In [None]:
word_vectors = torch.FloatTensor(word_vectors)
unigram_distribution = torch.FloatTensor(word_distribution)

In [None]:
class loss(nn.Module):

    def __init__(self, topics, word_vectors, unigram_distribution, 
                 n_documents, n_topics, num_sampled):
        super(loss, self).__init__()

        self.doc_embedding = nn.Embedding(n_documents, n_topics)
        self.doc_embedding.weight = nn.Parameter(2.0*torch.rand(n_documents, n_topics) - 1.0)
        
        self.neg = negative_sampling_loss(word_vectors, unigram_distribution, num_sampled)
        self.topics = topics

    def forward(self, doc_indices, pivot_words, target_words):
        
        alpha = 1.0/n_topics
        lambda_const = 200
        
        doc_weights = self.doc_embedding(doc_indices)
        doc_vectors = self.topics(doc_weights)
        
        neg_loss = self.neg(pivot_words, target_words, doc_vectors)
        dirichlet_loss = lambda_const*(1.0 - alpha)*F.log_softmax(doc_weights).sum(0).mean(0)

        return neg_loss + dirichlet_loss

In [None]:
topics = topic_embedding(n_topics, embedding_dim)

In [None]:
model = loss(
    topics, word_vectors, unigram_distribution, 
    n_documents, n_topics, num_sampled
)

In [None]:
model.cuda();

In [None]:
optimizer = optim.SGD(model.parameters(), lr=1e-6, momentum=0.9, nesterov=True)

In [None]:
n_epochs = 1

In [None]:
n_batches = math.ceil(data_size/batch_size)
n_batches

In [None]:
all_losses = []
model.train()
for epoch in range(0, n_epochs):
    for step, batch in tqdm(enumerate(iterator, 1 + epoch*n_batches)):

        batch = Variable(batch.cuda())
        doc_indices = batch[:, 0]
        pivot_words = batch[:, 1]
        target_words = batch[:, 2:]

        total_loss = model(doc_indices, pivot_words, target_words)

        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()
        
        all_losses += [total_loss.data[0]]